PAT support

Aggelos Economopoulos aoiko at cc.ece.ntua.gr
Mon Apr 19 10:14:49 PDT 2010


Here's a patch for Page Attribute Table support I ported from FreeBSD
quite a while back. I'm sending it to submit@ in case someone will get
interested and deal w/ the cpu identification part (probably integrating
the FreeBSD code for that too) so we can integrate it. It will get used
more and more in new code and will significantly help porting.

As you'll notice in pmap_init_pat()

+#ifdef notyet
+       if (cpu_vendor_id != CPU_VENDOR_INTEL ||
+           (I386_CPU_FAMILY(cpu_id) == 6 && I386_CPU_MODEL(cpu_id) >=
0xe)) {
+#else
+       if (!0) {
+#endif

so we need at least I386_CPU_{FAMILY,MODEL} (to be found in
i386/include/specialreg.h) and having the companion code would be nice
too (ISTR it was straightforward but not trivial, not sure if I remember
correctly now. Looks trivial enough :/)

I *have* tried using the PMAP in mxge(4) so I don't think you'll get any
panics ;)

Aggelos
diff --git a/sys/cpu/i386/include/cpufunc.h b/sys/cpu/i386/include/cpufunc.h
index 9b7e80e..9c57446 100644
--- a/sys/cpu/i386/include/cpufunc.h
+++ b/sys/cpu/i386/include/cpufunc.h
@@ -396,6 +396,14 @@ cpu_invltlb(void)
 
 #endif
 
+#ifndef _CPU_WBINVL_DEFINED
+static __inline void
+cpu_wbinvl(void)
+{
+	__asm __volatile("wbinvd");
+}
+#endif
+
 static __inline void
 cpu_nop(void)
 {
diff --git a/sys/cpu/i386/include/pmap.h b/sys/cpu/i386/include/pmap.h
index f7f0def..ccaef4e 100644
--- a/sys/cpu/i386/include/pmap.h
+++ b/sys/cpu/i386/include/pmap.h
@@ -62,10 +62,12 @@
 #define PG_A		0x020	/* A	Accessed		*/
 #define	PG_M		0x040	/* D	Dirty			*/
 #define	PG_PS		0x080	/* PS	Page size (0=4k,1=4M)	*/
+#define PG_PTE_PAT	0x080	/* PAT	PAT index		*/
 #define	PG_G		0x100	/* G	Global			*/
 #define	PG_AVAIL1	0x200	/*    /	Available for system	*/
 #define	PG_AVAIL2	0x400	/*   <	programmers use		*/
 #define	PG_AVAIL3	0x800	/*    \				*/
+#define PG_PDE_PAT	0x1000	/* PAT	PAT index		*/
 
 
 /* Our various interpretations of the above */
diff --git a/sys/cpu/i386/include/specialreg.h b/sys/cpu/i386/include/specialreg.h
index e0207ec..36a9811 100644
--- a/sys/cpu/i386/include/specialreg.h
+++ b/sys/cpu/i386/include/specialreg.h
@@ -159,6 +159,7 @@
 #define MSR_MTRR64kBase		0x250
 #define MSR_MTRR16kBase		0x258
 #define MSR_MTRR4kBase		0x268
+#define	MSR_PAT			0x277
 #define MSR_MTRRdefType		0x2ff
 #define MSR_MC0_CTL		0x400
 #define MSR_MC0_STATUS		0x401
@@ -184,6 +185,17 @@
 #define MSR_THERM_INTERRUPT	0x19b
 #define MSR_THERM_STATUS	0x19c
 
+/*
+ * PAT modes.
+ */
+#define	PAT_UNCACHEABLE		0x00
+#define	PAT_WRITE_COMBINING	0x01
+#define	PAT_WRITE_THROUGH	0x04
+#define	PAT_WRITE_PROTECTED	0x05
+#define	PAT_WRITE_BACK		0x06
+#define	PAT_UNCACHED		0x07
+#define	PAT_VALUE(i, m)		((long long)(m) << (8 * (i)))
+#define	PAT_MASK(i)		PAT_VALUE(i, 0xff)
 
 /*
  * Constants related to MTRRs
diff --git a/sys/platform/pc32/i386/mp_machdep.c b/sys/platform/pc32/i386/mp_machdep.c
index eba146f..5e82c3c 100644
--- a/sys/platform/pc32/i386/mp_machdep.c
+++ b/sys/platform/pc32/i386/mp_machdep.c
@@ -624,6 +624,8 @@ init_secondary(void)
 	load_cr0(cr0);
 	pmap_set_opt();		/* PSE/4MB pages, etc */
 
+	pmap_init_pat();	/* Page Attribute Table */
+
 	/* set up CPU registers and state */
 	cpu_setregs();
 
diff --git a/sys/platform/pc32/i386/pmap.c b/sys/platform/pc32/i386/pmap.c
index 0504439..bf04044 100644
--- a/sys/platform/pc32/i386/pmap.c
+++ b/sys/platform/pc32/i386/pmap.c
@@ -158,6 +158,7 @@ vm_offset_t KvaSize;		/* max size of kernel virtual address space */
 static boolean_t pmap_initialized = FALSE;	/* Has pmap_init completed? */
 static int pgeflag;		/* PG_G or-in */
 static int pseflag;		/* PG_PS or-in */
+static int pat_works;		/* Is page attribute table sane? */
 
 static vm_object_t kptobj;
 
@@ -216,6 +217,7 @@ static unsigned * pmap_pte_quick (pmap_t pmap, vm_offset_t va);
 static vm_page_t pmap_page_lookup (vm_object_t object, vm_pindex_t pindex);
 static int pmap_unuse_pt (pmap_t, vm_offset_t, vm_page_t, pmap_inval_info_t);
 static vm_offset_t pmap_kmem_choose(vm_offset_t addr);
+static int pmap_cache_bits(int, boolean_t);
 
 static unsigned pdir4mb;
 
@@ -295,6 +297,55 @@ pmap_pte_quick(pmap_t pmap, vm_offset_t va)
 	return (0);
 }
 
+/*
+ * Setup the PAT MSR.
+ */
+void
+pmap_init_pat(void)
+{
+	uint64_t pat_msr;
+
+	/* Bail if this CPU doesn't implement PAT. */
+	if (!(cpu_feature & CPUID_PAT))
+		return;
+
+#ifdef notyet
+	if (cpu_vendor_id != CPU_VENDOR_INTEL ||
+	    (I386_CPU_FAMILY(cpu_id) == 6 && I386_CPU_MODEL(cpu_id) >= 0xe)) {
+#else
+	if (!0) {
+#endif
+		/*
+		 * Leave the indices 0-3 at the default of WB, WT, UC, and UC-.
+		 * Program 4 and 5 as WP and WC.
+		 * Leave 6 and 7 as UC and UC-.
+		 */
+		pat_msr = rdmsr(MSR_PAT);
+		pat_msr &= ~(PAT_MASK(4) | PAT_MASK(5));
+		pat_msr |= PAT_VALUE(4, PAT_WRITE_PROTECTED) |
+		    PAT_VALUE(5, PAT_WRITE_COMBINING);
+		pat_works = 1;
+	} else {
+		/*
+		 * Due to some Intel errata, we can only safely use the lower 4
+		 * PAT entries.  Thus, just replace PAT Index 2 with WC instead
+		 * of UC-.
+		 *
+		 *   Intel Pentium III Processor Specification Update
+		 * Errata E.27 (Upper Four PAT Entries Not Usable With Mode B
+		 * or Mode C Paging)
+		 *
+		 *   Intel Pentium IV  Processor Specification Update
+		 * Errata N46 (PAT Index MSB May Be Calculated Incorrectly)
+		 */
+		pat_msr = rdmsr(MSR_PAT);
+		pat_msr &= ~PAT_MASK(2);
+		pat_msr |= PAT_VALUE(2, PAT_WRITE_COMBINING);
+		pat_works = 0;
+	}
+	wrmsr(MSR_PAT, pat_msr);
+}
+
 
 /*
  *	Bootstrap the system enough to run with virtual memory.
@@ -446,6 +497,7 @@ pmap_bootstrap(vm_paddr_t firstaddr, vm_paddr_t loadaddr)
 	}
 #endif
 
+	pmap_init_pat();
 	/*
 	 * We need to finish setting up the globaldata page for the BSP.
 	 * locore has already populated the page table for the mdglobaldata
@@ -554,6 +606,89 @@ pmap_init2(void)
  * Low level helper routines.....
  ***************************************************/
 
+/*
+ * Determine the appropriate bits to set in a PTE or PDE for a specified
+ * caching mode.
+ */
+static int
+pmap_cache_bits(int mode, boolean_t is_pde)
+{
+	int pat_flag, pat_index, cache_bits;
+
+	/* The PAT bit is different for PTE's and PDE's. */
+	pat_flag = is_pde ? PG_PDE_PAT : PG_PTE_PAT;
+
+	/* If we don't support PAT, map extended modes to older ones. */
+	if (!(cpu_feature & CPUID_PAT)) {
+		switch (mode) {
+		case PAT_UNCACHEABLE:
+		case PAT_WRITE_THROUGH:
+		case PAT_WRITE_BACK:
+			break;
+		case PAT_UNCACHED:
+		case PAT_WRITE_COMBINING:
+		case PAT_WRITE_PROTECTED:
+			mode = PAT_UNCACHEABLE;
+			break;
+		}
+	}
+	
+	/* Map the caching mode to a PAT index. */
+	if (pat_works) {
+		switch (mode) {
+		case PAT_UNCACHEABLE:
+			pat_index = 3;
+			break;
+		case PAT_WRITE_THROUGH:
+			pat_index = 1;
+			break;
+		case PAT_WRITE_BACK:
+			pat_index = 0;
+			break;
+		case PAT_UNCACHED:
+			pat_index = 2;
+			break;
+		case PAT_WRITE_COMBINING:
+			pat_index = 5;
+			break;
+		case PAT_WRITE_PROTECTED:
+			pat_index = 4;
+			break;
+		default:
+			panic("Unknown caching mode %d\n", mode);
+		}
+	} else {
+		switch (mode) {
+		case PAT_UNCACHED:
+		case PAT_UNCACHEABLE:
+		case PAT_WRITE_PROTECTED:
+			pat_index = 3;
+			break;
+		case PAT_WRITE_THROUGH:
+			pat_index = 1;
+			break;
+		case PAT_WRITE_BACK:
+			pat_index = 0;
+			break;
+		case PAT_WRITE_COMBINING:
+			pat_index = 2;
+			break;
+		default:
+			panic("Unknown caching mode %d\n", mode);
+		}
+	}
+
+	/* Map the 3-bit index value into the PAT, PCD, and PWT bits. */
+	cache_bits = 0;
+	if (pat_index & 0x4)
+		cache_bits |= pat_flag;
+	if (pat_index & 0x2)
+		cache_bits |= PG_NC_PCD;
+	if (pat_index & 0x1)
+		cache_bits |= PG_NC_PWT;
+	return (cache_bits);
+}
+
 #if defined(PMAP_DIAGNOSTIC)
 
 /*
@@ -3210,6 +3345,70 @@ pmap_unmapdev(vm_offset_t va, vm_size_t size)
 	kmem_free(&kernel_map, base, size);
 }
 
+int
+pmap_change_attr(vm_offset_t va, vm_size_t size, int mode)
+{
+	vm_offset_t base, offset, tmpva;
+	pt_entry_t *pte;
+	u_int opte, npte;
+	pd_entry_t *pde;
+	pmap_inval_info info;
+
+	base = trunc_page(va);
+	offset = va & PAGE_MASK;
+	size = roundup(offset + size, PAGE_SIZE);
+
+	/*
+	 * Only supported on kernel virtual addresses
+	 */
+	if (base < KvaStart)
+		return (EINVAL);
+
+	/* 4MB pages and pages that aren't mapped aren't supported. */
+	for (tmpva = base; tmpva < (base + size); tmpva += PAGE_SIZE) {
+		pde = pmap_pde(&kernel_pmap, tmpva);
+		if (*pde & PG_PS)
+			return (EINVAL);
+		if (*pde == 0)
+			return (EINVAL);
+		pte = vtopte(tmpva);
+		if (*pte == 0)
+			return (EINVAL);
+	}
+
+	pmap_inval_init(&info);
+	/*
+	 * Ok, all the pages exist and are 4k, so run through them updating
+	 * their cache mode.
+	 */
+	for (tmpva = base; size > 0; ) {
+		pte = vtopte(tmpva);
+
+		/*
+		 * The cache mode bits are all in the low 32-bits of the
+		 * PTE, so we can just spin on updating the low 32-bits.
+		 */
+		do {
+			opte = *(u_int *)pte;
+			npte = opte & ~(PG_PTE_PAT | PG_NC_PCD | PG_NC_PWT);
+			npte |= pmap_cache_bits(mode, 0);
+		} while (npte != opte &&
+		    !atomic_cmpset_int((u_int *)pte, opte, npte));
+		pmap_inval_add(&info, &kernel_pmap, tmpva);
+		tmpva += PAGE_SIZE;
+		size -= PAGE_SIZE;
+	}
+
+	/*
+	 * Flush CPU caches to make sure any data isn't cached that shouldn't
+	 * be, etc.
+	 */
+	pmap_inval_cache_add(&info, &kernel_pmap, -1);
+	pmap_inval_flush(&info);
+	return (0);
+}
+
+
 /*
  * perform the pmap work for mincore
  */
diff --git a/sys/platform/pc32/i386/pmap_inval.c b/sys/platform/pc32/i386/pmap_inval.c
index b6b68f3..35999ba 100644
--- a/sys/platform/pc32/i386/pmap_inval.c
+++ b/sys/platform/pc32/i386/pmap_inval.c
@@ -67,7 +67,7 @@
 #ifdef SMP
 
 static void
-_cpu_invltlb(void *dummy)
+_cpu_invltlb(void *dummy __unused)
 {
     cpu_invltlb();
 }
@@ -78,6 +78,12 @@ _cpu_invl1pg(void *data)
     cpu_invlpg(data);
 }
 
+static void
+_cpu_wbinvl(void *dummy __unused)
+{
+	cpu_wbinvl();
+}
+
 #endif
 
 /*
@@ -89,6 +95,7 @@ pmap_inval_init(pmap_inval_info_t info)
     info->pir_flags = 0;
 }
 
+#ifdef SMP
 /*
  * Add a (pmap, va) pair to the invalidation list and protect access
  * as appropriate.
@@ -96,7 +103,6 @@ pmap_inval_init(pmap_inval_info_t info)
 void
 pmap_inval_add(pmap_inval_info_t info, pmap_t pmap, vm_offset_t va)
 {
-#ifdef SMP
     if ((info->pir_flags & PIRF_CPUSYNC) == 0) {
 	info->pir_flags |= PIRF_CPUSYNC;
 	info->pir_cpusync.cs_run_func = NULL;
@@ -106,46 +112,86 @@ pmap_inval_add(pmap_inval_info_t info, pmap_t pmap, vm_offset_t va)
     } else if (pmap->pm_active & ~info->pir_cpusync.cs_mask) {
 	lwkt_cpusync_add(pmap->pm_active, &info->pir_cpusync);
     }
-#else
-    if (pmap->pm_active == 0)
-	return;
-#endif
     if ((info->pir_flags & (PIRF_INVLTLB|PIRF_INVL1PG)) == 0) {
 	if (va == (vm_offset_t)-1) {
 	    info->pir_flags |= PIRF_INVLTLB;
-#ifdef SMP
 	    info->pir_cpusync.cs_fin2_func = _cpu_invltlb;
-#endif
 	} else {
 	    info->pir_flags |= PIRF_INVL1PG;
 	    info->pir_cpusync.cs_data = (void *)va;
-#ifdef SMP
 	    info->pir_cpusync.cs_fin2_func = _cpu_invl1pg;
-#endif
 	}
     } else {
 	info->pir_flags |= PIRF_INVLTLB;
-#ifdef SMP
 	info->pir_cpusync.cs_fin2_func = _cpu_invltlb;
-#endif
     }
 }
 
+void
+pmap_inval_cache_add(pmap_inval_info_t info, pmap_t pmap,
+		     vm_offset_t va __unused)
+{
+    if ((info->pir_flags & PIRF_CPUSYNC) == 0) {
+	info->pir_flags |= PIRF_CPUSYNC;
+	info->pir_cpusync.cs_run_func = NULL;
+	info->pir_cpusync.cs_fin1_func = NULL;
+	info->pir_cpusync.cs_fin2_func = NULL;
+	lwkt_cpusync_start(pmap->pm_active, &info->pir_cpusync);
+    } else if (pmap->pm_active & ~info->pir_cpusync.cs_mask) {
+	lwkt_cpusync_add(pmap->pm_active, &info->pir_cpusync);
+    }
+    info->pir_flags |= PIRF_WBINVL;
+    info->pir_cpusync.cs_fin2_func = _cpu_wbinvl;
+}
+
 /*
  * Synchronize changes with target cpus.
  */
 void
 pmap_inval_flush(pmap_inval_info_t info)
 {
-#ifdef SMP
     if (info->pir_flags & PIRF_CPUSYNC)
 	lwkt_cpusync_finish(&info->pir_cpusync);
-#else
+    info->pir_flags = 0;
+}
+
+#else	/* !SMP */
+
+void
+pmap_inval_add(pmap_inval_info_t info, pmap_t pmap, vm_offset_t va)
+{
+    if (pmap->pm_active == 0)
+	return;
+    if ((info->pir_flags & (PIRF_INVLTLB|PIRF_INVL1PG)) == 0) {
+	if (va == (vm_offset_t)-1) {
+	    info->pir_flags |= PIRF_INVLTLB;
+	} else {
+	    info->pir_flags |= PIRF_INVL1PG;
+	    info->pir_cpusync.cs_data = (void *)va;
+	}
+    } else {
+	info->pir_flags |= PIRF_INVLTLB;
+    }
+}
+
+void
+pmap_inval_cache_add(pmap_inval_info_t info, pmap_t pmap, vm_offset_t va)
+{
+    if (pmap->pm_active == 0)
+	return;
+    info->pir_flags |= PIRF_WBINVL;
+}
+
+void
+pmap_inval_flush(pmap_inval_info_t info)
+{
     if (info->pir_flags & PIRF_INVLTLB)
 	cpu_invltlb();
     else if (info->pir_flags & PIRF_INVL1PG)
 	cpu_invlpg(info->pir_cpusync.cs_data);
-#endif
+    if (info->pir_flags & PIRF_WBINVL)
+	    cpu_wbinvl();
     info->pir_flags = 0;
 }
 
+#endif	/* SMP */
diff --git a/sys/platform/pc32/include/pmap.h b/sys/platform/pc32/include/pmap.h
index 9d7dda8..8efc52e 100644
--- a/sys/platform/pc32/include/pmap.h
+++ b/sys/platform/pc32/include/pmap.h
@@ -249,6 +249,8 @@ int	pmap_get_pgeflag(void);
 #ifdef SMP
 void	pmap_set_opt (void);
 #endif
+void	pmap_init_pat(void);
+int	pmap_change_attr(vm_offset_t, vm_size_t, int);
 
 #endif /* _KERNEL */
 
diff --git a/sys/platform/pc32/include/pmap_inval.h b/sys/platform/pc32/include/pmap_inval.h
index e8cd668..23e149f 100644
--- a/sys/platform/pc32/include/pmap_inval.h
+++ b/sys/platform/pc32/include/pmap_inval.h
@@ -51,6 +51,7 @@ typedef pmap_inval_info *pmap_inval_info_t;
 #define PIRF_INVLTLB	0x0001	/* request invalidation of whole table */
 #define PIRF_INVL1PG	0x0002	/* else request invalidation of one page */
 #define PIRF_CPUSYNC	0x0004	/* cpusync is currently active */
+#define PIRF_WBINVL	0x0008	/* request cache invalidation */
 
 #ifdef _KERNEL
 
@@ -60,6 +61,7 @@ typedef pmap_inval_info *pmap_inval_info_t;
 
 void pmap_inval_init(pmap_inval_info_t);
 void pmap_inval_add(pmap_inval_info_t, pmap_t, vm_offset_t);
+void pmap_inval_cache_add(pmap_inval_info_t, pmap_t, vm_offset_t);
 void pmap_inval_flush(pmap_inval_info_t);
 
 #endif




More information about the Submit mailing list