malloc(M_NOWAIT) issues.

Matthew Dillon dillon at apollo.backplane.com
Sun Jan 18 20:15:49 PST 2004


    Here's what I have so far.  This does not represent any major operational
    changes yet, but I'm gearing up to some sort of solution for CAM and
    for general interrupt-time allocations.

    What this patch does is change the VM_ALLOC_* state into a flags set,
    and augments the M_ malloc flags.  It seems to do a fair job when I
    drop hw.physmem to 64m and run buildworld -j 20.  I'm trying to make the
    flags more flexible to better cover the situations that come up.

    My intention is to find a solution that takes advantage of the fact that
    interrupt threads are threads.  One advantage that DFly already has is
    the fact that it should be possible to reuse pages from the 'cache' queue
    for allocations made from interrupts simply by calling lwkt_yield()
    if curthread->td_preempted is non-NULL.  This would cause the 
    interrupt preemption to return to the original thread and then reschedule
    as a non-preemptive thread.  When non-preemptive it should be possible
    for an interrupt to make use of additional memory resources or even to
    block (if we are very careful, anyway).

						-Matt


Index: dev/agp/agp.c
===================================================================
RCS file: /cvs/src/sys/dev/agp/agp.c,v
retrieving revision 1.8
diff -u -r1.8 agp.c
--- dev/agp/agp.c	9 Dec 2003 19:40:56 -0000	1.8
+++ dev/agp/agp.c	19 Jan 2004 03:03:15 -0000
@@ -523,7 +523,7 @@
 		 * the pages will be allocated and zeroed.
 		 */
 		m = vm_page_grab(mem->am_obj, OFF_TO_IDX(i),
-				 VM_ALLOC_ZERO | VM_ALLOC_RETRY);
+			 VM_ALLOC_NORMAL | VM_ALLOC_ZERO | VM_ALLOC_RETRY);
 		if ((m->flags & PG_ZERO) == 0)
 			vm_page_zero_fill(m);
 		AGP_DPF("found page pa=%#x\n", VM_PAGE_TO_PHYS(m));
Index: dev/agp/agp_i810.c
===================================================================
RCS file: /cvs/src/sys/dev/agp/agp_i810.c,v
retrieving revision 1.4
diff -u -r1.4 agp_i810.c
--- dev/agp/agp_i810.c	9 Dec 2003 19:40:56 -0000	1.4
+++ dev/agp/agp_i810.c	19 Jan 2004 03:03:27 -0000
@@ -519,7 +519,8 @@
 		 * get its physical address.
 		 */
 		vm_page_t m;
-		m = vm_page_grab(mem->am_obj, 0, VM_ALLOC_ZERO|VM_ALLOC_RETRY);
+		m = vm_page_grab(mem->am_obj, 0, 
+			VM_ALLOC_NORMAL|VM_ALLOC_ZERO|VM_ALLOC_RETRY);
 		if ((m->flags & PG_ZERO) == 0)
 			vm_page_zero_fill(m);
 		vm_page_wire(m);
Index: i386/i386/pmap.c
===================================================================
RCS file: /cvs/src/sys/i386/i386/pmap.c,v
retrieving revision 1.27
diff -u -r1.27 pmap.c
--- i386/i386/pmap.c	18 Jan 2004 12:29:47 -0000	1.27
+++ i386/i386/pmap.c	19 Jan 2004 03:42:40 -0000
@@ -1178,7 +1178,7 @@
 	 * Find or fabricate a new pagetable page
 	 */
 	m = vm_page_grab(pmap->pm_pteobj, ptepindex,
-			VM_ALLOC_ZERO | VM_ALLOC_RETRY);
+			VM_ALLOC_NORMAL | VM_ALLOC_ZERO | VM_ALLOC_RETRY);
 
 	KASSERT(m->queue == PQ_NONE,
 		("_pmap_allocpte: %p->queue != PQ_NONE", m));
@@ -1377,8 +1377,9 @@
 		/*
 		 * This index is bogus, but out of the way
 		 */
-		nkpg = vm_page_alloc(kptobj, nkpt, VM_ALLOC_SYSTEM);
-		if (!nkpg)
+		nkpg = vm_page_alloc(kptobj, nkpt, 
+			VM_ALLOC_NORMAL | VM_ALLOC_SYSTEM | VM_ALLOC_INTERRUPT);
+		if (nkpg == NULL)
 			panic("pmap_growkernel: no memory to grow kernel");
 
 		nkpt++;
Index: kern/kern_conf.c
===================================================================
RCS file: /cvs/src/sys/kern/kern_conf.c,v
retrieving revision 1.5
diff -u -r1.5 kern_conf.c
--- kern/kern_conf.c	9 Nov 2003 02:22:36 -0000	1.5
+++ kern/kern_conf.c	19 Jan 2004 03:35:00 -0000
@@ -118,7 +118,7 @@
 	}
 	if (stashed >= DEVT_STASH) {
 		MALLOC(si, struct specinfo *, sizeof(*si), M_DEVT,
-		    M_USE_RESERVE);
+		    M_WAITOK|M_USE_RESERVE);
 		bzero(si, sizeof(*si));
 	} else if (LIST_FIRST(&dev_free)) {
 		si = LIST_FIRST(&dev_free);
Index: kern/kern_mpipe.c
===================================================================
RCS file: /cvs/src/sys/kern/kern_mpipe.c,v
retrieving revision 1.1
diff -u -r1.1 kern_mpipe.c
--- kern/kern_mpipe.c	30 Nov 2003 20:13:54 -0000	1.1
+++ kern/kern_mpipe.c	19 Jan 2004 02:38:53 -0000
@@ -69,7 +69,7 @@
 	++mpipe->total_count;
 	mpipe_free(mpipe, buf);
 	while (--nnow > 0) {
-	    buf = malloc(bytes, mpipe->type, M_NOWAIT);
+	    buf = malloc(bytes, mpipe->type, M_SYSNOWAIT);
 	    if (buf == NULL)
 		break;
 	    ++mpipe->total_count;
@@ -98,7 +98,7 @@
 }
 
 /*
- * Allocate an entry.  flags can be M_NOWAIT which tells us not to block.
+ * Allocate an entry.  flags can be M_RNOWAIT which tells us not to block.
  * Unlike a normal malloc, if we block in mpipe_alloc() no deadlock will occur
  * because it will unblock the moment an existing in-use buffer is freed.
  */
@@ -116,7 +116,7 @@
 		return(buf);
 	    }
 	    --mpipe->total_count;
-	} else if (flags & M_NOWAIT) {
+	} else if (flags & M_RNOWAIT) {
 	    crit_exit();
 	    return(NULL);
 	} else {
Index: kern/kern_slaballoc.c
===================================================================
RCS file: /cvs/src/sys/kern/kern_slaballoc.c,v
retrieving revision 1.14
diff -u -r1.14 kern_slaballoc.c
--- kern/kern_slaballoc.c	25 Oct 2003 00:48:03 -0000	1.14
+++ kern/kern_slaballoc.c	19 Jan 2004 03:30:35 -0000
@@ -190,7 +190,7 @@
     ZonePageCount = ZoneSize / PAGE_SIZE;
 
     npg = (VM_MAX_KERNEL_ADDRESS - VM_MIN_KERNEL_ADDRESS) / PAGE_SIZE;
-    kmemusage = kmem_slab_alloc(npg * sizeof(struct kmemusage), PAGE_SIZE, M_ZERO);
+    kmemusage = kmem_slab_alloc(npg * sizeof(struct kmemusage), PAGE_SIZE, M_WAITOK|M_ZERO);
 
     for (i = 0; i < arysize(weirdary); ++i)
 	weirdary[i] = WEIRD_ADDR;
@@ -332,9 +332,15 @@
  *	KMEM subsystem.  A SLAB tracking descriptor must be specified, use
  *	&SlabMisc if you don't care.
  *
- *	M_NOWAIT	- return NULL instead of blocking.
+ *	M_RNOWAIT	- return NULL instead of blocking.
  *	M_ZERO		- zero the returned memory.
- *	M_USE_RESERVE	- allocate out of the system reserve if necessary
+ *	M_USE_RESERVE	- allow greater drawdown of the free list
+ *	M_USE_INTERRUPT_RESERVE - allow the freelist to be exhausted
+ *
+ *	M_FAILSAFE	- Failsafe allocation, when the allocation must
+ *			  succeed attemp to get out of any preemption context
+ *			  and allocate from the cache, else block (even though
+ *			  we might be blocking from an interrupt), or panic.
  */
 void *
 malloc(unsigned long size, struct malloc_type *type, int flags)
@@ -372,7 +378,7 @@
 	    ttl += type->ks_memuse[i];
 	type->ks_loosememuse = ttl;
 	if (ttl >= type->ks_limit) {
-	    if (flags & (M_NOWAIT|M_NULLOK))
+	    if (flags & (M_RNOWAIT|M_NULLOK))
 		return(NULL);
 	    panic("%s: malloc limit exceeded", type->ks_shortdesc);
 	}
@@ -393,7 +399,7 @@
      * safely manipulate the kernel_map in free() due to free() possibly
      * being called via an IPI message or from sensitive interrupt code.
      */
-    while (slgd->NFreeZones > ZONE_RELS_THRESH && (flags & M_NOWAIT) == 0) {
+    while (slgd->NFreeZones > ZONE_RELS_THRESH && (flags & M_RNOWAIT) == 0) {
 	crit_enter();
 	if (slgd->NFreeZones > ZONE_RELS_THRESH) {	/* crit sect race */
 	    z = slgd->FreeZones;
@@ -406,7 +412,7 @@
     /*
      * XXX handle oversized frees that were queued from free().
      */
-    while (slgd->FreeOvZones && (flags & M_NOWAIT) == 0) {
+    while (slgd->FreeOvZones && (flags & M_RNOWAIT) == 0) {
 	crit_enter();
 	if ((z = slgd->FreeOvZones) != NULL) {
 	    KKASSERT(z->z_Magic == ZALLOC_OVSZ_MAGIC);
@@ -835,6 +841,11 @@
  *	but when we move zalloc() over to use this function as its backend
  *	we will have to switch to kreserve/krelease and call reserve(0)
  *	after the new space is made available.
+ *
+ *	Interrupt code which has preempted other code is not allowed to
+ *	message with CACHE pages, but if M_FAILSAFE is set we can do a
+ *	yield to become non-preempting and try again inclusive of
+ *	cache pages.
  */
 static void *
 kmem_slab_alloc(vm_size_t size, vm_offset_t align, int flags)
@@ -843,6 +854,8 @@
     vm_offset_t addr;
     vm_offset_t offset;
     int count;
+    int wanted_reserve;
+    thread_t td;
     vm_map_t map = kernel_map;
 
     size = round_page(size);
@@ -856,10 +869,12 @@
     vm_map_lock(map);
     if (vm_map_findspace(map, vm_map_min(map), size, align, &addr)) {
 	vm_map_unlock(map);
-	if ((flags & (M_NOWAIT|M_NULLOK)) == 0)
+	if ((flags & (M_RNOWAIT|M_NULLOK)) == 0)
 	    panic("kmem_slab_alloc(): kernel_map ran out of space!");
 	crit_exit();
 	vm_map_entry_release(count);
+	if ((flags & (M_FAILSAFE|M_NULLOK)) == M_FAILSAFE)
+	    panic("kmem_slab_alloc(): kernel_map ran out of space!");
 	return(NULL);
     }
     offset = addr - VM_MIN_KERNEL_ADDRESS;
@@ -868,26 +883,65 @@
 		    kernel_object, offset, addr, addr + size,
 		    VM_PROT_ALL, VM_PROT_ALL, 0);
 
+    td = curthread;
+    wanted_reserve = 0;	/* non-zero = tried but unable to use system reserve */
+
     /*
      * Allocate the pages.  Do not mess with the PG_ZERO flag yet.
      */
     for (i = 0; i < size; i += PAGE_SIZE) {
 	vm_page_t m;
 	vm_pindex_t idx = OFF_TO_IDX(offset + i);
-	int zero = (flags & M_ZERO) ? VM_ALLOC_ZERO : 0;
+	int vmflags = 0;
 
-	if ((flags & (M_NOWAIT|M_USE_RESERVE)) == M_NOWAIT)
-	    m = vm_page_alloc(kernel_object, idx, VM_ALLOC_INTERRUPT|zero);
-	else
-	    m = vm_page_alloc(kernel_object, idx, VM_ALLOC_SYSTEM|zero);
+	if (flags & M_ZERO)
+	    vmflags |= VM_ALLOC_ZERO;
+	if (flags & M_USE_RESERVE)
+	    vmflags |= VM_ALLOC_SYSTEM;
+	if (flags & M_USE_INTERRUPT_RESERVE)
+	    vmflags |= VM_ALLOC_INTERRUPT;
+	if ((flags & (M_RNOWAIT|M_WAITOK)) == 0)
+		printf("kmem_slab_alloc: bad flags %08x (%p)\n", flags, ((int **)&size)[-1]);
+	if (flags & (M_FAILSAFE|M_WAITOK)) {
+	    if (td->td_preempted) {
+		wanted_reserve = 1;
+	    } else {
+		vmflags |= VM_ALLOC_NORMAL;
+		wanted_reserve = 0;
+	    }
+	}
+
+	m = vm_page_alloc(kernel_object, idx, vmflags);
+
+	/*
+	 * If the allocation failed we either return NULL or we retry.
+	 *
+	 * If M_WAITOK or M_FAILSAFE is set we retry.  Note that M_WAITOK
+	 * (and M_FAILSAFE) can be specified from an interrupt.  M_FAILSAFE
+	 * generates a warning or a panic.
+	 */
 	if (m == NULL) {
-	    if ((flags & M_NOWAIT) == 0) {
-		vm_map_unlock(map);
-		vm_wait();
-		vm_map_lock(map);
+	    if (flags & (M_FAILSAFE|M_WAITOK)) {
+		if (wanted_reserve) {
+		    if (flags & M_FAILSAFE)
+			printf("malloc: no memory, try failsafe\n");
+		    vm_map_unlock(map);
+		    lwkt_yield();
+		    vm_map_lock(map);
+		} else {
+		    if (flags & M_FAILSAFE)
+			printf("malloc: no memory, block even tho we shouldn't\n");
+		    vm_map_unlock(map);
+		    vm_wait();
+		    vm_map_lock(map);
+		}
 		i -= PAGE_SIZE;	/* retry */
 		continue;
 	    }
+
+	    /*
+	     * We were unable to recover, cleanup and return NULL
+	     */
 	    while (i != 0) {
 		i -= PAGE_SIZE;
 		m = vm_page_lookup(kernel_object, OFF_TO_IDX(offset + i));
@@ -902,6 +956,8 @@
     }
 
     /*
+     * Success!
+     *
      * Mark the map entry as non-pageable using a routine that allows us to
      * populate the underlying pages.
      */
Index: kern/kern_varsym.c
===================================================================
RCS file: /cvs/src/sys/kern/kern_varsym.c,v
retrieving revision 1.3
diff -u -r1.3 kern_varsym.c
--- kern/kern_varsym.c	10 Nov 2003 23:58:57 -0000	1.3
+++ kern/kern_varsym.c	19 Jan 2004 03:29:01 -0000
@@ -128,7 +128,7 @@
 
     if ((error = copyinstr(uap->name, name, sizeof(name), NULL)) != 0)
 	goto done2;
-    buf = malloc(MAXVARSYM_DATA, M_TEMP, 0);
+    buf = malloc(MAXVARSYM_DATA, M_TEMP, M_WAITOK);
     if (uap->data && 
 	(error = copyinstr(uap->data, buf, MAXVARSYM_DATA, NULL)) != 0)
     {
@@ -377,8 +377,8 @@
 	error = E2BIG;
     } else if (data) {
 	datalen = strlen(data);
-	ve = malloc(sizeof(struct varsyment), M_VARSYM, M_ZERO);
-	sym = malloc(sizeof(struct varsym) + namelen + datalen + 2, M_VARSYM, 0);
+	ve = malloc(sizeof(struct varsyment), M_VARSYM, M_WAITOK|M_ZERO);
+	sym = malloc(sizeof(struct varsym) + namelen + datalen + 2, M_VARSYM, M_WAITOK);
 	ve->ve_sym = sym;
 	sym->vs_refs = 1;
 	sym->vs_namelen = namelen;
@@ -417,7 +417,7 @@
 {
     struct varsyment *nve;
 
-    nve = malloc(sizeof(struct varsyment), M_VARSYM, M_ZERO);
+    nve = malloc(sizeof(struct varsyment), M_VARSYM, M_WAITOK|M_ZERO);
     nve->ve_sym = ve->ve_sym;
     ++nve->ve_sym->vs_refs;
     TAILQ_INSERT_TAIL(&vss->vx_queue, nve, ve_entry);
Index: kern/vfs_bio.c
===================================================================
RCS file: /cvs/src/sys/kern/vfs_bio.c,v
retrieving revision 1.16
diff -u -r1.16 vfs_bio.c
--- kern/vfs_bio.c	3 Nov 2003 17:11:21 -0000	1.16
+++ kern/vfs_bio.c	19 Jan 2004 01:37:26 -0000
@@ -2503,7 +2503,7 @@
 					 * with paging I/O, no matter which
 					 * process we are.
 					 */
-					m = vm_page_alloc(obj, pi, VM_ALLOC_SYSTEM);
+					m = vm_page_alloc(obj, pi, VM_ALLOC_NORMAL | VM_ALLOC_SYSTEM);
 					if (m == NULL) {
 						VM_WAIT;
 						vm_pageout_deficit += desiredpages - bp->b_npages;
@@ -3157,7 +3157,7 @@
 		 */
 		p = vm_page_alloc(kernel_object,
 			((pg - VM_MIN_KERNEL_ADDRESS) >> PAGE_SHIFT),
-		    VM_ALLOC_SYSTEM);
+			VM_ALLOC_NORMAL | VM_ALLOC_SYSTEM);
 		if (!p) {
 			vm_pageout_deficit += (to - from) >> PAGE_SHIFT;
 			VM_WAIT;
Index: sys/malloc.h
===================================================================
RCS file: /cvs/src/sys/sys/malloc.h,v
retrieving revision 1.16
diff -u -r1.16 malloc.h
--- sys/malloc.h	30 Nov 2003 20:13:53 -0000	1.16
+++ sys/malloc.h	19 Jan 2004 02:25:10 -0000
@@ -59,12 +59,50 @@
 /*
  * flags to malloc.
  */
-#define	M_NOWAIT     	0x0001	/* do not block */
-#define	M_WAITOK     	0x0002	/* wait for resources */
+#define	M_RNOWAIT     	0x0001	/* do not block */
+#define	M_WAITOK     	0x0002	/* wait for resources / alloc from cache */
 #define	M_ZERO       	0x0100	/* bzero() the allocation */
-#define	M_USE_RESERVE	0x0200	/* can alloc out of reserve memory */
+#define	M_USE_RESERVE	0x0200	/* can eat into free list reserve */
 #define	M_NULLOK	0x0400	/* ok to return NULL in M_WAITOK case */
 #define M_PASSIVE_ZERO	0x0800	/* (internal to the slab code only) */
+#define M_USE_INTERRUPT_RESERVE \
+			0x1000	/* can exhaust free list entirely */
+#define	M_FAILSAFE    	0x2000	/* failsafe allocation attempt */
+
+/*
+ * M_NOWAIT has to be a set of flags for equivalence to prior use. 
+ *
+ * M_INTALLOC should be used for any critical infrastructure allocations
+ * made from interrupts.
+ *
+ * M_SYSALLOC should be used for any critical infrastructure allocations
+ * made by the kernel proper.
+ *
+ * NOTE ON DRAGONFLY USE OF M_NOWAIT.  M_NOWAIT has traditionally been used
+ * when we did not wish to break spl protections or when we allocate memory
+ * from interrupts.  For the spl protection case we intend to move all
+ * such allocations outside of the spl blocks.  
+ *
+ * For the interrupt case the issue comes down to whether it is possible
+ * to allocate out of the VM page cache.  Since interrupts are threads it
+ * is theoretically possible to allocate out of the VM page cache as long
+ * as we determine that we are not preempting another thread.  This is a
+ * simple td->td_preempted check.  In DFly we can also theoretically do
+ * an lwkt_yield() to force the interrupt thread to be rescheduled (so it
+ * is no longer preempting a thread) and then allocate out of the cache.
+ * This is what the M_FAILSAFE flag does in M_INTALLOC and this is why
+ * M_INTALLOC should be used in interrupt-related situations where the
+ * allocation must absolutely succeed for the health of the machine.
+ */
+
+#define M_INTNOWAIT	(M_RNOWAIT|M_USE_RESERVE|M_USE_INTERRUPT_RESERVE)
+#define M_SYSNOWAIT	(M_RNOWAIT|M_USE_RESERVE)
+#define M_INTWAIT	(M_WAITOK|M_USE_RESERVE|M_USE_INTERRUPT_RESERVE)
+#define M_SYSWAIT	(M_WAITOK|M_USE_RESERVE)
+
+#define M_NOWAIT	M_INTNOWAIT
+#define M_INTALLOC	(M_INTNOWAIT|M_FAILSAFE)
+#define M_SYSALLOC	M_SYSWAIT
 
 #define	M_MAGIC		877983977	/* time when first defined :-) */
 
Index: vfs/nwfs/nwfs_vfsops.c
===================================================================
RCS file: /cvs/src/sys/vfs/nwfs/nwfs_vfsops.c,v
retrieving revision 1.7
diff -u -r1.7 nwfs_vfsops.c
--- vfs/nwfs/nwfs_vfsops.c	7 Aug 2003 21:54:36 -0000	1.7
+++ vfs/nwfs/nwfs_vfsops.c	19 Jan 2004 03:35:32 -0000
@@ -184,7 +184,7 @@
 	ncp_conn_unlock(conn,td);	/* we keep the ref */
 	mp->mnt_stat.f_iosize = conn->buffer_size;
         /* We must malloc our own mount info */
-        MALLOC(nmp,struct nwmount *,sizeof(struct nwmount),M_NWFSDATA,M_USE_RESERVE | M_ZERO);
+        MALLOC(nmp,struct nwmount *,sizeof(struct nwmount),M_NWFSDATA, M_WAITOK|M_USE_RESERVE|M_ZERO);
         if (nmp == NULL) {
                 nwfs_printf("could not alloc nwmount\n");
                 error = ENOMEM;
Index: vfs/smbfs/smbfs_vfsops.c
===================================================================
RCS file: /cvs/src/sys/vfs/smbfs/smbfs_vfsops.c,v
retrieving revision 1.7
diff -u -r1.7 smbfs_vfsops.c
--- vfs/smbfs/smbfs_vfsops.c	7 Aug 2003 21:54:36 -0000	1.7
+++ vfs/smbfs/smbfs_vfsops.c	19 Jan 2004 03:35:43 -0000
@@ -177,7 +177,7 @@
 #ifdef SMBFS_USEZONE
 	smp = zalloc(smbfsmount_zone);
 #else
-        MALLOC(smp, struct smbmount*, sizeof(*smp), M_SMBFSDATA, M_USE_RESERVE);
+        MALLOC(smp, struct smbmount*, sizeof(*smp), M_SMBFSDATA, M_WAITOK|M_USE_RESERVE);
 #endif
         if (smp == NULL) {
                 printf("could not alloc smbmount\n");
Index: vm/vm_fault.c
===================================================================
RCS file: /cvs/src/sys/vm/vm_fault.c,v
retrieving revision 1.9
diff -u -r1.9 vm_fault.c
--- vm/vm_fault.c	3 Nov 2003 17:11:23 -0000	1.9
+++ vm/vm_fault.c	19 Jan 2004 01:37:26 -0000
@@ -363,7 +363,7 @@
 			fs.m = NULL;
 			if (!vm_page_count_severe()) {
 				fs.m = vm_page_alloc(fs.object, fs.pindex,
-				    (fs.vp || fs.object->backing_object)? VM_ALLOC_NORMAL: VM_ALLOC_ZERO);
+				    (fs.vp || fs.object->backing_object)? VM_ALLOC_NORMAL: VM_ALLOC_NORMAL | VM_ALLOC_ZERO);
 			}
 			if (fs.m == NULL) {
 				unlock_and_deallocate(&fs);
Index: vm/vm_kern.c
===================================================================
RCS file: /cvs/src/sys/vm/vm_kern.c,v
retrieving revision 1.13
diff -u -r1.13 vm_kern.c
--- vm/vm_kern.c	14 Jan 2004 23:26:14 -0000	1.13
+++ vm/vm_kern.c	19 Jan 2004 03:14:51 -0000
@@ -204,7 +204,7 @@
 		vm_page_t mem;
 
 		mem = vm_page_grab(kernel_object, OFF_TO_IDX(offset + i),
-				VM_ALLOC_ZERO | VM_ALLOC_RETRY);
+			    VM_ALLOC_ZERO | VM_ALLOC_NORMAL | VM_ALLOC_RETRY);
 		if ((mem->flags & PG_ZERO) == 0)
 			vm_page_zero_fill(mem);
 		mem->valid = VM_PAGE_BITS_ALL;
@@ -285,14 +285,11 @@
  * 	kmem_alloc() because we may need to allocate memory at interrupt
  * 	level where we cannot block (canwait == FALSE).
  *
- * 	Note that this still only works in a uni-processor environment and
- * 	when called at splhigh().
- *
  * 	We don't worry about expanding the map (adding entries) since entries
  * 	for wired maps are statically allocated.
  *
- *	NOTE:  This routine is not supposed to block if M_NOWAIT is set, but
- *	I have not verified that it actually does not block.
+ *	NOTE:  Please see kmem_slab_alloc() for a better explanation of the
+ *	M_* flags.
  */
 vm_offset_t
 kmem_malloc(vm_map_t map, vm_size_t size, int flags)
@@ -302,6 +299,8 @@
 	vm_offset_t addr;
 	vm_page_t m;
 	int count;
+	thread_t td;
+	int wanted_reserve;
 
 	if (map != kernel_map && map != mb_map)
 		panic("kmem_malloc: map != {kmem,mb}_map");
@@ -324,9 +323,13 @@
 			printf("Out of mbuf clusters - adjust NMBCLUSTERS or increase maxusers!\n");
 			return (0);
 		}
-		if ((flags & M_NOWAIT) == 0)
-			panic("kmem_malloc(%ld): kernel_map too small: %ld total allocated",
+		if ((flags & (M_RNOWAIT|M_NULLOK)) == 0 ||
+		    (flags & (M_FAILSAFE|M_NULLOK)) == M_FAILSAFE
+		) {
+			panic("kmem_malloc(%ld): kernel_map too small: "
+				"%ld total allocated",
 				(long)size, (long)map->size);
+		}
 		return (0);
 	}
 	offset = addr - VM_MIN_KERNEL_ADDRESS;
@@ -335,32 +338,52 @@
 		kmem_object, offset, addr, addr + size,
 		VM_PROT_ALL, VM_PROT_ALL, 0);
 
+	td = curthread;
+	wanted_reserve = 0;
+
 	for (i = 0; i < size; i += PAGE_SIZE) {
-		/*
-		 * Note: if M_NOWAIT specified alone, allocate from 
-		 * interrupt-safe queues only (just the free list).  If 
-		 * M_USE_RESERVE is also specified, we can also
-		 * allocate from the cache.  Neither of the latter two
-		 * flags may be specified from an interrupt since interrupts
-		 * are not allowed to mess with the cache queue.
-		 */
-retry:
-		m = vm_page_alloc(kmem_object, OFF_TO_IDX(offset + i),
-		    ((flags & (M_NOWAIT|M_USE_RESERVE)) == M_NOWAIT) ?
-			VM_ALLOC_INTERRUPT : 
-			VM_ALLOC_SYSTEM);
+		int vmflags;
+
+		vmflags = VM_ALLOC_SYSTEM;	/* XXX M_USE_RESERVE? */
+		if ((flags & (M_WAITOK|M_RNOWAIT)) == 0)
+			printf("kmem_malloc: bad flags %08x (%p)\n", flags, ((int **)&map)[-1]);
+		if (flags & M_USE_INTERRUPT_RESERVE)
+			vmflags |= VM_ALLOC_INTERRUPT;
+		if (flags & (M_FAILSAFE|M_WAITOK)) {
+			if (td->td_preempted) {
+				wanted_reserve = 1;
+			} else {
+				vmflags |= VM_ALLOC_NORMAL;
+				wanted_reserve = 0;
+			}
+		}
+
+		m = vm_page_alloc(kmem_object, OFF_TO_IDX(offset + i), vmflags);
 
 		/*
 		 * Ran out of space, free everything up and return. Don't need
 		 * to lock page queues here as we know that the pages we got
 		 * aren't on any queues.
+		 *
+		 * If M_WAITOK or M_FAILSAFE is set we can yield or block.
 		 */
 		if (m == NULL) {
-			if ((flags & M_NOWAIT) == 0) {
-				vm_map_unlock(map);
-				VM_WAIT;
-				vm_map_lock(map);
-				goto retry;
+			if (flags & (M_FAILSAFE|M_WAITOK)) {
+				if (wanted_reserve) {
+					if (flags & M_FAILSAFE)
+						printf("kmem_malloc: no memory, try failsafe\n");
+					vm_map_unlock(map);
+					lwkt_yield();
+					vm_map_lock(map);
+				} else {
+					if (flags & M_FAILSAFE)
+						printf("kmem_malloc: no memory, block even though we shouldn't\n");
+					vm_map_unlock(map);
+					VM_WAIT;
+					vm_map_lock(map);
+				}
+				i -= PAGE_SIZE;	/* retry */
+				continue;
 			}
 			/* 
 			 * Free the pages before removing the map entry.
Index: vm/vm_map.c
===================================================================
RCS file: /cvs/src/sys/vm/vm_map.c,v
retrieving revision 1.19
diff -u -r1.19 vm_map.c
--- vm/vm_map.c	18 Jan 2004 12:32:04 -0000	1.19
+++ vm/vm_map.c	19 Jan 2004 03:04:31 -0000
@@ -3513,11 +3513,11 @@
 		for (idx = 0; idx < robject->size; idx++) {
 
 			m_out = vm_page_grab(robject, idx,
-						VM_ALLOC_NORMAL | VM_ALLOC_RETRY);
+					    VM_ALLOC_NORMAL | VM_ALLOC_RETRY);
 
 			if (m_out->valid == 0) {
 				m_in = vm_page_grab(object, bo_pindex + idx,
-						VM_ALLOC_NORMAL | VM_ALLOC_RETRY);
+					    VM_ALLOC_NORMAL | VM_ALLOC_RETRY);
 				if (m_in->valid == 0) {
 					rv = vm_pager_get_pages(object, &m_in, 1, 0);
 					if (rv != VM_PAGER_OK) {
Index: vm/vm_page.c
===================================================================
RCS file: /cvs/src/sys/vm/vm_page.c,v
retrieving revision 1.15
diff -u -r1.15 vm_page.c
--- vm/vm_page.c	3 Nov 2003 17:11:23 -0000	1.15
+++ vm/vm_page.c	19 Jan 2004 03:49:00 -0000
@@ -731,10 +731,10 @@
  *	with this VM object/offset pair.
  *
  *	page_req classes:
- *	VM_ALLOC_NORMAL		normal process request
- *	VM_ALLOC_SYSTEM		system *really* needs a page
- *	VM_ALLOC_INTERRUPT	interrupt time request
- *	VM_ALLOC_ZERO		zero page
+ *	VM_ALLOC_NORMAL		allow use of cache pages, nominal free drain
+ *	VM_ALLOC_SYSTEM		greater free drain
+ *	VM_ALLOC_INTERRUPT	allow free list to be completely drained
+ *	VM_ALLOC_ZERO		advisory request for pre-zero'd page
  *
  *	Object must be locked.
  *	This routine may not block.
@@ -752,62 +752,72 @@
 
 	KASSERT(!vm_page_lookup(object, pindex),
 		("vm_page_alloc: page already allocated"));
+	KKASSERT(page_req & 
+		(VM_ALLOC_NORMAL|VM_ALLOC_INTERRUPT|VM_ALLOC_SYSTEM));
 
 	/*
 	 * The pager is allowed to eat deeper into the free page list.
 	 */
-
-	if ((curthread == pagethread) && (page_req != VM_ALLOC_INTERRUPT)) {
-		page_req = VM_ALLOC_SYSTEM;
-	};
+	if (curthread == pagethread)
+		page_req |= VM_ALLOC_SYSTEM;
 
 	s = splvm();
-
 loop:
-	if (vmstats.v_free_count > vmstats.v_free_reserved) {
+	if (vmstats.v_free_count > vmstats.v_free_reserved ||
+	    ((page_req & VM_ALLOC_INTERRUPT) && vmstats.v_free_count > 0) ||
+	    ((page_req & VM_ALLOC_SYSTEM) && vmstats.v_cache_count == 0 &&
+		vmstats.v_free_count > vmstats.v_interrupt_free_min)
+	) {
 		/*
-		 * Allocate from the free queue if there are plenty of pages
-		 * in it.
+		 * The free queue has sufficient free pages to take one out.
 		 */
-		if (page_req == VM_ALLOC_ZERO)
+		if (page_req & VM_ALLOC_ZERO)
 			m = vm_page_select_free(object, pindex, TRUE);
 		else
 			m = vm_page_select_free(object, pindex, FALSE);
-	} else if (
-	    (page_req == VM_ALLOC_SYSTEM && 
-	     vmstats.v_cache_count == 0 && 
-	     vmstats.v_free_count > vmstats.v_interrupt_free_min) ||
-	    (page_req == VM_ALLOC_INTERRUPT && vmstats.v_free_count > 0)
-	) {
+	} else if (page_req & VM_ALLOC_NORMAL) {
+		/*
+		 * Allocatable from the cache (non-interrupt only).  On
+		 * success, we must free the page and try again, thus
+		 * ensuring that vmstats.v_*_free_min counters are replenished.
+		 */
+#ifdef INVARIANTS
+		if (curthread->td_preempted) {
+			printf("vm_page_alloc(): warning, attempt to allocate"
+				" cache page from preempting interrupt\n");
+			m = NULL;
+		} else {
+			m = vm_page_select_cache(object, pindex);
+		}
+#else
+		m = vm_page_select_cache(object, pindex);
+#endif
 		/*
-		 * Interrupt or system, dig deeper into the free list.
+		 * On succuess move the page into the free queue and loop.
 		 */
-		m = vm_page_select_free(object, pindex, FALSE);
-	} else if (page_req != VM_ALLOC_INTERRUPT) {
+		if (m != NULL) {
+			KASSERT(m->dirty == 0,
+			    ("Found dirty cache page %p", m));
+			vm_page_busy(m);
+			vm_page_protect(m, VM_PROT_NONE);
+			vm_page_free(m);
+			goto loop;
+		}
+
 		/*
-		 * Allocatable from cache (non-interrupt only).  On success,
-		 * we must free the page and try again, thus ensuring that
-		 * vmstats.v_*_free_min counters are replenished.
+		 * On failure return NULL
 		 */
-		m = vm_page_select_cache(object, pindex);
-		if (m == NULL) {
-			splx(s);
+		splx(s);
 #if defined(DIAGNOSTIC)
-			if (vmstats.v_cache_count > 0)
-				printf("vm_page_alloc(NORMAL): missing pages on cache queue: %d\n", vmstats.v_cache_count);
+		if (vmstats.v_cache_count > 0)
+			printf("vm_page_alloc(NORMAL): missing pages on cache queue: %d\n", vmstats.v_cache_count);
 #endif
-			vm_pageout_deficit++;
-			pagedaemon_wakeup();
-			return (NULL);
-		}
-		KASSERT(m->dirty == 0, ("Found dirty cache page %p", m));
-		vm_page_busy(m);
-		vm_page_protect(m, VM_PROT_NONE);
-		vm_page_free(m);
-		goto loop;
+		vm_pageout_deficit++;
+		pagedaemon_wakeup();
+		return (NULL);
 	} else {
 		/*
-		 * Not allocatable from cache from interrupt, give up.
+		 * No pages available, wakeup the pageout daemon and give up.
 		 */
 		splx(s);
 		vm_pageout_deficit++;
@@ -816,24 +826,18 @@
 	}
 
 	/*
-	 *  At this point we had better have found a good page.
+	 * Good page found.
 	 */
-
-	KASSERT(
-	    m != NULL,
-	    ("vm_page_alloc(): missing page on free queue\n")
-	);
+	KASSERT(m != NULL, ("vm_page_alloc(): missing page on free queue\n"));
 
 	/*
 	 * Remove from free queue
 	 */
-
 	vm_page_unqueue_nowakeup(m);
 
 	/*
 	 * Initialize structure.  Only the PG_ZERO flag is inherited.
 	 */
-
 	if (m->flags & PG_ZERO) {
 		vm_page_zero_count--;
 		m->flags = PG_ZERO | PG_BUSY;
@@ -845,7 +849,8 @@
 	m->act_count = 0;
 	m->busy = 0;
 	m->valid = 0;
-	KASSERT(m->dirty == 0, ("vm_page_alloc: free/cache page %p was dirty", m));
+	KASSERT(m->dirty == 0, 
+		("vm_page_alloc: free/cache page %p was dirty", m));
 
 	/*
 	 * vm_page_insert() is safe prior to the splx().  Note also that
@@ -853,7 +858,6 @@
 	 * could cause us to block allocating memory).  We cannot block 
 	 * anywhere.
 	 */
-
 	vm_page_insert(m, object, pindex);
 
 	/*
@@ -864,7 +868,6 @@
 		pagedaemon_wakeup();
 
 	splx(s);
-
 	return (m);
 }
 
@@ -1428,15 +1431,18 @@
  * changing state.  We keep on waiting, if the page continues
  * to be in the object.  If the page doesn't exist, allocate it.
  *
+ * If VM_ALLOC_RETRY is specified VM_ALLOC_NORMAL must also be specified.
+ *
  * This routine may block.
  */
 vm_page_t
 vm_page_grab(vm_object_t object, vm_pindex_t pindex, int allocflags)
 {
-
 	vm_page_t m;
 	int s, generation;
 
+	KKASSERT(allocflags &
+		(VM_ALLOC_NORMAL|VM_ALLOC_INTERRUPT|VM_ALLOC_SYSTEM));
 retrylookup:
 	if ((m = vm_page_lookup(object, pindex)) != NULL) {
 		if (m->busy || (m->flags & PG_BUSY)) {
Index: vm/vm_page.h
===================================================================
RCS file: /cvs/src/sys/vm/vm_page.h,v
retrieving revision 1.8
diff -u -r1.8 vm_page.h
--- vm/vm_page.h	3 Nov 2003 17:11:23 -0000	1.8
+++ vm/vm_page.h	19 Jan 2004 03:47:09 -0000
@@ -389,11 +389,19 @@
 #define VM_PAGE_BITS_ALL 0xffff
 #endif
 
-#define VM_ALLOC_NORMAL		0
-#define VM_ALLOC_INTERRUPT	1
-#define VM_ALLOC_SYSTEM		2
-#define	VM_ALLOC_ZERO		3
-#define	VM_ALLOC_RETRY		0x80
+/*
+ * Note: the code will always use nominally free pages from the free list
+ * before trying other flag-specified sources. 
+ *
+ * At least one of VM_ALLOC_NORMAL|VM_ALLOC_SYSTEM|VM_ALLOC_INTERRUPT 
+ * must be specified.  VM_ALLOC_RETRY may only be specified if VM_ALLOC_NORMAL
+ * is also specified.
+ */
+#define VM_ALLOC_NORMAL		0x01	/* ok to use cache pages */
+#define VM_ALLOC_SYSTEM		0x02	/* ok to exhaust most of free list */
+#define VM_ALLOC_INTERRUPT	0x04	/* ok to exhaust entire free list */
+#define	VM_ALLOC_ZERO		0x08	/* req pre-zero'd memory if avail */
+#define	VM_ALLOC_RETRY		0x80	/* indefinite block (vm_page_grab()) */
 
 void vm_page_unhold(vm_page_t mem);
 
Index: vm/vm_zone.c
===================================================================
RCS file: /cvs/src/sys/vm/vm_zone.c,v
retrieving revision 1.12
diff -u -r1.12 vm_zone.c
--- vm/vm_zone.c	14 Jan 2004 23:26:14 -0000	1.12
+++ vm/vm_zone.c	19 Jan 2004 01:37:26 -0000
@@ -190,10 +190,10 @@
 			z->zobj = obj;
 			_vm_object_allocate(OBJT_DEFAULT, z->zpagemax, obj);
 		}
-		z->zallocflag = VM_ALLOC_INTERRUPT;
+		z->zallocflag = VM_ALLOC_SYSTEM | VM_ALLOC_INTERRUPT;
 		z->zmax += nentries;
 	} else {
-		z->zallocflag = VM_ALLOC_SYSTEM;
+		z->zallocflag = VM_ALLOC_NORMAL | VM_ALLOC_SYSTEM;
 		z->zmax = 0;
 	}
 





More information about the Kernel mailing list