BUF/BIO stage 3 patch #1

Matthew Dillon dillon at apollo.backplane.com
Fri Mar 3 12:47:28 PST 2006


    Ok, here is the next patch. I am treading carefully, doing as much
    precursor work as I can before the massive block number -> 64 bit offset
    conversion patch.

    This is a smaller patch which cleans up the buffer cache hash table.
    The global hash table has been replaced by a per-vnode red-black tree
    and the buffer's vnode assignment, bp->b_vp, is now completely integrated
    with the block lookup code (prior patches removed all the bp->b_vp hacks
    so bp->b_vp is now an ultra-stable field).  This in turn allowed the
    reassignbuf() routine to be greatly simplified.

					-Matt
					Matthew Dillon 
					<dillon at xxxxxxxxxxxxx>

Index: i386/i386/machdep.c
===================================================================
RCS file: /cvs/src/sys/i386/i386/machdep.c,v
retrieving revision 1.86
diff -u -r1.86 machdep.c
--- i386/i386/machdep.c	17 Feb 2006 19:18:06 -0000	1.86
+++ i386/i386/machdep.c	3 Mar 2006 18:28:07 -0000
@@ -353,7 +353,6 @@
 
 	valloc(swbuf, struct buf, nswbuf);
 	valloc(buf, struct buf, nbuf);
-	v = bufhashinit(v);
 
 	/*
 	 * End of first pass, size has been calculated so allocate memory
Index: kern/vfs_bio.c
===================================================================
RCS file: /cvs/src/sys/kern/vfs_bio.c,v
retrieving revision 1.57
diff -u -r1.57 vfs_bio.c
--- kern/vfs_bio.c	2 Mar 2006 20:28:49 -0000	1.57
+++ kern/vfs_bio.c	3 Mar 2006 19:30:37 -0000
@@ -164,9 +164,6 @@
 SYSCTL_INT(_debug_sizeof, OID_AUTO, buf, CTLFLAG_RD, 0, sizeof(struct buf),
 	"sizeof(struct buf)");
 
-static int bufhashmask;
-static int bufhashshift;
-static LIST_HEAD(bufhashhdr, buf) *bufhashtbl, invalhash;
 char *buf_wmesg = BUF_WMESG;
 
 extern int vm_swap_size;
@@ -177,51 +174,6 @@
 #define VFS_BIO_NEED_BUFSPACE	0x08	/* wait for buf space, lo hysteresis */
 
 /*
- * Buffer hash table code.  Note that the logical block scans linearly, which
- * gives us some L1 cache locality.
- */
-
-static __inline 
-struct bufhashhdr *
-bufhash(struct vnode *vnp, daddr_t bn)
-{
-	u_int64_t hashkey64;
-	int hashkey; 
-	
-	/*
-	 * A variation on the Fibonacci hash that Knuth credits to
-	 * R. W. Floyd, see Knuth's _Art of Computer Programming,
-	 * Volume 3 / Sorting and Searching_
-	 *
-         * We reduce the argument to 32 bits before doing the hash to
-	 * avoid the need for a slow 64x64 multiply on 32 bit platforms.
-	 *
-	 * sizeof(struct vnode) is 168 on i386, so toss some of the lower
-	 * bits of the vnode address to reduce the key range, which
-	 * improves the distribution of keys across buckets.
-	 *
-	 * The file system cylinder group blocks are very heavily
-	 * used.  They are located at invervals of fbg, which is
-	 * on the order of 89 to 94 * 2^10, depending on other
-	 * filesystem parameters, for a 16k block size.  Smaller block
-	 * sizes will reduce fpg approximately proportionally.  This
-	 * will cause the cylinder group index to be hashed using the
-	 * lower bits of the hash multiplier, which will not distribute
-	 * the keys as uniformly in a classic Fibonacci hash where a
-	 * relatively small number of the upper bits of the result
-	 * are used.  Using 2^16 as a close-enough approximation to
-	 * fpg, split the hash multiplier in half, with the upper 16
-	 * bits being the inverse of the golden ratio, and the lower
-	 * 16 bits being a fraction between 1/3 and 3/7 (closer to
-	 * 3/7 in this case), that gives good experimental results.
-	 */
-	hashkey64 = ((u_int64_t)(uintptr_t)vnp >> 3) + (u_int64_t)bn;
-	hashkey = (((u_int32_t)(hashkey64 + (hashkey64 >> 32)) * 0x9E376DB1u) >>
-	    bufhashshift) & bufhashmask;
-	return(&bufhashtbl[hashkey]);
-}
-
-/*
  * numdirtywakeup:
  *
  *	If someone is blocked due to there being too many dirty buffers,
@@ -384,25 +336,6 @@
 }
 
 /*
- * bufhashinit:
- *
- *	Initialize buffer headers and related structures. 
- */
-
-caddr_t
-bufhashinit(caddr_t vaddr)
-{
-	/* first, make a null hash table */
-	bufhashshift = 29;
-	for (bufhashmask = 8; bufhashmask < nbuf / 4; bufhashmask <<= 1)
-		bufhashshift--;
-	bufhashtbl = (void *)vaddr;
-	vaddr = vaddr + sizeof(*bufhashtbl) * bufhashmask;
-	--bufhashmask;
-	return(vaddr);
-}
-
-/*
  * bufinit:
  *
  *	Load time initialisation of the buffer cache, called from machine
@@ -415,11 +348,6 @@
 	vm_offset_t bogus_offset;
 	int i;
 
-	LIST_INIT(&invalhash);
-
-	for (i = 0; i <= bufhashmask; i++)
-		LIST_INIT(&bufhashtbl[i]);
-
 	/* next, make a null set of free lists */
 	for (i = 0; i < BUFFER_QUEUES; i++)
 		TAILQ_INIT(&bufqueues[i]);
@@ -435,7 +363,6 @@
 		LIST_INIT(&bp->b_dep);
 		BUF_LOCKINIT(bp);
 		TAILQ_INSERT_TAIL(&bufqueues[BQUEUE_EMPTY], bp, b_freelist);
-		LIST_INSERT_HEAD(&invalhash, bp, b_hash);
 	}
 
 	/*
@@ -923,7 +850,7 @@
 
 	if ((bp->b_flags & B_DELWRI) == 0) {
 		bp->b_flags |= B_DONE | B_DELWRI;
-		reassignbuf(bp, bp->b_vp);
+		reassignbuf(bp);
 		++numdirtybuffers;
 		bd_wakeup((lodirtybuffers + hidirtybuffers) / 2);
 	}
@@ -949,7 +876,7 @@
 {
 	if (bp->b_flags & B_DELWRI) {
 		bp->b_flags &= ~B_DELWRI;
-		reassignbuf(bp, bp->b_vp);
+		reassignbuf(bp);
 		--numdirtybuffers;
 		numdirtywakeup(lodirtybuffers);
 	}
@@ -1274,6 +1201,7 @@
 		bp->b_flags |= B_INVAL;
 		bp->b_xflags &= ~BX_BKGRDWRITE;
 		KASSERT(bp->b_vp == NULL, ("bp1 %p flags %08x/%08lx vnode %p unexpectededly still associated!", bp, saved_flags, bp->b_flags, bp->b_vp));
+		KKASSERT((bp->b_flags & B_HASHED) == 0);
 		if (bp->b_xflags & BX_BKGRDINPROG)
 			panic("losing buffer 1");
 		if (bp->b_kvasize) {
@@ -1282,22 +1210,19 @@
 			bp->b_qindex = BQUEUE_EMPTY;
 		}
 		TAILQ_INSERT_HEAD(&bufqueues[bp->b_qindex], bp, b_freelist);
-		LIST_REMOVE(bp, b_hash);
-		LIST_INSERT_HEAD(&invalhash, bp, b_hash);
 	} else if (bp->b_flags & (B_ERROR | B_INVAL | B_NOCACHE | B_RELBUF)) {
 		/*
 		 * Buffers with junk contents.   Again these buffers had better
 		 * already be disassociated from their vnode.
 		 */
 		KASSERT(bp->b_vp == NULL, ("bp2 %p flags %08x/%08lx vnode %p unexpectededly still associated!", bp, saved_flags, bp->b_flags, bp->b_vp));
+		KKASSERT((bp->b_flags & B_HASHED) == 0);
 		bp->b_flags |= B_INVAL;
 		bp->b_xflags &= ~BX_BKGRDWRITE;
 		if (bp->b_xflags & BX_BKGRDINPROG)
 			panic("losing buffer 2");
 		bp->b_qindex = BQUEUE_CLEAN;
 		TAILQ_INSERT_HEAD(&bufqueues[BQUEUE_CLEAN], bp, b_freelist);
-		LIST_REMOVE(bp, b_hash);
-		LIST_INSERT_HEAD(&invalhash, bp, b_hash);
 	} else if (bp->b_flags & B_LOCKED) {
 		/*
 		 * Buffers that are locked.
@@ -1497,25 +1422,6 @@
 }
 
 /*
- * gbincore:
- *
- *	Check to see if a block is currently memory resident.
- */
-struct buf *
-gbincore(struct vnode * vp, daddr_t blkno)
-{
-	struct buf *bp;
-	struct bufhashhdr *bh;
-
-	bh = bufhash(vp, blkno);
-	LIST_FOREACH(bp, bh, b_hash) {
-		if (bp->b_vp == vp && bp->b_lblkno == blkno)
-			break;
-	}
-	return (bp);
-}
-
-/*
  * vfs_bio_awrite:
  *
  *	Implement clustered async writes for clearing out B_DELWRI buffers.
@@ -1556,7 +1462,7 @@
 		maxcl = MAXPHYS / size;
 
 		for (i = 1; i < maxcl; i++) {
-			if ((bpa = gbincore(vp, lblkno + i)) &&
+			if ((bpa = findblk(vp, lblkno + i)) &&
 			    BUF_REFCNT(bpa) == 0 &&
 			    ((bpa->b_flags & (B_DELWRI | B_CLUSTEROK | B_INVAL)) ==
 			    (B_DELWRI | B_CLUSTEROK)) &&
@@ -1570,7 +1476,7 @@
 			}
 		}
 		for (j = 1; i + j <= maxcl && j <= lblkno; j++) {
-			if ((bpa = gbincore(vp, lblkno - j)) &&
+			if ((bpa = findblk(vp, lblkno - j)) &&
 			    BUF_REFCNT(bpa) == 0 &&
 			    ((bpa->b_flags & (B_DELWRI | B_CLUSTEROK | B_INVAL)) ==
 			    (B_DELWRI | B_CLUSTEROK)) &&
@@ -1785,12 +1691,11 @@
 		 */
 
 		KASSERT(bp->b_vp == NULL, ("bp3 %p flags %08lx vnode %p qindex %d unexpectededly still associated!", bp, bp->b_flags, bp->b_vp, qindex));
+		KKASSERT((bp->b_flags & B_HASHED) == 0);
 		if (LIST_FIRST(&bp->b_dep) != NULL && bioops.io_deallocate)
 			(*bioops.io_deallocate)(bp);
 		if (bp->b_xflags & BX_BKGRDINPROG)
 			panic("losing buffer 3");
-		LIST_REMOVE(bp, b_hash);
-		LIST_INSERT_HEAD(&invalhash, bp, b_hash);
 
 		/*
 		 * critical section protection is not required when
@@ -2055,26 +1960,10 @@
 }
 
 /*
- * incore:
- *
- *	Check to see if a block is currently resident in memory.
- */
-struct buf *
-incore(struct vnode * vp, daddr_t blkno)
-{
-	struct buf *bp;
-
-	crit_enter();
-	bp = gbincore(vp, blkno);
-	crit_exit();
-	return (bp);
-}
-
-/*
  * inmem:
  *
  *	Returns true if no I/O is needed to access the associated VM object.
- *	This is like incore except it also hunts around in the VM system for
+ *	This is like findblk except it also hunts around in the VM system for
  *	the data.
  *
  *	Note that we ignore vm_page_free() races from interrupts against our
@@ -2089,7 +1978,7 @@
 	vm_page_t m;
 	vm_ooffset_t off;
 
-	if (incore(vp, blkno))
+	if (findblk(vp, blkno))
 		return 1;
 	if (vp->v_mount == NULL)
 		return 0;
@@ -2208,6 +2097,25 @@
 }
 
 /*
+ * findblk:
+ *
+ *	Locate and return the specified buffer, or NULL if the buffer does
+ *	not exist.  Do not attempt to lock the buffer or manipulate it in
+ *	any way.  The caller must validate that the correct buffer has been
+ *	obtain after locking it.
+ */
+struct buf *
+findblk(struct vnode *vp, daddr_t blkno)
+{
+	struct buf *bp;
+
+	crit_enter();
+	bp = buf_rb_hash_RB_LOOKUP(&vp->v_rbhash_tree, blkno);
+	crit_exit();
+	return(bp);
+}
+
+/*
  * getblk:
  *
  *	Get a block given a specified block and offset into a file/device.
@@ -2251,10 +2159,9 @@
  *	prior to issuing the READ.  biodone() will *not* clear B_INVAL.
  */
 struct buf *
-getblk(struct vnode * vp, daddr_t blkno, int size, int slpflag, int slptimeo)
+getblk(struct vnode *vp, daddr_t blkno, int size, int slpflag, int slptimeo)
 {
 	struct buf *bp;
-	struct bufhashhdr *bh;
 
 	if (size > MAXBSIZE)
 		panic("getblk: size(%d) > MAXBSIZE(%d)", size, MAXBSIZE);
@@ -2266,7 +2173,7 @@
 	 * to completely exhaust the buffer cache.
          *
          * If this check ever becomes a bottleneck it may be better to
-         * move it into the else, when gbincore() fails.  At the moment
+         * move it into the else, when findblk() fails.  At the moment
          * it isn't a problem.
 	 *
 	 * XXX remove, we cannot afford to block anywhere if holding a vnode
@@ -2279,7 +2186,7 @@
 		tsleep(&needsbuffer, slpflag, "newbuf", slptimeo);
 	}
 
-	if ((bp = gbincore(vp, blkno))) {
+	if ((bp = findblk(vp, blkno))) {
 		/*
 		 * The buffer was found in the cache, but we need to lock it.
 		 * Even with LK_NOWAIT the lockmgr may break our critical
@@ -2458,7 +2365,7 @@
 		 * from the point of the duplicate buffer creation through
 		 * to here, and we've locked the buffer.
 		 */
-		if (gbincore(vp, blkno)) {
+		if (findblk(vp, blkno)) {
 			bp->b_flags |= B_INVAL;
 			brelse(bp);
 			goto loop;
@@ -2466,18 +2373,16 @@
 
 		/*
 		 * Insert the buffer into the hash, so that it can
-		 * be found by incore.  bgetvp() and bufhash()
-		 * must be synchronized with each other.  Make sure the
-		 * translation layer has been cleared.
+		 * be found by findblk(). 
+		 *
+		 * Make sure the translation layer has been cleared.
 		 */
 		bp->b_lblkno = blkno;
 		bp->b_loffset = offset;
 		bp->b_bio2.bio_blkno = (daddr_t)-1;
+		/* bp->b_bio2.bio_next = NULL; */
 
 		bgetvp(vp, bp);
-		LIST_REMOVE(bp, b_hash);
-		bh = bufhash(vp, blkno);
-		LIST_INSERT_HEAD(bh, bp, b_hash);
 
 		/*
 		 * set B_VMIO bit.  allocbuf() the buffer bigger.  Since the
Index: kern/vfs_cluster.c
===================================================================
RCS file: /cvs/src/sys/kern/vfs_cluster.c,v
retrieving revision 1.16
diff -u -r1.16 vfs_cluster.c
--- kern/vfs_cluster.c	21 Feb 2006 18:46:56 -0000	1.16
+++ kern/vfs_cluster.c	3 Mar 2006 19:31:25 -0000
@@ -133,14 +133,14 @@
 			bp->b_flags &= ~B_RAM;
 			/*
 			 * We do the crit here so that there is no window
-			 * between the incore and the b_usecount increment
+			 * between the findblk and the b_usecount increment
 			 * below.  We opt to keep the crit out of the loop
 			 * for efficiency.
 			 */
 			crit_enter();
 			for (i = 1; i < maxra; i++) {
 
-				if (!(tbp = incore(vp, lblkno+i))) {
+				if (!(tbp = findblk(vp, lblkno+i))) {
 					break;
 				}
 
@@ -371,7 +371,7 @@
 			 * would block in the lock.  The same checks have to
 			 * be made again after we officially get the buffer.
 			 */
-			if ((tbp = incore(vp, lbn + i)) != NULL) {
+			if ((tbp = findblk(vp, lbn + i)) != NULL) {
 				if (BUF_LOCK(tbp, LK_EXCLUSIVE | LK_NOWAIT))
 					break;
 				BUF_UNLOCK(tbp);
@@ -749,7 +749,7 @@
 		 * is delayed-write but either locked or inval, it cannot 
 		 * partake in the clustered write.
 		 */
-		if (((tbp = gbincore(vp, start_lbn)) == NULL) ||
+		if (((tbp = findblk(vp, start_lbn)) == NULL) ||
 		  ((tbp->b_flags & (B_LOCKED | B_INVAL | B_DELWRI)) != B_DELWRI) ||
 		  BUF_LOCK(tbp, LK_EXCLUSIVE | LK_NOWAIT)) {
 			++start_lbn;
@@ -818,7 +818,7 @@
 				 * If the adjacent data is not even in core it
 				 * can't need to be written.
 				 */
-				if ((tbp = gbincore(vp, start_lbn)) == NULL) {
+				if ((tbp = findblk(vp, start_lbn)) == NULL) {
 					crit_exit();
 					break;
 				}
@@ -903,7 +903,6 @@
 			bundirty(tbp);
 			tbp->b_flags &= ~(B_READ | B_DONE | B_ERROR);
 			tbp->b_flags |= B_ASYNC;
-			reassignbuf(tbp, tbp->b_vp);	/* put on clean list */
 			crit_exit();
 			BUF_KERNPROC(tbp);
 			cluster_append(&bp->b_bio1, tbp);
Index: kern/vfs_lock.c
===================================================================
RCS file: /cvs/src/sys/kern/vfs_lock.c,v
retrieving revision 1.9
diff -u -r1.9 vfs_lock.c
--- kern/vfs_lock.c	2 Mar 2006 19:07:59 -0000	1.9
+++ kern/vfs_lock.c	3 Mar 2006 18:26:04 -0000
@@ -494,6 +494,7 @@
 
 	RB_INIT(&vp->v_rbclean_tree);
 	RB_INIT(&vp->v_rbdirty_tree);
+	RB_INIT(&vp->v_rbhash_tree);
 	vp->v_type = VNON;
 	vp->v_tag = 0;
 	vp->v_ops = NULL;
Index: kern/vfs_subr.c
===================================================================
RCS file: /cvs/src/sys/kern/vfs_subr.c,v
retrieving revision 1.69
diff -u -r1.69 vfs_subr.c
--- kern/vfs_subr.c	2 Mar 2006 20:28:49 -0000	1.69
+++ kern/vfs_subr.c	3 Mar 2006 19:58:30 -0000
@@ -136,7 +136,8 @@
  * Red black tree functions
  */
 static int rb_buf_compare(struct buf *b1, struct buf *b2);
-RB_GENERATE(buf_rb_tree, buf, b_rbnode, rb_buf_compare);
+RB_GENERATE2(buf_rb_tree, buf, b_rbnode, rb_buf_compare, daddr_t, b_lblkno);
+RB_GENERATE2(buf_rb_hash, buf, b_rbhash, rb_buf_compare, daddr_t, b_lblkno);
 
 static int
 rb_buf_compare(struct buf *b1, struct buf *b2)
@@ -356,6 +357,8 @@
 
 	if (!RB_EMPTY(&vp->v_rbdirty_tree) || !RB_EMPTY(&vp->v_rbclean_tree))
 		panic("vinvalbuf: flush failed");
+	if (!RB_EMPTY(&vp->v_rbhash_tree))
+		panic("vinvalbuf: flush failed, buffers still present");
 	return (0);
 }
 
@@ -785,17 +788,22 @@
 bgetvp(struct vnode *vp, struct buf *bp)
 {
 	KASSERT(bp->b_vp == NULL, ("bgetvp: not free"));
+	KKASSERT((bp->b_flags & (B_HASHED|B_DELWRI)) == 0);
+	KKASSERT((bp->b_xflags & (BX_VNCLEAN|BX_VNDIRTY)) == 0);
 
 	vhold(vp);
-	bp->b_vp = vp;
 	/*
 	 * Insert onto list for new vnode.
 	 */
 	crit_enter();
+	bp->b_vp = vp;
+	bp->b_flags |= B_HASHED;
+	if (buf_rb_hash_RB_INSERT(&vp->v_rbhash_tree, bp))
+		panic("reassignbuf: dup lblk vp %p bp %p", vp, bp);
+
 	bp->b_xflags |= BX_VNCLEAN;
-	bp->b_xflags &= ~BX_VNDIRTY;
 	if (buf_rb_tree_RB_INSERT(&vp->v_rbclean_tree, bp))
-		panic("reassignbuf: dup lblk vp %p bp %p", vp, bp);
+		panic("reassignbuf: dup lblk/clean vp %p bp %p", vp, bp);
 	crit_exit();
 }
 
@@ -821,6 +829,10 @@
 			buf_rb_tree_RB_REMOVE(&vp->v_rbclean_tree, bp);
 		bp->b_xflags &= ~(BX_VNDIRTY | BX_VNCLEAN);
 	}
+	if (bp->b_flags & B_HASHED) {
+		buf_rb_hash_RB_REMOVE(&vp->v_rbhash_tree, bp);
+		bp->b_flags &= ~B_HASHED;
+	}
 	if ((vp->v_flag & VONWORKLST) && RB_EMPTY(&vp->v_rbdirty_tree)) {
 		vp->v_flag &= ~VONWORKLST;
 		LIST_REMOVE(vp, v_synclist);
@@ -841,6 +853,7 @@
 pbgetvp(struct vnode *vp, struct buf *bp)
 {
 	KASSERT(bp->b_vp == NULL, ("pbgetvp: not free"));
+	KKASSERT((bp->b_flags & B_HASHED) == 0);
 
 	bp->b_vp = vp;
 	bp->b_flags |= B_PAGING;
@@ -853,37 +866,23 @@
 pbrelvp(struct buf *bp)
 {
 	KASSERT(bp->b_vp != NULL, ("pbrelvp: NULL"));
+	KKASSERT((bp->b_flags & B_HASHED) == 0);
 
 	bp->b_vp = NULL;
 	bp->b_flags &= ~B_PAGING;
 }
 
-void
-pbreassignbuf(struct buf *bp, struct vnode *newvp)
-{
-	if ((bp->b_flags & B_PAGING) == 0) {
-		panic(
-		    "pbreassignbuf() on non phys bp %p", 
-		    bp
-		);
-	}
-	bp->b_vp = newvp;
-}
-
 /*
- * Reassign a buffer from one vnode to another.
- * Used to assign file specific control information
- * (indirect blocks) to the vnode to which they belong.
+ * Reassign the buffer to the proper clean/dirty list based on B_DELWRI.
+ * This routine is called when the state of the B_DELWRI bit is changed.
  */
 void
-reassignbuf(struct buf *bp, struct vnode *newvp)
+reassignbuf(struct buf *bp)
 {
+	struct vnode *vp = bp->b_vp;
 	int delay;
 
-	if (newvp == NULL) {
-		printf("reassignbuf: NULL");
-		return;
-	}
+	KKASSERT(vp != NULL);
 	++reassignbufcalls;
 
 	/*
@@ -894,34 +893,30 @@
 		panic("cannot reassign paging buffer");
 
 	crit_enter();
-	/*
-	 * Delete from old vnode list, if on one.
-	 */
-	if (bp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN)) {
-		if (bp->b_xflags & BX_VNDIRTY)
-			buf_rb_tree_RB_REMOVE(&bp->b_vp->v_rbdirty_tree, bp);
-		else 
-			buf_rb_tree_RB_REMOVE(&bp->b_vp->v_rbclean_tree, bp);
-		bp->b_xflags &= ~(BX_VNDIRTY | BX_VNCLEAN);
-		if (bp->b_vp != newvp) {
-			vdrop(bp->b_vp);
-			bp->b_vp = NULL;	/* for clarification */
-		}
-	}
-	/*
-	 * If dirty, put on list of dirty buffers; otherwise insert onto list
-	 * of clean buffers.
-	 */
 	if (bp->b_flags & B_DELWRI) {
-		if ((newvp->v_flag & VONWORKLST) == 0) {
-			switch (newvp->v_type) {
+		/*
+		 * Move to the dirty list, add the vnode to the worklist
+		 */
+		if (bp->b_xflags & BX_VNCLEAN) {
+			buf_rb_tree_RB_REMOVE(&vp->v_rbclean_tree, bp);
+			bp->b_xflags &= ~BX_VNCLEAN;
+		}
+		if ((bp->b_xflags & BX_VNDIRTY) == 0) {
+			if (buf_rb_tree_RB_INSERT(&vp->v_rbdirty_tree, bp)) {
+				panic("reassignbuf: dup lblk vp %p bp %p",
+				      vp, bp);
+			}
+			bp->b_xflags |= BX_VNDIRTY;
+		}
+		if ((vp->v_flag & VONWORKLST) == 0) {
+			switch (vp->v_type) {
 			case VDIR:
 				delay = dirdelay;
 				break;
 			case VCHR:
 			case VBLK:
-				if (newvp->v_rdev && 
-				    newvp->v_rdev->si_mountpoint != NULL) {
+				if (vp->v_rdev && 
+				    vp->v_rdev->si_mountpoint != NULL) {
 					delay = metadelay;
 					break;
 				}
@@ -929,24 +924,29 @@
 			default:
 				delay = filedelay;
 			}
-			vn_syncer_add_to_worklist(newvp, delay);
+			vn_syncer_add_to_worklist(vp, delay);
 		}
-		bp->b_xflags |= BX_VNDIRTY;
-		if (buf_rb_tree_RB_INSERT(&newvp->v_rbdirty_tree, bp))
-			panic("reassignbuf: dup lblk vp %p bp %p", newvp, bp);
 	} else {
-		bp->b_xflags |= BX_VNCLEAN;
-		if (buf_rb_tree_RB_INSERT(&newvp->v_rbclean_tree, bp))
-			panic("reassignbuf: dup lblk vp %p bp %p", newvp, bp);
-		if ((newvp->v_flag & VONWORKLST) &&
-		    RB_EMPTY(&newvp->v_rbdirty_tree)) {
-			newvp->v_flag &= ~VONWORKLST;
-			LIST_REMOVE(newvp, v_synclist);
+		/*
+		 * Move to the clean list, remove the vnode from the worklist
+		 * if no dirty blocks remain.
+		 */
+		if (bp->b_xflags & BX_VNDIRTY) {
+			buf_rb_tree_RB_REMOVE(&vp->v_rbdirty_tree, bp);
+			bp->b_xflags &= ~BX_VNDIRTY;
+		}
+		if ((bp->b_xflags & BX_VNCLEAN) == 0) {
+			if (buf_rb_tree_RB_INSERT(&vp->v_rbclean_tree, bp)) {
+				panic("reassignbuf: dup lblk vp %p bp %p",
+				      vp, bp);
+			}
+			bp->b_xflags |= BX_VNCLEAN;
+		}
+		if ((vp->v_flag & VONWORKLST) &&
+		    RB_EMPTY(&vp->v_rbdirty_tree)) {
+			vp->v_flag &= ~VONWORKLST;
+			LIST_REMOVE(vp, v_synclist);
 		}
-	}
-	if (bp->b_vp != newvp) {
-		bp->b_vp = newvp;
-		vhold(bp->b_vp);
 	}
 	crit_exit();
 }
Index: sys/buf.h
===================================================================
RCS file: /cvs/src/sys/sys/buf.h,v
retrieving revision 1.23
diff -u -r1.23 buf.h
--- sys/buf.h	2 Mar 2006 19:26:17 -0000	1.23
+++ sys/buf.h	3 Mar 2006 19:38:29 -0000
@@ -75,7 +75,9 @@
 #define NBUF_BIO	4
 
 struct buf_rb_tree;
-RB_PROTOTYPE(buf_rb_tree, buf, b_rbnode, rb_buf_compare);
+struct buf_rb_hash;
+RB_PROTOTYPE2(buf_rb_tree, buf, b_rbnode, rb_buf_compare, daddr_t, b_lblkno);
+RB_PROTOTYPE2(buf_rb_hash, buf, b_rbhash, rb_buf_compare, daddr_t, b_lblkno);
 
 /*
  * To avoid including <ufs/ffs/softdep.h> 
@@ -138,8 +140,8 @@
  *	unrelated to the vnode/device whos strategy routine was called.
  */
 struct buf {
-	LIST_ENTRY(buf) b_hash;		/* Hash chain. */
-	RB_ENTRY(buf) b_rbnode;		/* Red-Black node in vnode RB tree */
+	RB_ENTRY(buf) b_rbnode;		/* RB node in vnode clean/dirty tree */
+	RB_ENTRY(buf) b_rbhash;		/* RB node in vnode hash tree */
 	TAILQ_ENTRY(buf) b_freelist;	/* Free list position if not active. */
 	struct buf *b_cluster_next;	/* Next buffer (cluster code) */
 	struct vnode *b_vp;		/* (vp, lblkno) index */
@@ -229,7 +231,7 @@
 #define	B_DIRECT	0x00000008	/* direct I/O flag (pls free vmio) */
 #define	B_DEFERRED	0x00000010	/* Skipped over for cleaning */
 #define	B_CACHE		0x00000020	/* Bread found us in the cache. */
-#define	B_UNUSED40 	0x00000040 	/* Unused */
+#define	B_HASHED 	0x00000040 	/* Indexed via v_rbhash_tree */
 #define	B_DELWRI	0x00000080	/* Delay I/O until buffer reused. */
 #define	B_FREEBUF	0x00000100	/* Instruct driver: free blocks */
 #define	B_DONE		0x00000200	/* I/O completed. */
@@ -339,7 +341,6 @@
 
 struct uio;
 
-caddr_t bufhashinit (caddr_t);
 void	bufinit (void);
 void	bwillwrite (void);
 int	buf_dirty_count_severe (void);
@@ -360,9 +361,8 @@
 void	bqrelse (struct buf *);
 int	vfs_bio_awrite (struct buf *);
 struct buf *getpbuf (int *);
-struct buf *incore (struct vnode *, daddr_t);
-struct buf *gbincore (struct vnode *, daddr_t);
 int	inmem (struct vnode *, daddr_t);
+struct buf *findblk (struct vnode *, daddr_t);
 struct buf *getblk (struct vnode *, daddr_t, int, int, int);
 struct buf *geteblk (int);
 struct bio *push_bio(struct bio *);
@@ -390,8 +390,7 @@
 void	pbgetvp (struct vnode *, struct buf *);
 void	pbrelvp (struct buf *);
 int	allocbuf (struct buf *bp, int size);
-void	reassignbuf (struct buf *, struct vnode *);
-void	pbreassignbuf (struct buf *, struct vnode *);
+void	reassignbuf (struct buf *);
 struct	buf *trypbuf (int *);
 
 #endif /* _KERNEL */
Index: sys/vnode.h
===================================================================
RCS file: /cvs/src/sys/sys/vnode.h,v
retrieving revision 1.40
diff -u -r1.40 vnode.h
--- sys/vnode.h	17 Feb 2006 19:18:07 -0000	1.40
+++ sys/vnode.h	3 Mar 2006 18:25:23 -0000
@@ -155,6 +155,7 @@
  *	 journaling is turned on or off.
  */
 RB_HEAD(buf_rb_tree, buf);
+RB_HEAD(buf_rb_hash, buf);
 
 struct vnode {
 	u_long	v_flag;				/* vnode flags (see below) */
@@ -171,6 +172,7 @@
 	TAILQ_ENTRY(vnode) v_nmntvnodes;	/* vnodes for mount point */
 	struct buf_rb_tree v_rbclean_tree;	/* RB tree of clean bufs */
 	struct buf_rb_tree v_rbdirty_tree;	/* RB tree of dirty bufs */
+	struct buf_rb_hash v_rbhash_tree;	/* RB tree general lookup */
 	LIST_ENTRY(vnode) v_synclist;		/* vnodes with dirty buffers */
 	enum	vtype v_type;			/* vnode type */
 	union {
Index: vfs/nfs/nfs_bio.c
===================================================================
RCS file: /cvs/src/sys/vfs/nfs/nfs_bio.c,v
retrieving revision 1.26
diff -u -r1.26 nfs_bio.c
--- vfs/nfs/nfs_bio.c	17 Feb 2006 19:18:07 -0000	1.26
+++ vfs/nfs/nfs_bio.c	3 Mar 2006 19:12:19 -0000
@@ -458,7 +458,7 @@
 		    for (nra = 0; nra < nmp->nm_readahead && nra < seqcount &&
 			(off_t)(lbn + 1 + nra) * biosize < np->n_size; nra++) {
 			rabn = lbn + 1 + nra;
-			if (!incore(vp, rabn)) {
+			if (!findblk(vp, rabn)) {
 			    rabp = nfs_getcacheblk(vp, rabn, biosize, td);
 			    if (!rabp)
 				return (EINTR);
@@ -642,7 +642,7 @@
 		    (np->n_direofoffset == 0 ||
 		    (lbn + 1) * NFS_DIRBLKSIZ < np->n_direofoffset) &&
 		    !(np->n_flag & NQNFSNONCACHE) &&
-		    !incore(vp, lbn + 1)) {
+		    !findblk(vp, lbn + 1)) {
 			rabp = nfs_getcacheblk(vp, lbn + 1, NFS_DIRBLKSIZ, td);
 			if (rabp) {
 			    if ((rabp->b_flags & (B_CACHE|B_DELWRI)) == 0) {
Index: vfs/nfs/nfs_serv.c
===================================================================
RCS file: /cvs/src/sys/vfs/nfs/nfs_serv.c,v
retrieving revision 1.26
diff -u -r1.26 nfs_serv.c
--- vfs/nfs/nfs_serv.c	1 Mar 2006 00:21:58 -0000	1.26
+++ vfs/nfs/nfs_serv.c	3 Mar 2006 19:05:43 -0000
@@ -2057,6 +2057,17 @@
 		if (!error) {
 			nqsrv_getl(dvp, ND_WRITE);
 			nqsrv_getl(vp, ND_WRITE);
+			if (dvp) {
+				if (dvp == vp)
+					vrele(dvp);
+				else
+					vput(dvp);
+				dvp = NULL;
+			}
+			if (vp) {
+				vput(vp);
+				vp = NULL;
+			}
 			error = VOP_NREMOVE(nd.nl_ncp, nd.nl_cred);
 		}
 	}
@@ -3531,9 +3542,10 @@
 			 * have to lock and write it.  Otherwise the prior
 			 * write is assumed to have already been committed.
 			 */
-			if ((bp = gbincore(vp, lblkno)) != NULL && (bp->b_flags & B_DELWRI)) {
+			if ((bp = findblk(vp, lblkno)) != NULL && (bp->b_flags & B_DELWRI)) {
 				if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT)) {
-					BUF_LOCK(bp, LK_EXCLUSIVE | LK_SLEEPFAIL);
+					if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_SLEEPFAIL) == 0)
+						BUF_UNLOCK(bp);
 					continue; /* retry */
 				}
 				bremfree(bp);
Index: vfs/nfs/nfs_vnops.c
===================================================================
RCS file: /cvs/src/sys/vfs/nfs/nfs_vnops.c,v
retrieving revision 1.48
diff -u -r1.48 nfs_vnops.c
--- vfs/nfs/nfs_vnops.c	2 Mar 2006 19:26:19 -0000	1.48
+++ vfs/nfs/nfs_vnops.c	3 Mar 2006 19:43:55 -0000
@@ -3316,7 +3316,6 @@
 	/*
 	 * Undirty the bp.  We will redirty it later if the I/O fails.
 	 */
-
 	crit_enter();
 	bundirty(bp);
 	bp->b_flags &= ~(B_READ|B_DONE|B_ERROR);
@@ -3331,15 +3330,13 @@
 	BUF_KERNPROC(bp);
 	vn_strategy(bp->b_vp, &bp->b_bio1);
 
-	if((oldflags & B_ASYNC) == 0) {
+	if ((oldflags & B_ASYNC) == 0) {
 		int rtval = biowait(bp);
 
 		if (oldflags & B_DELWRI) {
-			crit_enter();
-			reassignbuf(bp, bp->b_vp);
-			crit_exit();
+			KKASSERT(bp->b_xflags & BX_VNDIRTY);
+			/*reassignbuf(bp);*/
 		}
-
 		brelse(bp);
 		return (rtval);
 	} 
Index: vfs/ufs/ffs_softdep.c
===================================================================
RCS file: /cvs/src/sys/vfs/ufs/ffs_softdep.c,v
retrieving revision 1.34
diff -u -r1.34 ffs_softdep.c
--- vfs/ufs/ffs_softdep.c	17 Feb 2006 19:18:08 -0000	1.34
+++ vfs/ufs/ffs_softdep.c	3 Mar 2006 19:27:38 -0000
@@ -2266,8 +2266,11 @@
 	 * Otherwise we have to read the blocks in from the disk.
 	 */
 	ACQUIRE_LOCK(&lk);
-	if ((bp = incore(ip->i_devvp, dbn)) != NULL &&
+	if ((bp = findblk(ip->i_devvp, dbn)) != NULL &&
 	    (wk = LIST_FIRST(&bp->b_dep)) != NULL) {
+		/*
+		 * bp must be ir_savebp, which is held locked for our use.
+		 */
 		if (wk->wk_type != D_INDIRDEP ||
 		    (indirdep = WK_INDIRDEP(wk))->ir_savebp != bp ||
 		    (indirdep->ir_state & GOINGAWAY) == 0) {
Index: vfs/ufs/ufs_bmap.c
===================================================================
RCS file: /cvs/src/sys/vfs/ufs/ufs_bmap.c,v
retrieving revision 1.7
diff -u -r1.7 ufs_bmap.c
--- vfs/ufs/ufs_bmap.c	17 Feb 2006 19:18:08 -0000	1.7
+++ vfs/ufs/ufs_bmap.c	3 Mar 2006 19:14:52 -0000
@@ -165,7 +165,7 @@
 		 */
 
 		metalbn = xap->in_lbn;
-		if ((daddr == 0 && !incore(vp, metalbn)) || metalbn == bn)
+		if ((daddr == 0 && !findblk(vp, metalbn)) || metalbn == bn)
 			break;
 		/*
 		 * If we get here, we've either got the block in the cache





More information about the Kernel mailing list