VFS/BIO RB patch #1 clean/dirty buffer cache patch to convert to red-black trees

Sat Apr 9 13:57:31 PDT 2005

Here is a preliminary patch set to convert VFS/BIO's clean and dirty
    buffer lists to red-black trees.  This is a fairly serious patch set
    as it involves essentially rewriting all the various filesystem's fsync
    procedures.  It also introduces a new RB_SCAN() function which is capable
    of performing a ranged-scan and callbacks and which also fully supports 
    callback blocking and tree changes occuring during the callback or 
    blocked callback without blowing up the scan.  (Maybe I should patent
    the algorithm! <grin>).

    This patch also changes the periodic filesystem syncer to take advantage
    of the new topology and do partial fsyncs when more then 1MB of dirty
    buffers are present for any given file.   The feature can be tested by
    setting vfs.write_behind to 0 and using 'dd' to copy a 4MB file, then
    observing disk I/O over the next minute or so.

    --

    A beefed up version of the new partial fsyncing feature will soon
    replace the current write_behind heuristic (not part of this patch),
    allowing larger amounts of dirty data to be buffered without initiating
    disk I/O and allowing far larger linear data aggregations under load.

    I've decided that the patch is sophisticated enough that I will not
    be committing it this weekend.  It needs more testing.  I might commit
    it during a reckless moment at USENIX :-)

					-Matt
					Matthew Dillon 
					<dillon at xxxxxxxxxxxxx>

Index: kern/vfs_bio.c
===================================================================
RCS file: /cvs/src/sys/kern/vfs_bio.c,v
retrieving revision 1.34
diff -u -r1.34 vfs_bio.c

--- kern/vfs_bio.c	23 Mar 2005 20:37:03 -0000	1.34
+++ kern/vfs_bio.c	9 Apr 2005 02:51:17 -0000
@@ -147,15 +147,20 @@
 SYSCTL_INT(_vfs, OID_AUTO, bufreusecnt, CTLFLAG_RW,
 	&bufreusecnt, 0, "");
 
+#if 0
 /*
  * Disable background writes for now.  There appear to be races in the 
  * flags tests and locking operations as well as races in the completion
  * code modifying the original bp (origbp) without holding a lock, assuming
  * splbio protection when there might not be splbio protection.
+ *
+ * XXX disable also because the RB tree can't handle multiple blocks with
+ * the same lblkno.
  */
 static int dobkgrdwrite = 0;
 SYSCTL_INT(_debug, OID_AUTO, dobkgrdwrite, CTLFLAG_RW, &dobkgrdwrite, 0,
 	"Do background writes (honoring the BV_BKGRDWRITE flag)?");
+#endif
 
 static int bufhashmask;
 static int bufhashshift;
@@ -673,6 +678,7 @@
 	/* Mark the buffer clean */
 	bundirty(bp);
 
+#if 0
 	/*
 	 * If this buffer is marked for background writing and we
 	 * do not have to wait for it, make a copy and write the
@@ -694,13 +700,13 @@
 
 		/* set it to be identical to the old block */
 		memcpy(newbp->b_data, bp->b_data, bp->b_bufsize);
-		bgetvp(bp->b_vp, newbp);
 		newbp->b_lblkno = bp->b_lblkno;
 		newbp->b_blkno = bp->b_blkno;
 		newbp->b_offset = bp->b_offset;
 		newbp->b_iodone = vfs_backgroundwritedone;
 		newbp->b_flags |= B_ASYNC | B_CALL;
 		newbp->b_flags &= ~B_INVAL;
+		bgetvp(bp->b_vp, newbp);
 
 		/* move over the dependencies */
 		if (LIST_FIRST(&bp->b_dep) != NULL && bioops.io_movedeps)
@@ -719,9 +725,10 @@
 		bqrelse(bp);
 		bp = newbp;
 	}
+#endif
 
 	bp->b_flags &= ~(B_READ | B_DONE | B_ERROR);
-	bp->b_flags |= B_WRITEINPROG | B_CACHE;
+	bp->b_flags |= B_CACHE;
 
 	bp->b_vp->v_numoutput++;
 	vfs_busy_pages(bp, 1);
Index: kern/vfs_lock.c
===================================================================
RCS file: /cvs/src/sys/kern/vfs_lock.c,v
retrieving revision 1.5
diff -u -r1.5 vfs_lock.c
--- kern/vfs_lock.c	19 Jan 2005 18:57:00 -0000	1.5
+++ kern/vfs_lock.c	9 Apr 2005 02:51:17 -0000
@@ -492,8 +492,8 @@
 		numvnodes++;
 	}
 
-	TAILQ_INIT(&vp->v_cleanblkhd);
-	TAILQ_INIT(&vp->v_dirtyblkhd);
+	RB_INIT(&vp->v_rbclean_tree);
+	RB_INIT(&vp->v_rbdirty_tree);
 	vp->v_type = VNON;
 	vp->v_tag = 0;
 	vp->v_ops = NULL;
Index: kern/vfs_subr.c
===================================================================
RCS file: /cvs/src/sys/kern/vfs_subr.c,v
retrieving revision 1.53
diff -u -r1.53 vfs_subr.c
--- kern/vfs_subr.c	4 Mar 2005 02:21:48 -0000	1.53
+++ kern/vfs_subr.c	9 Apr 2005 02:51:17 -0000
@@ -132,6 +132,22 @@
 extern struct vnodeopv_entry_desc spec_vnodeop_entries[];
 
 /*
+ * Red black tree functions
+ */
+static int rb_buf_compare(struct buf *b1, struct buf *b2);
+RB_GENERATE(buf_rb_tree, buf, b_rbnode, rb_buf_compare);
+
+static int
+rb_buf_compare(struct buf *b1, struct buf *b2)
+{
+	if (b1->b_lblkno < b2->b_lblkno)
+		return(-1);
+	if (b1->b_lblkno > b2->b_lblkno)
+		return(1);
+	return(0);
+}
+
+/*
  * Return 0 if the vnode is already on the free list or cannot be placed
  * on the free list.  Return 1 if the vnode can be placed on the free list.
  */
@@ -249,7 +265,6 @@
 {
 	struct vnode *vp;
 
-	bp->b_flags &= ~B_WRITEINPROG;
 	if ((vp = bp->b_vp)) {
 		vp->v_numoutput--;
 		if (vp->v_numoutput < 0)
@@ -266,15 +281,27 @@
  *
  * vp must be locked.
  */
+static int vinvalbuf_bp(struct buf *bp, void *data);
+
+struct vinvalbuf_bp_info {
+	struct vnode *vp;
+	int slptimeo;
+	int slpflag;
+	int flags;
+};
+
 int
 vinvalbuf(struct vnode *vp, int flags, struct thread *td,
 	int slpflag, int slptimeo)
 {
-	struct buf *bp;
-	struct buf *nbp, *blist;
+	struct vinvalbuf_bp_info info;
 	int s, error;
 	vm_object_t object;
 
+	/*
+	 * If we are being asked to save, call fsync to ensure that the inode
+	 * is updated.
+	 */
 	if (flags & V_SAVE) {
 		s = splbio();
 		while (vp->v_numoutput) {
@@ -286,66 +313,33 @@
 				return (error);
 			}
 		}
-		if (!TAILQ_EMPTY(&vp->v_dirtyblkhd)) {
+		if (!RB_EMPTY(&vp->v_rbdirty_tree)) {
 			splx(s);
 			if ((error = VOP_FSYNC(vp, MNT_WAIT, td)) != 0)
 				return (error);
 			s = splbio();
 			if (vp->v_numoutput > 0 ||
-			    !TAILQ_EMPTY(&vp->v_dirtyblkhd))
+			    !RB_EMPTY(&vp->v_rbdirty_tree))
 				panic("vinvalbuf: dirty bufs");
 		}
 		splx(s);
   	}
 	s = splbio();
-	for (;;) {
-		blist = TAILQ_FIRST(&vp->v_cleanblkhd);
-		if (!blist)
-			blist = TAILQ_FIRST(&vp->v_dirtyblkhd);
-		if (!blist)
-			break;
-
-		for (bp = blist; bp; bp = nbp) {
-			nbp = TAILQ_NEXT(bp, b_vnbufs);
-			if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT)) {
-				error = BUF_TIMELOCK(bp,
-				    LK_EXCLUSIVE | LK_SLEEPFAIL,
-				    "vinvalbuf", slpflag, slptimeo);
-				if (error == ENOLCK)
-					break;
-				splx(s);
-				return (error);
-			}
-			/*
-			 * XXX Since there are no node locks for NFS, I
-			 * believe there is a slight chance that a delayed
-			 * write will occur while sleeping just above, so
-			 * check for it.  Note that vfs_bio_awrite expects
-			 * buffers to reside on a queue, while VOP_BWRITE and
-			 * brelse do not.
-			 */
-			if (((bp->b_flags & (B_DELWRI | B_INVAL)) == B_DELWRI) &&
-				(flags & V_SAVE)) {
+	info.slptimeo = slptimeo;
+	info.slpflag = slpflag;
+	info.flags = flags;
+	info.vp = vp;
 
-				if (bp->b_vp == vp) {
-					if (bp->b_flags & B_CLUSTEROK) {
-						BUF_UNLOCK(bp);
-						vfs_bio_awrite(bp);
-					} else {
-						bremfree(bp);
-						bp->b_flags |= B_ASYNC;
-						VOP_BWRITE(bp->b_vp, bp);
-					}
-				} else {
-					bremfree(bp);
-					(void) VOP_BWRITE(bp->b_vp, bp);
-				}
-				break;
-			}
-			bremfree(bp);
-			bp->b_flags |= (B_INVAL | B_NOCACHE | B_RELBUF);
-			bp->b_flags &= ~B_ASYNC;
-			brelse(bp);
+	/*
+	 * Flush the buffer cache until nothing is left.
+	 */
+	while (!RB_EMPTY(&vp->v_rbclean_tree) || 
+	    !RB_EMPTY(&vp->v_rbdirty_tree)) {
+		error = RB_SCAN(buf_rb_tree, &vp->v_rbclean_tree, NULL,
+			vinvalbuf_bp, &info);
+		if (error == 0) {
+			error = RB_SCAN(buf_rb_tree, &vp->v_rbdirty_tree, NULL,
+					vinvalbuf_bp, &info);
 		}
 	}
 
@@ -375,11 +369,61 @@
 			(flags & V_SAVE) ? TRUE : FALSE);
 	}
 
-	if (!TAILQ_EMPTY(&vp->v_dirtyblkhd) || !TAILQ_EMPTY(&vp->v_cleanblkhd))
+	if (!RB_EMPTY(&vp->v_rbdirty_tree) || !RB_EMPTY(&vp->v_rbclean_tree))
 		panic("vinvalbuf: flush failed");
 	return (0);
 }
 
+static int
+vinvalbuf_bp(struct buf *bp, void *data)
+{
+	struct vinvalbuf_bp_info *info = data;
+	int error;
+
+	if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT)) {
+		error = BUF_TIMELOCK(bp,
+		    LK_EXCLUSIVE | LK_SLEEPFAIL,
+		    "vinvalbuf", info->slpflag, info->slptimeo);
+		if (error == 0) {
+			BUF_UNLOCK(bp);
+			error = ENOLCK;
+		}
+		if (error == ENOLCK)
+			return(0);
+		return (-error);
+	}
+	/*
+	 * XXX Since there are no node locks for NFS, I
+	 * believe there is a slight chance that a delayed
+	 * write will occur while sleeping just above, so
+	 * check for it.  Note that vfs_bio_awrite expects
+	 * buffers to reside on a queue, while VOP_BWRITE and
+	 * brelse do not.
+	 */
+	if (((bp->b_flags & (B_DELWRI | B_INVAL)) == B_DELWRI) &&
+	    (info->flags & V_SAVE)) {
+		if (bp->b_vp == info->vp) {
+			if (bp->b_flags & B_CLUSTEROK) {
+				BUF_UNLOCK(bp);
+				vfs_bio_awrite(bp);
+			} else {
+				bremfree(bp);
+				bp->b_flags |= B_ASYNC;
+				VOP_BWRITE(bp->b_vp, bp);
+			}
+		} else {
+			bremfree(bp);
+			VOP_BWRITE(bp->b_vp, bp);
+		}
+	} else {
+		bremfree(bp);
+		bp->b_flags |= (B_INVAL | B_NOCACHE | B_RELBUF);
+		bp->b_flags &= ~B_ASYNC;
+		brelse(bp);
+	}
+	return(0);
+}
+
 /*
  * Truncate a file's buffer and pages to a specified length.  This
  * is in lieu of the old vinvalbuf mechanism, which performed unneeded
@@ -387,92 +431,52 @@
  *
  * The vnode must be locked.
  */
+static int vtruncbuf_bp_trunc_cmp(struct buf *bp, void *data);
+static int vtruncbuf_bp_trunc(struct buf *bp, void *data);
+static int vtruncbuf_bp_metasync_cmp(struct buf *bp, void *data);
+static int vtruncbuf_bp_metasync(struct buf *bp, void *data);
+
 int
 vtruncbuf(struct vnode *vp, struct thread *td, off_t length, int blksize)
 {
-	struct buf *bp;
-	struct buf *nbp;
-	int s, anyfreed;
-	int trunclbn;
+	daddr_t trunclbn;
+	int count;
+	int s;
 
 	/*
-	 * Round up to the *next* lbn.
+	 * Round up to the *next* lbn, then destroy the buffers in question.  
+	 * Since we are only removing some of the buffers we must rely on the
+	 * scan count to determine whether a loop is necessary.
 	 */
 	trunclbn = (length + blksize - 1) / blksize;
 
 	s = splbio();
-restart:
-	anyfreed = 1;
-	for (;anyfreed;) {
-		anyfreed = 0;
-		for (bp = TAILQ_FIRST(&vp->v_cleanblkhd); bp; bp = nbp) {
-			nbp = TAILQ_NEXT(bp, b_vnbufs);
-			if (bp->b_lblkno >= trunclbn) {
-				if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT)) {
-					BUF_LOCK(bp, LK_EXCLUSIVE|LK_SLEEPFAIL);
-					goto restart;
-				} else {
-					bremfree(bp);
-					bp->b_flags |= (B_INVAL | B_RELBUF);
-					bp->b_flags &= ~B_ASYNC;
-					brelse(bp);
-					anyfreed = 1;
-				}
-				if (nbp &&
-				    (((nbp->b_xflags & BX_VNCLEAN) == 0) ||
-				    (nbp->b_vp != vp) ||
-				    (nbp->b_flags & B_DELWRI))) {
-					goto restart;
-				}
-			}
-		}
-
-		for (bp = TAILQ_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) {
-			nbp = TAILQ_NEXT(bp, b_vnbufs);
-			if (bp->b_lblkno >= trunclbn) {
-				if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT)) {
-					BUF_LOCK(bp, LK_EXCLUSIVE|LK_SLEEPFAIL);
-					goto restart;
-				} else {
-					bremfree(bp);
-					bp->b_flags |= (B_INVAL | B_RELBUF);
-					bp->b_flags &= ~B_ASYNC;
-					brelse(bp);
-					anyfreed = 1;
-				}
-				if (nbp &&
-				    (((nbp->b_xflags & BX_VNDIRTY) == 0) ||
-				    (nbp->b_vp != vp) ||
-				    (nbp->b_flags & B_DELWRI) == 0)) {
-					goto restart;
-				}
-			}
-		}
-	}
+	do {
+		count = RB_SCAN(buf_rb_tree, &vp->v_rbclean_tree, 
+				vtruncbuf_bp_trunc_cmp,
+				vtruncbuf_bp_trunc, &trunclbn);
+		count += RB_SCAN(buf_rb_tree, &vp->v_rbdirty_tree,
+				vtruncbuf_bp_trunc_cmp,
+				vtruncbuf_bp_trunc, &trunclbn);
+	} while(count);
 
+	/*
+	 * For safety, fsync any remaining metadata if the file is not being
+	 * truncated to 0.  Since the metadata does not represent the entire
+	 * dirty list we have to rely on the hit count to ensure that we get
+	 * all of it.
+	 */
 	if (length > 0) {
-restartsync:
-		for (bp = TAILQ_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) {
-			nbp = TAILQ_NEXT(bp, b_vnbufs);
-			if ((bp->b_flags & B_DELWRI) && (bp->b_lblkno < 0)) {
-				if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT)) {
-					BUF_LOCK(bp, LK_EXCLUSIVE|LK_SLEEPFAIL);
-					goto restart;
-				} else {
-					bremfree(bp);
-					if (bp->b_vp == vp) {
-						bp->b_flags |= B_ASYNC;
-					} else {
-						bp->b_flags &= ~B_ASYNC;
-					}
-					VOP_BWRITE(bp->b_vp, bp);
-				}
-				goto restartsync;
-			}
-
-		}
+		do {
+			count = RB_SCAN(buf_rb_tree, &vp->v_rbdirty_tree,
+					vtruncbuf_bp_metasync_cmp,
+					vtruncbuf_bp_metasync, vp);
+		} while (count);
 	}
 
+	/*
+	 * Wait for any in-progress I/O to complete before returning (why?)
+	 */
 	while (vp->v_numoutput > 0) {
 		vp->v_flag |= VBWAIT;
 		tsleep(&vp->v_numoutput, 0, "vbtrunc", 0);
@@ -486,6 +490,311 @@
 }
 
 /*
+ * The callback buffer is beyond the new file EOF and must be destroyed.
+ * Note that the compare function must conform to the RB_SCAN's requirements.
+ */
+static
+int
+vtruncbuf_bp_trunc_cmp(struct buf *bp, void *data)
+{
+	if (bp->b_lblkno >= *(daddr_t *)data)
+		return(0);
+	return(-1);
+}
+
+static 
+int 
+vtruncbuf_bp_trunc(struct buf *bp, void *data)
+{
+	/*
+	 * Do not try to use a buffer we cannot immediately lock, but sleep
+	 * anyway to prevent a livelock.  The code will loop until all buffers
+	 * can be acted upon.
+	 */
+	if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT)) {
+		if (BUF_LOCK(bp, LK_EXCLUSIVE|LK_SLEEPFAIL) == 0)
+			BUF_UNLOCK(bp);
+	} else {
+		bremfree(bp);
+		bp->b_flags |= (B_INVAL | B_RELBUF);
+		bp->b_flags &= ~B_ASYNC;
+		brelse(bp);
+	}
+	return(1);
+}
+
+/*
+ * Fsync all meta-data after truncating a file to be non-zero.  Only metadata
+ * blocks (with a negative lblkno) are scanned.
+ * Note that the compare function must conform to the RB_SCAN's requirements.
+ */
+static int
+vtruncbuf_bp_metasync_cmp(struct buf *bp, void *data)
+{
+	if (bp->b_lblkno < 0)
+		return(0);
+	return(1);
+}
+
+static int
+vtruncbuf_bp_metasync(struct buf *bp, void *data)
+{
+	struct vnode *vp = data;
+
+	if (bp->b_flags & B_DELWRI) {
+		/*
+		 * Do not try to use a buffer we cannot immediately lock,
+		 * but sleep anyway to prevent a livelock.  The code will
+		 * loop until all buffers can be acted upon.
+		 */
+		if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT)) {
+			if (BUF_LOCK(bp, LK_EXCLUSIVE|LK_SLEEPFAIL) == 0)
+				BUF_UNLOCK(bp);
+		} else {
+			bremfree(bp);
+			if (bp->b_vp == vp) {
+				bp->b_flags |= B_ASYNC;
+			} else {
+				bp->b_flags &= ~B_ASYNC;
+			}
+			VOP_BWRITE(bp->b_vp, bp);
+		}
+		return(1);
+	} else {
+		return(0);
+	}
+}
+
+/*
+ * vfsync - implements a multipass fsync on a file which understands
+ * dependancies and meta-data.  The passed vnode must be locked.  The 
+ * waitfor argument may be MNT_WAIT or MNT_NOWAIT, or MNT_LAZY.
+ *
+ * When fsyncing data asynchronously just do one consolidated pass starting
+ * with the most negative block number.  This may not get all the data due
+ * to dependancies.
+ *
+ * When fsyncing data synchronously do a data pass, then a metadata pass,
+ * then do additional data+metadata passes to try to get all the data out.
+ */
+static int vfsync_wait_output(struct vnode *vp, 
+			    int (*waitoutput)(struct vnode *, struct thread *));
+static int vfsync_data_only_cmp(struct buf *bp, void *data);
+static int vfsync_meta_only_cmp(struct buf *bp, void *data);
+static int vfsync_lazy_range_cmp(struct buf *bp, void *data);
+static int vfsync_bp(struct buf *bp, void *data);
+
+struct vfsync_info {
+	struct vnode *vp;
+	int synchronous;
+	int syncdeps;
+	int lazycount;
+	int lazylimit;
+	daddr_t lbn;
+	int s;
+	int (*checkdef)(struct buf *);
+};
+
+int
+vfsync(struct vnode *vp, int waitfor, int passes, daddr_t lbn,
+	int (*checkdef)(struct buf *),
+	int (*waitoutput)(struct vnode *, struct thread *))
+{
+	struct vfsync_info info;
+	int error;
+
+	bzero(&info, sizeof(info));
+	info.vp = vp;
+	info.s = splbio();
+	info.lbn = lbn;
+	if ((info.checkdef = checkdef) == NULL)
+		info.syncdeps = 1;
+
+	switch(waitfor) {
+	case MNT_LAZY:
+		/*
+		 * Lazy (filesystem syncer typ) Asynchronous plus limit the
+		 * number of data (not meta) pages we try to flush to 1MB.
+		 * A non-zero return means that lazy limit was reached.
+		 */
+		info.lazylimit = 1024 * 1024;
+		info.syncdeps = 1;
+		error = RB_SCAN(buf_rb_tree, &vp->v_rbdirty_tree, 
+				vfsync_lazy_range_cmp, vfsync_bp, &info);
+		RB_SCAN(buf_rb_tree, &vp->v_rbdirty_tree, 
+				vfsync_meta_only_cmp, vfsync_bp, &info);
+		if (error == 0)
+			vp->v_lazyw = 0;
+		else if (!RB_EMPTY(&vp->v_rbdirty_tree))
+			vn_syncer_add_to_worklist(vp, 1);
+		error = 0;
+		break;
+	case MNT_NOWAIT:
+		/*
+		 * Asynchronous.  Do a data-only pass and a meta-only pass.
+		 */
+		info.syncdeps = 1;
+		RB_SCAN(buf_rb_tree, &vp->v_rbdirty_tree, vfsync_data_only_cmp, 
+			vfsync_bp, &info);
+		RB_SCAN(buf_rb_tree, &vp->v_rbdirty_tree, vfsync_meta_only_cmp, 
+			vfsync_bp, &info);
+		error = 0;
+		break;
+	default:
+		/*
+		 * Synchronous.  Do a data-only pass, then a meta-data+data
+		 * pass, then additional integrated passes to try to get
+		 * all the dependancies flushed.
+		 */
+		RB_SCAN(buf_rb_tree, &vp->v_rbdirty_tree, vfsync_data_only_cmp,
+			vfsync_bp, &info);
+		error = vfsync_wait_output(vp, waitoutput);
+		if (error == 0) {
+			RB_SCAN(buf_rb_tree, &vp->v_rbdirty_tree, NULL,
+				vfsync_bp, &info);
+			error = vfsync_wait_output(vp, waitoutput);
+		}
+		while (error == 0 && passes > 0 &&
+		    !RB_EMPTY(&vp->v_rbdirty_tree)) {
+			if (--passes == 0) {
+				info.synchronous = 1;
+				info.syncdeps = 1;
+			}
+			error = RB_SCAN(buf_rb_tree, &vp->v_rbdirty_tree, NULL,
+				vfsync_bp, &info);
+			if (error < 0)
+				error = -error;
+			info.syncdeps = 1;
+			if (error == 0)
+				error = vfsync_wait_output(vp, waitoutput);
+		}
+		break;
+	}
+	splx(info.s);
+	return(error);
+}
+
+static int
+vfsync_wait_output(struct vnode *vp, int (*waitoutput)(struct vnode *, struct thread *))
+{
+	int error = 0;
+
+	while (vp->v_numoutput) {
+		vp->v_flag |= VBWAIT;
+		tsleep(&vp->v_numoutput, 0, "fsfsn", 0);
+	}
+	if (waitoutput)
+		error = waitoutput(vp, curthread);
+	return(error);
+}
+
+static int
+vfsync_data_only_cmp(struct buf *bp, void *data)
+{
+	if (bp->b_lblkno < 0)
+		return(-1);
+	return(0);
+}
+
+static int
+vfsync_meta_only_cmp(struct buf *bp, void *data)
+{
+	if (bp->b_lblkno < 0)
+		return(0);
+	return(1);
+}
+
+static int
+vfsync_lazy_range_cmp(struct buf *bp, void *data)
+{
+	struct vfsync_info *info = data;
+	if (bp->b_lblkno < info->vp->v_lazyw)
+		return(-1);
+	return(0);
+}
+
+static int
+vfsync_bp(struct buf *bp, void *data)
+{
+	struct vfsync_info *info = data;
+	struct vnode *vp = info->vp;
+	int error;
+
+	/*
+	 * if syncdeps is not set we do not try to write buffers which have
+	 * dependancies.
+	 */
+	if (!info->synchronous && info->syncdeps == 0 && info->checkdef(bp))
+		return(0);
+
+	/*
+	 * Ignore buffers that we cannot immediately lock.  XXX
+	 */
+	if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT))
+		return(0);
+	if ((bp->b_flags & B_DELWRI) == 0)
+		panic("vfsync_bp: buffer not dirty");
+	if (vp != bp->b_vp)
+		panic("vfsync_bp: buffer vp mismatch");
+
+	/*
+	 * B_NEEDCOMMIT (primarily used by NFS) is a state where the buffer
+	 * has been written but an additional handshake with the device
+	 * is required before we can dispose of the buffer.  We have no idea
+	 * how to do this so we have to skip these buffers.
+	 */
+	if (bp->b_flags & B_NEEDCOMMIT) {
+		BUF_UNLOCK(bp);
+		return(0);
+	}
+
+	/*
+	 * (LEGACY FROM UFS, REMOVE WHEN POSSIBLE) - invalidate any dirty
+	 * buffers beyond the file EOF. 
+	 */
+	if (info->lbn != (daddr_t)-1 && vp->v_type == VREG && 
+	    bp->b_lblkno >= info->lbn) {
+		bremfree(bp);
+		bp->b_flags |= B_INVAL | B_NOCACHE;
+		splx(info->s);
+		brelse(bp);
+		info->s = splbio();
+	}
+
+	if (info->synchronous) {
+		/*
+		 * Synchronous flushing.  An error may be returned.
+		 */
+		bremfree(bp);
+		splx(info->s);
+		error = bwrite(bp);
+		info->s = splbio();
+	} else { 
+		/*
+		 * Asynchronous flushing.  A negative return value simply
+		 * stops the scan and is not considered an error.  We use
+		 * this to support limited MNT_LAZY flushes.
+		 */
+		vp->v_lazyw = bp->b_lblkno;
+		if ((vp->v_flag & VOBJBUF) && (bp->b_flags & B_CLUSTEROK)) {
+			BUF_UNLOCK(bp);
+			info->lazycount += vfs_bio_awrite(bp);
+		} else {
+			info->lazycount += bp->b_bufsize;
+			bremfree(bp);
+			splx(info->s);
+			bawrite(bp);
+			info->s = splbio();
+		}
+		if (info->lazylimit && info->lazycount >= info->lazylimit)
+			error = 1;
+		else
+			error = 0;
+	}
+	return(-error);
+}
+
+/*
  * Associate a buffer with a vnode.
  */
 void
@@ -502,7 +811,8 @@
 	crit_enter();
 	bp->b_xflags |= BX_VNCLEAN;
 	bp->b_xflags &= ~BX_VNDIRTY;
-	TAILQ_INSERT_TAIL(&vp->v_cleanblkhd, bp, b_vnbufs);
+	if (buf_rb_tree_RB_INSERT(&vp->v_rbclean_tree, bp))
+		panic("reassignbuf: dup lblk vp %p bp %p", vp, bp);
 	crit_exit();
 }
 
@@ -513,7 +823,6 @@
 brelvp(struct buf *bp)
 {
 	struct vnode *vp;
-	struct buflists *listheadp;
 
 	KASSERT(bp->b_vp != NULL, ("brelvp: NULL"));
 
@@ -524,13 +833,12 @@
 	crit_enter();
 	if (bp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN)) {
 		if (bp->b_xflags & BX_VNDIRTY)
-			listheadp = &vp->v_dirtyblkhd;
-		else 
-			listheadp = &vp->v_cleanblkhd;
-		TAILQ_REMOVE(listheadp, bp, b_vnbufs);
+			buf_rb_tree_RB_REMOVE(&vp->v_rbdirty_tree, bp);
+		else
+			buf_rb_tree_RB_REMOVE(&vp->v_rbclean_tree, bp);
 		bp->b_xflags &= ~(BX_VNDIRTY | BX_VNCLEAN);
 	}
-	if ((vp->v_flag & VONWORKLST) && TAILQ_EMPTY(&vp->v_dirtyblkhd)) {
+	if ((vp->v_flag & VONWORKLST) && RB_EMPTY(&vp->v_rbdirty_tree)) {
 		vp->v_flag &= ~VONWORKLST;
 		LIST_REMOVE(vp, v_synclist);
 	}
@@ -564,15 +872,7 @@
 {
 	KASSERT(bp->b_vp != NULL, ("pbrelvp: NULL"));
 
-	/* XXX REMOVE ME */
-	if (TAILQ_NEXT(bp, b_vnbufs) != NULL) {
-		panic(
-		    "relpbuf(): b_vp was probably reassignbuf()d %p %x", 
-		    bp,
-		    (int)bp->b_flags
-		);
-	}
-	bp->b_vp = (struct vnode *) 0;
+	bp->b_vp = NULL;
 	bp->b_flags &= ~B_PAGING;
 }
 
@@ -596,7 +896,6 @@
 void
 reassignbuf(struct buf *bp, struct vnode *newvp)
 {
-	struct buflists *listheadp;
 	int delay;
 
 	if (newvp == NULL) {
@@ -618,10 +917,9 @@
 	 */
 	if (bp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN)) {
 		if (bp->b_xflags & BX_VNDIRTY)
-			listheadp = &bp->b_vp->v_dirtyblkhd;
+			buf_rb_tree_RB_REMOVE(&bp->b_vp->v_rbdirty_tree, bp);
 		else 
-			listheadp = &bp->b_vp->v_cleanblkhd;
-		TAILQ_REMOVE(listheadp, bp, b_vnbufs);
+			buf_rb_tree_RB_REMOVE(&bp->b_vp->v_rbclean_tree, bp);
 		bp->b_xflags &= ~(BX_VNDIRTY | BX_VNCLEAN);
 		if (bp->b_vp != newvp) {
 			vdrop(bp->b_vp);
@@ -633,9 +931,6 @@
 	 * of clean buffers.
 	 */
 	if (bp->b_flags & B_DELWRI) {
-		struct buf *tbp;
-
-		listheadp = &newvp->v_dirtyblkhd;
 		if ((newvp->v_flag & VONWORKLST) == 0) {
 			switch (newvp->v_type) {
 			case VDIR:
@@ -655,62 +950,14 @@
 			vn_syncer_add_to_worklist(newvp, delay);
 		}
 		bp->b_xflags |= BX_VNDIRTY;
-		tbp = TAILQ_FIRST(listheadp);
-		if (tbp == NULL ||
-		    bp->b_lblkno == 0 ||
-		    (bp->b_lblkno > 0 && tbp->b_lblkno < 0) ||
-		    (bp->b_lblkno > 0 && bp->b_lblkno < tbp->b_lblkno)) {
-			TAILQ_INSERT_HEAD(listheadp, bp, b_vnbufs);
-			++reassignbufsortgood;
-		} else if (bp->b_lblkno < 0) {
-			TAILQ_INSERT_TAIL(listheadp, bp, b_vnbufs);
-			++reassignbufsortgood;
-		} else if (reassignbufmethod == 1) {
-			/*
-			 * New sorting algorithm, only handle sequential case,
-			 * otherwise append to end (but before metadata)
-			 */
-			if ((tbp = gbincore(newvp, bp->b_lblkno - 1)) != NULL &&
-			    (tbp->b_xflags & BX_VNDIRTY)) {
-				/*
-				 * Found the best place to insert the buffer
-				 */
-				TAILQ_INSERT_AFTER(listheadp, tbp, bp, b_vnbufs);
-				++reassignbufsortgood;
-			} else {
-				/*
-				 * Missed, append to end, but before meta-data.
-				 * We know that the head buffer in the list is
-				 * not meta-data due to prior conditionals.
-				 *
-				 * Indirect effects:  NFS second stage write
-				 * tends to wind up here, giving maximum 
-				 * distance between the unstable write and the
-				 * commit rpc.
-				 */
-				tbp = TAILQ_LAST(listheadp, buflists);
-				while (tbp && tbp->b_lblkno < 0)
-					tbp = TAILQ_PREV(tbp, buflists, b_vnbufs);
-				TAILQ_INSERT_AFTER(listheadp, tbp, bp, b_vnbufs);
-				++reassignbufsortbad;
-			}
-		} else {
-			/*
-			 * Old sorting algorithm, scan queue and insert
-			 */
-			struct buf *ttbp;
-			while ((ttbp = TAILQ_NEXT(tbp, b_vnbufs)) &&
-			    (ttbp->b_lblkno < bp->b_lblkno)) {
-				++reassignbufloops;
-				tbp = ttbp;
-			}
-			TAILQ_INSERT_AFTER(listheadp, tbp, bp, b_vnbufs);
-		}
+		if (buf_rb_tree_RB_INSERT(&newvp->v_rbdirty_tree, bp))
+			panic("reassignbuf: dup lblk vp %p bp %p", newvp, bp);
 	} else {
 		bp->b_xflags |= BX_VNCLEAN;
-		TAILQ_INSERT_TAIL(&newvp->v_cleanblkhd, bp, b_vnbufs);
+		if (buf_rb_tree_RB_INSERT(&newvp->v_rbclean_tree, bp))
+			panic("reassignbuf: dup lblk vp %p bp %p", newvp, bp);
 		if ((newvp->v_flag & VONWORKLST) &&
-		    TAILQ_EMPTY(&newvp->v_dirtyblkhd)) {
+		    RB_EMPTY(&newvp->v_rbdirty_tree)) {
 			newvp->v_flag &= ~VONWORKLST;
 			LIST_REMOVE(newvp, v_synclist);
 		}
Index: kern/vfs_sync.c
===================================================================
RCS file: /cvs/src/sys/kern/vfs_sync.c,v
retrieving revision 1.3
diff -u -r1.3 vfs_sync.c
--- kern/vfs_sync.c	20 Jan 2005 17:52:03 -0000	1.3
+++ kern/vfs_sync.c	9 Apr 2005 02:51:17 -0000
@@ -216,22 +216,23 @@
 				vput(vp);
 			}
 			s = splbio();
+
+			/*
+			 * If the vnode is still at the head of the list
+			 * we were not able to completely flush it.  To
+			 * give other vnodes a fair shake we move it to
+			 * a later slot.
+			 *
+			 * Note that v_tag VT_VFS vnodes can remain on the
+			 * worklist with no dirty blocks, but sync_fsync()
+			 * moves it to a later slot so we will never see it
+			 * here.
+			 */
 			if (LIST_FIRST(slp) == vp) {
-				/*
-				 * Note: v_tag VT_VFS vps can remain on the
-				 * worklist too with no dirty blocks, but 
-				 * since sync_fsync() moves it to a different 
-				 * slot we are safe.
-				 */
-				if (TAILQ_EMPTY(&vp->v_dirtyblkhd) &&
-				    !vn_isdisk(vp, NULL))
+				if (RB_EMPTY(&vp->v_rbdirty_tree) &&
+				    !vn_isdisk(vp, NULL)) {
 					panic("sched_sync: fsync failed vp %p tag %d", vp, vp->v_tag);
-				/*
-				 * Put us back on the worklist.  The worklist
-				 * routine will remove us from our current
-				 * position and then add us back in at a later
-				 * position.
-				 */
+				}
 				vn_syncer_add_to_worklist(vp, syncdelay);
 			}
 			splx(s);
Index: sys/buf.h
===================================================================
RCS file: /cvs/src/sys/sys/buf.h,v
diff -u -r1.11 buf.h
--- sys/buf.h	17 Jul 2004 01:45:37 -0000	1.11
+++ sys/buf.h	9 Apr 2005 02:51:17 -0000
@@ -56,12 +56,18 @@
 #ifndef _SYS_XIO_H_
 #include <sys/xio.h>
 #endif
+#ifndef _SYS_TREE_H_
+#include <sys/tree.h>
+#endif
 
 struct buf;
 struct mount;
 struct vnode;
 struct xio;
 
+struct buf_rb_tree;
+RB_PROTOTYPE(buf_rb_tree, buf, b_rbnode, rb_buf_compare);
+
 /*
  * To avoid including <ufs/ffs/softdep.h> 
  */   
@@ -110,7 +116,7 @@
  */
 struct buf {
 	LIST_ENTRY(buf) b_hash;		/* Hash chain. */
-	TAILQ_ENTRY(buf) b_vnbufs;	/* Buffer's associated vnode. */
+	RB_ENTRY(buf) b_rbnode;		/* Red-Black node in vnode RB tree */
 	TAILQ_ENTRY(buf) b_freelist;	/* Free list position if not active. */
 	TAILQ_ENTRY(buf) b_act;		/* Device driver queue when active. *new* */
 	long	b_flags;		/* B_* flags. */
@@ -222,7 +228,7 @@
 #define	B_DONE		0x00000200	/* I/O completed. */
 #define	B_EINTR		0x00000400	/* I/O was interrupted */
 #define	B_ERROR		0x00000800	/* I/O error occurred. */
-#define	B_SCANNED	0x00001000	/* VOP_FSYNC funcs mark written bufs */
+#define	B_UNUSED1000	0x00001000
 #define	B_INVAL		0x00002000	/* Does not contain valid info. */
 #define	B_LOCKED	0x00004000	/* Locked in core (not reusable). */
 #define	B_NOCACHE	0x00008000	/* Do not cache block after use. */
@@ -235,7 +241,7 @@
 #define	B_RELBUF	0x00400000	/* Release VMIO buffer. */
 #define	B_WANT		0x00800000	/* Used by vm_pager.c */
 #define	B_WRITE		0x00000000	/* Write buffer (pseudo flag). */
-#define	B_WRITEINPROG	0x01000000	/* Write in progress. */
+#define	B_UNUSED1000000	0x01000000
 #define	B_XXX		0x02000000	/* Debugging flag. */
 #define	B_PAGING	0x04000000	/* volatile paging I/O -- bypass VMIO */
 #define	B_ORDERED	0x08000000	/* Must guarantee I/O ordering */
Index: sys/mount.h
===================================================================
RCS file: /cvs/src/sys/sys/mount.h,v
retrieving revision 1.17
diff -u -r1.17 mount.h
--- sys/mount.h	22 Mar 2005 22:13:33 -0000	1.17
+++ sys/mount.h	9 Apr 2005 02:51:17 -0000
@@ -273,7 +273,7 @@
  */
 #define MNT_WAIT	1	/* synchronously wait for I/O to complete */
 #define MNT_NOWAIT	2	/* start all I/O, but do not wait for it */
-#define MNT_LAZY	3	/* push data not written by filesystem syncer */
+#define MNT_LAZY	4	/* be lazy and do not necessarily push it all */
 
 /*
  * Generic file handle
Index: sys/tree.h
===================================================================
RCS file: /cvs/src/sys/sys/tree.h,v
retrieving revision 1.1
diff -u -r1.1 tree.h
--- sys/tree.h	19 Aug 2004 20:38:33 -0000	1.1
+++ sys/tree.h	9 Apr 2005 03:24:48 -0000
@@ -290,16 +290,25 @@
 	     (x) = SPLAY_NEXT(name, head, x))
 
 /* Macros that define a red-black tree */
+
+#define RB_SCAN_INFO(name, type)					\
+struct name##_scan_info {						\
+	struct name##_scan_info *link;					\
+	struct type	*node;						\
+}
+
 #define RB_HEAD(name, type)						\
 struct name {								\
-	struct type *rbh_root; /* root of the tree */			\
+	struct type *rbh_root; 		     /* root of the tree */	\
+	struct name##_scan_info *rbh_inprog; /* scans in progress */	\
 }
 
 #define RB_INITIALIZER(root)						\
-	{ NULL }
+	{ NULL, NULL }
 
 #define RB_INIT(root) do {						\
 	(root)->rbh_root = NULL;					\
+	(root)->rbh_inprog = NULL;					\
 } while (/*CONSTCOND*/ 0)
 
 #define RB_BLACK	0
@@ -317,6 +326,7 @@
 #define RB_PARENT(elm, field)		(elm)->field.rbe_parent
 #define RB_COLOR(elm, field)		(elm)->field.rbe_color
 #define RB_ROOT(head)			(head)->rbh_root
+#define RB_INPROG(head)			(head)->rbh_inprog
 #define RB_EMPTY(head)			(RB_ROOT(head) == NULL)
 
 #define RB_SET(elm, parent, field) do {					\
@@ -376,20 +386,20 @@
 
 /* Generates prototypes and inline functions */
 #define RB_PROTOTYPE(name, type, field, cmp)				\
-void name##_RB_INSERT_COLOR(struct name *, struct type *);	\
-void name##_RB_REMOVE_COLOR(struct name *, struct type *, struct type *);\
 struct type *name##_RB_REMOVE(struct name *, struct type *);		\
 struct type *name##_RB_INSERT(struct name *, struct type *);		\
 struct type *name##_RB_FIND(struct name *, struct type *);		\
+int name##_RB_SCAN(struct name *, int (*)(struct type *, void *),	\
+			int (*)(struct type *, void *), void *);	\
 struct type *name##_RB_NEXT(struct type *);				\
 struct type *name##_RB_MINMAX(struct name *, int);			\
-									\
+RB_SCAN_INFO(name, type)						\
 
 /* Main rb operation.
  * Moves node close to the key of elm to top
  */
 #define RB_GENERATE(name, type, field, cmp)				\
-void									\
+static void								\
 name##_RB_INSERT_COLOR(struct name *head, struct type *elm)		\
 {									\
 	struct type *parent, *gparent, *tmp;				\
@@ -433,8 +443,9 @@
 	RB_COLOR(head->rbh_root, field) = RB_BLACK;			\
 }									\
 									\
-void									\
-name##_RB_REMOVE_COLOR(struct name *head, struct type *parent, struct type *elm) \
+static void								\
+name##_RB_REMOVE_COLOR(struct name *head, struct type *parent,		\
+			struct type *elm) 				\
 {									\
 	struct type *tmp;						\
 	while ((elm == NULL || RB_COLOR(elm, field) == RB_BLACK) &&	\
@@ -514,8 +525,16 @@
 struct type *								\
 name##_RB_REMOVE(struct name *head, struct type *elm)			\
 {									\
-	struct type *child, *parent, *old = elm;			\
+	struct type *child, *parent, *old;				\
+	struct name##_scan_info *inprog;				\
 	int color;							\
+									\
+	for (inprog = RB_INPROG(head); inprog; inprog = inprog->link) { \
+		if (inprog->node == elm) 				\
+			inprog->node = RB_NEXT(name, head, elm);	\
+	}								\
+									\
+	old = elm;							\
 	if (RB_LEFT(elm, field) == NULL)				\
 		child = RB_RIGHT(elm, field);				\
 	else if (RB_RIGHT(elm, field) == NULL)				\
@@ -594,7 +613,7 @@
 		else if (comp > 0)					\
 			tmp = RB_RIGHT(tmp, field);			\
 		else							\
-			return (tmp);					\
+			return(tmp);					\
 	}								\
 	RB_SET(elm, parent, field);					\
 	if (parent != NULL) {						\
@@ -627,6 +646,74 @@
 	return (NULL);							\
 }									\
 									\
+/*									\
+ * Issue a callback for all matching items.  The scan function must	\
+ * return < 0 for items below the desired range, 0 for items within	\
+ * the range, and > 0 for items beyond the range.   Any item may be	\
+ * deleted while the scan is in progress.				\
+ */									\
+static int								\
+name##_SCANCMP_ALL(struct type *type, void *data)			\
+{									\
+	return(0);							\
+}									\
+									\
+int									\
+name##_RB_SCAN(struct name *head,					\
+		int (*scancmp)(struct type *, void *),			\
+		int (*callback)(struct type *, void *),			\
+		void *data)						\
+{									\
+	struct name##_scan_info info;					\
+	struct name##_scan_info **infopp;				\
+	struct type *best;						\
+	struct type *tmp;						\
+	int count;							\
+	int comp;							\
+									\
+	if (scancmp == NULL)						\
+		scancmp = name##_SCANCMP_ALL;				\
+									\
+	/*								\
+	 * Locate the first element.					\
+	 */								\
+	tmp = RB_ROOT(head);						\
+	best = NULL;							\
+	while (tmp) {							\
+		comp = scancmp(tmp, data);				\
+		if (comp < 0) {						\
+			tmp = RB_RIGHT(tmp, field);			\
+		} else if (comp > 0) {					\
+			tmp = RB_LEFT(tmp, field);			\
+		} else {						\
+			best = tmp;					\
+			if (RB_LEFT(tmp, field) == NULL)		\
+				break;					\
+			tmp = RB_LEFT(tmp, field);			\
+		}							\
+	}								\
+	count = 0;							\
+	if (best) {							\
+		info.node = RB_NEXT(name, head, best);			\
+		info.link = RB_INPROG(head);				\
+		RB_INPROG(head) = &info;				\
+		while ((comp = callback(best, data)) >= 0) {		\
+			count += comp;					\
+			best = info.node;				\
+			if (best == NULL || scancmp(best, data) != 0)	\
+				break;					\
+			info.node = RB_NEXT(name, head, best);		\
+		}							\
+		if (comp < 0)	/* error or termination */		\
+			count = comp;					\
+		infopp = &RB_INPROG(head);				\
+		while (*infopp != &info) 				\
+			infopp = &(*infopp)->link;			\
+		*infopp = info.link;					\
+	}								\
+	return(count);							\
+}									\
+									\
 /* ARGSUSED */								\
 struct type *								\
 name##_RB_NEXT(struct type *elm)					\
@@ -670,6 +757,8 @@
 #define RB_INSERT(name, x, y)	name##_RB_INSERT(x, y)
 #define RB_REMOVE(name, x, y)	name##_RB_REMOVE(x, y)
 #define RB_FIND(name, x, y)	name##_RB_FIND(x, y)
+#define RB_SCAN(name, root, cmp, callback, data) 		\
+				name##_RB_SCAN(root, cmp, callback, data)
 #define RB_NEXT(name, x, y)	name##_RB_NEXT(y)
 #define RB_MIN(name, x)		name##_RB_MINMAX(x, RB_NEGINF)
 #define RB_MAX(name, x)		name##_RB_MINMAX(x, RB_INF)
Index: sys/vnode.h
===================================================================
RCS file: /cvs/src/sys/sys/vnode.h,v
retrieving revision 1.31
diff -u -r1.31 vnode.h
--- sys/vnode.h	15 Feb 2005 08:32:18 -0000	1.31
+++ sys/vnode.h	9 Apr 2005 02:51:17 -0000
@@ -49,6 +49,7 @@
 #endif
 #include <sys/vfsops.h>
 #include <sys/vfscache.h>
+#include <sys/tree.h>
 
 #include <machine/lock.h>
 
@@ -152,6 +153,8 @@
  *	 pointer because mnt_vn_use_ops may change dynamically when e.g.
  *	 journaling is turned on or off.
  */
+RB_HEAD(buf_rb_tree, buf);
+
 struct vnode {
 	u_long	v_flag;				/* vnode flags (see below) */
 	int	v_usecount;			/* reference count of users */
@@ -163,8 +166,8 @@
 	struct  vop_ops **v_ops;		/* vnode operations vector */
 	TAILQ_ENTRY(vnode) v_freelist;		/* vnode freelist */
 	TAILQ_ENTRY(vnode) v_nmntvnodes;	/* vnodes for mount point */
-	struct	buflists v_cleanblkhd;		/* clean blocklist head */
-	struct	buflists v_dirtyblkhd;		/* dirty blocklist head */
+	struct buf_rb_tree v_rbclean_tree;	/* RB tree of clean bufs */
+	struct buf_rb_tree v_rbdirty_tree;	/* RB tree of clean bufs */
 	LIST_ENTRY(vnode) v_synclist;		/* vnodes with dirty buffers */
 	long	v_numoutput;			/* num of writes in progress */
 	enum	vtype v_type;			/* vnode type */
@@ -179,6 +182,7 @@
 		struct fifoinfo	*vu_fifoinfo;	/* fifo (VFIFO) */
 	} v_un;
 	struct	nqlease *v_lease;		/* Soft reference to lease */
+	daddr_t v_lazyw;			/* lazy write iterator */
 	daddr_t	v_lastw;			/* last write (write cluster) */
 	daddr_t	v_cstart;			/* start block of cluster */
 	daddr_t	v_lasta;			/* last allocation */
@@ -619,6 +623,9 @@
 	    struct thread *td, int slpflag, int slptimeo);
 int	vtruncbuf (struct vnode *vp, struct thread *td,
 		off_t length, int blksize);
+int	vfsync(struct vnode *vp, int waitfor, int passes, daddr_t lbn,
+		int (*checkdef)(struct buf *),
+		int (*waitoutput)(struct vnode *, struct thread *));
 void	vprint (char *label, struct vnode *vp);
 int	vrecycle (struct vnode *vp, struct thread *td);
 int	vn_close (struct vnode *vp, int flags, struct thread *td);
Index: vfs/gnu/ext2fs/ext2_inode.c
===================================================================
RCS file: /cvs/src/sys/vfs/gnu/ext2fs/ext2_inode.c,v
retrieving revision 1.7
diff -u -r1.7 ext2_inode.c
--- vfs/gnu/ext2fs/ext2_inode.c	8 Apr 2004 20:57:52 -0000	1.7
+++ vfs/gnu/ext2fs/ext2_inode.c	9 Apr 2005 02:51:17 -0000
@@ -341,8 +341,8 @@
 	for (i = 0; i < NDADDR; i++)
 		if (newblks[i] != oip->i_db[i])
 			panic("itrunc2");
-	if (length == 0 && (!TAILQ_EMPTY(&ovp->v_dirtyblkhd) ||
-			    !TAILQ_EMPTY(&ovp->v_cleanblkhd)))
+	if (length == 0 && (!RB_EMPTY(&ovp->v_rbdirty_tree) ||
+			    !RB_EMPTY(&ovp->v_rbclean_tree)))
 		panic("itrunc3");
 #endif /* DIAGNOSTIC */
 	/*
Index: vfs/gnu/ext2fs/ext2_vfsops.c
===================================================================
RCS file: /cvs/src/sys/vfs/gnu/ext2fs/ext2_vfsops.c,v
retrieving revision 1.26
diff -u -r1.26 ext2_vfsops.c
--- vfs/gnu/ext2fs/ext2_vfsops.c	2 Feb 2005 21:34:18 -0000	1.26
+++ vfs/gnu/ext2fs/ext2_vfsops.c	9 Apr 2005 02:51:17 -0000
@@ -919,7 +919,7 @@
 	if (vp->v_type == VNON ||
 	    ((ip->i_flag &
 	    (IN_ACCESS | IN_CHANGE | IN_MODIFIED | IN_UPDATE)) == 0 &&
-	    (TAILQ_EMPTY(&vp->v_dirtyblkhd) || info->waitfor == MNT_LAZY))) {
+	    (RB_EMPTY(&vp->v_rbdirty_tree) || info->waitfor == MNT_LAZY))) {
 		return(0);
 	}
 	if ((error = VOP_FSYNC(vp, info->waitfor, info->td)) != 0)
Index: vfs/gnu/ext2fs/ext2_vnops.c
===================================================================
RCS file: /cvs/src/sys/vfs/gnu/ext2fs/ext2_vnops.c,v
retrieving revision 1.19
diff -u -r1.19 ext2_vnops.c
--- vfs/gnu/ext2fs/ext2_vnops.c	15 Feb 2005 08:32:18 -0000	1.19
+++ vfs/gnu/ext2fs/ext2_vnops.c	9 Apr 2005 02:51:18 -0000
@@ -176,13 +176,21 @@
  *	      struct proc *a_p)
  */
 /* ARGSUSED */
+
+static int ext2_fsync_bp(struct buf *bp, void *data);
+
+struct ext2_fsync_bp_info {
+	struct vnode *vp;
+	int waitfor;
+	int s;
+};
+
 static int
 ext2_fsync(struct vop_fsync_args *ap)
 {
+	struct ext2_fsync_bp_info info;
 	struct vnode *vp = ap->a_vp;
-	struct buf *bp;
-	struct buf *nbp;
-	int s;
+	int count;
 
 	/* 
 	 * XXX why is all this fs specific?
@@ -193,42 +201,55 @@
 	 */
 	ext2_discard_prealloc(VTOI(vp));
 
+	info.s = splbio();
+	info.vp = vp;
 loop:
-	s = splbio();
-	for (bp = TAILQ_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) {
-		nbp = TAILQ_NEXT(bp, b_vnbufs);
-		if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT))
-			continue;
-		if ((bp->b_flags & B_DELWRI) == 0)
-			panic("ext2_fsync: not dirty");
-		bremfree(bp);
-		splx(s);
-		/*
-		 * Wait for I/O associated with indirect blocks to complete,
-		 * since there is no way to quickly wait for them below.
-		 */
-		if (bp->b_vp == vp || ap->a_waitfor == MNT_NOWAIT)
-			(void) bawrite(bp);
-		else
-			(void) bwrite(bp);
+	info.waitfor = ap->a_waitfor;
+	count = RB_SCAN(buf_rb_tree, &vp->v_rbdirty_tree, NULL, 
+			ext2_fsync_bp, &info);
+	if (count)
 		goto loop;
-	}
+
 	if (ap->a_waitfor == MNT_WAIT) {
 		while (vp->v_numoutput) {
 			vp->v_flag |= VBWAIT;
 			tsleep(&vp->v_numoutput, 0, "e2fsyn", 0);
 		}
 #if DIAGNOSTIC
-		if (!TAILQ_EMPTY(&vp->v_dirtyblkhd)) {
+		if (!RB_EMPTY(&vp->v_rbdirty_tree)) {
 			vprint("ext2_fsync: dirty", vp);
 			goto loop;
 		}
 #endif
 	}
-	splx(s);
+	splx(info.s);
 	return (UFS_UPDATE(ap->a_vp, ap->a_waitfor == MNT_WAIT));
 }
 
+static int
+ext2_fsync_bp(struct buf *bp, void *data)
+{
+	struct ext2_fsync_bp_info *info = data;
+
+	if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT))
+		return(0);
+	if ((bp->b_flags & B_DELWRI) == 0)
+		panic("ext2_fsync: not dirty");
+	bremfree(bp);
+	splx(info->s);
+
+	/*
+	 * Wait for I/O associated with indirect blocks to complete,
+	 * since there is no way to quickly wait for them below.
+	 */
+	if (bp->b_vp == info->vp || info->waitfor == MNT_NOWAIT)
+		(void) bawrite(bp);
+	else
+		(void) bwrite(bp);
+	info->s = splbio();
+	return(1);
+}
+
 /*
  * Mknod vnode call
  *
Index: vfs/hpfs/hpfs_vnops.c
===================================================================
RCS file: /cvs/src/sys/vfs/hpfs/hpfs_vnops.c,v
retrieving revision 1.22
diff -u -r1.22 hpfs_vnops.c
--- vfs/hpfs/hpfs_vnops.c	15 Feb 2005 08:32:18 -0000	1.22
+++ vfs/hpfs/hpfs_vnops.c	9 Apr 2005 02:51:18 -0000
@@ -117,36 +117,20 @@
 hpfs_fsync(struct vop_fsync_args *ap)
 {
 	struct vnode *vp = ap->a_vp;
-	int s;
-	struct buf *bp, *nbp;
 
 	/*
 	 * Flush all dirty buffers associated with a vnode.
 	 */
+#ifdef DIAGNOSTIC
 loop:
-	s = splbio();
-	for (bp = TAILQ_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) {
-		nbp = TAILQ_NEXT(bp, b_vnbufs);
-		if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT))
-			continue;
-		if ((bp->b_flags & B_DELWRI) == 0)
-			panic("hpfs_fsync: not dirty");
-		bremfree(bp);
-		splx(s);
-		(void) bwrite(bp);
-		goto loop;
-	}
-	while (vp->v_numoutput) {
-		vp->v_flag |= VBWAIT;
-		(void) tsleep((caddr_t)&vp->v_numoutput, 0, "hpfsn", 0);
-	}
+#endif
+	vfsync(vp, ap->a_waitfor, 0, (daddr_t)-1, NULL, NULL);
 #ifdef DIAGNOSTIC
-	if (!TAILQ_EMPTY(&vp->v_dirtyblkhd)) {
+	if (ap->a_waitfor == MNT_WAIT && !RB_EMPTY(&vp->v_rbdirty_tree)) {
 		vprint("hpfs_fsync: dirty", vp);
 		goto loop;
 	}
 #endif
-	splx(s);
 
 	/*
 	 * Write out the on-disc version of the vnode.
Index: vfs/msdosfs/msdosfs_vfsops.c
===================================================================
RCS file: /cvs/src/sys/vfs/msdosfs/msdosfs_vfsops.c,v
retrieving revision 1.24
diff -u -r1.24 msdosfs_vfsops.c
--- vfs/msdosfs/msdosfs_vfsops.c	2 Feb 2005 21:34:18 -0000	1.24
+++ vfs/msdosfs/msdosfs_vfsops.c	9 Apr 2005 02:51:18 -0000
@@ -713,8 +713,8 @@
 		    TAILQ_NEXT(vp, v_freelist), TAILQ_PREV(vp, v_freelist),
 		    vp->v_mount);
 		printf("cleanblkhd %p, dirtyblkhd %p, numoutput %ld, type %d\n",
-		    TAILQ_FIRST(&vp->v_cleanblkhd),
-		    TAILQ_FIRST(&vp->v_dirtyblkhd),
+		    RB_EMPTY(&vp->v_rbclean_tree),
+		    RB_EMPTY(&vp->v_rbdirty_tree),
 		    vp->v_numoutput, vp->v_type);
 		printf("union %p, tag %d, data[0] %08x, data[1] %08x\n",
 		    vp->v_socket, vp->v_tag,
@@ -832,7 +832,7 @@
 	if (vp->v_type == VNON ||
 	    ((dep->de_flag &
 	    (DE_ACCESS | DE_CREATE | DE_UPDATE | DE_MODIFIED)) == 0 &&
-	    (TAILQ_EMPTY(&vp->v_dirtyblkhd) || info->waitfor == MNT_LAZY))) {
+	    (RB_EMPTY(&vp->v_rbdirty_tree) || info->waitfor == MNT_LAZY))) {
 		return(0);
 	}
 	if ((error = VOP_FSYNC(vp, info->waitfor, info->td)) != 0)
Index: vfs/msdosfs/msdosfs_vnops.c
===================================================================
RCS file: /cvs/src/sys/vfs/msdosfs/msdosfs_vnops.c,v
retrieving revision 1.23
diff -u -r1.23 msdosfs_vnops.c
--- vfs/msdosfs/msdosfs_vnops.c	15 Feb 2005 08:32:18 -0000	1.23
+++ vfs/msdosfs/msdosfs_vnops.c	9 Apr 2005 02:51:18 -0000
@@ -830,36 +830,20 @@
 msdosfs_fsync(struct vop_fsync_args *ap)
 {
 	struct vnode *vp = ap->a_vp;
-	int s;
-	struct buf *bp, *nbp;
 
 	/*
 	 * Flush all dirty buffers associated with a vnode.
 	 */
+#ifdef DIAGNOSTIC
 loop:
-	s = splbio();
-	for (bp = TAILQ_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) {
-		nbp = TAILQ_NEXT(bp, b_vnbufs);
-		if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT))
-			continue;
-		if ((bp->b_flags & B_DELWRI) == 0)
-			panic("msdosfs_fsync: not dirty");
-		bremfree(bp);
-		splx(s);
-		(void) bwrite(bp);
-		goto loop;
-	}
-	while (vp->v_numoutput) {
-		vp->v_flag |= VBWAIT;
-		(void) tsleep((caddr_t)&vp->v_numoutput, 0, "msdosfsn", 0);
-	}
+#endif
+	vfsync(vp, ap->a_waitfor, 0, (daddr_t)-1, NULL, NULL);
 #ifdef DIAGNOSTIC
-	if (!TAILQ_EMPTY(&vp->v_dirtyblkhd)) {
+	if (ap->a_waitfor == MNT_WAIT && !RB_EMPTY(&vp->v_rbdirty_tree)) {
 		vprint("msdosfs_fsync: dirty", vp);
 		goto loop;
 	}
 #endif
-	splx(s);
 	return (deupdat(VTODE(vp), ap->a_waitfor == MNT_WAIT));
 }
 
Index: vfs/nfs/nfs_bio.c
===================================================================
RCS file: /cvs/src/sys/vfs/nfs/nfs_bio.c,v
retrieving revision 1.21
diff -u -r1.21 nfs_bio.c
--- vfs/nfs/nfs_bio.c	17 Mar 2005 17:28:46 -0000	1.21
+++ vfs/nfs/nfs_bio.c	9 Apr 2005 02:51:18 -0000
@@ -1290,10 +1290,6 @@
 				goto again;
 			}
 		}
-
-		if ((bp->b_flags & B_READ) == 0)
-			bp->b_flags |= B_WRITEINPROG;
-
 		BUF_KERNPROC(bp);
 		TAILQ_INSERT_TAIL(&nmp->nm_bufq, bp, b_freelist);
 		nmp->nm_bufqlen++;
@@ -1449,10 +1445,8 @@
 		    off_t off;
 
 		    off = ((u_quad_t)bp->b_blkno) * DEV_BSIZE + bp->b_dirtyoff;
-		    bp->b_flags |= B_WRITEINPROG;
 		    retv = nfs_commit(bp->b_vp, off, 
 				bp->b_dirtyend - bp->b_dirtyoff, td);
-		    bp->b_flags &= ~B_WRITEINPROG;
 		    if (retv == 0) {
 			    bp->b_dirtyoff = bp->b_dirtyend = 0;
 			    bp->b_flags &= ~(B_NEEDCOMMIT | B_CLUSTEROK);
@@ -1486,7 +1480,6 @@
 		else
 		    iomode = NFSV3WRITE_FILESYNC;
 
-		bp->b_flags |= B_WRITEINPROG;
 		error = nfs_writerpc(vp, uiop, &iomode, &must_commit);
 
 		/*
@@ -1510,7 +1503,6 @@
 		} else {
 		    bp->b_flags &= ~(B_NEEDCOMMIT | B_CLUSTEROK);
 		}
-		bp->b_flags &= ~B_WRITEINPROG;
 
 		/*
 		 * For an interrupted write, the buffer is still valid
Index: vfs/nfs/nfs_nqlease.c
===================================================================
RCS file: /cvs/src/sys/vfs/nfs/nfs_nqlease.c,v
retrieving revision 1.23
diff -u -r1.23 nfs_nqlease.c
--- vfs/nfs/nfs_nqlease.c	27 Mar 2005 23:51:42 -0000	1.23
+++ vfs/nfs/nfs_nqlease.c	9 Apr 2005 02:51:18 -0000
@@ -1078,7 +1078,7 @@
 			} else if ((np->n_expiry - NQ_RENEWAL) < time_second) {
 			    if ((np->n_flag & (NQNFSWRITE | NQNFSNONCACHE))
 				 == NQNFSWRITE &&
-				 !TAILQ_EMPTY(&vp->v_dirtyblkhd) &&
+				 !RB_EMPTY(&vp->v_rbdirty_tree) &&
 				 vget(vp, LK_EXCLUSIVE, td) == 0) {
 				 nmp->nm_inprog = vp;
 				 if (vpid == vp->v_id &&
Index: vfs/nfs/nfs_subs.c
===================================================================
RCS file: /cvs/src/sys/vfs/nfs/nfs_subs.c,v
retrieving revision 1.27
diff -u -r1.27 nfs_subs.c
--- vfs/nfs/nfs_subs.c	17 Mar 2005 17:28:46 -0000	1.27
+++ vfs/nfs/nfs_subs.c	9 Apr 2005 02:51:18 -0000
@@ -2062,11 +2062,13 @@
  * B_CLUSTEROK must be cleared along with B_NEEDCOMMIT because stage 1 data 
  * writes are not clusterable.
  */
+
+static int nfs_clearcommit_bp(struct buf *bp, void *data __unused);
+
 void
 nfs_clearcommit(struct mount *mp)
 {
 	struct vnode *vp, *nvp;
-	struct buf *bp, *nbp;
 	lwkt_tokref ilock;
 	int s;
 
@@ -2076,19 +2078,24 @@
 		nvp = TAILQ_NEXT(vp, v_nmntvnodes);	/* ZZZ */
 		if (vp->v_flag & VPLACEMARKER)
 			continue;
-		for (bp = TAILQ_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) {
-			nbp = TAILQ_NEXT(bp, b_vnbufs);
-			if (BUF_REFCNT(bp) == 0 &&
-			    (bp->b_flags & (B_DELWRI | B_NEEDCOMMIT))
-			     == (B_DELWRI | B_NEEDCOMMIT)) {
-				bp->b_flags &= ~(B_NEEDCOMMIT | B_CLUSTEROK);
-			}
-		}
+		RB_SCAN(buf_rb_tree, &vp->v_rbdirty_tree, NULL,
+			nfs_clearcommit_bp, NULL);
 	}
 	splx(s);
 	lwkt_reltoken(&ilock);
 }
 
+static int
+nfs_clearcommit_bp(struct buf *bp, void *data __unused)
+{
+	if (BUF_REFCNT(bp) == 0 &&
+	    (bp->b_flags & (B_DELWRI | B_NEEDCOMMIT))
+	     == (B_DELWRI | B_NEEDCOMMIT)) {
+		bp->b_flags &= ~(B_NEEDCOMMIT | B_CLUSTEROK);
+	}
+	return(0);
+}
+
 #ifndef NFS_NOSERVER
 /*
  * Map errnos to NFS error numbers. For Version 3 also filter out error
Index: vfs/nfs/nfs_vfsops.c
===================================================================
RCS file: /cvs/src/sys/vfs/nfs/nfs_vfsops.c,v
retrieving revision 1.25
diff -u -r1.25 nfs_vfsops.c
--- vfs/nfs/nfs_vfsops.c	17 Mar 2005 17:28:46 -0000	1.25
+++ vfs/nfs/nfs_vfsops.c	9 Apr 2005 02:51:18 -0000
@@ -1144,7 +1144,7 @@
 {
     struct scaninfo *info = data;
 
-    if (VOP_ISLOCKED(vp, NULL) || TAILQ_EMPTY(&vp->v_dirtyblkhd))
+    if (VOP_ISLOCKED(vp, NULL) || RB_EMPTY(&vp->v_rbdirty_tree))
 	return(-1);
     if (info->waitfor == MNT_LAZY)
 	return(-1);
Index: vfs/nfs/nfs_vnops.c
===================================================================
RCS file: /cvs/src/sys/vfs/nfs/nfs_vnops.c,v
retrieving revision 1.39
diff -u -r1.39 nfs_vnops.c
--- vfs/nfs/nfs_vnops.c	6 Apr 2005 18:39:53 -0000	1.39
+++ vfs/nfs/nfs_vnops.c	9 Apr 2005 19:32:53 -0000
@@ -2875,140 +2875,266 @@
 }
 
 /*
- * Flush all the blocks associated with a vnode.
- * 	Walk through the buffer pool and push any dirty pages
- *	associated with the vnode.
+ * Flush all the blocks associated with a vnode.   Dirty NFS buffers may be
+ * in one of two states:  If B_NEEDCOMMIT is clear then the buffer contains
+ * new NFS data which needs to be written to the server.  If B_NEEDCOMMIT is
+ * set the buffer contains data that has already been written to the server
+ * and which now needs a commit RPC.
+ *
+ * If commit is 0 we only take one pass and only flush buffers containing new
+ * dirty data.
+ *
+ * If commit is 1 we take two passes, issuing a commit RPC in the second
+ * pass.
+ *
+ * If waitfor is MNT_WAIT and commit is 1, we loop as many times as required
+ * to completely flush all pending data.
+ *
+ * Note that the RB_SCAN code properly handles the case where the
+ * callback might block and directly or indirectly (another thread) cause
+ * the RB tree to change.
  */
+
+#ifndef NFS_COMMITBVECSIZ
+#define NFS_COMMITBVECSIZ	16
+#endif
+
+struct nfs_flush_info {
+	enum { NFI_FLUSHNEW, NFI_COMMIT } mode;
+	struct thread *td;
+	struct vnode *vp;
+	int waitfor;
+	int slpflag;
+	int slptimeo;
+	int loops;
+	struct buf *bvary[NFS_COMMITBVECSIZ];
+	int bvsize;
+	off_t beg_off;
+	off_t end_off;
+};
+
+static int nfs_flush_bp(struct buf *bp, void *data);
+static int nfs_flush_docommit(struct nfs_flush_info *info, int error);
+
 int
 nfs_flush(struct vnode *vp, int waitfor, struct thread *td, int commit)
 {
 	struct nfsnode *np = VTONFS(vp);
-	struct buf *bp;
-	int i;
-	struct buf *nbp;
 	struct nfsmount *nmp = VFSTONFS(vp->v_mount);
-	int s, error = 0, slptimeo = 0, slpflag = 0, retv, bvecpos;
-	int passone = 1;
-	u_quad_t off, endoff, toff;
-	struct buf **bvec = NULL;
-#ifndef NFS_COMMITBVECSIZ
-#define NFS_COMMITBVECSIZ	20
-#endif
-	struct buf *bvec_on_stack[NFS_COMMITBVECSIZ];
-	int bvecsize = 0, bveccount;
+	struct nfs_flush_info info;
+	int error;
 
-	if (nmp->nm_flag & NFSMNT_INT)
-		slpflag = PCATCH;
-	if (!commit)
-		passone = 0;
-	/*
-	 * A b_flags == (B_DELWRI | B_NEEDCOMMIT) block has been written to the
-	 * server, but nas not been committed to stable storage on the server
-	 * yet. On the first pass, the byte range is worked out and the commit
-	 * rpc is done. On the second pass, nfs_writebp() is called to do the
-	 * job.
-	 */
-again:
-	off = (u_quad_t)-1;
-	endoff = 0;
-	bvecpos = 0;
-	if (NFS_ISV3(vp) && commit) {
-		s = splbio();
+	bzero(&info, sizeof(info));
+	info.td = td;
+	info.vp = vp;
+	info.waitfor = waitfor;
+	info.slpflag = (nmp->nm_flag & NFSMNT_INT) ? PCATCH : 0;
+	info.loops = 0;
+
+	do {
 		/*
-		 * Count up how many buffers waiting for a commit.
+		 * Flush mode
 		 */
-		bveccount = 0;
-		for (bp = TAILQ_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) {
-			nbp = TAILQ_NEXT(bp, b_vnbufs);
-			if (BUF_REFCNT(bp) == 0 &&
-			    (bp->b_flags & (B_DELWRI | B_NEEDCOMMIT))
-				== (B_DELWRI | B_NEEDCOMMIT))
-				bveccount++;
+		info.mode = NFI_FLUSHNEW;
+		error = RB_SCAN(buf_rb_tree, &vp->v_rbdirty_tree, NULL, 
+				nfs_flush_bp, &info);
+
+		/*
+		 * Take a second pass if committing and no error occured.  
+		 * Clean up any left over collection (whether an error 
+		 * occurs or not).
+		 */
+		if (commit && error == 0) {
+			info.mode = NFI_COMMIT;
+			error = RB_SCAN(buf_rb_tree, &vp->v_rbdirty_tree, NULL, 
+					nfs_flush_bp, &info);
+			if (info.bvsize)
+				error = nfs_flush_docommit(&info, error);
+		}
+
+		/*
+		 * Wait for pending I/O to complete before checking whether
+		 * any further dirty buffers exist.
+		 */
+		while (waitfor == MNT_WAIT && vp->v_numoutput) {
+			vp->v_flag |= VBWAIT;
+			error = tsleep((caddr_t)&vp->v_numoutput,
+				info.slpflag, "nfsfsync", info.slptimeo);
+			if (error) {
+				/*
+				 * We have to be able to break out if this 
+				 * is an 'intr' mount.
+				 */
+				if (nfs_sigintr(nmp, (struct nfsreq *)0, td)) {
+					error = -EINTR;
+					break;
+				}
+
+				/*
+				 * Since we do not process pending signals,
+				 * once we get a PCATCH our tsleep() will no
+				 * longer sleep, switch to a fixed timeout
+				 * instead.
+				 */
+				if (info.slpflag == PCATCH) {
+					info.slpflag = 0;
+					info.slptimeo = 2 * hz;
+				}
+				error = 0;
+			}
 		}
+		++info.loops;
 		/*
-		 * Allocate space to remember the list of bufs to commit.  It is
-		 * important to use M_NOWAIT here to avoid a race with nfs_write.
-		 * If we can't get memory (for whatever reason), we will end up
-		 * committing the buffers one-by-one in the loop below.
+		 * Loop if we are flushing synchronous as well as committing,
+		 * and dirty buffers are still present.  Otherwise we might livelock.
 		 */
-		if (bvec != NULL && bvec != bvec_on_stack)
-			free(bvec, M_TEMP);
-		if (bveccount > NFS_COMMITBVECSIZ) {
-			bvec = (struct buf **)
-				malloc(bveccount * sizeof(struct buf *),
-				       M_TEMP, M_NOWAIT);
-			if (bvec == NULL) {
-				bvec = bvec_on_stack;
-				bvecsize = NFS_COMMITBVECSIZ;
-			} else
-				bvecsize = bveccount;
+	} while (waitfor == MNT_WAIT && commit && 
+		 error == 0 && !RB_EMPTY(&vp->v_rbdirty_tree));
+
+	/*
+	 * The callbacks have to return a negative error to terminate the
+	 * RB scan.
+	 */
+	if (error < 0)
+		error = -error;
+
+	/*
+	 * Deal with any error collection
+	 */
+	if (np->n_flag & NWRITEERR) {
+		error = np->n_error;
+		np->n_flag &= ~NWRITEERR;
+	}
+	return (error);
+}
+
+
+static
+int
+nfs_flush_bp(struct buf *bp, void *data)
+{
+	struct nfs_flush_info *info = data;
+	off_t toff;
+	int error;
+	int s;
+
+	error = 0;
+	switch(info->mode) {
+	case NFI_FLUSHNEW:
+		s = splbio();
+		if (info->loops && info->waitfor == MNT_WAIT) {
+			error = BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT);
+			if (error) {
+				error = BUF_TIMELOCK(bp,
+						LK_EXCLUSIVE | LK_SLEEPFAIL,
+						"nfsfsync",
+						info->slpflag, info->slptimeo);
+			}
 		} else {
-			bvec = bvec_on_stack;
-			bvecsize = NFS_COMMITBVECSIZ;
+			error = BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT);
 		}
-		for (bp = TAILQ_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) {
-			nbp = TAILQ_NEXT(bp, b_vnbufs);
-			if (bvecpos >= bvecsize)
+		if (error == 0) {
+			if ((bp->b_flags & B_DELWRI) == 0)
+				panic("nfs_fsync: not dirty");
+			if (bp->b_flags & B_NEEDCOMMIT) {
+				BUF_UNLOCK(bp);
+				splx(s);
 				break;
-			if ((bp->b_flags & (B_DELWRI | B_NEEDCOMMIT)) !=
-			    (B_DELWRI | B_NEEDCOMMIT) ||
-			    BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT))
-				continue;
+			}
 			bremfree(bp);
-			/*
-			 * NOTE: we are not clearing B_DONE here, so we have
-			 * to do it later on in this routine if we intend to 
-			 * initiate I/O on the bp.
-			 *
-			 * Note: to avoid loopback deadlocks, we do not
-			 * assign b_runningbufspace.
-			 */
-			bp->b_flags |= B_WRITEINPROG;
-			vfs_busy_pages(bp, 1);
-
-			/*
-			 * bp is protected by being locked, but nbp is not
-			 * and vfs_busy_pages() may sleep.  We have to
-			 * recalculate nbp.
-			 */
-			nbp = TAILQ_NEXT(bp, b_vnbufs);
-
-			/*
-			 * A list of these buffers is kept so that the
-			 * second loop knows which buffers have actually
-			 * been committed. This is necessary, since there
-			 * may be a race between the commit rpc and new
-			 * uncommitted writes on the file.
-			 */
-			bvec[bvecpos++] = bp;
-			toff = ((u_quad_t)bp->b_blkno) * DEV_BSIZE +
-				bp->b_dirtyoff;
-			if (toff < off)
-				off = toff;
-			toff += (u_quad_t)(bp->b_dirtyend - bp->b_dirtyoff);
-			if (toff > endoff)
-				endoff = toff;
+
+			bp->b_flags |= B_ASYNC;
+			splx(s);
+			VOP_BWRITE(bp->b_vp, bp);
+		} else {
+			splx(s);
+			error = 0;
+		}
+		break;
+	case NFI_COMMIT:
+		/*
+		 * Only process buffers in need of a commit which we can
+		 * immediately lock.  This may prevent a buffer from being
+		 * committed, but the normal flush loop will block on the
+		 * same buffer so we shouldn't get into an endless loop.
+		 */
+		s = splbio();
+		if ((bp->b_flags & (B_DELWRI | B_NEEDCOMMIT)) != 
+		    BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT) != 0) {
+			splx(s);
+			break;
+		}
+
+		bremfree(bp);
+
+		/*
+		 * NOTE: we are not clearing B_DONE here, so we have
+		 * to do it later on in this routine if we intend to 
+		 * initiate I/O on the bp.
+		 *
+		 * Note: to avoid loopback deadlocks, we do not
+		 * assign b_runningbufspace.
+		 */
+		vfs_busy_pages(bp, 1);
+
+		info->bvary[info->bvsize] = bp;
+		toff = ((u_quad_t)bp->b_blkno) * DEV_BSIZE +
+			bp->b_dirtyoff;
+		if (info->bvsize == 0 || toff < info->beg_off)
+			info->beg_off = toff;
+		toff += (u_quad_t)(bp->b_dirtyend - bp->b_dirtyoff);
+		if (info->bvsize == 0 || toff > info->end_off)
+			info->end_off = toff;
+		++info->bvsize;
+		if (info->bvsize == NFS_COMMITBVECSIZ) {
+			error = nfs_flush_docommit(info, 0);
+			KKASSERT(info->bvsize == 0);
 		}
 		splx(s);
 	}
-	if (bvecpos > 0) {
+	return (error);
+}
+
+static
+int
+nfs_flush_docommit(struct nfs_flush_info *info, int error)
+{
+	struct vnode *vp;
+	struct buf *bp;
+	off_t bytes;
+	int retv;
+	int i;
+	int s;
+
+	vp = info->vp;
+
+	if (info->bvsize > 0) {
 		/*
 		 * Commit data on the server, as required.  Note that
 		 * nfs_commit will use the vnode's cred for the commit.
+		 * The NFSv3 commit RPC is limited to a 32 bit byte count.
 		 */
-		retv = nfs_commit(vp, off, (int)(endoff - off), td);
-
-		if (retv == NFSERR_STALEWRITEVERF)
-			nfs_clearcommit(vp->v_mount);
+		bytes = info->end_off - info->beg_off;
+		if (bytes > 0x40000000)
+			bytes = 0x40000000;
+		if (error) {
+			retv = -error;
+		} else {
+			retv = nfs_commit(vp, info->beg_off, 
+					    (int)bytes, info->td);
+			if (retv == NFSERR_STALEWRITEVERF)
+				nfs_clearcommit(vp->v_mount);
+		}
 
 		/*
 		 * Now, either mark the blocks I/O done or mark the
 		 * blocks dirty, depending on whether the commit
 		 * succeeded.
 		 */
-		for (i = 0; i < bvecpos; i++) {
-			bp = bvec[i];
-			bp->b_flags &= ~(B_NEEDCOMMIT | B_WRITEINPROG | B_CLUSTEROK);
+		for (i = 0; i < info->bvsize; ++i) {
+			bp = info->bvary[i];
+			bp->b_flags &= ~(B_NEEDCOMMIT | B_CLUSTEROK);
 			if (retv) {
 				/*
 				 * Error, leave B_DELWRI intact
@@ -3033,82 +3159,8 @@
 				biodone(bp);
 			}
 		}
+		info->bvsize = 0;
 	}
-
-	/*
-	 * Start/do any write(s) that are required.
-	 */
-loop:
-	s = splbio();
-	for (bp = TAILQ_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) {
-		nbp = TAILQ_NEXT(bp, b_vnbufs);
-		if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT)) {
-			if (waitfor != MNT_WAIT || passone)
-				continue;
-			error = BUF_TIMELOCK(bp, LK_EXCLUSIVE | LK_SLEEPFAIL,
-			    "nfsfsync", slpflag, slptimeo);
-			splx(s);
-			if (error == 0)
-				panic("nfs_fsync: inconsistent lock");
-			if (error == ENOLCK)
-				goto loop;
-			if (nfs_sigintr(nmp, (struct nfsreq *)0, td)) {
-				error = EINTR;
-				goto done;
-			}
-			if (slpflag == PCATCH) {
-				slpflag = 0;
-				slptimeo = 2 * hz;
-			}
-			goto loop;
-		}
-		if ((bp->b_flags & B_DELWRI) == 0)
-			panic("nfs_fsync: not dirty");
-		if ((passone || !commit) && (bp->b_flags & B_NEEDCOMMIT)) {
-			BUF_UNLOCK(bp);
-			continue;
-		}
-		bremfree(bp);
-		if (passone || !commit)
-		    bp->b_flags |= B_ASYNC;
-		else
-		    bp->b_flags |= B_ASYNC | B_WRITEINPROG;
-		splx(s);
-		VOP_BWRITE(bp->b_vp, bp);
-		goto loop;
-	}
-	splx(s);
-	if (passone) {
-		passone = 0;
-		goto again;
-	}
-	if (waitfor == MNT_WAIT) {
-		while (vp->v_numoutput) {
-			vp->v_flag |= VBWAIT;
-			error = tsleep((caddr_t)&vp->v_numoutput,
-				slpflag, "nfsfsync", slptimeo);
-			if (error) {
-			    if (nfs_sigintr(nmp, (struct nfsreq *)0, td)) {
-				error = EINTR;
-				goto done;
-			    }
-			    if (slpflag == PCATCH) {
-				slpflag = 0;
-				slptimeo = 2 * hz;
-			    }
-			}
-		}
-		if (!TAILQ_EMPTY(&vp->v_dirtyblkhd) && commit) {
-			goto loop;
-		}
-	}
-	if (np->n_flag & NWRITEERR) {
-		error = np->n_error;
-		np->n_flag &= ~NWRITEERR;
-	}
-done:
-	if (bvec != NULL && bvec != bvec_on_stack)
-		free(bvec, M_TEMP);
 	return (error);
 }
 
@@ -3165,9 +3217,8 @@
 }
 
 /*
- * This is a clone of vn_bwrite(), except that B_WRITEINPROG isn't set unless
- * the force flag is one and it also handles the B_NEEDCOMMIT flag.  We set
- * B_CACHE if this is a VMIO buffer.
+ * This is a clone of vn_bwrite(), except that it also handles the
+ * B_NEEDCOMMIT flag.  We set B_CACHE if this is a VMIO buffer.
  */
 int
 nfs_writebp(struct buf *bp, int force, struct thread *td)
@@ -3206,8 +3257,6 @@
 	 */
 	vfs_busy_pages(bp, 1);
 
-	if (force)
-		bp->b_flags |= B_WRITEINPROG;
 	BUF_KERNPROC(bp);
 	VOP_STRATEGY(bp->b_vp, bp);
 
Index: vfs/nwfs/nwfs_io.c
===================================================================
RCS file: /cvs/src/sys/vfs/nwfs/nwfs_io.c,v
retrieving revision 1.14
diff -u -r1.14 nwfs_io.c
--- vfs/nwfs/nwfs_io.c	17 Feb 2005 14:00:10 -0000	1.14
+++ vfs/nwfs/nwfs_io.c	9 Apr 2005 02:51:18 -0000
@@ -315,9 +315,7 @@
 		uiop->uio_offset = ((off_t)bp->b_blkno) * DEV_BSIZE + bp->b_dirtyoff;
 		io.iov_base = (char *)bp->b_data + bp->b_dirtyoff;
 		uiop->uio_rw = UIO_WRITE;
-		bp->b_flags |= B_WRITEINPROG;
 		error = ncp_write(NWFSTOCONN(nmp), &np->n_fh, uiop, cr);
-		bp->b_flags &= ~B_WRITEINPROG;
 
 		/*
 		 * For an interrupted write, the buffer is still valid
Index: vfs/nwfs/nwfs_vfsops.c
===================================================================
RCS file: /cvs/src/sys/vfs/nwfs/nwfs_vfsops.c,v
retrieving revision 1.16
diff -u -r1.16 nwfs_vfsops.c
--- vfs/nwfs/nwfs_vfsops.c	12 Feb 2005 01:31:38 -0000	1.16
+++ vfs/nwfs/nwfs_vfsops.c	9 Apr 2005 02:51:18 -0000
@@ -490,7 +490,7 @@
 		 */
 		if (vp->v_mount != mp)
 			goto loop;
-		if (VOP_ISLOCKED(vp, NULL) || TAILQ_EMPTY(&vp->v_dirtyblkhd) ||
+		if (VOP_ISLOCKED(vp, NULL) || RB_EMPTY(&vp->v_rbdirty_tree) ||
 		    waitfor == MNT_LAZY)
 			continue;
 		if (vget(vp, LK_EXCLUSIVE, td))
Index: vfs/smbfs/smbfs_io.c
===================================================================
RCS file: /cvs/src/sys/vfs/smbfs/smbfs_io.c,v
retrieving revision 1.15
diff -u -r1.15 smbfs_io.c
--- vfs/smbfs/smbfs_io.c	8 Jan 2005 18:57:48 -0000	1.15
+++ vfs/smbfs/smbfs_io.c	9 Apr 2005 02:51:18 -0000
@@ -343,9 +343,7 @@
 		uiop->uio_offset = ((off_t)bp->b_blkno) * DEV_BSIZE + bp->b_dirtyoff;
 		io.iov_base = (char *)bp->b_data + bp->b_dirtyoff;
 		uiop->uio_rw = UIO_WRITE;
-		bp->b_flags |= B_WRITEINPROG;
 		error = smb_write(smp->sm_share, np->n_fid, uiop, &scred);
-		bp->b_flags &= ~B_WRITEINPROG;
 
 		/*
 		 * For an interrupted write, the buffer is still valid
Index: vfs/smbfs/smbfs_vfsops.c
===================================================================
RCS file: /cvs/src/sys/vfs/smbfs/smbfs_vfsops.c,v
retrieving revision 1.18
diff -u -r1.18 smbfs_vfsops.c
--- vfs/smbfs/smbfs_vfsops.c	12 Feb 2005 01:32:49 -0000	1.18
+++ vfs/smbfs/smbfs_vfsops.c	9 Apr 2005 02:51:18 -0000
@@ -439,7 +439,7 @@
 		 */
 		if (vp->v_mount != mp)
 			goto loop;
-		if (VOP_ISLOCKED(vp, NULL) || TAILQ_EMPTY(&vp->v_dirtyblkhd) ||
+		if (VOP_ISLOCKED(vp, NULL) || RB_EMPTY(&vp->v_rbdirty_tree) ||
 		    waitfor == MNT_LAZY)
 			continue;
 		if (vget(vp, LK_EXCLUSIVE, td))
Index: vfs/specfs/spec_vnops.c
===================================================================
RCS file: /cvs/src/sys/vfs/specfs/spec_vnops.c,v
retrieving revision 1.23
diff -u -r1.23 spec_vnops.c
--- vfs/specfs/spec_vnops.c	15 Feb 2005 08:32:18 -0000	1.23
+++ vfs/specfs/spec_vnops.c	9 Apr 2005 02:51:18 -0000
@@ -409,72 +409,16 @@
 spec_fsync(struct vop_fsync_args *ap)
 {
 	struct vnode *vp = ap->a_vp;
-	struct buf *bp;
-	struct buf *nbp;
-	int s;
-	int maxretry = 10000;	/* large, arbitrarily chosen */
+	int error;
 
 	if (!vn_isdisk(vp, NULL))
 		return (0);
 
-loop1:
-	/*
-	 * MARK/SCAN initialization to avoid infinite loops
-	 */
-	s = splbio();
-	for (bp = TAILQ_FIRST(&vp->v_dirtyblkhd); bp;
-	     bp = TAILQ_NEXT(bp, b_vnbufs)) {
-		bp->b_flags &= ~B_SCANNED;
-	}
-	splx(s);
-
 	/*
 	 * Flush all dirty buffers associated with a block device.
 	 */
-loop2:
-	s = splbio();
-	for (bp = TAILQ_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) {
-		nbp = TAILQ_NEXT(bp, b_vnbufs);
-		if ((bp->b_flags & B_SCANNED) != 0)
-			continue;
-		bp->b_flags |= B_SCANNED;
-		if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT))
-			continue;
-		if ((bp->b_flags & B_DELWRI) == 0)
-			panic("spec_fsync: not dirty");
-		if ((vp->v_flag & VOBJBUF) && (bp->b_flags & B_CLUSTEROK)) {
-			BUF_UNLOCK(bp);
-			vfs_bio_awrite(bp);
-			splx(s);
-		} else {
-			bremfree(bp);
-			splx(s);
-			bawrite(bp);
-		}
-		goto loop2;
-	}
-
-	/*
-	 * If synchronous the caller expects us to completely resolve all
-	 * dirty buffers in the system.  Wait for in-progress I/O to
-	 * complete (which could include background bitmap writes), then
-	 * retry if dirty blocks still exist.
-	 */
-	if (ap->a_waitfor == MNT_WAIT) {
-		while (vp->v_numoutput) {
-			vp->v_flag |= VBWAIT;
-			(void) tsleep((caddr_t)&vp->v_numoutput, 0, "spfsyn", 0);
-		}
-		if (!TAILQ_EMPTY(&vp->v_dirtyblkhd)) {
-			if (--maxretry != 0) {
-				splx(s);
-				goto loop1;
-			}
-			vprint("spec_fsync: giving up on dirty", vp);
-		}
-	}
-	splx(s);
-	return (0);
+	error = vfsync(vp, ap->a_waitfor, 10000, (daddr_t)-1, NULL, NULL);
+	return (error);
 }
 
 /*
Index: vfs/ufs/ffs_extern.h
===================================================================
RCS file: /cvs/src/sys/vfs/ufs/ffs_extern.h,v
retrieving revision 1.8
diff -u -r1.8 ffs_extern.h
--- vfs/ufs/ffs_extern.h	17 Aug 2004 18:57:36 -0000	1.8
+++ vfs/ufs/ffs_extern.h	9 Apr 2005 02:51:18 -0000
@@ -121,6 +121,6 @@
 void	softdep_setup_allocindir_page(struct inode *, ufs_lbn_t,
 	    struct buf *, int, ufs_daddr_t, ufs_daddr_t, struct buf *);
 void	softdep_fsync_mountdev(struct vnode *);
-int	softdep_sync_metadata(struct vop_fsync_args *);
+int	softdep_sync_metadata(struct vnode *, struct thread *);
 
 #endif /* !_UFS_FFS_EXTERN_H */
Index: vfs/ufs/ffs_inode.c
===================================================================
RCS file: /cvs/src/sys/vfs/ufs/ffs_inode.c,v
retrieving revision 1.13
diff -u -r1.13 ffs_inode.c
--- vfs/ufs/ffs_inode.c	8 Mar 2005 17:47:04 -0000	1.13
+++ vfs/ufs/ffs_inode.c	9 Apr 2005 02:51:18 -0000
@@ -391,8 +391,8 @@
 		if (newblks[i] != oip->i_db[i])
 			panic("ffs_truncate2");
 	if (length == 0 &&
-	    (!TAILQ_EMPTY(&ovp->v_dirtyblkhd) ||
-	     !TAILQ_EMPTY(&ovp->v_cleanblkhd)))
+	    (!RB_EMPTY(&ovp->v_rbdirty_tree) ||
+	     !RB_EMPTY(&ovp->v_rbclean_tree)))
 		panic("ffs_truncate3");
 #endif /* DIAGNOSTIC */
 	/*
Index: vfs/ufs/ffs_rawread.c
===================================================================
RCS file: /cvs/src/sys/vfs/ufs/ffs_rawread.c,v
retrieving revision 1.9
diff -u -r1.9 ffs_rawread.c
--- vfs/ufs/ffs_rawread.c	12 Oct 2004 19:21:12 -0000	1.9
+++ vfs/ufs/ffs_rawread.c	9 Apr 2005 02:51:18 -0000
@@ -98,7 +98,7 @@
 	/* Check for dirty mmap, pending writes and dirty buffers */
 	spl = splbio();
 	if (vp->v_numoutput > 0 ||
-	    !TAILQ_EMPTY(&vp->v_dirtyblkhd) ||
+	    !RB_EMPTY(&vp->v_rbdirty_tree) ||
 	    (vp->v_flag & VOBJDIRTY) != 0) {
 		splx(spl);
 
@@ -130,7 +130,7 @@
 			}
 		}
 		/* Flush dirty buffers */
-		if (!TAILQ_EMPTY(&vp->v_dirtyblkhd)) {
+		if (!RB_EMPTY(&vp->v_rbdirty_tree)) {
 			splx(spl);
 			if ((error = VOP_FSYNC(vp, MNT_WAIT, td)) != 0) {
 				if (upgraded != 0)
@@ -139,7 +139,7 @@
 			}
 			spl = splbio();
 			if (vp->v_numoutput > 0 ||
-			    !TAILQ_EMPTY(&vp->v_dirtyblkhd))
+			    !RB_EMPTY(&vp->v_rbdirty_tree))
 				panic("ffs_rawread_sync: dirty bufs");
 		}
 		splx(spl);
Index: vfs/ufs/ffs_softdep.c
===================================================================
RCS file: /cvs/src/sys/vfs/ufs/ffs_softdep.c,v
retrieving revision 1.21
diff -u -r1.21 ffs_softdep.c
--- vfs/ufs/ffs_softdep.c	2 Feb 2005 21:34:19 -0000	1.21
+++ vfs/ufs/ffs_softdep.c	9 Apr 2005 02:51:18 -0000
@@ -1734,11 +1734,19 @@
  * later release and zero the inode so that the calling routine
  * can release it.
  */
+struct softdep_setup_freeblocks_info {
+	struct fs *fs;
+	struct inode *ip;
+};
+
+static int softdep_setup_freeblocks_bp(struct buf *bp, void *data);
+
 void
 softdep_setup_freeblocks(ip, length)
 	struct inode *ip;	/* The inode whose length is to be reduced */
 	off_t length;		/* The new length for the file */
 {
+	struct softdep_setup_freeblocks_info info;
 	struct freeblks *freeblks;
 	struct inodedep *inodedep;
 	struct allocdirect *adp;
@@ -1746,6 +1754,7 @@
 	struct buf *bp;
 	struct fs *fs;
 	int i, error, delay;
+	int count;
 
 	fs = ip->i_fs;
 	if (length != 0)
@@ -1822,15 +1831,13 @@
 	vp = ITOV(ip);
 	ACQUIRE_LOCK(&lk);
 	drain_output(vp, 1);
-	while (getdirtybuf(&TAILQ_FIRST(&vp->v_dirtyblkhd), MNT_WAIT)) {
-		bp = TAILQ_FIRST(&vp->v_dirtyblkhd);
-		(void) inodedep_lookup(fs, ip->i_number, 0, &inodedep);
-		deallocate_dependencies(bp, inodedep);
-		bp->b_flags |= B_INVAL | B_NOCACHE;
-		FREE_LOCK(&lk);
-		brelse(bp);
-		ACQUIRE_LOCK(&lk);
-	}
+
+	info.fs = fs;
+	info.ip = ip;
+	do {
+		count = RB_SCAN(buf_rb_tree, &vp->v_rbdirty_tree, NULL, 
+				softdep_setup_freeblocks_bp, &info);
+	} while (count > 0);
 	if (inodedep_lookup(fs, ip->i_number, 0, &inodedep) != 0)
 		(void)free_inodedep(inodedep);
 	FREE_LOCK(&lk);
@@ -1843,6 +1850,23 @@
 		handle_workitem_freeblocks(freeblks);
 }
 
+static int
+softdep_setup_freeblocks_bp(struct buf *bp, void *data)
+{
+	struct softdep_setup_freeblocks_info *info = data;
+	struct inodedep *inodedep;
+
+	if (getdirtybuf(&bp, MNT_WAIT) == 0)
+		return(-1);
+	(void) inodedep_lookup(info->fs, info->ip->i_number, 0, &inodedep);
+	deallocate_dependencies(bp, inodedep);
+	bp->b_flags |= B_INVAL | B_NOCACHE;
+	FREE_LOCK(&lk);
+	brelse(bp);
+	ACQUIRE_LOCK(&lk);
+	return(1);
+}
+
 /*
  * Reclaim any dependency structures from a buffer that is about to
  * be reallocated to a new vnode. The buffer must be locked, thus,
@@ -4016,49 +4040,50 @@
  * before flushing the rest of the dirty blocks so as to reduce
  * the number of dependencies that will have to be rolled back.
  */
+static int softdep_fsync_mountdev_bp(struct buf *bp, void *data);
+
 void
 softdep_fsync_mountdev(vp)
 	struct vnode *vp;
 {
-	struct buf *bp, *nbp;
-	struct worklist *wk;
-
 	if (!vn_isdisk(vp, NULL))
 		panic("softdep_fsync_mountdev: vnode not a disk");
 	ACQUIRE_LOCK(&lk);
-	for (bp = TAILQ_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) {
-		nbp = TAILQ_NEXT(bp, b_vnbufs);
-		/* 
-		 * If it is already scheduled, skip to the next buffer.
-		 */
-		if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT))
-			continue;
-		if ((bp->b_flags & B_DELWRI) == 0) {
-			FREE_LOCK(&lk);
-			panic("softdep_fsync_mountdev: not dirty");
-		}
-		/*
-		 * We are only interested in bitmaps with outstanding
-		 * dependencies.
-		 */
-		if ((wk = LIST_FIRST(&bp->b_dep)) == NULL ||
-		    wk->wk_type != D_BMSAFEMAP ||
-		    (bp->b_xflags & BX_BKGRDINPROG)) {
-			BUF_UNLOCK(bp);
-			continue;
-		}
-		bremfree(bp);
+	RB_SCAN(buf_rb_tree, &vp->v_rbdirty_tree, NULL, 
+		softdep_fsync_mountdev_bp, NULL);
+	drain_output(vp, 1);
+	FREE_LOCK(&lk);
+}
+
+static int
+softdep_fsync_mountdev_bp(struct buf *bp, void *data)
+{
+	struct worklist *wk;
+
+	/* 
+	 * If it is already scheduled, skip to the next buffer.
+	 */
+	if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT))
+		return(0);
+	if ((bp->b_flags & B_DELWRI) == 0) {
 		FREE_LOCK(&lk);
-		(void) bawrite(bp);
-		ACQUIRE_LOCK(&lk);
-		/*
-		 * Since we may have slept during the I/O, we need 
-		 * to start from a known point.
-		 */
-		nbp = TAILQ_FIRST(&vp->v_dirtyblkhd);
+		panic("softdep_fsync_mountdev: not dirty");
 	}
-	drain_output(vp, 1);
+	/*
+	 * We are only interested in bitmaps with outstanding
+	 * dependencies.
+	 */
+	if ((wk = LIST_FIRST(&bp->b_dep)) == NULL ||
+	    wk->wk_type != D_BMSAFEMAP ||
+	    (bp->b_xflags & BX_BKGRDINPROG)) {
+		BUF_UNLOCK(bp);
+		return(0);
+	}
+	bremfree(bp);
 	FREE_LOCK(&lk);
+	(void) bawrite(bp);
+	ACQUIRE_LOCK(&lk);
+	return(0);
 }
 
 /*
@@ -4067,22 +4092,18 @@
  * so that the syncing routine can succeed by pushing the dirty blocks
  * associated with the file. If any I/O errors occur, they are returned.
  */
+struct softdep_sync_metadata_info {
+	struct vnode *vp;
+	int waitfor;
+};
+
+static int softdep_sync_metadata_bp(struct buf *bp, void *data);
+
 int
-softdep_sync_metadata(ap)
-	struct vop_fsync_args /* {
-		struct vnode *a_vp;
-		struct ucred *a_cred;
-		int a_waitfor;
-		struct proc *a_p;
-	} */ *ap;
+softdep_sync_metadata(struct vnode *vp, struct thread *td)
 {
-	struct vnode *vp = ap->a_vp;
-	struct pagedep *pagedep;
-	struct allocdirect *adp;
-	struct allocindir *aip;
-	struct buf *bp, *nbp;
-	struct worklist *wk;
-	int i, error, waitfor;
+	struct softdep_sync_metadata_info info;
+	int error, waitfor;
 
 	/*
 	 * Check whether this vnode is involved in a filesystem
@@ -4127,12 +4148,71 @@
 	 * all potential buffers on the dirty list will be visible.
 	 */
 	drain_output(vp, 1);
-	if (getdirtybuf(&TAILQ_FIRST(&vp->v_dirtyblkhd), MNT_WAIT) == 0) {
+	info.vp = vp;
+	info.waitfor = waitfor;
+	error = RB_SCAN(buf_rb_tree, &vp->v_rbdirty_tree, NULL, 
+			softdep_sync_metadata_bp, &info);
+	if (error < 0) {
+		FREE_LOCK(&lk);
+		return(-error);	/* error code */
+	}
+
+	/*
+	 * The brief unlock is to allow any pent up dependency
+	 * processing to be done.  Then proceed with the second pass.
+	 */
+	if (waitfor == MNT_NOWAIT) {
+		waitfor = MNT_WAIT;
+		FREE_LOCK(&lk);
+		ACQUIRE_LOCK(&lk);
+		goto top;
+	}
+
+	/*
+	 * If we have managed to get rid of all the dirty buffers,
+	 * then we are done. For certain directories and block
+	 * devices, we may need to do further work.
+	 *
+	 * We must wait for any I/O in progress to finish so that
+	 * all potential buffers on the dirty list will be visible.
+	 */
+	drain_output(vp, 1);
+	if (RB_EMPTY(&vp->v_rbdirty_tree)) {
 		FREE_LOCK(&lk);
 		return (0);
 	}
-	bp = TAILQ_FIRST(&vp->v_dirtyblkhd);
-loop:
+
+	FREE_LOCK(&lk);
+	/*
+	 * If we are trying to sync a block device, some of its buffers may
+	 * contain metadata that cannot be written until the contents of some
+	 * partially written files have been written to disk. The only easy
+	 * way to accomplish this is to sync the entire filesystem (luckily
+	 * this happens rarely).
+	 */
+	if (vn_isdisk(vp, NULL) && 
+	    vp->v_rdev &&
+	    vp->v_rdev->si_mountpoint && !VOP_ISLOCKED(vp, NULL) &&
+	    (error = VFS_SYNC(vp->v_rdev->si_mountpoint, MNT_WAIT, td)) != 0)
+		return (error);
+	return (0);
+}
+
+static int
+softdep_sync_metadata_bp(struct buf *bp, void *data)
+{
+	struct softdep_sync_metadata_info *info = data;
+	struct pagedep *pagedep;
+	struct allocdirect *adp;
+	struct allocindir *aip;
+	struct worklist *wk;
+	struct buf *nbp;
+	int error;
+	int i;
+
+	if (getdirtybuf(&bp, MNT_WAIT) == 0)
+		return (0);
+
 	/*
 	 * As we hold the buffer locked, none of its dependencies
 	 * will disappear.
@@ -4145,14 +4225,15 @@
 			if (adp->ad_state & DEPCOMPLETE)
 				break;
 			nbp = adp->ad_buf;
-			if (getdirtybuf(&nbp, waitfor) == 0)
+			if (getdirtybuf(&nbp, info->waitfor) == 0)
 				break;
 			FREE_LOCK(&lk);
-			if (waitfor == MNT_NOWAIT) {
+			if (info->waitfor == MNT_NOWAIT) {
 				bawrite(nbp);
 			} else if ((error = VOP_BWRITE(nbp->b_vp, nbp)) != 0) {
 				bawrite(bp);
-				return (error);
+				ACQUIRE_LOCK(&lk);
+				return (-error);
 			}
 			ACQUIRE_LOCK(&lk);
 			break;
@@ -4162,14 +4243,15 @@
 			if (aip->ai_state & DEPCOMPLETE)
 				break;
 			nbp = aip->ai_buf;
-			if (getdirtybuf(&nbp, waitfor) == 0)
+			if (getdirtybuf(&nbp, info->waitfor) == 0)
 				break;
 			FREE_LOCK(&lk);
-			if (waitfor == MNT_NOWAIT) {
+			if (info->waitfor == MNT_NOWAIT) {
 				bawrite(nbp);
 			} else if ((error = VOP_BWRITE(nbp->b_vp, nbp)) != 0) {
 				bawrite(bp);
-				return (error);
+				ACQUIRE_LOCK(&lk);
+				return (-error);
 			}
 			ACQUIRE_LOCK(&lk);
 			break;
@@ -4186,7 +4268,8 @@
 				FREE_LOCK(&lk);
 				if ((error = VOP_BWRITE(nbp->b_vp, nbp)) != 0) {
 					bawrite(bp);
-					return (error);
+					ACQUIRE_LOCK(&lk);
+					return (-error);
 				}
 				ACQUIRE_LOCK(&lk);
 				goto restart;
@@ -4198,7 +4281,8 @@
 			    WK_INODEDEP(wk)->id_ino)) != 0) {
 				FREE_LOCK(&lk);
 				bawrite(bp);
-				return (error);
+				ACQUIRE_LOCK(&lk);
+				return (-error);
 			}
 			break;
 
@@ -4215,11 +4299,13 @@
 				if (LIST_FIRST(&pagedep->pd_diraddhd[i]) == 0)
 					continue;
 				if ((error =
-				    flush_pagedep_deps(vp, pagedep->pd_mnt,
+				    flush_pagedep_deps(info->vp,
+						pagedep->pd_mnt,
 						&pagedep->pd_diraddhd[i]))) {
 					FREE_LOCK(&lk);
 					bawrite(bp);
-					return (error);
+					ACQUIRE_LOCK(&lk);
+					return (-error);
 				}
 			}
 			break;
@@ -4233,14 +4319,15 @@
 			 * rather than panic, just flush it.
 			 */
 			nbp = WK_MKDIR(wk)->md_buf;
-			if (getdirtybuf(&nbp, waitfor) == 0)
+			if (getdirtybuf(&nbp, info->waitfor) == 0)
 				break;
 			FREE_LOCK(&lk);
-			if (waitfor == MNT_NOWAIT) {
+			if (info->waitfor == MNT_NOWAIT) {
 				bawrite(nbp);
 			} else if ((error = VOP_BWRITE(nbp->b_vp, nbp)) != 0) {
 				bawrite(bp);
-				return (error);
+				ACQUIRE_LOCK(&lk);
+				return (-error);
 			}
 			ACQUIRE_LOCK(&lk);
 			break;
@@ -4254,14 +4341,15 @@
 			 * rather than panic, just flush it.
 			 */
 			nbp = WK_BMSAFEMAP(wk)->sm_buf;
-			if (getdirtybuf(&nbp, waitfor) == 0)
+			if (getdirtybuf(&nbp, info->waitfor) == 0)
 				break;
 			FREE_LOCK(&lk);
-			if (waitfor == MNT_NOWAIT) {
+			if (info->waitfor == MNT_NOWAIT) {
 				bawrite(nbp);
 			} else if ((error = VOP_BWRITE(nbp->b_vp, nbp)) != 0) {
 				bawrite(bp);
-				return (error);
+				ACQUIRE_LOCK(&lk);
+				return (-error);
 			}
 			ACQUIRE_LOCK(&lk);
 			break;
@@ -4273,54 +4361,10 @@
 			/* NOTREACHED */
 		}
 	}
-	(void) getdirtybuf(&TAILQ_NEXT(bp, b_vnbufs), MNT_WAIT);
-	nbp = TAILQ_NEXT(bp, b_vnbufs);
 	FREE_LOCK(&lk);
 	bawrite(bp);
 	ACQUIRE_LOCK(&lk);
-	if (nbp != NULL) {
-		bp = nbp;
-		goto loop;
-	}
-	/*
-	 * The brief unlock is to allow any pent up dependency
-	 * processing to be done.  Then proceed with the second pass.
-	 */
-	if (waitfor == MNT_NOWAIT) {
-		waitfor = MNT_WAIT;
-		FREE_LOCK(&lk);
-		ACQUIRE_LOCK(&lk);
-		goto top;
-	}
-
-	/*
-	 * If we have managed to get rid of all the dirty buffers,
-	 * then we are done. For certain directories and block
-	 * devices, we may need to do further work.
-	 *
-	 * We must wait for any I/O in progress to finish so that
-	 * all potential buffers on the dirty list will be visible.
-	 */
-	drain_output(vp, 1);
-	if (TAILQ_FIRST(&vp->v_dirtyblkhd) == NULL) {
-		FREE_LOCK(&lk);
-		return (0);
-	}
-
-	FREE_LOCK(&lk);
-	/*
-	 * If we are trying to sync a block device, some of its buffers may
-	 * contain metadata that cannot be written until the contents of some
-	 * partially written files have been written to disk. The only easy
-	 * way to accomplish this is to sync the entire filesystem (luckily
-	 * this happens rarely).
-	 */
-	if (vn_isdisk(vp, NULL) && 
-	    vp->v_rdev &&
-	    vp->v_rdev->si_mountpoint && !VOP_ISLOCKED(vp, NULL) &&
-	    (error = VFS_SYNC(vp->v_rdev->si_mountpoint, MNT_WAIT, ap->a_td)) != 0)
-		return (error);
-	return (0);
+	return(0);
 }
 
 /*
Index: vfs/ufs/ffs_softdep_stub.c
===================================================================
RCS file: /cvs/src/sys/vfs/ufs/ffs_softdep_stub.c,v
retrieving revision 1.6
diff -u -r1.6 ffs_softdep_stub.c
--- vfs/ufs/ffs_softdep_stub.c	18 Jul 2004 19:43:48 -0000	1.6
+++ vfs/ufs/ffs_softdep_stub.c	9 Apr 2005 02:51:18 -0000
@@ -175,11 +175,10 @@
 }
 
 /*
- * softdep_sync_metadata(struct vnode *a_vp, struct ucred *a_cred,
- *			 int a_waitfor, struct proc *a_p)
+ * softdep_sync_metadata(struct vnode *vp, struct thread *td)
  */
 int
-softdep_sync_metadata(struct vop_fsync_args *ap)
+softdep_sync_metadata(struct vnode *vp, struct thread *td)
 {
 	return (0);
 }
Index: vfs/ufs/ffs_vfsops.c
===================================================================
RCS file: /cvs/src/sys/vfs/ufs/ffs_vfsops.c,v
retrieving revision 1.31
diff -u -r1.31 ffs_vfsops.c
--- vfs/ufs/ffs_vfsops.c	12 Feb 2005 01:30:57 -0000	1.31
+++ vfs/ufs/ffs_vfsops.c	9 Apr 2005 02:51:19 -0000
@@ -1008,7 +1008,7 @@
 	 */
 	if (vp->v_type == VNON || ((ip->i_flag &
 	     (IN_ACCESS | IN_CHANGE | IN_MODIFIED | IN_UPDATE)) == 0 &&
-	     TAILQ_EMPTY(&vp->v_dirtyblkhd))) {
+	     RB_EMPTY(&vp->v_rbdirty_tree))) {
 		return(-1);
 	}
 	return(0);
@@ -1028,7 +1028,7 @@
 	ip = VTOI(vp);
 	if (vp->v_type == VNON || ((ip->i_flag &
 	     (IN_ACCESS | IN_CHANGE | IN_MODIFIED | IN_UPDATE)) == 0 &&
-	     TAILQ_EMPTY(&vp->v_dirtyblkhd))) {
+	     RB_EMPTY(&vp->v_rbdirty_tree))) {
 		return(0);
 	}
 	if (vp->v_type != VCHR) {
Index: vfs/ufs/ffs_vnops.c
===================================================================
RCS file: /cvs/src/sys/vfs/ufs/ffs_vnops.c,v
retrieving revision 1.12
diff -u -r1.12 ffs_vnops.c
--- vfs/ufs/ffs_vnops.c	15 Feb 2005 08:32:18 -0000	1.12
+++ vfs/ufs/ffs_vnops.c	9 Apr 2005 02:51:19 -0000
@@ -103,17 +103,17 @@
  * ffs_fsync(struct vnode *a_vp, struct ucred *a_cred, int a_waitfor,
  *	     struct proc *a_p)
  */
+
+static int ffs_checkdeferred(struct buf *bp);
+
 /* ARGSUSED */
 static int
 ffs_fsync(struct vop_fsync_args *ap)
 {
 	struct vnode *vp = ap->a_vp;
-	struct buf *bp;
-	struct buf *nbp;
-	int s, error, wait, passes, skipmeta;
 	daddr_t lbn;
+	int error;
 
-	wait = (ap->a_waitfor == MNT_WAIT);
 	if (vn_isdisk(vp, NULL)) {
 		lbn = INT_MAX;
 		if (vp->v_rdev && vp->v_rdev->si_mountpoint != NULL &&
@@ -128,133 +128,21 @@
 	/*
 	 * Flush all dirty buffers associated with a vnode.
 	 */
-	passes = NIADDR + 1;
-	skipmeta = 0;
-	if (wait)
-		skipmeta = 1;
-	s = splbio();
-loop:
-	for (bp = TAILQ_FIRST(&vp->v_dirtyblkhd); bp;
-	     bp = TAILQ_NEXT(bp, b_vnbufs))
-		bp->b_flags &= ~B_SCANNED;
-	for (bp = TAILQ_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) {
-		nbp = TAILQ_NEXT(bp, b_vnbufs);
-		/* 
-		 * Reasons to skip this buffer: it has already been considered
-		 * on this pass, this pass is the first time through on a
-		 * synchronous flush request and the buffer being considered
-		 * is metadata, the buffer has dependencies that will cause
-		 * it to be redirtied and it has not already been deferred,
-		 * or it is already being written.
-		 */
-		if ((bp->b_flags & B_SCANNED) != 0)
-			continue;
-		bp->b_flags |= B_SCANNED;
-		if ((skipmeta == 1 && bp->b_lblkno < 0))
-			continue;
-		if (!wait && LIST_FIRST(&bp->b_dep) != NULL &&
-		    (bp->b_flags & B_DEFERRED) == 0 &&
-		    bioops.io_countdeps && (*bioops.io_countdeps)(bp, 0)) {
-			bp->b_flags |= B_DEFERRED;
-			continue;
-		}
-		if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT))
-			continue;
-		if ((bp->b_flags & B_DELWRI) == 0)
-			panic("ffs_fsync: not dirty");
-		if (vp != bp->b_vp)
-			panic("ffs_fsync: vp != vp->b_vp");
-		/*
-		 * If this is a synchronous flush request, or it is not a
-		 * file or device, start the write on this buffer immediatly.
-		 */
-		if (wait || (vp->v_type != VREG && vp->v_type != VBLK)) {
-
-			/*
-			 * On our final pass through, do all I/O synchronously
-			 * so that we can find out if our flush is failing
-			 * because of write errors.
-			 */
-			if (passes > 0 || !wait) {
-				if ((bp->b_flags & B_CLUSTEROK) && !wait) {
-					BUF_UNLOCK(bp);
-					(void) vfs_bio_awrite(bp);
-				} else {
-					bremfree(bp);
-					splx(s);
-					(void) bawrite(bp);
-					s = splbio();
-				}
-			} else {
-				bremfree(bp);
-				splx(s);
-				if ((error = bwrite(bp)) != 0)
-					return (error);
-				s = splbio();
-			}
-		} else if ((vp->v_type == VREG) && (bp->b_lblkno >= lbn)) {
-			/* 
-			 * If the buffer is for data that has been truncated
-			 * off the file, then throw it away.
-			 */
-			bremfree(bp);
-			bp->b_flags |= B_INVAL | B_NOCACHE;
-			splx(s);
-			s = splbio();
-		} else {
-			BUF_UNLOCK(bp);
-			vfs_bio_awrite(bp);
-		}
-		/*
-		 * Since we may have slept during the I/O, we need 
-		 * to start from a known point.
-		 */
-		nbp = TAILQ_FIRST(&vp->v_dirtyblkhd);
-	}
-	/*
-	 * If we were asked to do this synchronously, then go back for
-	 * another pass, this time doing the metadata.
-	 */
-	if (skipmeta) {
-		skipmeta = 0;
-		goto loop;
-	}
+	error = vfsync(vp, ap->a_waitfor, NIADDR + 1, lbn, ffs_checkdeferred, softdep_sync_metadata);
+	if (error == 0)
+		error = UFS_UPDATE(vp, (ap->a_waitfor == MNT_WAIT));
+	return (error);
+}
 
-	if (wait) {
-		while (vp->v_numoutput) {
-			vp->v_flag |= VBWAIT;
-			(void)tsleep((caddr_t)&vp->v_numoutput, 0, "ffsfsn", 0);
-  		}
-
-		/* 
-		 * Ensure that any filesystem metatdata associated
-		 * with the vnode has been written.
-		 */
-		splx(s);
-		if ((error = softdep_sync_metadata(ap)) != 0)
-			return (error);
-		s = splbio();
-
-		if (!TAILQ_EMPTY(&vp->v_dirtyblkhd)) {
-			/*
-			 * Block devices associated with filesystems may
-			 * have new I/O requests posted for them even if
-			 * the vnode is locked, so no amount of trying will
-			 * get them clean. Thus we give block devices a
-			 * good effort, then just give up. For all other file
-			 * types, go around and try again until it is clean.
-			 */
-			if (passes > 0) {
-				passes -= 1;
-				goto loop;
-			}
-#ifdef DIAGNOSTIC
-			if (!vn_isdisk(vp, NULL))
-				vprint("ffs_fsync: dirty", vp);
-#endif
-		}
+static int
+ffs_checkdeferred(struct buf *bp)
+{
+	if (LIST_FIRST(&bp->b_dep) != NULL &&
+	    (bp->b_flags & B_DEFERRED) == 0 &&
+	    bioops.io_countdeps && (*bioops.io_countdeps)(bp, 0)) {
+		bp->b_flags |= B_DEFERRED;
+		return(1);
 	}
-	splx(s);
-	return (UFS_UPDATE(vp, wait));
+	return(0);
 }
+