BUF/BIO stage 3 patch #1
Matthew Dillon
dillon at apollo.backplane.com
Fri Mar 3 12:47:28 PST 2006
Ok, here is the next patch. I am treading carefully, doing as much
precursor work as I can before the massive block number -> 64 bit offset
conversion patch.
This is a smaller patch which cleans up the buffer cache hash table.
The global hash table has been replaced by a per-vnode red-black tree
and the buffer's vnode assignment, bp->b_vp, is now completely integrated
with the block lookup code (prior patches removed all the bp->b_vp hacks
so bp->b_vp is now an ultra-stable field). This in turn allowed the
reassignbuf() routine to be greatly simplified.
-Matt
Matthew Dillon
<dillon at xxxxxxxxxxxxx>
Index: i386/i386/machdep.c
===================================================================
RCS file: /cvs/src/sys/i386/i386/machdep.c,v
retrieving revision 1.86
diff -u -r1.86 machdep.c
--- i386/i386/machdep.c 17 Feb 2006 19:18:06 -0000 1.86
+++ i386/i386/machdep.c 3 Mar 2006 18:28:07 -0000
@@ -353,7 +353,6 @@
valloc(swbuf, struct buf, nswbuf);
valloc(buf, struct buf, nbuf);
- v = bufhashinit(v);
/*
* End of first pass, size has been calculated so allocate memory
Index: kern/vfs_bio.c
===================================================================
RCS file: /cvs/src/sys/kern/vfs_bio.c,v
retrieving revision 1.57
diff -u -r1.57 vfs_bio.c
--- kern/vfs_bio.c 2 Mar 2006 20:28:49 -0000 1.57
+++ kern/vfs_bio.c 3 Mar 2006 19:30:37 -0000
@@ -164,9 +164,6 @@
SYSCTL_INT(_debug_sizeof, OID_AUTO, buf, CTLFLAG_RD, 0, sizeof(struct buf),
"sizeof(struct buf)");
-static int bufhashmask;
-static int bufhashshift;
-static LIST_HEAD(bufhashhdr, buf) *bufhashtbl, invalhash;
char *buf_wmesg = BUF_WMESG;
extern int vm_swap_size;
@@ -177,51 +174,6 @@
#define VFS_BIO_NEED_BUFSPACE 0x08 /* wait for buf space, lo hysteresis */
/*
- * Buffer hash table code. Note that the logical block scans linearly, which
- * gives us some L1 cache locality.
- */
-
-static __inline
-struct bufhashhdr *
-bufhash(struct vnode *vnp, daddr_t bn)
-{
- u_int64_t hashkey64;
- int hashkey;
-
- /*
- * A variation on the Fibonacci hash that Knuth credits to
- * R. W. Floyd, see Knuth's _Art of Computer Programming,
- * Volume 3 / Sorting and Searching_
- *
- * We reduce the argument to 32 bits before doing the hash to
- * avoid the need for a slow 64x64 multiply on 32 bit platforms.
- *
- * sizeof(struct vnode) is 168 on i386, so toss some of the lower
- * bits of the vnode address to reduce the key range, which
- * improves the distribution of keys across buckets.
- *
- * The file system cylinder group blocks are very heavily
- * used. They are located at invervals of fbg, which is
- * on the order of 89 to 94 * 2^10, depending on other
- * filesystem parameters, for a 16k block size. Smaller block
- * sizes will reduce fpg approximately proportionally. This
- * will cause the cylinder group index to be hashed using the
- * lower bits of the hash multiplier, which will not distribute
- * the keys as uniformly in a classic Fibonacci hash where a
- * relatively small number of the upper bits of the result
- * are used. Using 2^16 as a close-enough approximation to
- * fpg, split the hash multiplier in half, with the upper 16
- * bits being the inverse of the golden ratio, and the lower
- * 16 bits being a fraction between 1/3 and 3/7 (closer to
- * 3/7 in this case), that gives good experimental results.
- */
- hashkey64 = ((u_int64_t)(uintptr_t)vnp >> 3) + (u_int64_t)bn;
- hashkey = (((u_int32_t)(hashkey64 + (hashkey64 >> 32)) * 0x9E376DB1u) >>
- bufhashshift) & bufhashmask;
- return(&bufhashtbl[hashkey]);
-}
-
-/*
* numdirtywakeup:
*
* If someone is blocked due to there being too many dirty buffers,
@@ -384,25 +336,6 @@
}
/*
- * bufhashinit:
- *
- * Initialize buffer headers and related structures.
- */
-
-caddr_t
-bufhashinit(caddr_t vaddr)
-{
- /* first, make a null hash table */
- bufhashshift = 29;
- for (bufhashmask = 8; bufhashmask < nbuf / 4; bufhashmask <<= 1)
- bufhashshift--;
- bufhashtbl = (void *)vaddr;
- vaddr = vaddr + sizeof(*bufhashtbl) * bufhashmask;
- --bufhashmask;
- return(vaddr);
-}
-
-/*
* bufinit:
*
* Load time initialisation of the buffer cache, called from machine
@@ -415,11 +348,6 @@
vm_offset_t bogus_offset;
int i;
- LIST_INIT(&invalhash);
-
- for (i = 0; i <= bufhashmask; i++)
- LIST_INIT(&bufhashtbl[i]);
-
/* next, make a null set of free lists */
for (i = 0; i < BUFFER_QUEUES; i++)
TAILQ_INIT(&bufqueues[i]);
@@ -435,7 +363,6 @@
LIST_INIT(&bp->b_dep);
BUF_LOCKINIT(bp);
TAILQ_INSERT_TAIL(&bufqueues[BQUEUE_EMPTY], bp, b_freelist);
- LIST_INSERT_HEAD(&invalhash, bp, b_hash);
}
/*
@@ -923,7 +850,7 @@
if ((bp->b_flags & B_DELWRI) == 0) {
bp->b_flags |= B_DONE | B_DELWRI;
- reassignbuf(bp, bp->b_vp);
+ reassignbuf(bp);
++numdirtybuffers;
bd_wakeup((lodirtybuffers + hidirtybuffers) / 2);
}
@@ -949,7 +876,7 @@
{
if (bp->b_flags & B_DELWRI) {
bp->b_flags &= ~B_DELWRI;
- reassignbuf(bp, bp->b_vp);
+ reassignbuf(bp);
--numdirtybuffers;
numdirtywakeup(lodirtybuffers);
}
@@ -1274,6 +1201,7 @@
bp->b_flags |= B_INVAL;
bp->b_xflags &= ~BX_BKGRDWRITE;
KASSERT(bp->b_vp == NULL, ("bp1 %p flags %08x/%08lx vnode %p unexpectededly still associated!", bp, saved_flags, bp->b_flags, bp->b_vp));
+ KKASSERT((bp->b_flags & B_HASHED) == 0);
if (bp->b_xflags & BX_BKGRDINPROG)
panic("losing buffer 1");
if (bp->b_kvasize) {
@@ -1282,22 +1210,19 @@
bp->b_qindex = BQUEUE_EMPTY;
}
TAILQ_INSERT_HEAD(&bufqueues[bp->b_qindex], bp, b_freelist);
- LIST_REMOVE(bp, b_hash);
- LIST_INSERT_HEAD(&invalhash, bp, b_hash);
} else if (bp->b_flags & (B_ERROR | B_INVAL | B_NOCACHE | B_RELBUF)) {
/*
* Buffers with junk contents. Again these buffers had better
* already be disassociated from their vnode.
*/
KASSERT(bp->b_vp == NULL, ("bp2 %p flags %08x/%08lx vnode %p unexpectededly still associated!", bp, saved_flags, bp->b_flags, bp->b_vp));
+ KKASSERT((bp->b_flags & B_HASHED) == 0);
bp->b_flags |= B_INVAL;
bp->b_xflags &= ~BX_BKGRDWRITE;
if (bp->b_xflags & BX_BKGRDINPROG)
panic("losing buffer 2");
bp->b_qindex = BQUEUE_CLEAN;
TAILQ_INSERT_HEAD(&bufqueues[BQUEUE_CLEAN], bp, b_freelist);
- LIST_REMOVE(bp, b_hash);
- LIST_INSERT_HEAD(&invalhash, bp, b_hash);
} else if (bp->b_flags & B_LOCKED) {
/*
* Buffers that are locked.
@@ -1497,25 +1422,6 @@
}
/*
- * gbincore:
- *
- * Check to see if a block is currently memory resident.
- */
-struct buf *
-gbincore(struct vnode * vp, daddr_t blkno)
-{
- struct buf *bp;
- struct bufhashhdr *bh;
-
- bh = bufhash(vp, blkno);
- LIST_FOREACH(bp, bh, b_hash) {
- if (bp->b_vp == vp && bp->b_lblkno == blkno)
- break;
- }
- return (bp);
-}
-
-/*
* vfs_bio_awrite:
*
* Implement clustered async writes for clearing out B_DELWRI buffers.
@@ -1556,7 +1462,7 @@
maxcl = MAXPHYS / size;
for (i = 1; i < maxcl; i++) {
- if ((bpa = gbincore(vp, lblkno + i)) &&
+ if ((bpa = findblk(vp, lblkno + i)) &&
BUF_REFCNT(bpa) == 0 &&
((bpa->b_flags & (B_DELWRI | B_CLUSTEROK | B_INVAL)) ==
(B_DELWRI | B_CLUSTEROK)) &&
@@ -1570,7 +1476,7 @@
}
}
for (j = 1; i + j <= maxcl && j <= lblkno; j++) {
- if ((bpa = gbincore(vp, lblkno - j)) &&
+ if ((bpa = findblk(vp, lblkno - j)) &&
BUF_REFCNT(bpa) == 0 &&
((bpa->b_flags & (B_DELWRI | B_CLUSTEROK | B_INVAL)) ==
(B_DELWRI | B_CLUSTEROK)) &&
@@ -1785,12 +1691,11 @@
*/
KASSERT(bp->b_vp == NULL, ("bp3 %p flags %08lx vnode %p qindex %d unexpectededly still associated!", bp, bp->b_flags, bp->b_vp, qindex));
+ KKASSERT((bp->b_flags & B_HASHED) == 0);
if (LIST_FIRST(&bp->b_dep) != NULL && bioops.io_deallocate)
(*bioops.io_deallocate)(bp);
if (bp->b_xflags & BX_BKGRDINPROG)
panic("losing buffer 3");
- LIST_REMOVE(bp, b_hash);
- LIST_INSERT_HEAD(&invalhash, bp, b_hash);
/*
* critical section protection is not required when
@@ -2055,26 +1960,10 @@
}
/*
- * incore:
- *
- * Check to see if a block is currently resident in memory.
- */
-struct buf *
-incore(struct vnode * vp, daddr_t blkno)
-{
- struct buf *bp;
-
- crit_enter();
- bp = gbincore(vp, blkno);
- crit_exit();
- return (bp);
-}
-
-/*
* inmem:
*
* Returns true if no I/O is needed to access the associated VM object.
- * This is like incore except it also hunts around in the VM system for
+ * This is like findblk except it also hunts around in the VM system for
* the data.
*
* Note that we ignore vm_page_free() races from interrupts against our
@@ -2089,7 +1978,7 @@
vm_page_t m;
vm_ooffset_t off;
- if (incore(vp, blkno))
+ if (findblk(vp, blkno))
return 1;
if (vp->v_mount == NULL)
return 0;
@@ -2208,6 +2097,25 @@
}
/*
+ * findblk:
+ *
+ * Locate and return the specified buffer, or NULL if the buffer does
+ * not exist. Do not attempt to lock the buffer or manipulate it in
+ * any way. The caller must validate that the correct buffer has been
+ * obtain after locking it.
+ */
+struct buf *
+findblk(struct vnode *vp, daddr_t blkno)
+{
+ struct buf *bp;
+
+ crit_enter();
+ bp = buf_rb_hash_RB_LOOKUP(&vp->v_rbhash_tree, blkno);
+ crit_exit();
+ return(bp);
+}
+
+/*
* getblk:
*
* Get a block given a specified block and offset into a file/device.
@@ -2251,10 +2159,9 @@
* prior to issuing the READ. biodone() will *not* clear B_INVAL.
*/
struct buf *
-getblk(struct vnode * vp, daddr_t blkno, int size, int slpflag, int slptimeo)
+getblk(struct vnode *vp, daddr_t blkno, int size, int slpflag, int slptimeo)
{
struct buf *bp;
- struct bufhashhdr *bh;
if (size > MAXBSIZE)
panic("getblk: size(%d) > MAXBSIZE(%d)", size, MAXBSIZE);
@@ -2266,7 +2173,7 @@
* to completely exhaust the buffer cache.
*
* If this check ever becomes a bottleneck it may be better to
- * move it into the else, when gbincore() fails. At the moment
+ * move it into the else, when findblk() fails. At the moment
* it isn't a problem.
*
* XXX remove, we cannot afford to block anywhere if holding a vnode
@@ -2279,7 +2186,7 @@
tsleep(&needsbuffer, slpflag, "newbuf", slptimeo);
}
- if ((bp = gbincore(vp, blkno))) {
+ if ((bp = findblk(vp, blkno))) {
/*
* The buffer was found in the cache, but we need to lock it.
* Even with LK_NOWAIT the lockmgr may break our critical
@@ -2458,7 +2365,7 @@
* from the point of the duplicate buffer creation through
* to here, and we've locked the buffer.
*/
- if (gbincore(vp, blkno)) {
+ if (findblk(vp, blkno)) {
bp->b_flags |= B_INVAL;
brelse(bp);
goto loop;
@@ -2466,18 +2373,16 @@
/*
* Insert the buffer into the hash, so that it can
- * be found by incore. bgetvp() and bufhash()
- * must be synchronized with each other. Make sure the
- * translation layer has been cleared.
+ * be found by findblk().
+ *
+ * Make sure the translation layer has been cleared.
*/
bp->b_lblkno = blkno;
bp->b_loffset = offset;
bp->b_bio2.bio_blkno = (daddr_t)-1;
+ /* bp->b_bio2.bio_next = NULL; */
bgetvp(vp, bp);
- LIST_REMOVE(bp, b_hash);
- bh = bufhash(vp, blkno);
- LIST_INSERT_HEAD(bh, bp, b_hash);
/*
* set B_VMIO bit. allocbuf() the buffer bigger. Since the
Index: kern/vfs_cluster.c
===================================================================
RCS file: /cvs/src/sys/kern/vfs_cluster.c,v
retrieving revision 1.16
diff -u -r1.16 vfs_cluster.c
--- kern/vfs_cluster.c 21 Feb 2006 18:46:56 -0000 1.16
+++ kern/vfs_cluster.c 3 Mar 2006 19:31:25 -0000
@@ -133,14 +133,14 @@
bp->b_flags &= ~B_RAM;
/*
* We do the crit here so that there is no window
- * between the incore and the b_usecount increment
+ * between the findblk and the b_usecount increment
* below. We opt to keep the crit out of the loop
* for efficiency.
*/
crit_enter();
for (i = 1; i < maxra; i++) {
- if (!(tbp = incore(vp, lblkno+i))) {
+ if (!(tbp = findblk(vp, lblkno+i))) {
break;
}
@@ -371,7 +371,7 @@
* would block in the lock. The same checks have to
* be made again after we officially get the buffer.
*/
- if ((tbp = incore(vp, lbn + i)) != NULL) {
+ if ((tbp = findblk(vp, lbn + i)) != NULL) {
if (BUF_LOCK(tbp, LK_EXCLUSIVE | LK_NOWAIT))
break;
BUF_UNLOCK(tbp);
@@ -749,7 +749,7 @@
* is delayed-write but either locked or inval, it cannot
* partake in the clustered write.
*/
- if (((tbp = gbincore(vp, start_lbn)) == NULL) ||
+ if (((tbp = findblk(vp, start_lbn)) == NULL) ||
((tbp->b_flags & (B_LOCKED | B_INVAL | B_DELWRI)) != B_DELWRI) ||
BUF_LOCK(tbp, LK_EXCLUSIVE | LK_NOWAIT)) {
++start_lbn;
@@ -818,7 +818,7 @@
* If the adjacent data is not even in core it
* can't need to be written.
*/
- if ((tbp = gbincore(vp, start_lbn)) == NULL) {
+ if ((tbp = findblk(vp, start_lbn)) == NULL) {
crit_exit();
break;
}
@@ -903,7 +903,6 @@
bundirty(tbp);
tbp->b_flags &= ~(B_READ | B_DONE | B_ERROR);
tbp->b_flags |= B_ASYNC;
- reassignbuf(tbp, tbp->b_vp); /* put on clean list */
crit_exit();
BUF_KERNPROC(tbp);
cluster_append(&bp->b_bio1, tbp);
Index: kern/vfs_lock.c
===================================================================
RCS file: /cvs/src/sys/kern/vfs_lock.c,v
retrieving revision 1.9
diff -u -r1.9 vfs_lock.c
--- kern/vfs_lock.c 2 Mar 2006 19:07:59 -0000 1.9
+++ kern/vfs_lock.c 3 Mar 2006 18:26:04 -0000
@@ -494,6 +494,7 @@
RB_INIT(&vp->v_rbclean_tree);
RB_INIT(&vp->v_rbdirty_tree);
+ RB_INIT(&vp->v_rbhash_tree);
vp->v_type = VNON;
vp->v_tag = 0;
vp->v_ops = NULL;
Index: kern/vfs_subr.c
===================================================================
RCS file: /cvs/src/sys/kern/vfs_subr.c,v
retrieving revision 1.69
diff -u -r1.69 vfs_subr.c
--- kern/vfs_subr.c 2 Mar 2006 20:28:49 -0000 1.69
+++ kern/vfs_subr.c 3 Mar 2006 19:58:30 -0000
@@ -136,7 +136,8 @@
* Red black tree functions
*/
static int rb_buf_compare(struct buf *b1, struct buf *b2);
-RB_GENERATE(buf_rb_tree, buf, b_rbnode, rb_buf_compare);
+RB_GENERATE2(buf_rb_tree, buf, b_rbnode, rb_buf_compare, daddr_t, b_lblkno);
+RB_GENERATE2(buf_rb_hash, buf, b_rbhash, rb_buf_compare, daddr_t, b_lblkno);
static int
rb_buf_compare(struct buf *b1, struct buf *b2)
@@ -356,6 +357,8 @@
if (!RB_EMPTY(&vp->v_rbdirty_tree) || !RB_EMPTY(&vp->v_rbclean_tree))
panic("vinvalbuf: flush failed");
+ if (!RB_EMPTY(&vp->v_rbhash_tree))
+ panic("vinvalbuf: flush failed, buffers still present");
return (0);
}
@@ -785,17 +788,22 @@
bgetvp(struct vnode *vp, struct buf *bp)
{
KASSERT(bp->b_vp == NULL, ("bgetvp: not free"));
+ KKASSERT((bp->b_flags & (B_HASHED|B_DELWRI)) == 0);
+ KKASSERT((bp->b_xflags & (BX_VNCLEAN|BX_VNDIRTY)) == 0);
vhold(vp);
- bp->b_vp = vp;
/*
* Insert onto list for new vnode.
*/
crit_enter();
+ bp->b_vp = vp;
+ bp->b_flags |= B_HASHED;
+ if (buf_rb_hash_RB_INSERT(&vp->v_rbhash_tree, bp))
+ panic("reassignbuf: dup lblk vp %p bp %p", vp, bp);
+
bp->b_xflags |= BX_VNCLEAN;
- bp->b_xflags &= ~BX_VNDIRTY;
if (buf_rb_tree_RB_INSERT(&vp->v_rbclean_tree, bp))
- panic("reassignbuf: dup lblk vp %p bp %p", vp, bp);
+ panic("reassignbuf: dup lblk/clean vp %p bp %p", vp, bp);
crit_exit();
}
@@ -821,6 +829,10 @@
buf_rb_tree_RB_REMOVE(&vp->v_rbclean_tree, bp);
bp->b_xflags &= ~(BX_VNDIRTY | BX_VNCLEAN);
}
+ if (bp->b_flags & B_HASHED) {
+ buf_rb_hash_RB_REMOVE(&vp->v_rbhash_tree, bp);
+ bp->b_flags &= ~B_HASHED;
+ }
if ((vp->v_flag & VONWORKLST) && RB_EMPTY(&vp->v_rbdirty_tree)) {
vp->v_flag &= ~VONWORKLST;
LIST_REMOVE(vp, v_synclist);
@@ -841,6 +853,7 @@
pbgetvp(struct vnode *vp, struct buf *bp)
{
KASSERT(bp->b_vp == NULL, ("pbgetvp: not free"));
+ KKASSERT((bp->b_flags & B_HASHED) == 0);
bp->b_vp = vp;
bp->b_flags |= B_PAGING;
@@ -853,37 +866,23 @@
pbrelvp(struct buf *bp)
{
KASSERT(bp->b_vp != NULL, ("pbrelvp: NULL"));
+ KKASSERT((bp->b_flags & B_HASHED) == 0);
bp->b_vp = NULL;
bp->b_flags &= ~B_PAGING;
}
-void
-pbreassignbuf(struct buf *bp, struct vnode *newvp)
-{
- if ((bp->b_flags & B_PAGING) == 0) {
- panic(
- "pbreassignbuf() on non phys bp %p",
- bp
- );
- }
- bp->b_vp = newvp;
-}
-
/*
- * Reassign a buffer from one vnode to another.
- * Used to assign file specific control information
- * (indirect blocks) to the vnode to which they belong.
+ * Reassign the buffer to the proper clean/dirty list based on B_DELWRI.
+ * This routine is called when the state of the B_DELWRI bit is changed.
*/
void
-reassignbuf(struct buf *bp, struct vnode *newvp)
+reassignbuf(struct buf *bp)
{
+ struct vnode *vp = bp->b_vp;
int delay;
- if (newvp == NULL) {
- printf("reassignbuf: NULL");
- return;
- }
+ KKASSERT(vp != NULL);
++reassignbufcalls;
/*
@@ -894,34 +893,30 @@
panic("cannot reassign paging buffer");
crit_enter();
- /*
- * Delete from old vnode list, if on one.
- */
- if (bp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN)) {
- if (bp->b_xflags & BX_VNDIRTY)
- buf_rb_tree_RB_REMOVE(&bp->b_vp->v_rbdirty_tree, bp);
- else
- buf_rb_tree_RB_REMOVE(&bp->b_vp->v_rbclean_tree, bp);
- bp->b_xflags &= ~(BX_VNDIRTY | BX_VNCLEAN);
- if (bp->b_vp != newvp) {
- vdrop(bp->b_vp);
- bp->b_vp = NULL; /* for clarification */
- }
- }
- /*
- * If dirty, put on list of dirty buffers; otherwise insert onto list
- * of clean buffers.
- */
if (bp->b_flags & B_DELWRI) {
- if ((newvp->v_flag & VONWORKLST) == 0) {
- switch (newvp->v_type) {
+ /*
+ * Move to the dirty list, add the vnode to the worklist
+ */
+ if (bp->b_xflags & BX_VNCLEAN) {
+ buf_rb_tree_RB_REMOVE(&vp->v_rbclean_tree, bp);
+ bp->b_xflags &= ~BX_VNCLEAN;
+ }
+ if ((bp->b_xflags & BX_VNDIRTY) == 0) {
+ if (buf_rb_tree_RB_INSERT(&vp->v_rbdirty_tree, bp)) {
+ panic("reassignbuf: dup lblk vp %p bp %p",
+ vp, bp);
+ }
+ bp->b_xflags |= BX_VNDIRTY;
+ }
+ if ((vp->v_flag & VONWORKLST) == 0) {
+ switch (vp->v_type) {
case VDIR:
delay = dirdelay;
break;
case VCHR:
case VBLK:
- if (newvp->v_rdev &&
- newvp->v_rdev->si_mountpoint != NULL) {
+ if (vp->v_rdev &&
+ vp->v_rdev->si_mountpoint != NULL) {
delay = metadelay;
break;
}
@@ -929,24 +924,29 @@
default:
delay = filedelay;
}
- vn_syncer_add_to_worklist(newvp, delay);
+ vn_syncer_add_to_worklist(vp, delay);
}
- bp->b_xflags |= BX_VNDIRTY;
- if (buf_rb_tree_RB_INSERT(&newvp->v_rbdirty_tree, bp))
- panic("reassignbuf: dup lblk vp %p bp %p", newvp, bp);
} else {
- bp->b_xflags |= BX_VNCLEAN;
- if (buf_rb_tree_RB_INSERT(&newvp->v_rbclean_tree, bp))
- panic("reassignbuf: dup lblk vp %p bp %p", newvp, bp);
- if ((newvp->v_flag & VONWORKLST) &&
- RB_EMPTY(&newvp->v_rbdirty_tree)) {
- newvp->v_flag &= ~VONWORKLST;
- LIST_REMOVE(newvp, v_synclist);
+ /*
+ * Move to the clean list, remove the vnode from the worklist
+ * if no dirty blocks remain.
+ */
+ if (bp->b_xflags & BX_VNDIRTY) {
+ buf_rb_tree_RB_REMOVE(&vp->v_rbdirty_tree, bp);
+ bp->b_xflags &= ~BX_VNDIRTY;
+ }
+ if ((bp->b_xflags & BX_VNCLEAN) == 0) {
+ if (buf_rb_tree_RB_INSERT(&vp->v_rbclean_tree, bp)) {
+ panic("reassignbuf: dup lblk vp %p bp %p",
+ vp, bp);
+ }
+ bp->b_xflags |= BX_VNCLEAN;
+ }
+ if ((vp->v_flag & VONWORKLST) &&
+ RB_EMPTY(&vp->v_rbdirty_tree)) {
+ vp->v_flag &= ~VONWORKLST;
+ LIST_REMOVE(vp, v_synclist);
}
- }
- if (bp->b_vp != newvp) {
- bp->b_vp = newvp;
- vhold(bp->b_vp);
}
crit_exit();
}
Index: sys/buf.h
===================================================================
RCS file: /cvs/src/sys/sys/buf.h,v
retrieving revision 1.23
diff -u -r1.23 buf.h
--- sys/buf.h 2 Mar 2006 19:26:17 -0000 1.23
+++ sys/buf.h 3 Mar 2006 19:38:29 -0000
@@ -75,7 +75,9 @@
#define NBUF_BIO 4
struct buf_rb_tree;
-RB_PROTOTYPE(buf_rb_tree, buf, b_rbnode, rb_buf_compare);
+struct buf_rb_hash;
+RB_PROTOTYPE2(buf_rb_tree, buf, b_rbnode, rb_buf_compare, daddr_t, b_lblkno);
+RB_PROTOTYPE2(buf_rb_hash, buf, b_rbhash, rb_buf_compare, daddr_t, b_lblkno);
/*
* To avoid including <ufs/ffs/softdep.h>
@@ -138,8 +140,8 @@
* unrelated to the vnode/device whos strategy routine was called.
*/
struct buf {
- LIST_ENTRY(buf) b_hash; /* Hash chain. */
- RB_ENTRY(buf) b_rbnode; /* Red-Black node in vnode RB tree */
+ RB_ENTRY(buf) b_rbnode; /* RB node in vnode clean/dirty tree */
+ RB_ENTRY(buf) b_rbhash; /* RB node in vnode hash tree */
TAILQ_ENTRY(buf) b_freelist; /* Free list position if not active. */
struct buf *b_cluster_next; /* Next buffer (cluster code) */
struct vnode *b_vp; /* (vp, lblkno) index */
@@ -229,7 +231,7 @@
#define B_DIRECT 0x00000008 /* direct I/O flag (pls free vmio) */
#define B_DEFERRED 0x00000010 /* Skipped over for cleaning */
#define B_CACHE 0x00000020 /* Bread found us in the cache. */
-#define B_UNUSED40 0x00000040 /* Unused */
+#define B_HASHED 0x00000040 /* Indexed via v_rbhash_tree */
#define B_DELWRI 0x00000080 /* Delay I/O until buffer reused. */
#define B_FREEBUF 0x00000100 /* Instruct driver: free blocks */
#define B_DONE 0x00000200 /* I/O completed. */
@@ -339,7 +341,6 @@
struct uio;
-caddr_t bufhashinit (caddr_t);
void bufinit (void);
void bwillwrite (void);
int buf_dirty_count_severe (void);
@@ -360,9 +361,8 @@
void bqrelse (struct buf *);
int vfs_bio_awrite (struct buf *);
struct buf *getpbuf (int *);
-struct buf *incore (struct vnode *, daddr_t);
-struct buf *gbincore (struct vnode *, daddr_t);
int inmem (struct vnode *, daddr_t);
+struct buf *findblk (struct vnode *, daddr_t);
struct buf *getblk (struct vnode *, daddr_t, int, int, int);
struct buf *geteblk (int);
struct bio *push_bio(struct bio *);
@@ -390,8 +390,7 @@
void pbgetvp (struct vnode *, struct buf *);
void pbrelvp (struct buf *);
int allocbuf (struct buf *bp, int size);
-void reassignbuf (struct buf *, struct vnode *);
-void pbreassignbuf (struct buf *, struct vnode *);
+void reassignbuf (struct buf *);
struct buf *trypbuf (int *);
#endif /* _KERNEL */
Index: sys/vnode.h
===================================================================
RCS file: /cvs/src/sys/sys/vnode.h,v
retrieving revision 1.40
diff -u -r1.40 vnode.h
--- sys/vnode.h 17 Feb 2006 19:18:07 -0000 1.40
+++ sys/vnode.h 3 Mar 2006 18:25:23 -0000
@@ -155,6 +155,7 @@
* journaling is turned on or off.
*/
RB_HEAD(buf_rb_tree, buf);
+RB_HEAD(buf_rb_hash, buf);
struct vnode {
u_long v_flag; /* vnode flags (see below) */
@@ -171,6 +172,7 @@
TAILQ_ENTRY(vnode) v_nmntvnodes; /* vnodes for mount point */
struct buf_rb_tree v_rbclean_tree; /* RB tree of clean bufs */
struct buf_rb_tree v_rbdirty_tree; /* RB tree of dirty bufs */
+ struct buf_rb_hash v_rbhash_tree; /* RB tree general lookup */
LIST_ENTRY(vnode) v_synclist; /* vnodes with dirty buffers */
enum vtype v_type; /* vnode type */
union {
Index: vfs/nfs/nfs_bio.c
===================================================================
RCS file: /cvs/src/sys/vfs/nfs/nfs_bio.c,v
retrieving revision 1.26
diff -u -r1.26 nfs_bio.c
--- vfs/nfs/nfs_bio.c 17 Feb 2006 19:18:07 -0000 1.26
+++ vfs/nfs/nfs_bio.c 3 Mar 2006 19:12:19 -0000
@@ -458,7 +458,7 @@
for (nra = 0; nra < nmp->nm_readahead && nra < seqcount &&
(off_t)(lbn + 1 + nra) * biosize < np->n_size; nra++) {
rabn = lbn + 1 + nra;
- if (!incore(vp, rabn)) {
+ if (!findblk(vp, rabn)) {
rabp = nfs_getcacheblk(vp, rabn, biosize, td);
if (!rabp)
return (EINTR);
@@ -642,7 +642,7 @@
(np->n_direofoffset == 0 ||
(lbn + 1) * NFS_DIRBLKSIZ < np->n_direofoffset) &&
!(np->n_flag & NQNFSNONCACHE) &&
- !incore(vp, lbn + 1)) {
+ !findblk(vp, lbn + 1)) {
rabp = nfs_getcacheblk(vp, lbn + 1, NFS_DIRBLKSIZ, td);
if (rabp) {
if ((rabp->b_flags & (B_CACHE|B_DELWRI)) == 0) {
Index: vfs/nfs/nfs_serv.c
===================================================================
RCS file: /cvs/src/sys/vfs/nfs/nfs_serv.c,v
retrieving revision 1.26
diff -u -r1.26 nfs_serv.c
--- vfs/nfs/nfs_serv.c 1 Mar 2006 00:21:58 -0000 1.26
+++ vfs/nfs/nfs_serv.c 3 Mar 2006 19:05:43 -0000
@@ -2057,6 +2057,17 @@
if (!error) {
nqsrv_getl(dvp, ND_WRITE);
nqsrv_getl(vp, ND_WRITE);
+ if (dvp) {
+ if (dvp == vp)
+ vrele(dvp);
+ else
+ vput(dvp);
+ dvp = NULL;
+ }
+ if (vp) {
+ vput(vp);
+ vp = NULL;
+ }
error = VOP_NREMOVE(nd.nl_ncp, nd.nl_cred);
}
}
@@ -3531,9 +3542,10 @@
* have to lock and write it. Otherwise the prior
* write is assumed to have already been committed.
*/
- if ((bp = gbincore(vp, lblkno)) != NULL && (bp->b_flags & B_DELWRI)) {
+ if ((bp = findblk(vp, lblkno)) != NULL && (bp->b_flags & B_DELWRI)) {
if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT)) {
- BUF_LOCK(bp, LK_EXCLUSIVE | LK_SLEEPFAIL);
+ if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_SLEEPFAIL) == 0)
+ BUF_UNLOCK(bp);
continue; /* retry */
}
bremfree(bp);
Index: vfs/nfs/nfs_vnops.c
===================================================================
RCS file: /cvs/src/sys/vfs/nfs/nfs_vnops.c,v
retrieving revision 1.48
diff -u -r1.48 nfs_vnops.c
--- vfs/nfs/nfs_vnops.c 2 Mar 2006 19:26:19 -0000 1.48
+++ vfs/nfs/nfs_vnops.c 3 Mar 2006 19:43:55 -0000
@@ -3316,7 +3316,6 @@
/*
* Undirty the bp. We will redirty it later if the I/O fails.
*/
-
crit_enter();
bundirty(bp);
bp->b_flags &= ~(B_READ|B_DONE|B_ERROR);
@@ -3331,15 +3330,13 @@
BUF_KERNPROC(bp);
vn_strategy(bp->b_vp, &bp->b_bio1);
- if((oldflags & B_ASYNC) == 0) {
+ if ((oldflags & B_ASYNC) == 0) {
int rtval = biowait(bp);
if (oldflags & B_DELWRI) {
- crit_enter();
- reassignbuf(bp, bp->b_vp);
- crit_exit();
+ KKASSERT(bp->b_xflags & BX_VNDIRTY);
+ /*reassignbuf(bp);*/
}
-
brelse(bp);
return (rtval);
}
Index: vfs/ufs/ffs_softdep.c
===================================================================
RCS file: /cvs/src/sys/vfs/ufs/ffs_softdep.c,v
retrieving revision 1.34
diff -u -r1.34 ffs_softdep.c
--- vfs/ufs/ffs_softdep.c 17 Feb 2006 19:18:08 -0000 1.34
+++ vfs/ufs/ffs_softdep.c 3 Mar 2006 19:27:38 -0000
@@ -2266,8 +2266,11 @@
* Otherwise we have to read the blocks in from the disk.
*/
ACQUIRE_LOCK(&lk);
- if ((bp = incore(ip->i_devvp, dbn)) != NULL &&
+ if ((bp = findblk(ip->i_devvp, dbn)) != NULL &&
(wk = LIST_FIRST(&bp->b_dep)) != NULL) {
+ /*
+ * bp must be ir_savebp, which is held locked for our use.
+ */
if (wk->wk_type != D_INDIRDEP ||
(indirdep = WK_INDIRDEP(wk))->ir_savebp != bp ||
(indirdep->ir_state & GOINGAWAY) == 0) {
Index: vfs/ufs/ufs_bmap.c
===================================================================
RCS file: /cvs/src/sys/vfs/ufs/ufs_bmap.c,v
retrieving revision 1.7
diff -u -r1.7 ufs_bmap.c
--- vfs/ufs/ufs_bmap.c 17 Feb 2006 19:18:08 -0000 1.7
+++ vfs/ufs/ufs_bmap.c 3 Mar 2006 19:14:52 -0000
@@ -165,7 +165,7 @@
*/
metalbn = xap->in_lbn;
- if ((daddr == 0 && !incore(vp, metalbn)) || metalbn == bn)
+ if ((daddr == 0 && !findblk(vp, metalbn)) || metalbn == bn)
break;
/*
* If we get here, we've either got the block in the cache
More information about the Kernel
mailing list