vnlur_proc lockups under high disk I/O
Matthew Dillon
dillon at apollo.backplane.com
Mon Feb 7 17:31:41 PST 2005
Gary, thanks for the cores! They helped me track the problem down.
It turns out to be a fairly simple bug in the kernel. Basically what
is going on here is that the vnode recycler is not allowed to recycle
'internal' directory nodes in the namecache topology. i.e. if the
path A/B/C/D is cached, then the recycler is not allowed to remove A, B,
or C. The recycler checks this condition by checking the vnode's
v_holdcnt.
Unfortunately, it turns out that buffer cache buffers also bump
v_holdcnt, not just the namecache. This was preventing the recycler
from recycling vnodes related to small files (e.g. the 'Root'
and 'Entries' found all over a cvs checkout of src) and causing your
lockup.
I believe this patch will solve the problem. Please try it out
and report back to the list. Note that I have added some debugging
printfs (which will not be in the final commit).
-Matt
Index: kern/vfs_mount.c
===================================================================
RCS file: /cvs/src/sys/kern/vfs_mount.c,v
retrieving revision 1.6
diff -u -r1.6 vfs_mount.c
--- kern/vfs_mount.c 4 Feb 2005 23:16:28 -0000 1.6
+++ kern/vfs_mount.c 8 Feb 2005 01:30:06 -0000
@@ -352,19 +352,70 @@
/*
* Return 0 if the vnode is not already on the free list, return 1 if the
* vnode, with some additional work could possibly be placed on the free list.
+ * We try to avoid recycling vnodes with lots of cached pages. The cache
+ * trigger level is calculated dynamically.
*/
static __inline int
-vmightfree(struct vnode *vp, int use_count, int page_count)
+vmightfree(struct vnode *vp, int page_count)
{
if (vp->v_flag & VFREE)
return (0);
- if (vp->v_usecount != use_count || vp->v_holdcnt)
+ if (vp->v_usecount != 0)
return (0);
if (vp->v_object && vp->v_object->resident_page_count >= page_count)
return (0);
return (1);
}
+/*
+ * The vnode was found to be possibly freeable and the caller has locked it
+ * (thus the usecount should be 1 now). Determine if the vnode is actually
+ * freeable, doing some cleanups in the process. Returns 1 if the vnode
+ * can be freed, 0 otherwise.
+ *
+ * Note that v_holdcnt may be non-zero because (A) this vnode is not a leaf
+ * in the namecache topology and (B) this vnode has buffer cache bufs.
+ * We cannot remove vnodes with non-leaf namecache associations. We do a
+ * tentitive leaf check prior to attempting to flush out any buffers but the
+ * 'real' test when all is said in done is that v_holdcnt must become 0 for
+ * the vnode to be freeable.
+ *
+ * We could theoretically just unconditionally flush when v_holdcnt != 0,
+ * but flushing data associated with non-leaf nodes (which are always
+ * directories), just throws it away for no benefit. It is the buffer
+ * cache's responsibility to choose buffers to recycle from the cached
+ * data point of view.
+ */
+static int
+visleaf(struct vnode *vp)
+{
+ struct namecache *ncp;
+
+ TAILQ_FOREACH(ncp, &vp->v_namecache, nc_vnode) {
+ if (!TAILQ_EMPTY(&ncp->nc_list))
+ return(0);
+ }
+ return(1);
+}
+
+static int
+vtrytomakefreeable(struct vnode *vp, int page_count)
+{
+ if (vp->v_flag & VFREE)
+ return (0);
+ if (vp->v_usecount != 1)
+ return (0);
+ if (vp->v_object && vp->v_object->resident_page_count >= page_count)
+ return (0);
+ if (vp->v_holdcnt && visleaf(vp)) {
+ vinvalbuf(vp, V_SAVE, NULL, 0, 0);
+ printf((vp->v_holdcnt ? "vrecycle: vp %p failed: %s\n" :
+ "vrecycle: vp %p succeeded: %s\n"), vp,
+ (TAILQ_FIRST(&vp->v_namecache) ?
+ TAILQ_FIRST(&vp->v_namecache)->nc_name : "?"));
+ }
+ return(vp->v_usecount == 1 && vp->v_holdcnt == 0);
+}
static int
vlrureclaim(struct mount *mp)
@@ -401,7 +452,7 @@
*/
if (vp->v_type == VNON || /* XXX */
vp->v_type == VBAD || /* XXX */
- !vmightfree(vp, 0, trigger) /* critical path opt */
+ !vmightfree(vp, trigger) /* critical path opt */
) {
TAILQ_REMOVE(&mp->mnt_nvnodelist, vp, v_nmntvnodes);
TAILQ_INSERT_TAIL(&mp->mnt_nvnodelist,vp, v_nmntvnodes);
@@ -436,7 +487,7 @@
vp->v_type == VBAD || /* XXX */
(vp->v_flag & VRECLAIMED) ||
vp->v_mount != mp ||
- !vmightfree(vp, 1, trigger) /* critical path opt */
+ !vtrytomakefreeable(vp, trigger) /* critical path opt */
) {
if (vp->v_mount == mp) {
TAILQ_REMOVE(&mp->mnt_nvnodelist,
More information about the Bugs
mailing list