file weirdness / vnlru process issues

Matthew Dillon dillon at apollo.backplane.com
Mon Oct 18 13:55:17 PDT 2004


    I figured out what is going on from Pawel's kernel dump.  Thanks Pawel!

    Basically the issue is that the namecache topology holds the vnode for
    any directory containing 'leaf' records in the namecache.  Since there
    are always leaf nodes there are always freeable vnodes.

    However, negative cache hits create namecache records which causes
    the governing directory vnode to be held.

    There is a bug in the flushing of negative cache entries.  The
    cache_enter() code checks to see if there are too many negative 
    cache entries, but it it only tries to remove one, and this attempt
    does not always succeed.  The result is that the number of negative
    entries builds up continuously when doing things that cause a lot of
    negative cache hits (like buildworlds and such).  Additionally, the
    vnlru code does not try to clean out negative cache hits, so it can 
    get stuck in an endless loop.

    I have verified that the number of negative cache entries can grow
    way beyond the normally allowed limits.

    The solution is to have cache_enter() try a little harder to clean
    out negative cache entries.  I added some hysteresis to avoid a
    constant following-mode cleaning mode.   The vnlru code now attempts
    to clean 10% of the negative entries on each loop.

    I am hoping that this patch will solve the problem.  Note that I have
    added some kernel printf's in this patch to print the hysteresis mode
    out.  It outputs 'H' when it hits the high limit and starts draining,
    and it prints 'L' when it hits the low limit and stops draining, and it
    outputs 'X' if the vnlru process has to run.  You can observe the
    current number of negative entries with 'sysctl vfs.cache.numneg'.

    Please try the patch.  I want to know if it fixes Pawel's lockups and
    if it helps or fixes the performance issues that Chris noticed after 
    the system has been running a while and especially the 'vlruwk' states.
    
					-Matt

Index: sys/namecache.h
===================================================================
RCS file: /cvs/src/sys/sys/namecache.h,v
retrieving revision 1.13
diff -u -r1.13 namecache.h
--- sys/namecache.h	12 Oct 2004 19:20:48 -0000	1.13
+++ sys/namecache.h	18 Oct 2004 20:35:08 -0000
@@ -164,6 +164,7 @@
 void	cache_purge(struct vnode *vp);
 void	cache_purgevfs (struct mount *mp);
 int	cache_get_nonblock(struct namecache *ncp);
+void	cache_cleanneg(int count);
 struct namecache *cache_get(struct namecache *ncp);
 struct namecache *cache_hold(struct namecache *ncp);
 void	cache_put(struct namecache *ncp);
Index: kern/vfs_cache.c
===================================================================
RCS file: /cvs/src/sys/kern/vfs_cache.c,v
retrieving revision 1.38
diff -u -r1.38 vfs_cache.c
--- kern/vfs_cache.c	12 Oct 2004 19:20:46 -0000	1.38
+++ kern/vfs_cache.c	18 Oct 2004 20:51:49 -0000
@@ -1275,6 +1275,7 @@
 	struct namecache *bpar;
 	struct nchashhead *nchpp;
 	u_int32_t hash;
+	static enum { CHI_LOW, CHI_HIGH } cache_hysteresis_state = CHI_LOW;
 
 	/*
 	 * If the directory has no namecache entry we bail.  This will result
@@ -1390,13 +1391,57 @@
 	cache_put(ncp);
 
 	/*
-	 * Don't cache too many negative hits
+	 * Don't cache too many negative hits.  We use hysteresis to reduce
+	 * the impact on the critical path.
 	 */
-	if (numneg > MINNEG && numneg * ncnegfactor > numcache) {
+	switch(cache_hysteresis_state) {
+	case CHI_LOW:
+		if (numneg > MINNEG && numneg * ncnegfactor > numcache) {
+			cache_cleanneg(10);
+			if (cache_hysteresis_state == CHI_LOW)
+				printf("H");
+			cache_hysteresis_state = CHI_HIGH;
+		}
+		break;
+	case CHI_HIGH:
+		if (numneg > MINNEG * 9 / 10 && 
+		    numneg * ncnegfactor * 9 / 10 > numcache
+		) {
+			cache_cleanneg(10);
+		} else {
+			cache_hysteresis_state = CHI_LOW;
+			printf("L");
+		}
+		break;
+	}
+}
+
+void
+cache_cleanneg(int count)
+{
+	struct namecache *ncp;
+
+	/*
+	 * Automode from the vnlru proc - clean out 10% of the negative cache
+	 * entries.
+	 */
+	if (count == 0) {
+		count = numneg / 10 + 1;
+		printf("X");
+	}
+
+	/*
+	 * Attempt to clean out the specified number of negative cache
+	 * entries.
+	 */
+	while (count) {
 		ncp = TAILQ_FIRST(&ncneglist);
 		KKASSERT(ncp != NULL);
+		TAILQ_REMOVE(&ncneglist, ncp, nc_vnode);
+		TAILQ_INSERT_TAIL(&ncneglist, ncp, nc_vnode);
 		if (cache_get_nonblock(ncp) == 0)
 			cache_zap(ncp);
+		--count;
 	}
 }
 
Index: kern/vfs_mount.c
===================================================================
RCS file: /cvs/src/sys/kern/vfs_mount.c,v
retrieving revision 1.1
diff -u -r1.1 vfs_mount.c
--- kern/vfs_mount.c	12 Oct 2004 19:20:46 -0000	1.1
+++ kern/vfs_mount.c	18 Oct 2004 20:34:52 -0000
@@ -470,6 +470,7 @@
 			continue;
 		}
 		done = 0;
+		cache_cleanneg(0);
 		lwkt_gettoken(&ilock, &mountlist_token);
 		for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) {
 			if (vfs_busy(mp, LK_NOWAIT, &ilock, td)) {





More information about the Bugs mailing list