namecache coherency 3rd turn

Csaba Henk csaba.henk at creo.hu
Wed Mar 29 01:58:00 PST 2006


Hi!

This time the following changes were made:

 - Namecache locks are kept by shadowinfo structures which
   can be embedded into the namecache structure itself (singleton
   groups) or fetched from a pool (for non-singleton groups).

 - Shadow group data structure: I ended up at what has been
   suggested by Matt earlier: shadow group entries form a circular list.
   Double linked in order to support O(1) node deletion, enhanced
   with a height counter to retain tree semantics.

 - cache_setunresolved(ncp) blows up subtree over ncp to break broken
   topologies.

 - Deadlock avoiding techniques of the previous patch have been kept.

 - The "struct namecache *nc_shadowed" field of namecache structures
   still exists, but is nowhere referred by cache code. Now it's sort
   of a private field, almost like the "void *" fields in vnodes,
   specinfo structures, etc. It could be easily ditched and replaced by
   per-mount hash for those fs-es who would use it. I just kept it as
   is -- I didn't want to do anything about it without having a
   consensus.

 - Nullfs adjusted to this API.

Regards,
Csaba
diff -r e1d135a8f666 sys/kern/vfs_cache.c
--- a/sys/kern/vfs_cache.c	Sun Mar 26 07:56:54 2006 +0000
+++ b/sys/kern/vfs_cache.c	Wed Mar 29 10:23:26 2006 +0200
@@ -108,6 +108,10 @@
 #define NCHHASH(hash)	(&nchashtbl[(hash) & nchash])
 #define MINNEG		1024
 
+/* Modes for shadow group traversal */
+#define SG_ALL     0 /* traverse whole group */
+#define SG_SUBTREE 1 /* traverse only subtree */
+
 MALLOC_DEFINE(M_VFSCACHE, "vfscache", "VFS name cache entries");
 
 static LIST_HEAD(nchashhead, namecache) *nchashtbl;	/* Hash Table */
@@ -170,6 +174,15 @@ static u_long numneghits; STATNODE(CTLFL
 static u_long numneghits; STATNODE(CTLFLAG_RD, numneghits, &numneghits);
 
 struct nchstats nchstats[SMP_MAXCPU];
+
+static STAILQ_HEAD(, shadowinfo) shadowinfo_freeq;
+static u_long numshadowinfo = 0;
+STATNODE(CTLFLAG_RD, numshadowinfo, &numshadowinfo);
+static long maxnumshadowinfo = -1;
+SYSCTL_LONG(_vfs_cache, OID_AUTO, maxnumshadowinfo, CTLFLAG_RW,
+            &maxnumshadowinfo, 0, "");
+MALLOC_DEFINE(M_SHADOWINFO, "shadowinfo", "VFS name cache shadowinfo");
+
 /*
  * Export VFS cache effectiveness statistics to user-land.
  *
@@ -196,6 +209,62 @@ SYSCTL_PROC(_vfs_cache, OID_AUTO, nchsta
 SYSCTL_PROC(_vfs_cache, OID_AUTO, nchstats, CTLTYPE_OPAQUE|CTLFLAG_RD,
   0, 0, sysctl_nchstats, "S,nchstats", "VFS cache effectiveness statistics");
 
+/* XXX stubs for later MPSAFE work */
+#define shadowinfo_freeq_lock()
+#define shadowinfo_freeq_unlock()
+
+static struct shadowinfo *
+shadowinfo_fetch(void)
+{
+	struct shadowinfo *shinf = STAILQ_FIRST(&shadowinfo_freeq);
+
+	if (! shinf)
+		goto alloc;
+
+	shadowinfo_freeq_lock();
+	if ((shinf = STAILQ_FIRST(&shadowinfo_freeq)))
+		STAILQ_REMOVE_HEAD(&shadowinfo_freeq, sh_entry);
+	shadowinfo_freeq_unlock();
+
+	if (shinf)
+		return (shinf);
+
+alloc:
+	shinf = malloc(sizeof(*shinf), M_SHADOWINFO, M_WAITOK|M_ZERO);
+	numshadowinfo++;
+
+	return (shinf);
+}
+
+static __inline
+struct shadowinfo*
+shadowinfo_ref(struct shadowinfo *shinf)
+{
+	shinf->sh_refs++;
+
+	return (shinf);
+}
+
+static void 
+shadowinfo_put(struct shadowinfo *shinf)
+{
+	if (--shinf->sh_refs > 0)
+		return;
+
+	if (maxnumshadowinfo >= 0 && numshadowinfo > maxnumshadowinfo) {
+		free(shinf, M_SHADOWINFO);
+		numshadowinfo--;
+		return;
+	}
+
+	shinf->sh_exlocks = 0;
+	shinf->sh_locktd = NULL;
+
+	shadowinfo_freeq_lock();
+	STAILQ_INSERT_TAIL(&shadowinfo_freeq, shinf, sh_entry);
+	shadowinfo_freeq_unlock();
+}
+
 static void cache_zap(struct namecache *ncp);
 
 /*
@@ -225,7 +294,7 @@ _cache_drop(struct namecache *ncp)
 	    (ncp->nc_flag & NCF_UNRESOLVED) && 
 	    TAILQ_EMPTY(&ncp->nc_list)
 	) {
-		KKASSERT(ncp->nc_exlocks == 0);
+		KKASSERT(ncp->nc_shadowinfo->sh_exlocks == 0);
 		cache_lock(ncp);
 		cache_zap(ncp);
 	} else {
@@ -295,6 +364,10 @@ cache_alloc(int nlen)
 	ncp->nc_error = ENOTCONN;	/* needs to be resolved */
 	ncp->nc_refs = 1;
 	ncp->nc_fsmid = 1;
+	ncp->nc_shadowinfo = &ncp->nc_shadowinfo_internal;
+	ncp->nc_shadowinfo_internal.sh_refs = 2;
+	ncp->nc_shadow_prev = NULL;
+	ncp->nc_shadow_next = NULL;
 	TAILQ_INIT(&ncp->nc_list);
 	cache_lock(ncp);
 	return(ncp);
@@ -303,7 +376,7 @@ static void
 static void
 cache_free(struct namecache *ncp)
 {
-	KKASSERT(ncp->nc_refs == 1 && ncp->nc_exlocks == 1);
+	KKASSERT(ncp->nc_refs == 1 && ncp->nc_shadowinfo->sh_exlocks == 1);
 	if (ncp->nc_name)
 		free(ncp->nc_name, M_VFSCACHE);
 	free(ncp, M_VFSCACHE);
@@ -322,6 +395,188 @@ cache_drop(struct namecache *ncp)
 cache_drop(struct namecache *ncp)
 {
 	_cache_drop(ncp);
+}
+
+/*
+ * Iterate an "updater" function over a shadow group.
+ * All-group and subtree-only traversals are supported.
+ */
+static struct namecache *
+cache_group_walk(struct namecache *ncp,
+                 int (*updater)(struct namecache *xncp, void *param),
+                 int flags, void *param)
+{
+	struct namecache *xncp = ncp, *yncp;
+
+	for (;;) {
+		yncp = xncp->nc_shadow_next;
+		if (updater(xncp, param))
+			break;
+		if (! yncp || yncp == ncp ||
+		    (flags & SG_SUBTREE &&
+		     yncp->nc_shadowheight <= ncp->nc_shadowheight))
+			break;
+		xncp = yncp;
+	}
+
+	return(xncp);
+}
+
+struct migrate_param {
+	int heightdelta;
+	int exlocks;
+	struct shadowinfo *shadowinfo;
+};
+
+static int 
+migrate_updater(struct namecache *ncp, void *param)
+{
+	struct migrate_param *mpm = param;
+	struct shadowinfo *shinf = mpm->shadowinfo;
+	struct shadowinfo *oldshinf = ncp->nc_shadowinfo;
+
+	if (! shinf)
+		shinf = &ncp->nc_shadowinfo_internal;
+
+	if (shinf == oldshinf)
+		goto out;
+
+	shinf->sh_locktd = oldshinf->sh_locktd;
+
+	ncp->nc_shadowinfo = shadowinfo_ref(shinf);
+	shadowinfo_put(oldshinf);
+
+out:
+	ncp->nc_shadowheight += mpm->heightdelta;
+	if (mpm->exlocks >= 0)
+		shinf->sh_exlocks = mpm->exlocks;
+
+	return (0);
+}
+
+static __inline
+void
+cache_shadow_link(struct namecache *sncp, struct namecache *ncp)
+{
+	struct namecache *pncp;
+	struct namecache *nsncp;
+
+	pncp = ncp->nc_shadow_prev ?: ncp;
+	nsncp = sncp->nc_shadow_next ?: sncp;
+
+	pncp->nc_shadow_next = nsncp;
+	nsncp->nc_shadow_prev = pncp;
+
+	sncp->nc_shadow_next = ncp;
+	ncp->nc_shadow_prev = sncp;
+}
+
+static __inline
+void
+cache_shadow_unlink(struct namecache *ncp)
+{
+	if (! ncp->nc_shadow_next)
+		return;
+
+	KKASSERT(ncp->nc_shadow_prev);
+
+	if (ncp->nc_shadow_prev == ncp->nc_shadow_next) {
+		ncp->nc_shadow_prev->nc_shadow_next = NULL;
+		ncp->nc_shadow_next->nc_shadow_prev = NULL;
+	} else {
+		ncp->nc_shadow_prev->nc_shadow_next = ncp->nc_shadow_next;
+		ncp->nc_shadow_next->nc_shadow_prev = ncp->nc_shadow_prev;
+	}
+
+	ncp->nc_shadow_prev = ncp->nc_shadow_next = NULL;
+}
+
+/*
+ * Join ncp into the shadow group of sncp.
+ * 
+ * ncp must be unlocked on entry, while sncp must be locked on entry.
+ *
+ * The routine will fail and return ELOOP if the intended shadowing association
+ * doesnt' make sense (currently this boils down to ncp being the same as
+ * sncp).
+ * It will fail with EEXIST if ncp gets resolved or acquires a shadow
+ * association from elsewhere during the attach attempt (it is possbile due to
+ * the fact that ncp is unlocked).
+ *
+ * - On success ncp will be a representative of the joint shadow group, which
+ *   then will be locked.
+ * - On failure the namecache entries will exist separately just as they did
+ *   before; both entries will be locked.
+ */
+int
+cache_shadow_attach(struct namecache *ncp, struct namecache *sncp)
+{
+	struct migrate_param mpm;
+
+	if (ncp == sncp)
+		return(ELOOP);
+
+	KKASSERT(ncp->nc_shadowinfo->sh_locktd != curthread);
+	KKASSERT(sncp->nc_shadowinfo->sh_locktd == curthread);
+
+	cache_lock_two(ncp, sncp);
+
+	if ((ncp->nc_flag & NCF_UNRESOLVED) == 0 || ncp->nc_shadowheight != 0)
+		return(EEXIST);
+
+	if (sncp->nc_shadowinfo == &sncp->nc_shadowinfo_internal) {
+		mpm.heightdelta = 0;
+		mpm.shadowinfo = shadowinfo_fetch();
+		mpm.exlocks = sncp->nc_shadowinfo->sh_exlocks;
+		migrate_updater(sncp, &mpm);
+	}
+
+	mpm.heightdelta = sncp->nc_shadowheight + 1;
+	mpm.shadowinfo = sncp->nc_shadowinfo;
+	mpm.exlocks = -1;
+
+	cache_group_walk(ncp, &migrate_updater, SG_ALL, &mpm);
+	cache_shadow_link(sncp, ncp);
+	
+	return(0);
+}
+
+/*
+ * Take out namecache entry from its shadow group.
+ *
+ * The shadow group must be locked upon entry.
+ *
+ * On return both the entry and its former group remain locked.
+ */
+void
+cache_shadow_detach(struct namecache *ncp)
+{
+	struct namecache *pncp, *nncp;
+	struct migrate_param mpm;
+
+	mpm.shadowinfo = NULL;
+again:
+	mpm.heightdelta = -ncp->nc_shadowheight;
+	mpm.exlocks = ncp->nc_shadowinfo->sh_exlocks;
+	pncp = ncp->nc_shadow_prev;
+	nncp = ncp->nc_shadow_next;
+
+	migrate_updater(ncp, &mpm);
+	cache_shadow_unlink(ncp);
+
+	if (nncp && nncp == pncp) {
+		ncp = nncp;
+		goto again;
+	}
+}
+
+static int
+vref_updater(struct namecache *ncp, void *param)
+{
+	if (ncp->nc_vp)
+		*(int *)param > 0 ? vhold(ncp->nc_vp) : vdrop(ncp->nc_vp);
+
+	return(0);
 }
 
 /*
@@ -349,15 +604,21 @@ cache_lock(struct namecache *ncp)
 {
 	thread_t td;
 	int didwarn;
+	struct shadowinfo *shinf;
 
 	KKASSERT(ncp->nc_refs != 0);
 	didwarn = 0;
 	td = curthread;
 
 	for (;;) {
-		if (ncp->nc_exlocks == 0) {
-			ncp->nc_exlocks = 1;
-			ncp->nc_locktd = td;
+		shinf = ncp->nc_shadowinfo;
+		KKASSERT(shinf);
+		KKASSERT(shinf->sh_refs != 0);
+		if (shinf->sh_exlocks == 0) {
+			int ref = 1;
+
+			shinf->sh_exlocks = 1;
+			shinf->sh_locktd = td;
 			/* 
 			 * The vp associated with a locked ncp must be held
 			 * to prevent it from being recycled (which would
@@ -365,16 +626,15 @@ cache_lock(struct namecache *ncp)
 			 *
 			 * XXX loop on race for later MPSAFE work.
 			 */
-			if (ncp->nc_vp)
-				vhold(ncp->nc_vp);
+			cache_group_walk(ncp, &vref_updater, SG_ALL, &ref);
 			break;
 		}
-		if (ncp->nc_locktd == td) {
-			++ncp->nc_exlocks;
+		if (shinf->sh_locktd == td) {
+			++shinf->sh_exlocks;
 			break;
 		}
-		ncp->nc_flag |= NCF_LOCKREQ;
-		if (tsleep(ncp, 0, "clock", nclockwarn) == EWOULDBLOCK) {
+		shinf->sh_lockreq = 1;
+		if (tsleep(shinf, 0, "clock", nclockwarn) == EWOULDBLOCK) {
 			if (didwarn)
 				continue;
 			didwarn = 1;
@@ -398,12 +658,17 @@ cache_lock_nonblock(struct namecache *nc
 cache_lock_nonblock(struct namecache *ncp)
 {
 	thread_t td;
+	struct shadowinfo *shinf = ncp->nc_shadowinfo;
 
 	KKASSERT(ncp->nc_refs != 0);
+	KKASSERT(shinf);
+	KKASSERT(shinf->sh_refs != 0);
 	td = curthread;
-	if (ncp->nc_exlocks == 0) {
-		ncp->nc_exlocks = 1;
-		ncp->nc_locktd = td;
+	if (shinf->sh_exlocks == 0) {
+		int ref = 1;
+
+		shinf->sh_exlocks = 1;
+		shinf->sh_locktd = td;
 		/* 
 		 * The vp associated with a locked ncp must be held
 		 * to prevent it from being recycled (which would
@@ -411,8 +676,7 @@ cache_lock_nonblock(struct namecache *nc
 		 *
 		 * XXX loop on race for later MPSAFE work.
 		 */
-		if (ncp->nc_vp)
-			vhold(ncp->nc_vp);
+		cache_group_walk(ncp, &vref_updater, SG_ALL, &ref);
 		return(0);
 	} else {
 		return(EWOULDBLOCK);
@@ -423,17 +687,45 @@ cache_unlock(struct namecache *ncp)
 cache_unlock(struct namecache *ncp)
 {
 	thread_t td = curthread;
+	struct shadowinfo *shinf = ncp->nc_shadowinfo;
 
 	KKASSERT(ncp->nc_refs > 0);
-	KKASSERT(ncp->nc_exlocks > 0);
-	KKASSERT(ncp->nc_locktd == td);
-	if (--ncp->nc_exlocks == 0) {
-		if (ncp->nc_vp)
-			vdrop(ncp->nc_vp);
-		ncp->nc_locktd = NULL;
-		if (ncp->nc_flag & NCF_LOCKREQ) {
-			ncp->nc_flag &= ~NCF_LOCKREQ;
-			wakeup(ncp);
+	KKASSERT(shinf);
+	KKASSERT(shinf->sh_refs > 0);
+	KKASSERT(shinf->sh_exlocks > 0);
+	KKASSERT(shinf->sh_locktd == td);
+	if (shinf->sh_exlocks == 1) {
+		int ref = -1;
+		cache_group_walk(ncp, &vref_updater, SG_ALL, &ref);
+	}
+	if (--shinf->sh_exlocks == 0) {
+		shinf->sh_locktd = NULL;
+		if (shinf->sh_lockreq) {
+			shinf->sh_lockreq = 0;
+			wakeup(shinf);
+		}
+	}
+}
+
+/*
+ * Obtain lock on both of uncp and lncp.
+ *
+ * On entry, uncp is assumed to be unlocked, and lncp is assumed to be
+ * locked.
+ *
+ * After this function returns, caller is responsible for checking
+ * the state of lncp which might have got unlocked temporarily.
+ */
+void
+cache_lock_two(struct namecache *uncp, struct namecache *lncp)
+{
+	if (cache_lock_nonblock(uncp) != 0) {
+		if (uncp > lncp)
+			cache_lock(uncp);
+		else {
+			cache_unlock(lncp);
+			cache_lock(uncp);
+			cache_lock(lncp);
 		}
 	}
 }
@@ -453,7 +745,8 @@ cache_get_nonblock(struct namecache *ncp
 cache_get_nonblock(struct namecache *ncp)
 {
 	/* XXX MP */
-	if (ncp->nc_exlocks == 0 || ncp->nc_locktd == curthread) {
+	if (ncp->nc_shadowinfo->sh_exlocks == 0 ||
+	    ncp->nc_shadowinfo->sh_locktd == curthread) {
 		_cache_hold(ncp);
 		cache_lock(ncp);
 		return(0);
@@ -487,7 +780,7 @@ cache_setvp(struct namecache *ncp, struc
 		if (!TAILQ_EMPTY(&ncp->nc_list))
 			vhold(vp);
 		TAILQ_INSERT_HEAD(&vp->v_namecache, ncp, nc_vnode);
-		if (ncp->nc_exlocks)
+		if (ncp->nc_shadowinfo->sh_exlocks)
 			vhold(vp);
 
 		/*
@@ -521,6 +814,8 @@ cache_settimeout(struct namecache *ncp, 
 		ncp->nc_timeout = 1;
 }
 
+static int unresolver_updater(struct namecache *ncp, void *param); 
+
 /*
  * Disassociate the vnode or negative-cache association and mark a
  * namecache entry as unresolved again.  Note that the ncp is still
@@ -541,7 +836,25 @@ void
 void
 cache_setunresolved(struct namecache *ncp)
 {
+	struct namecache *nncp;
+
+	cache_group_walk(ncp, &unresolver_updater, SG_SUBTREE, ncp);
+
+	nncp = ncp->nc_shadow_next;
+	if (nncp)
+		cache_hold(nncp);
+	unresolver_updater(ncp, NULL);
+	if (nncp)
+		cache_put(nncp);
+}
+
+static int
+unresolver_updater(struct namecache *ncp, void *param) 
+{
 	struct vnode *vp;
+
+	if (ncp == param)
+		return(0);
 
 	if ((ncp->nc_flag & NCF_UNRESOLVED) == 0) {
 		ncp->nc_flag |= NCF_UNRESOLVED;
@@ -563,13 +876,23 @@ cache_setunresolved(struct namecache *nc
 			 */
 			if (!TAILQ_EMPTY(&ncp->nc_list))
 				vdrop(vp);
-			if (ncp->nc_exlocks)
+			if (ncp->nc_shadowinfo->sh_exlocks)
 				vdrop(vp);
 		} else {
 			TAILQ_REMOVE(&ncneglist, ncp, nc_vnode);
 			--numneg;
 		}
-	}
+
+		cache_shadow_detach(ncp);
+	}
+
+	if (ncp->nc_refs == 0) {
+		cache_hold(ncp);
+		cache_put(ncp);
+	}
+
+
+	return(0);
 }
 
 /*
@@ -619,7 +942,7 @@ cache_inval(struct namecache *ncp, int f
 	struct namecache *nextkid;
 	int rcnt = 0;
 
-	KKASSERT(ncp->nc_exlocks);
+	KKASSERT(ncp->nc_shadowinfo->sh_exlocks);
 
 	cache_setunresolved(ncp);
 	if (flags & CINV_DESTROY)
@@ -715,6 +1038,7 @@ restart:
  * XXX the disconnection could pose a problem, check code paths to make
  * sure any code that blocks can handle the parent being changed out from
  * under it.  Maybe we should lock the children (watch out for deadlocks) ?
+ * [UPDATE: attempt made to lock children, see in situ explanation]
  *
  * After we return the caller has the option of calling cache_setvp() if
  * the vnode of the new target ncp is known.
@@ -726,26 +1050,62 @@ cache_rename(struct namecache *fncp, str
 cache_rename(struct namecache *fncp, struct namecache *tncp)
 {
 	struct namecache *scan;
-	int didwarn = 0;
-
+	int didwarn[] = { 0, 0 };
+
+	/* XXX should we rather make here a non-equality assertion? */
+	if (fncp == tncp)
+		return;
+
+again:
 	cache_setunresolved(fncp);
 	cache_setunresolved(tncp);
+
+	/*
+	 * It seems we need to unlock fncp before calling cache_inval():
+	 * cache_inval() does a lot of lock/unlock/relock-ing (with tncp
+	 * and its children), therefore keeping fncp locked might be
+	 * deadlocky...
+	 */
+	cache_unlock(fncp);
+	
 	while (cache_inval(tncp, CINV_CHILDREN) != 0) {
-		if (didwarn++ % 10 == 0) {
-			printf("Warning: cache_rename: race during "
+		if (didwarn[0]++ % 10 == 0) {
+			printf("Warning: cache_rename: race #1 during "
 				"rename %s->%s\n",
 				fncp->nc_name, tncp->nc_name);
 		}
 		tsleep(tncp, 0, "mvrace", hz / 10);
 		cache_setunresolved(tncp);
 	}
+
+	cache_unlock(tncp);
+	cache_lock(fncp);
+
 	while ((scan = TAILQ_FIRST(&fncp->nc_list)) != NULL) {
-		cache_hold(scan);
+		cache_unlock(fncp);
+		/*
+		 * We have to lock fncp's kids in order to unresolve
+		 * their shadow kids...
+		 */
+		cache_get(scan);
 		cache_unlink_parent(scan);
+		cache_group_walk(scan, &unresolver_updater, SG_SUBTREE, scan);
 		cache_link_parent(scan, tncp);
 		if (scan->nc_flag & NCF_HASHED)
 			cache_rehash(scan);
-		cache_drop(scan);
+		cache_put(scan);
+		cache_lock(fncp);
+	}
+
+	cache_lock_two(tncp, fncp);
+
+	if ((fncp->nc_flag & tncp->nc_flag & NCF_UNRESOLVED) == 0) {
+		if (didwarn[1]++ % 10 == 0) {
+			printf("Warning: cache_rename: race #2 during "
+				"rename %s->%s\n",
+				fncp->nc_name, tncp->nc_name);
+		}
+		goto again;
 	}
 }
 
@@ -1321,7 +1681,7 @@ cache_zap(struct namecache *ncp)
 			cache_drop(ncp);
 			return;
 		}
-		KKASSERT(par->nc_exlocks == 0);
+		KKASSERT(par->nc_shadowinfo->sh_exlocks == 0);
 		cache_lock(ncp);
 	}
 done:
@@ -1417,7 +1777,7 @@ restart:
 		if (ncp->nc_timeout && 
 		    (int)(ncp->nc_timeout - ticks) < 0 &&
 		    (ncp->nc_flag & NCF_UNRESOLVED) == 0 &&
-		    ncp->nc_exlocks == 0
+		    ncp->nc_shadowinfo->sh_exlocks == 0
 		) {
 			cache_zap(cache_get(ncp));
 			goto restart;
@@ -1738,6 +2098,7 @@ nchinit(void)
 		gd->gd_nchstats = &nchstats[i];
 	}
 	TAILQ_INIT(&ncneglist);
+	STAILQ_INIT(&shadowinfo_freeq);
 	nchashtbl = hashinit(desiredvnodes*2, M_VFSCACHE, &nchash);
 	nclockwarn = 1 * hz;
 }
diff -r e1d135a8f666 sys/sys/namecache.h
--- a/sys/sys/namecache.h	Sun Mar 26 07:56:54 2006 +0000
+++ b/sys/sys/namecache.h	Wed Mar 29 10:23:26 2006 +0200
@@ -70,7 +70,20 @@
 
 struct vnode;
 
+/*
+ * Auxiliary structure for locking namecache entries,
+ * either on their own or grouped into "shadow groups".
+ */
+struct shadowinfo {
+    STAILQ_ENTRY(shadowinfo) sh_entry;  /* entry for free list */
+    int		   sh_exlocks;		/* namespace locking */
+    struct thread *sh_locktd;		/* namespace locking */
+    int            sh_refs;		/* reference count */
+    uint8_t        sh_lockreq :1;	/* lock intent sign */ 
+};
+
 TAILQ_HEAD(namecache_list, namecache);
+LIST_HEAD(namecache_shadow_list, namecache);
 
 /*
  * The namecache structure is used to manage the filesystem namespace.  Most
@@ -110,8 +123,12 @@ struct namecache {
     char	*nc_name;		/* Separately allocated seg name */
     int		nc_error;
     int		nc_timeout;		/* compared against ticks, or 0 */
-    int		nc_exlocks;		/* namespace locking */
-    struct thread *nc_locktd;		/* namespace locking */
+    struct shadowinfo *nc_shadowinfo;         /* namespace locking */
+    struct shadowinfo nc_shadowinfo_internal; /* private locking information */
+    struct namecache *nc_shadow_prev;   /* previous entry in shadow group */
+    struct namecache *nc_shadow_next;   /* next entry in shadow group */
+    int         nc_shadowheight;        /* measure within shadow group */
+    struct namecache *nc_shadowed;	/* lower layer entry in layered fs */
     struct mount *nc_mount;		/* associated mount for vopops */
     int64_t	nc_fsmid;		/* filesystem modified id */
 };
@@ -127,7 +144,7 @@ typedef struct namecache *namecache_t;
 #define NCF_MOUNTPT	0x0008	/* mount point */
 #define NCF_ROOT	0x0010	/* namecache root (static) */
 #define NCF_HASHED	0x0020	/* namecache entry in hash table */
-#define NCF_LOCKREQ	0x0040
+#define NCF_UNUSED040	0x0040
 #define NCF_UNUSED080	0x0080
 #define NCF_ISSYMLINK	0x0100	/* represents a symlink */
 #define NCF_ISDIR	0x0200	/* represents a directory */
@@ -150,6 +167,9 @@ void	cache_lock(struct namecache *ncp);
 void	cache_lock(struct namecache *ncp);
 int	cache_lock_nonblock(struct namecache *ncp);
 void	cache_unlock(struct namecache *ncp);
+void	cache_lock_two(struct namecache *uncp, struct namecache *lncp);
+int	cache_shadow_attach(struct namecache *ncp, struct namecache *sncp);
+void	cache_shadow_detach(struct namecache *ncp);
 void	cache_setvp(struct namecache *ncp, struct vnode *vp);
 void	cache_settimeout(struct namecache *ncp, int nticks);
 void	cache_setunresolved(struct namecache *ncp);
diff -r e1d135a8f666 sys/vfs/nullfs/null.h
--- a/sys/vfs/nullfs/null.h	Sun Mar 26 07:56:54 2006 +0000
+++ b/sys/vfs/nullfs/null.h	Wed Mar 29 10:29:08 2006 +0200
@@ -44,17 +44,26 @@ struct null_args {
 };
 
 struct null_mount {
-	struct mount	*nullm_vfs;
-	struct vnode	*nullm_rootvp;	/* Reference to root null_node */
+	struct namecache *nullm_ncp;
 };
 
 #ifdef _KERNEL
+
 #define	MOUNTTONULLMOUNT(mp) ((struct null_mount *)((mp)->mnt_data))
-
 #ifdef NULLFS_DEBUG
-#define NULLFSDEBUG(format, args...) printf(format ,## args)
+#define NULLFSDEBUG(format, args...) \
+	printf(" [nullfs] %s:%d: " format, __func__, __LINE__, ## args)
+#define	NULLNCDEBUG(ncp)							\
+        NULLFSDEBUG(#ncp " %p: name %s, refs %d, exlocks %d, nc_flag 0x%x, "	\
+	            "nc_mount %p, nc_shadowed %p, nc_shadowinfo %p, "		\
+	            "nc_shadowheight %d, nc_vp %p\n",				\
+	            (ncp), (ncp)->nc_name, (ncp)->nc_refs,			\
+	            (ncp)->nc_shadowinfo->sh_exlocks, (ncp)->nc_flag,		\
+	            (ncp)->nc_mount, (ncp)->nc_shadowed,			\
+	            (ncp)->nc_shadowinfo, (ncp)->nc_shadowheight, (ncp)->nc_vp)
 #else
 #define NULLFSDEBUG(format, args...)
+#define NULLNCDEBUG(ncp)
 #endif /* NULLFS_DEBUG */
 
 #endif /* _KERNEL */
diff -r e1d135a8f666 sys/vfs/nullfs/null_vfsops.c
--- a/sys/vfs/nullfs/null_vfsops.c	Sun Mar 26 07:56:54 2006 +0000
+++ b/sys/vfs/nullfs/null_vfsops.c	Wed Mar 29 10:29:08 2006 +0200
@@ -53,6 +53,7 @@
 #include <sys/vnode.h>
 #include <sys/mount.h>
 #include <sys/nlookup.h>
+#include <sys/namecache.h>
 #include "null.h"
 
 extern struct vnodeopv_entry_desc null_vnodeop_entries[];
@@ -80,12 +81,10 @@ nullfs_mount(struct mount *mp, char *pat
 {
 	int error = 0;
 	struct null_args args;
-	struct vnode *rootvp;
 	struct null_mount *xmp;
 	u_int size;
-	struct nlookupdata nd;
-
-	NULLFSDEBUG("nullfs_mount(mp = %p)\n", (void *)mp);
+
+	NULLFSDEBUG("mp %p\n", (void *)mp);
 
 	/*
 	 * Update is a no-op
@@ -98,118 +97,79 @@ nullfs_mount(struct mount *mp, char *pat
 	 * Get argument
 	 */
 	error = copyin(data, (caddr_t)&args, sizeof(struct null_args));
-	if (error)
+
+	xmp = malloc(sizeof(*xmp), M_NULLFSMNT, M_WAITOK | M_ZERO);
+
+	NULLFSDEBUG("nlookup %s\n", args.target);
+
+	xmp->nullm_ncp = nlookup_simple(args.target,
+		              UIO_SYSSPACE, NLC_FOLLOW, &error);
+
+	if (! xmp->nullm_ncp) {
+		free(xmp, M_NULLFSMNT);
 		return (error);
-
-	/*
-	 * Find lower node
-	 */
-	rootvp = NULL;
-	error = nlookup_init(&nd, args.target, UIO_USERSPACE, NLC_FOLLOW);
-	if (error == 0)
-		error = nlookup(&nd);
-	if (error == 0) {
-		error = cache_vget(nd.nl_ncp, nd.nl_cred, LK_EXCLUSIVE, 
-					&rootvp);
-	}
-
-	xmp = (struct null_mount *) malloc(sizeof(struct null_mount),
-				M_NULLFSMNT, M_WAITOK);	/* XXX */
-
-	/*
-	 * Save reference to underlying FS
-	 */
-        /*
-         * As lite stacking enters the scene, the old way of doing this
-	 * -- via the vnode -- is not good enough anymore...
-	 */
-	xmp->nullm_vfs = nd.nl_ncp->nc_mount;
-	nlookup_done(&nd);
-
-	vfs_add_vnodeops(mp, &mp->mnt_vn_norm_ops, 
-			 null_vnodeop_entries, 0);
-
-	VOP_UNLOCK(rootvp, 0, td);
-
-	/*
-	 * Keep a held reference to the root vnode.
-	 * It is vrele'd in nullfs_unmount.
-	 */
-	xmp->nullm_rootvp = rootvp;
-	/*
-	 * XXX What's the proper safety condition for querying
-	 * the underlying mount? Is this flag tuning necessary
-	 * at all?
-	 */
-	if (xmp->nullm_vfs->mnt_flag & MNT_LOCAL)
+	}
+
+	cache_unlock(xmp->nullm_ncp);
+
+	vfs_add_vnodeops(mp, &mp->mnt_vn_norm_ops, null_vnodeop_entries, 0);
+
+	if (xmp->nullm_ncp->nc_mount->mnt_flag & MNT_LOCAL)
 		mp->mnt_flag |= MNT_LOCAL;
-	mp->mnt_data = (qaddr_t) xmp;
+	mp->mnt_data = (void *)xmp;
 	vfs_getnewfsid(mp);
 
 	(void) copyinstr(args.target, mp->mnt_stat.f_mntfromname, MNAMELEN - 1,
 	    &size);
 	bzero(mp->mnt_stat.f_mntfromname + size, MNAMELEN - size);
-	(void)nullfs_statfs(mp, &mp->mnt_stat, td);
-	NULLFSDEBUG("nullfs_mount: lower %s, alias at %s\n",
-		mp->mnt_stat.f_mntfromname, mp->mnt_stat.f_mntfromname);
-	return (0);
-}
-
-/*
- * Free reference to null layer
- */
+	NULLFSDEBUG("lower %s, alias at %s\n",
+	            mp->mnt_stat.f_mntfromname, mp->mnt_stat.f_mntonname);
+	return (0);
+}
+
 static int
 nullfs_unmount(struct mount *mp, int mntflags, struct thread *td)
 {
-	void *mntdata;
-	int flags = 0;
-
-	NULLFSDEBUG("nullfs_unmount: mp = %p\n", (void *)mp);
-
-	if (mntflags & MNT_FORCE)
-		flags |= FORCECLOSE;
-
-	/*
-	 * Finally, throw away the null_mount structure
-	 */
-	mntdata = mp->mnt_data;
-	mp->mnt_data = 0;
-	free(mntdata, M_NULLFSMNT);
-	return 0;
+	NULLNCDEBUG(mp->mnt_ncp);
+
+	cache_drop(MOUNTTONULLMOUNT(mp)->nullm_ncp);
+	free(mp->mnt_data, M_NULLFSMNT);
+
+	return (0);
+}
+
+static int
+nullfs_start(struct mount *mp, int flags, struct thread *td)
+{
+	mp->mnt_ncp->nc_shadowed = MOUNTTONULLMOUNT(mp)->nullm_ncp;
+
+	return (0);
 }
 
 static int
 nullfs_root(struct mount *mp, struct vnode **vpp)
 {
-	struct thread *td = curthread;	/* XXX */
-	struct vnode *vp;
-
-	NULLFSDEBUG("nullfs_root(mp = %p, vp = %p)\n", (void *)mp,
-	    (void *)MOUNTTONULLMOUNT(mp)->nullm_rootvp);
-
-	/*
-	 * Return locked reference to root.
-	 */
-	vp = MOUNTTONULLMOUNT(mp)->nullm_rootvp;
-	vref(vp);
-
-#ifdef NULLFS_DEBUG
-	if (VOP_ISLOCKED(vp, NULL)) {
-		Debugger("root vnode is locked.\n");
-		vrele(vp);
-		return (EDEADLK);
-	}
-#endif
-	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
-	*vpp = vp;
-	return 0;
+	int error;
+
+	error = cache_vget(MOUNTTONULLMOUNT(mp)->nullm_ncp,
+	                   crhold(proc0.p_ucred), LK_EXCLUSIVE | LK_RETRY, vpp);
+	crfree(proc0.p_ucred);
+
+	return (error);
+}
+
+static __inline
+struct mount *
+nullfs_lowermount_0(struct mount *mp)
+{
+	return (MOUNTTONULLMOUNT(mp)->nullm_ncp->nc_mount);
 
 static int
 nullfs_quotactl(struct mount *mp, int cmd, uid_t uid, caddr_t arg,
 		struct thread *td)
 {
-	return VFS_QUOTACTL(MOUNTTONULLMOUNT(mp)->nullm_vfs, cmd, uid, arg, td);
+	return VFS_QUOTACTL(nullfs_lowermount_0(mp), cmd, uid, arg, td);
 }
 
 static int
@@ -218,12 +178,12 @@ nullfs_statfs(struct mount *mp, struct s
 	int error;
 	struct statfs mstat;
 
-	NULLFSDEBUG("nullfs_statfs(mp = %p, vp = %p)\n", (void *)mp,
-	    (void *)MOUNTTONULLMOUNT(mp)->nullm_rootvp);
+	NULLFSDEBUG("mp %p, ncp %p, lower mp %p\n",
+	            mp, mp->mnt_ncp, nullfs_lowermount_0(mp));
 
 	bzero(&mstat, sizeof(mstat));
 
-	error = VFS_STATFS(MOUNTTONULLMOUNT(mp)->nullm_vfs, &mstat, td);
+	error = VFS_STATFS(nullfs_lowermount_0(mp), &mstat, td);
 	if (error)
 		return (error);
 
@@ -248,23 +208,21 @@ nullfs_checkexp(struct mount *mp, struct
 nullfs_checkexp(struct mount *mp, struct sockaddr *nam, int *extflagsp,
 		struct ucred **credanonp)
 {
-
-	return VFS_CHECKEXP(MOUNTTONULLMOUNT(mp)->nullm_vfs, nam, 
-		extflagsp, credanonp);
+	return VFS_CHECKEXP(nullfs_lowermount_0(mp), nam, extflagsp, credanonp);
 }
 
 static int                        
 nullfs_extattrctl(struct mount *mp, int cmd, const char *attrname, caddr_t arg,
 		  struct thread *td)
 {
-	return VFS_EXTATTRCTL(MOUNTTONULLMOUNT(mp)->nullm_vfs, cmd, attrname,
-	    arg, td);
+	return VFS_EXTATTRCTL(nullfs_lowermount_0(mp), cmd, attrname, arg, td);
 }
 
 
 static struct vfsops null_vfsops = {
 	.vfs_mount =   	 	nullfs_mount,
 	.vfs_unmount =   	nullfs_unmount,
+	.vfs_start =            nullfs_start,
 	.vfs_root =     	nullfs_root,
 	.vfs_quotactl =   	nullfs_quotactl,
 	.vfs_statfs =    	nullfs_statfs,
diff -r e1d135a8f666 sys/vfs/nullfs/null_vnops.c
--- a/sys/vfs/nullfs/null_vnops.c	Sun Mar 26 07:56:54 2006 +0000
+++ b/sys/vfs/nullfs/null_vnops.c	Wed Mar 29 10:29:08 2006 +0200
@@ -98,7 +98,7 @@
  * might be able to get on with a hybrid solution: overlay some vnodes, and rely
  * on namecache API for the rest.
  */
- 
+
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
@@ -109,6 +109,8 @@
 #include <sys/namei.h>
 #include <sys/malloc.h>
 #include <sys/buf.h>
+#include <sys/namecache.h>
+#include <sys/nlookup.h>
 #include "null.h"
 
 static int	null_nresolve(struct vop_nresolve_args *ap);
@@ -122,90 +124,190 @@ static int	null_nrmdir(struct vop_nrmdir
 static int	null_nrmdir(struct vop_nrmdir_args *ap);
 static int	null_nrename(struct vop_nrename_args *ap);
 
+static __inline
+struct mount *
+nullfs_lowermount_l(struct namecache *ncp)
+{
+	/*
+	 * The code in use below allows allows passing through lower mounts.
+	 * If we didn't want to do that, we could use
+	 *
+	 *   MOUNTTONULLMOUNT(ncp->nc_mount)->nullm_ncp->nc_mount
+	 *
+	 * Eventually, the choice might be configurable.
+	 */
+	return (ncp->nc_shadowed->nc_mount);
+}
+
+
+static __inline
+int
+nullfs_check(struct namecache *ncp)
+{
+	if (ncp->nc_mount->mnt_ncp == ncp)
+		return (EPERM);
+
+	if (!ncp->nc_shadowed)
+		return (ENOENT);
+
+	if (ncp->nc_shadowheight == 0)
+		return (EINVAL);
+
+	return (0);
+}
+
 static int
 null_nresolve(struct vop_nresolve_args *ap)
 {
-	ap->a_head.a_ops = MOUNTTONULLMOUNT(ap->a_ncp->nc_mount)->nullm_vfs->mnt_vn_norm_ops;
-
-	return vop_nresolve_ap(ap);
-}
-
-static int
-null_ncreate(struct vop_ncreate_args *ap)
-{
-	ap->a_head.a_ops = MOUNTTONULLMOUNT(ap->a_ncp->nc_mount)->nullm_vfs->mnt_vn_norm_ops;
-
-	return vop_ncreate_ap(ap);
-}
-
-static int
-null_nmkdir(struct vop_nmkdir_args *ap)
-{
-	ap->a_head.a_ops = MOUNTTONULLMOUNT(ap->a_ncp->nc_mount)->nullm_vfs->mnt_vn_norm_ops;
-
-	return vop_nmkdir_ap(ap);
-}
-
-static int
-null_nmknod(struct vop_nmknod_args *ap)
-{
-	ap->a_head.a_ops = MOUNTTONULLMOUNT(ap->a_ncp->nc_mount)->nullm_vfs->mnt_vn_norm_ops;
-
-	return vop_nmknod_ap(ap);
-}
-
-static int
-null_nlink(struct vop_nlink_args *ap)
-{
-	ap->a_head.a_ops = MOUNTTONULLMOUNT(ap->a_ncp->nc_mount)->nullm_vfs->mnt_vn_norm_ops;
-
-	return vop_nlink_ap(ap);
-}
-
-static int
-null_nsymlink(struct vop_nsymlink_args *ap)
-{
-	ap->a_head.a_ops = MOUNTTONULLMOUNT(ap->a_ncp->nc_mount)->nullm_vfs->mnt_vn_norm_ops;
-
-	return vop_nsymlink_ap(ap);
-}
-
-static int
-null_nwhiteout(struct vop_nwhiteout_args *ap)
-{
-	ap->a_head.a_ops = MOUNTTONULLMOUNT(ap->a_ncp->nc_mount)->nullm_vfs->mnt_vn_norm_ops;
-
-	return vop_nwhiteout_ap(ap);
-}
-
-static int
-null_nremove(struct vop_nremove_args *ap)
-{
-	ap->a_head.a_ops = MOUNTTONULLMOUNT(ap->a_ncp->nc_mount)->nullm_vfs->mnt_vn_norm_ops;
-
-	return vop_nremove_ap(ap);
-}
-
-static int
-null_nrmdir(struct vop_nrmdir_args *ap)
-{
-	ap->a_head.a_ops = MOUNTTONULLMOUNT(ap->a_ncp->nc_mount)->nullm_vfs->mnt_vn_norm_ops;
-
-	return vop_nrmdir_ap(ap);
-}
+	struct namecache *ncp = ap->a_ncp;
+	struct nlcomponent nlc;
+	struct namecache *sncp, *psncp;
+	int error = 0;
+
+	sncp = ncp->nc_shadow_next;
+	if (sncp) {
+		cache_hold(sncp);
+		cache_setunresolved(sncp);
+		cache_put(sncp);
+	}
+
+	cache_unlock(ncp);
+	cache_lock(ncp->nc_parent);
+	psncp = ncp->nc_parent->nc_shadowed;
+	if (psncp)
+		cache_hold(psncp);
+	cache_unlock(ncp->nc_parent);
+
+	if (! psncp) {
+		cache_lock(ncp);
+		if ((ncp->nc_flag & NCF_UNRESOLVED) == 0)
+			cache_setvp(ncp, NULL);
+		return (ncp->nc_error);
+	}
+
+	nlc.nlc_nameptr = ncp->nc_name;
+	nlc.nlc_namelen = ncp->nc_nlen;
+	sncp = cache_nlookup(psncp, &nlc);
+	cache_drop(psncp);
+
+	if ((sncp->nc_flag & NCF_UNRESOLVED) == 0)
+		goto postdowncall;
+
+	ap->a_head.a_ops = sncp->nc_mount->mnt_vn_use_ops;
+	ap->a_ncp = sncp;
+	/*
+	 * According to cache_resolve(), the primary place for
+	 * VOP_NRESOLVE calls, the caller of the nresolve method
+	 * is the one who should take care about ncp->nc_error.
+	 */
+	ap->a_ncp->nc_error = vop_nresolve_ap(ap);
+
+postdowncall:
+
+	error = cache_shadow_attach(ncp, sncp);
+
+	NULLNCDEBUG(ncp);
+	NULLNCDEBUG(sncp);
+	NULLFSDEBUG("attach error %d\n", error);
+
+	if (error) {
+		cache_put(sncp);
+		if (ncp->nc_flag & NCF_UNRESOLVED) {
+			cache_setvp(ncp, NULL);
+			error = ENOENT;
+		} else if (error == EEXIST)
+			error = ncp->nc_error;
+	} else {
+		error = sncp->nc_error;
+		cache_setvp(ncp, sncp->nc_vp);
+		ncp->nc_shadowed = sncp;
+		cache_drop(sncp);
+	}
+
+	NULLFSDEBUG("error %d\n", error);
+	return (error);
+}
+
+#define NULL_NVOP_TEMPLATE(OP)							\
+static int									\
+null_ ## OP(struct vop_ ## OP ## _args *ap)					\
+{										\
+	struct namecache *ncp = ap->a_ncp;					\
+	struct namecache *sncp = ncp->nc_shadowed;				\
+	int error;								\
+										\
+	NULLNCDEBUG(ap->a_ncp);							\
+										\
+	if ((error = nullfs_check(ncp)))					\
+		return (error);							\
+	cache_hold(sncp);							\
+										\
+	NULLNCDEBUG(ap->a_ncp->nc_shadowed);					\
+										\
+	ap->a_head.a_ops = nullfs_lowermount_l(ap->a_ncp)->mnt_vn_use_ops;	\
+	ap->a_ncp = ncp->nc_shadowed;						\
+										\
+	error = vop_ ## OP ## _ap(ap);						\
+	NULLNCDEBUG(ncp);							\
+	NULLNCDEBUG(sncp);							\
+	sncp->nc_shadowinfo == ncp->nc_shadowinfo ?				\
+	    cache_drop(sncp) :							\
+	    cache_put(sncp);							\
+										\
+	return (error);								\
+}
+
+NULL_NVOP_TEMPLATE(ncreate)
+NULL_NVOP_TEMPLATE(nmkdir)
+NULL_NVOP_TEMPLATE(nmknod)
+NULL_NVOP_TEMPLATE(nlink)
+NULL_NVOP_TEMPLATE(nsymlink)
+NULL_NVOP_TEMPLATE(nwhiteout)
+NULL_NVOP_TEMPLATE(nremove)
+NULL_NVOP_TEMPLATE(nrmdir)
 
 static int
 null_nrename(struct vop_nrename_args *ap)
 {
+	struct namecache *fncp = ap->a_fncp;
+	struct namecache *tncp = ap->a_tncp;
+	struct namecache *sfncp = fncp->nc_shadowed;
+	struct namecache *stncp = tncp->nc_shadowed;
 	struct mount *lmp;
-
-	lmp = MOUNTTONULLMOUNT(ap->a_fncp->nc_mount)->nullm_vfs;
-	if (lmp != MOUNTTONULLMOUNT(ap->a_tncp->nc_mount)->nullm_vfs)
-		return (EINVAL);
-
-	ap->a_head.a_ops = lmp->mnt_vn_norm_ops;
-
-	return vop_nrename_ap(ap);
+	int error;
+
+	NULLNCDEBUG(ap->a_fncp);
+	NULLNCDEBUG(ap->a_tncp);
+
+	if ((error = nullfs_check(fncp)))
+		return (error);
+	if ((error = nullfs_check(tncp)))
+		return (error);
+
+	lmp = nullfs_lowermount_l(fncp);
+	if (lmp != nullfs_lowermount_l(tncp))
+		return (EXDEV);
+
+	cache_hold(sfncp);
+	cache_hold(stncp);
+
+	NULLNCDEBUG(ap->a_fncp->nc_shadowed);
+	NULLNCDEBUG(ap->a_tncp->nc_shadowed);
+
+	ap->a_head.a_ops = lmp->mnt_vn_use_ops;
+	ap->a_fncp = fncp->nc_shadowed;
+	ap->a_tncp = tncp->nc_shadowed;
+
+	error = vop_nrename_ap(ap);
+
+	sfncp->nc_shadowinfo == fncp->nc_shadowinfo ?
+	    cache_drop(sfncp) :
+	    cache_put(sfncp);
+	stncp->nc_shadowinfo == tncp->nc_shadowinfo ?
+	    cache_drop(stncp) :
+	    cache_put(stncp);
+
+	return (error);
 }
 
 /*
@@ -224,4 +326,3 @@ struct vnodeopv_entry_desc null_vnodeop_
 	{ &vop_nrename_desc,		(vnodeopv_entry_t) null_nrename },
 	{ NULL, NULL }
 };
-




More information about the Submit mailing list