namecache coherency, 2nd turn

Csaba Henk csaba.henk at creo.hu
Tue Feb 7 12:44:03 PST 2006


Sorry for the arbitrary indexing, there would be valid reasons for
calling it 3rd or 4th (or else) as well. Second such changeset from me,
at least.

See explanation on kernel at .

Csaba

diff -r b30705ca4860 sys/kern/vfs_cache.c
--- a/sys/kern/vfs_cache.c	Thu Feb  2 19:50:43 2006 +0100
+++ b/sys/kern/vfs_cache.c	Tue Feb  7 19:55:06 2006 +0100
@@ -108,6 +108,16 @@
 #define NCHHASH(hash)	(&nchashtbl[(hash) & nchash])
 #define MINNEG		1024
 
+/* Direction markers for shadow tree traversal. */
+#define STREE_AWAY	0x1
+#define STREE_BACK	0x2
+
+/* Shadow tree updater function with metadata (direction). */
+struct stree_updater {
+	int (*updater)(struct namecache *ncp, void *param);
+	int flags;
+};
+
 MALLOC_DEFINE(M_VFSCACHE, "vfscache", "VFS name cache entries");
 
 static LIST_HEAD(nchashhead, namecache) *nchashtbl;	/* Hash Table */
@@ -225,7 +235,7 @@ _cache_drop(struct namecache *ncp)
 	    (ncp->nc_flag & NCF_UNRESOLVED) && 
 	    TAILQ_EMPTY(&ncp->nc_list)
 	) {
-		KKASSERT(ncp->nc_exlocks == 0);
+		KKASSERT(ncp->nc_lockby->nc_exlocks == 0);
 		cache_lock(ncp);
 		cache_zap(ncp);
 	} else {
@@ -295,7 +305,9 @@ cache_alloc(int nlen)
 	ncp->nc_error = ENOTCONN;	/* needs to be resolved */
 	ncp->nc_refs = 1;
 	ncp->nc_fsmid = 1;
+	ncp->nc_lockby = ncp;
 	TAILQ_INIT(&ncp->nc_list);
+	LIST_INIT(&ncp->nc_shadow_list);
 	cache_lock(ncp);
 	return(ncp);
 }
@@ -323,6 +335,277 @@ cache_drop(struct namecache *ncp)
 {
 	_cache_drop(ncp);
 }
+
+/*
+ * Iterative tree traversal routine with actually more features than we need:
+ * supports making callbacks either in "away" or "back" direction, pruning
+ * the traversal at certain nodes (ie., skipping the respective subtree), and
+ * safe leaf cutting.
+ *
+ * Now it's pretty much the same if callbacks are made away or backway, and we
+ * never prune or cut off leaves...
+ *
+ * In fact, all we need is to be able to traverse a subtree identified by its
+ * root.
+ */
+static void
+cache_group_walk(struct namecache *ncp, struct stree_updater *sup, 
+                 void *param)
+{
+	struct namecache *pncp = ncp; /* previous ncp during traversal */
+	struct namecache *qncp;       /* current ncp during traversal */
+	struct namecache *xncp;       /* auxiliary ncp */
+	int prune;
+#ifdef INVARIANTS
+	int height = 0, turns = 0, pruned = 0;
+
+#define STAWAY_ASSERT(ncp) do {				\
+	KKASSERT(((ncp)->nc_flag & NCF_TREEPASS) == 0);	\
+	(ncp)->nc_flag |= NCF_TREEPASS;			\
+} while (0)
+#define STBACK_ASSERT(ncp) do {				\
+	KKASSERT((ncp)->nc_flag & NCF_TREEPASS);	\
+	(ncp)->nc_flag &= ~NCF_TREEPASS;		\
+} while (0)
+#else
+#define STAWAY_ASSERT(ncp)
+#define STBACK_ASSERT(ncp)
+#endif
+
+	STAWAY_ASSERT(ncp);
+	prune = sup->flags & STREE_AWAY ? sup->updater(ncp, param) : 0;
+	if (prune || ! (qncp = LIST_FIRST(&ncp->nc_shadow_list)))
+		goto out;
+
+	for (;;) {
+#ifdef INVARIANTS
+		turns++;
+#endif
+		if (qncp->nc_shadowed == pncp) {
+			/* heading toward leafs */
+			KKASSERT(height++ >= 0);
+			STAWAY_ASSERT(qncp);
+			prune = sup->flags & STREE_AWAY ?
+			        sup->updater(qncp, param) :
+			        0;
+#ifdef INVARIANTS
+			if (prune)
+				pruned = 1;
+#endif
+			if (prune || LIST_EMPTY(&qncp->nc_shadow_list)) {
+			       /*
+			        * we hit a leaf or were asked to prune the
+				* branch, turn back
+			        */
+				xncp = pncp;
+				pncp = qncp;
+				qncp = xncp;
+			} else {
+				/* keep going on */
+				pncp = qncp;
+				qncp = LIST_FIRST(&qncp->nc_shadow_list);
+			}
+		} else {
+			/* heading back toward root */
+			KKASSERT(--height >= 0);
+			STBACK_ASSERT(pncp);
+			if (LIST_NEXT(pncp, nc_shadow_entry))
+				/* turn to leaf direction using neighbour */
+				qncp = LIST_NEXT(pncp, nc_shadow_entry);
+			else
+				/*
+				 * No neighbour,
+				 * try to keep going on to root direction.
+			         */
+				qncp = qncp == ncp ? NULL : qncp->nc_shadowed;
+			xncp = pncp->nc_shadowed;
+			/*
+			 * We have to take care about finding out the next pncp
+			 * candidate before calling the updater, because the
+			 * updater might remove it from the tree, rendering 
+			 * pncp unuseable wrt. traversal.
+			 */
+			if (sup->flags & STREE_BACK)
+				sup->updater(pncp, param);
+#ifdef INVARIANTS
+			if (! pncp->nc_shadowed)
+				turns -= 2;
+#endif
+			if (! qncp)
+				break;
+			pncp = xncp;
+		}
+	}
+
+out:
+	STBACK_ASSERT(ncp);
+	KASSERT(height == 0, ("shadow tree left with height %d\n", height));
+	KASSERT(turns == 2 * ncp->nc_treesize || (pruned && turns < 2 * ncp->nc_treesize),
+	        ("inconsistent traversal turn count: turns %d, ncp->nc_treesize %d, "
+		 "pruned %d", turns, ncp->nc_treesize, pruned));
+
+	if (sup->flags & STREE_BACK)
+		sup->updater(ncp, param);
+}
+				
+static int
+migrate_updater(struct namecache *ncp, void *param)
+{
+	struct namecache *lncp = ncp->nc_lockby;
+
+	ncp->nc_lockby = param;
+
+	if (lncp->nc_flag & NCF_LOCKREQ) {
+		lncp->nc_flag &= ~NCF_LOCKREQ;
+		wakeup(lncp);
+	}
+
+	return(0);
+}
+
+static struct stree_updater stree_migrate_updater = {
+	.updater = migrate_updater,
+	.flags = STREE_BACK
+};
+
+/*
+ * Join ncp into the shadow group of sncp.
+ * 
+ * ncp must be unlocked on entry, while sncp must be locked on entry. Caller
+ * also has to hold a dedicated reference of sncp.
+ *
+ * The routine will fail and return ELOOP if the intended shadowing association
+ * yielded a loop in the shadow chain. It will fail with EEXIST if ncp gets
+ * resolved or acquires a shadow association from elsewhere during the attach
+ * attempt (it is [at least theoretically] possbile due to the fact that ncp
+ * is unlocked).
+ *
+ * - On success ncp will be a representative of the joint shadow group, which
+ *   then will be locked (both via ncp and sncp).
+ * - On failure the namecache entries will exist separately just as they did
+ *   before; both entries will be locked.
+ */
+int
+cache_shadow_attach(struct namecache *ncp, struct namecache *sncp)
+{
+#ifdef INVARIANTS
+	struct namecache *xncp;
+#endif
+
+	if (ncp == sncp)
+		return(ELOOP);
+
+	KKASSERT(ncp->nc_lockby->nc_locktd != curthread);
+	KKASSERT(sncp->nc_lockby->nc_locktd == curthread);
+
+	cache_lock_two(ncp, sncp);
+
+	if ((ncp->nc_flag & NCF_UNRESOLVED) == 0 || ncp->nc_shadowed)
+		return(EEXIST);
+
+	KKASSERT(ncp->nc_lockby == ncp);
+	KKASSERT(sncp->nc_lockby == sncp->nc_lockby->nc_lockby);
+
+	if (sncp->nc_lockby == ncp)
+		return(ELOOP);
+
+	ncp->nc_shadowed = sncp;
+	LIST_INSERT_HEAD(&sncp->nc_shadow_list, ncp, nc_shadow_entry);
+	cache_group_walk(ncp, &stree_migrate_updater, sncp->nc_lockby);
+
+#ifdef INVARIANTS
+	xncp = sncp;
+	for (;;) {
+		KKASSERT(xncp != ncp);
+		KKASSERT(xncp->nc_treesize++ >= 0);
+		xncp->nc_treesize += ncp->nc_treesize;
+		if (! xncp->nc_shadowed)
+			break;
+		xncp = xncp->nc_shadowed;
+	}
+	KKASSERT(xncp == ncp->nc_lockby);
+#endif
+
+	return(0);
+}
+
+static __inline
+void
+delete_shadow(struct namecache *ncp)
+{
+#ifdef INVARIANTS
+	struct namecache *xncp = ncp->nc_shadowed;
+
+	KKASSERT(ncp->nc_shadowed);
+	for (;;) {
+		KKASSERT(xncp != ncp);
+		xncp->nc_treesize -= ncp->nc_treesize;
+		KKASSERT(--xncp->nc_treesize >= 0);
+		if (! xncp->nc_shadowed)
+			break;
+		xncp = xncp->nc_shadowed;
+	}
+#endif
+
+	LIST_REMOVE(ncp, nc_shadow_entry);
+	ncp->nc_shadowed = NULL;
+}
+
+/*
+ * Take out namecache entry from its shadow group.
+ *
+ * ncp must really shadow someone, and the shadow group must be locked
+ * upon entry.
+ *
+ * After the routine returns, ncp will be the head of a new (possibly singleton)
+ * shadow group. The routine returns the former successor of ncp in the original
+ * shadow group in a locked+ref'd state.
+ */
+struct namecache *
+cache_shadow_detach(struct namecache *ncp)
+{
+	struct namecache *sncp = ncp->nc_shadowed;
+
+	KKASSERT(ncp->nc_shadowed);
+	KKASSERT(sncp->nc_lockby == ncp->nc_lockby);
+	KKASSERT(ncp->nc_lockby == ncp->nc_lockby->nc_lockby);
+
+	delete_shadow(ncp);
+	ncp->nc_locktd = curthread;
+	cache_group_walk(ncp, &stree_migrate_updater, ncp);
+
+	return(sncp);
+}
+
+static int
+vhold_updater(struct namecache *ncp, void *param)
+{
+	
+	if (ncp->nc_vp)
+		vhold(ncp->nc_vp);
+
+	return(0);
+}
+
+static struct stree_updater stree_vhold_updater = {
+	.updater = vhold_updater,
+	.flags = STREE_AWAY
+};
+
+static int
+vdrop_updater(struct namecache *ncp, void *param)
+{
+	
+	if (ncp->nc_vp)
+		vdrop(ncp->nc_vp);
+
+	return(0);
+}
+
+static struct stree_updater stree_vdrop_updater = {
+	.updater = vdrop_updater,
+	.flags = STREE_AWAY
+};
 
 /*
  * Namespace locking.  The caller must already hold a reference to the
@@ -349,15 +632,19 @@ cache_lock(struct namecache *ncp)
 {
 	thread_t td;
 	int didwarn;
+	struct namecache *lncp;
 
 	KKASSERT(ncp->nc_refs != 0);
 	didwarn = 0;
 	td = curthread;
 
 	for (;;) {
-		if (ncp->nc_exlocks == 0) {
-			ncp->nc_exlocks = 1;
-			ncp->nc_locktd = td;
+		lncp = ncp->nc_lockby;
+		KKASSERT(lncp);
+		KKASSERT(lncp->nc_refs != 0);
+		if (lncp->nc_exlocks == 0) {
+			lncp->nc_exlocks = 1;
+			lncp->nc_locktd = td;
 			/* 
 			 * The vp associated with a locked ncp must be held
 			 * to prevent it from being recycled (which would
@@ -365,16 +652,15 @@ cache_lock(struct namecache *ncp)
 			 *
 			 * XXX loop on race for later MPSAFE work.
 			 */
-			if (ncp->nc_vp)
-				vhold(ncp->nc_vp);
+			cache_group_walk(lncp, &stree_vhold_updater, NULL);
 			break;
 		}
-		if (ncp->nc_locktd == td) {
-			++ncp->nc_exlocks;
+		if (lncp->nc_locktd == td) {
+			++lncp->nc_exlocks;
 			break;
 		}
-		ncp->nc_flag |= NCF_LOCKREQ;
-		if (tsleep(ncp, 0, "clock", nclockwarn) == EWOULDBLOCK) {
+		lncp->nc_flag |= NCF_LOCKREQ;
+		if (tsleep(lncp, 0, "clock", nclockwarn) == EWOULDBLOCK) {
 			if (didwarn)
 				continue;
 			didwarn = 1;
@@ -398,12 +684,15 @@ cache_lock_nonblock(struct namecache *nc
 cache_lock_nonblock(struct namecache *ncp)
 {
 	thread_t td;
+	struct namecache *lncp = ncp->nc_lockby;
 
 	KKASSERT(ncp->nc_refs != 0);
+	KKASSERT(lncp);
+	KKASSERT(lncp->nc_refs != 0);
 	td = curthread;
-	if (ncp->nc_exlocks == 0) {
-		ncp->nc_exlocks = 1;
-		ncp->nc_locktd = td;
+	if (lncp->nc_exlocks == 0) {
+		lncp->nc_exlocks = 1;
+		lncp->nc_locktd = td;
 		/* 
 		 * The vp associated with a locked ncp must be held
 		 * to prevent it from being recycled (which would
@@ -411,8 +700,7 @@ cache_lock_nonblock(struct namecache *nc
 		 *
 		 * XXX loop on race for later MPSAFE work.
 		 */
-		if (ncp->nc_vp)
-			vhold(ncp->nc_vp);
+		cache_group_walk(lncp, &stree_vhold_updater, NULL);
 		return(0);
 	} else {
 		return(EWOULDBLOCK);
@@ -423,17 +711,43 @@ cache_unlock(struct namecache *ncp)
 cache_unlock(struct namecache *ncp)
 {
 	thread_t td = curthread;
+	struct namecache *lncp = ncp->nc_lockby;
 
 	KKASSERT(ncp->nc_refs > 0);
-	KKASSERT(ncp->nc_exlocks > 0);
-	KKASSERT(ncp->nc_locktd == td);
-	if (--ncp->nc_exlocks == 0) {
-		if (ncp->nc_vp)
-			vdrop(ncp->nc_vp);
-		ncp->nc_locktd = NULL;
-		if (ncp->nc_flag & NCF_LOCKREQ) {
-			ncp->nc_flag &= ~NCF_LOCKREQ;
-			wakeup(ncp);
+	KKASSERT(lncp);
+	KKASSERT(lncp->nc_refs > 0);
+	KKASSERT(lncp->nc_exlocks > 0);
+	KKASSERT(lncp->nc_locktd == td);
+	if (lncp->nc_exlocks == 1)
+		cache_group_walk(lncp, &stree_vdrop_updater, NULL);
+	if (--lncp->nc_exlocks == 0) {
+		lncp->nc_locktd = NULL;
+		if (lncp->nc_flag & NCF_LOCKREQ) {
+			lncp->nc_flag &= ~NCF_LOCKREQ;
+			wakeup(lncp);
+		}
+	}
+}
+
+/*
+ * Obtain lock on both of uncp and lncp.
+ *
+ * On entry, uncp is assumed to be unlocked, and lncp is assumed to be
+ * locked.
+ *
+ * After this function returns, caller is responsible for checking
+ * the state of lncp which might have got unlocked temporarily.
+ */
+void
+cache_lock_two(struct namecache *uncp, struct namecache *lncp)
+{
+	if (cache_lock_nonblock(uncp) != 0) {
+		if (uncp > lncp)
+			cache_lock(uncp);
+		else {
+			cache_unlock(lncp);
+			cache_lock(uncp);
+			cache_lock(lncp);
 		}
 	}
 }
@@ -453,7 +767,8 @@ cache_get_nonblock(struct namecache *ncp
 cache_get_nonblock(struct namecache *ncp)
 {
 	/* XXX MP */
-	if (ncp->nc_exlocks == 0 || ncp->nc_locktd == curthread) {
+	if (ncp->nc_lockby->nc_exlocks == 0 ||
+	    ncp->nc_lockby->nc_locktd == curthread) {
 		_cache_hold(ncp);
 		cache_lock(ncp);
 		return(0);
@@ -521,6 +836,13 @@ cache_settimeout(struct namecache *ncp, 
 		ncp->nc_timeout = 1;
 }
 
+static int unresolver_updater(struct namecache *ncp, void *param); 
+
+static struct stree_updater stree_unresolver_updater = {
+	.updater = unresolver_updater,
+	.flags = STREE_BACK
+};
+
 /*
  * Disassociate the vnode or negative-cache association and mark a
  * namecache entry as unresolved again.  Note that the ncp is still
@@ -541,10 +863,17 @@ void
 void
 cache_setunresolved(struct namecache *ncp)
 {
+	cache_group_walk(ncp, &stree_unresolver_updater, NULL);
+}
+
+static int
+unresolver_updater(struct namecache *ncp, void *param) 
+{
 	struct vnode *vp;
 
-	if ((ncp->nc_flag & NCF_UNRESOLVED) == 0) {
+	if ((ncp->nc_flag & NCF_UNRESOLVED) == 0 && ncp != param) {
 		ncp->nc_flag |= NCF_UNRESOLVED;
+		/* XXX Why don't we reset NCF_DESTROYED ? */
 		ncp->nc_flag &= ~(NCF_WHITEOUT|NCF_ISDIR|NCF_ISSYMLINK|
 				  NCF_FSMID);
 		ncp->nc_timeout = 0;
@@ -563,13 +892,15 @@ cache_setunresolved(struct namecache *nc
 			 */
 			if (!TAILQ_EMPTY(&ncp->nc_list))
 				vdrop(vp);
-			if (ncp->nc_exlocks)
+			if (ncp->nc_lockby->nc_exlocks)
 				vdrop(vp);
 		} else {
 			TAILQ_REMOVE(&ncneglist, ncp, nc_vnode);
 			--numneg;
 		}
 	}
+
+	return(0);
 }
 
 /*
@@ -619,7 +950,7 @@ cache_inval(struct namecache *ncp, int f
 	struct namecache *nextkid;
 	int rcnt = 0;
 
-	KKASSERT(ncp->nc_exlocks);
+	KKASSERT(ncp->nc_lockby->nc_exlocks);
 
 	cache_setunresolved(ncp);
 	if (flags & CINV_DESTROY)
@@ -715,6 +1046,7 @@ restart:
  * XXX the disconnection could pose a problem, check code paths to make
  * sure any code that blocks can handle the parent being changed out from
  * under it.  Maybe we should lock the children (watch out for deadlocks) ?
+ * [UPDATE: attempt made to lock children, see in situ explanation]
  *
  * After we return the caller has the option of calling cache_setvp() if
  * the vnode of the new target ncp is known.
@@ -726,26 +1058,62 @@ cache_rename(struct namecache *fncp, str
 cache_rename(struct namecache *fncp, struct namecache *tncp)
 {
 	struct namecache *scan;
-	int didwarn = 0;
-
+	int didwarn[] = { 0, 0 };
+
+	/* XXX should we rather make here a non-equality assertion? */
+	if (fncp == tncp)
+		return;
+
+again:
 	cache_setunresolved(fncp);
 	cache_setunresolved(tncp);
+
+	/*
+	 * It seems we need to unlock fncp before calling cache_inval():
+	 * cache_inval() does a lot of lock/unlock/relock-ing (with tncp
+	 * and its children), therefore keeping fncp locked might be
+	 * deadlocky...
+	 */
+	cache_unlock(fncp);
+	
 	while (cache_inval(tncp, CINV_CHILDREN) != 0) {
-		if (didwarn++ % 10 == 0) {
-			printf("Warning: cache_rename: race during "
+		if (didwarn[0]++ % 10 == 0) {
+			printf("Warning: cache_rename: race #1 during "
 				"rename %s->%s\n",
 				fncp->nc_name, tncp->nc_name);
 		}
 		tsleep(tncp, 0, "mvrace", hz / 10);
 		cache_setunresolved(tncp);
 	}
+
+	cache_unlock(tncp);
+	cache_lock(fncp);
+
 	while ((scan = TAILQ_FIRST(&fncp->nc_list)) != NULL) {
-		cache_hold(scan);
+		cache_unlock(fncp);
+		/*
+		 * We have to lock fncp's kids in order to unresolve
+		 * their shadow kids...
+		 */
+		cache_get(scan);
 		cache_unlink_parent(scan);
+		cache_group_walk(scan, &stree_unresolver_updater, scan);
 		cache_link_parent(scan, tncp);
 		if (scan->nc_flag & NCF_HASHED)
 			cache_rehash(scan);
-		cache_drop(scan);
+		cache_put(scan);
+		cache_lock(fncp);
+	}
+
+	cache_lock_two(tncp, fncp);
+
+	if ((fncp->nc_flag & tncp->nc_flag & NCF_UNRESOLVED) == 0) {
+		if (didwarn[1]++ % 10 == 0) {
+			printf("Warning: cache_rename: race #2 during "
+				"rename %s->%s\n",
+				fncp->nc_name, tncp->nc_name);
+		}
+		goto again;
 	}
 }
 
@@ -1207,6 +1575,9 @@ cache_zap(struct namecache *ncp)
 				vdrop(par->nc_vp);
 		}
 
+		if (ncp->nc_shadowed)
+			cache_put(cache_shadow_detach(ncp));
+
 		/*
 		 * ncp should not have picked up any refs.  Physically
 		 * destroy the ncp.
diff -r b30705ca4860 sys/sys/namecache.h
--- a/sys/sys/namecache.h	Thu Feb  2 19:50:43 2006 +0100
+++ b/sys/sys/namecache.h	Tue Feb  7 19:55:06 2006 +0100
@@ -71,6 +71,7 @@ struct vnode;
 struct vnode;
 
 TAILQ_HEAD(namecache_list, namecache);
+LIST_HEAD(namecache_shadow_list, namecache);
 
 /*
  * The namecache structure is used to manage the filesystem namespace.  Most
@@ -100,8 +101,12 @@ struct namecache {
     LIST_ENTRY(namecache) nc_hash;	/* hash chain (nc_parent,name) */
     TAILQ_ENTRY(namecache) nc_entry;	/* scan via nc_parent->nc_list */
     TAILQ_ENTRY(namecache) nc_vnode;	/* scan via vnode->v_namecache */
+    LIST_ENTRY(namecache) nc_shadow_entry; /* scan via nc_shadowed->nc_shadow_list */
     struct namecache_list  nc_list;	/* list of children */
+    struct namecache_shadow_list nc_shadow_list; /* list of shadow overlays */
     struct namecache *nc_parent;	/* namecache entry for parent */
+    struct namecache *nc_shadowed;	/* lower layer entry in layered fs */
+    struct namecache *nc_lockby;	/* entry via which this one can be locked */
     struct	vnode *nc_vp;		/* vnode representing name or NULL */
     int		nc_refs;		/* ref count prevents deletion */
     u_short	nc_flag;
@@ -114,6 +119,9 @@ struct namecache {
     struct thread *nc_locktd;		/* namespace locking */
     struct mount *nc_mount;		/* associated mount for vopops */
     int64_t	nc_fsmid;		/* filesystem modified id */
+#ifdef INVARIANTS
+    int nc_treesize;			/* shadow tree size under ncp */
+#endif
 };
 
 typedef struct namecache *namecache_t;
@@ -133,6 +141,9 @@ typedef struct namecache *namecache_t;
 #define NCF_ISDIR	0x0200	/* represents a directory */
 #define NCF_DESTROYED	0x0400	/* name association is considered destroyed */
 #define NCF_FSMID	0x0800	/* FSMID updated */
+#ifdef INVARIANTS
+#define NCF_TREEPASS	0x2000
+#endif
 
 /*
  * cache_inval[_vp]() flags
@@ -150,6 +161,9 @@ void	cache_lock(struct namecache *ncp);
 void	cache_lock(struct namecache *ncp);
 int	cache_lock_nonblock(struct namecache *ncp);
 void	cache_unlock(struct namecache *ncp);
+void	cache_lock_two(struct namecache *uncp, struct namecache *lncp);
+int	cache_shadow_attach(struct namecache *ncp, struct namecache *sncp);
+struct namecache *cache_shadow_detach(struct namecache *ncp);
 void	cache_setvp(struct namecache *ncp, struct vnode *vp);
 void	cache_settimeout(struct namecache *ncp, int nticks);
 void	cache_setunresolved(struct namecache *ncp);
diff -r b30705ca4860 sys/vfs/nullfs/null.h
--- a/sys/vfs/nullfs/null.h	Thu Feb  2 19:50:43 2006 +0100
+++ b/sys/vfs/nullfs/null.h	Tue Feb  7 19:55:06 2006 +0100
@@ -43,18 +43,21 @@ struct null_args {
 	char		*target;	/* Target of loopback  */
 };
 
-struct null_mount {
-	struct mount	*nullm_vfs;
-	struct vnode	*nullm_rootvp;	/* Reference to root null_node */
-};
-
 #ifdef _KERNEL
-#define	MOUNTTONULLMOUNT(mp) ((struct null_mount *)((mp)->mnt_data))
 
 #ifdef NULLFS_DEBUG
-#define NULLFSDEBUG(format, args...) printf(format ,## args)
+#define NULLFSDEBUG(format, args...) \
+	printf(" [nullfs] %s:%d: " format, __func__, __LINE__, ## args)
+#define	NULLNCDEBUG(ncp)							\
+        NULLFSDEBUG(#ncp " %p: name %s, refs %d, exlocks %d, nc_flag 0x%x, "	\
+                    "nc_mount %p, nc_shadowed %p, nc_lockby %p, nc_vp %p\n",	\
+                    (ncp), (ncp)->nc_name, (ncp)->nc_refs,			\
+	            (ncp)->nc_lockby->nc_exlocks, (ncp)->nc_flag,		\
+                    (ncp)->nc_mount, (ncp)->nc_shadowed,			\
+                    (ncp)->nc_lockby, (ncp)->nc_vp)
 #else
 #define NULLFSDEBUG(format, args...)
+#define NULLNCDEBUG(ncp)
 #endif /* NULLFS_DEBUG */
 
 #endif /* _KERNEL */
diff -r b30705ca4860 sys/vfs/nullfs/null_vfsops.c
--- a/sys/vfs/nullfs/null_vfsops.c	Thu Feb  2 19:50:43 2006 +0100
+++ b/sys/vfs/nullfs/null_vfsops.c	Tue Feb  7 19:55:06 2006 +0100
@@ -53,6 +53,7 @@
 #include <sys/vnode.h>
 #include <sys/mount.h>
 #include <sys/nlookup.h>
+#include <sys/namecache.h>
 #include "null.h"
 
 extern struct vnodeopv_entry_desc null_vnodeop_entries[];
@@ -80,12 +81,10 @@ nullfs_mount(struct mount *mp, char *pat
 {
 	int error = 0;
 	struct null_args args;
-	struct vnode *rootvp;
-	struct null_mount *xmp;
 	u_int size;
 	struct nlookupdata nd;
 
-	NULLFSDEBUG("nullfs_mount(mp = %p)\n", (void *)mp);
+	NULLFSDEBUG("mp %p\n", (void *)mp);
 
 	/*
 	 * Update is a no-op
@@ -98,118 +97,113 @@ nullfs_mount(struct mount *mp, char *pat
 	 * Get argument
 	 */
 	error = copyin(data, (caddr_t)&args, sizeof(struct null_args));
-	if (error)
-		return (error);
-
-	/*
-	 * Find lower node
-	 */
-	rootvp = NULL;
-	error = nlookup_init(&nd, args.target, UIO_USERSPACE, NLC_FOLLOW);
+
+	/*
+	 * Do a lookup just to see if things are not fundamentally broken...
+	 * but it's too early to make a proper use of the result.
+	 */
+	if (error == 0)
+		error = nlookup_init(&nd, args.target, UIO_USERSPACE,
+				     NLC_FOLLOW);
 	if (error == 0)
 		error = nlookup(&nd);
-	if (error == 0) {
-		error = cache_vget(nd.nl_ncp, nd.nl_cred, LK_EXCLUSIVE, 
-					&rootvp);
-	}
-
-	xmp = (struct null_mount *) malloc(sizeof(struct null_mount),
-				M_NULLFSMNT, M_WAITOK);	/* XXX */
-
-	/*
-	 * Save reference to underlying FS
-	 */
-        /*
-         * As lite stacking enters the scene, the old way of doing this
-	 * -- via the vnode -- is not good enough anymore...
-	 */
-	xmp->nullm_vfs = nd.nl_ncp->nc_mount;
+	if (error)
+		return(error);
+
 	nlookup_done(&nd);
 
-	vfs_add_vnodeops(mp, &mp->mnt_vn_norm_ops, 
-			 null_vnodeop_entries, 0);
-
-	VOP_UNLOCK(rootvp, 0, td);
-
-	/*
-	 * Keep a held reference to the root vnode.
-	 * It is vrele'd in nullfs_unmount.
-	 */
-	xmp->nullm_rootvp = rootvp;
-	/*
-	 * XXX What's the proper safety condition for querying
-	 * the underlying mount? Is this flag tuning necessary
-	 * at all?
-	 */
-	if (xmp->nullm_vfs->mnt_flag & MNT_LOCAL)
-		mp->mnt_flag |= MNT_LOCAL;
-	mp->mnt_data = (qaddr_t) xmp;
+	vfs_add_vnodeops(mp, &mp->mnt_vn_norm_ops, null_vnodeop_entries, 0);
+
+	/*
+	 * Heck it, let it just be local. I bet I need only five minutes to
+	 * find out a sound sounding meaning for "local" by which null mounts
+	 * are always local.
+	 */
+	mp->mnt_flag |= MNT_LOCAL;
 	vfs_getnewfsid(mp);
 
 	(void) copyinstr(args.target, mp->mnt_stat.f_mntfromname, MNAMELEN - 1,
 	    &size);
 	bzero(mp->mnt_stat.f_mntfromname + size, MNAMELEN - size);
-	(void)nullfs_statfs(mp, &mp->mnt_stat, td);
-	NULLFSDEBUG("nullfs_mount: lower %s, alias at %s\n",
-		mp->mnt_stat.f_mntfromname, mp->mnt_stat.f_mntfromname);
+	NULLFSDEBUG("lower %s, alias at %s\n",
+	            mp->mnt_stat.f_mntfromname, mp->mnt_stat.f_mntonname);
 	return (0);
 }
 
-/*
- * Free reference to null layer
- */
 static int
 nullfs_unmount(struct mount *mp, int mntflags, struct thread *td)
 {
-	void *mntdata;
-	int flags = 0;
-
-	NULLFSDEBUG("nullfs_unmount: mp = %p\n", (void *)mp);
-
-	if (mntflags & MNT_FORCE)
-		flags |= FORCECLOSE;
-
-	/*
-	 * Finally, throw away the null_mount structure
-	 */
-	mntdata = mp->mnt_data;
-	mp->mnt_data = 0;
-	free(mntdata, M_NULLFSMNT);
+	NULLNCDEBUG(mp->mnt_ncp);
+
+	cache_lock(mp->mnt_ncp);
+	cache_put(cache_shadow_detach(mp->mnt_ncp));
+	cache_unlock(mp->mnt_ncp);
+
 	return 0;
 }
 
 static int
+nullfs_start(struct mount *mp, int flags, struct thread *td)
+{
+	int error;
+	struct namecache *sncp;
+
+	NULLFSDEBUG("nlookup %s\n", mp->mnt_stat.f_mntfromname);
+
+	sncp = nlookup_simple(mp->mnt_stat.f_mntfromname,
+		              UIO_SYSSPACE, NLC_FOLLOW, &error);
+
+	if (! sncp)
+		return (error);
+
+	if ((error = cache_shadow_attach(mp->mnt_ncp, sncp)))
+		cache_put(sncp);
+
+	NULLNCDEBUG(mp->mnt_ncp);
+#ifdef NULLFS_DEBUG
+	if (mp->mnt_ncp->nc_shadowed)
+		NULLNCDEBUG(mp->mnt_ncp->nc_shadowed);
+#endif
+
+	cache_unlock(mp->mnt_ncp);
+	return (error);
+}	
+
+/*
+ * As the mount won't get aborted if VFS_START fails, we have to check in each 
+ * VFS call whether it has succeeded...
+ */ 
+
+static int
 nullfs_root(struct mount *mp, struct vnode **vpp)
 {
-	struct thread *td = curthread;	/* XXX */
-	struct vnode *vp;
-
-	NULLFSDEBUG("nullfs_root(mp = %p, vp = %p)\n", (void *)mp,
-	    (void *)MOUNTTONULLMOUNT(mp)->nullm_rootvp);
-
-	/*
-	 * Return locked reference to root.
-	 */
-	vp = MOUNTTONULLMOUNT(mp)->nullm_rootvp;
-	vref(vp);
-
-#ifdef NULLFS_DEBUG
-	if (VOP_ISLOCKED(vp, NULL)) {
-		Debugger("root vnode is locked.\n");
-		vrele(vp);
-		return (EDEADLK);
-	}
-#endif
-	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
-	*vpp = vp;
-	return 0;
+	int error;
+
+	if (! mp->mnt_ncp || ! mp->mnt_ncp->nc_shadowed)
+		return (ENXIO);
+
+	error = cache_vget(mp->mnt_ncp->nc_shadowed, crhold(proc0.p_ucred),
+	                   LK_EXCLUSIVE | LK_RETRY, vpp);
+	crfree(proc0.p_ucred);
+
+	return (error);
+}
+
+static __inline
+struct mount *
+nullfs_lowermount_0(struct mount *mp)
+{
+	return (mp->mnt_ncp->nc_shadowed->nc_mount);
 }
 
 static int
 nullfs_quotactl(struct mount *mp, int cmd, uid_t uid, caddr_t arg,
 		struct thread *td)
 {
-	return VFS_QUOTACTL(MOUNTTONULLMOUNT(mp)->nullm_vfs, cmd, uid, arg, td);
+	if (! mp->mnt_ncp || ! mp->mnt_ncp->nc_shadowed)
+		return (ENXIO);
+
+	return VFS_QUOTACTL(nullfs_lowermount_0(mp), cmd, uid, arg, td);
 }
 
 static int
@@ -218,12 +212,15 @@ nullfs_statfs(struct mount *mp, struct s
 	int error;
 	struct statfs mstat;
 
-	NULLFSDEBUG("nullfs_statfs(mp = %p, vp = %p)\n", (void *)mp,
-	    (void *)MOUNTTONULLMOUNT(mp)->nullm_rootvp);
+	if (! mp->mnt_ncp || ! mp->mnt_ncp->nc_shadowed)
+		return (ENXIO);
+
+	NULLFSDEBUG("mp %p, ncp %p, lower mp %p\n",
+	            mp, mp->mnt_ncp, nullfs_lowermount_0(mp));
 
 	bzero(&mstat, sizeof(mstat));
 
-	error = VFS_STATFS(MOUNTTONULLMOUNT(mp)->nullm_vfs, &mstat, td);
+	error = VFS_STATFS(nullfs_lowermount_0(mp), &mstat, td);
 	if (error)
 		return (error);
 
@@ -248,23 +245,27 @@ nullfs_checkexp(struct mount *mp, struct
 nullfs_checkexp(struct mount *mp, struct sockaddr *nam, int *extflagsp,
 		struct ucred **credanonp)
 {
-
-	return VFS_CHECKEXP(MOUNTTONULLMOUNT(mp)->nullm_vfs, nam, 
-		extflagsp, credanonp);
+	if (! mp->mnt_ncp || ! mp->mnt_ncp->nc_shadowed)
+		return (ENXIO);
+
+	return VFS_CHECKEXP(nullfs_lowermount_0(mp), nam, extflagsp, credanonp);
 }
 
 static int                        
 nullfs_extattrctl(struct mount *mp, int cmd, const char *attrname, caddr_t arg,
 		  struct thread *td)
 {
-	return VFS_EXTATTRCTL(MOUNTTONULLMOUNT(mp)->nullm_vfs, cmd, attrname,
-	    arg, td);
+	if (! mp->mnt_ncp || ! mp->mnt_ncp->nc_shadowed)
+		return (ENXIO);
+
+	return VFS_EXTATTRCTL(nullfs_lowermount_0(mp), cmd, attrname, arg, td);
 }
 
 
 static struct vfsops null_vfsops = {
 	.vfs_mount =   	 	nullfs_mount,
 	.vfs_unmount =   	nullfs_unmount,
+	.vfs_start =     	nullfs_start,
 	.vfs_root =     	nullfs_root,
 	.vfs_quotactl =   	nullfs_quotactl,
 	.vfs_statfs =    	nullfs_statfs,
diff -r b30705ca4860 sys/vfs/nullfs/null_vnops.c
--- a/sys/vfs/nullfs/null_vnops.c	Thu Feb  2 19:50:43 2006 +0100
+++ b/sys/vfs/nullfs/null_vnops.c	Tue Feb  7 19:55:06 2006 +0100
@@ -109,6 +109,8 @@
 #include <sys/namei.h>
 #include <sys/malloc.h>
 #include <sys/buf.h>
+#include <sys/namecache.h>
+#include <sys/nlookup.h>
 #include "null.h"
 
 static int	null_nresolve(struct vop_nresolve_args *ap);
@@ -122,18 +124,114 @@ static int	null_nrmdir(struct vop_nrmdir
 static int	null_nrmdir(struct vop_nrmdir_args *ap);
 static int	null_nrename(struct vop_nrename_args *ap);
 
+static __inline
+struct mount *
+nullfs_lowermount_l(struct namecache *ncp)
+{
+	/*
+	 * The code in use below allows allows passing through lower mounts.
+	 * If we didn't want to do that, we could use
+	 *
+	 *   ncp->nc_mount->mnt_ncp->nc_shadowed->nc_mount
+	 *
+	 * Eventually, the choice might be configurable.
+	 *
+	 *                  -  -  -
+	 *
+	 * Matt says in
+	 * http://leaf.dragonflybsd.org/mailarchive/kernel/2006-01/msg00023.html
+	 * :
+	 
+    The ncp->nc_mount field was never meant to be used by the VFS code...
+    only to be used internally by cache_*().  It looks like I broke my own
+    rule... I have two references in NFS, but that's for later.
+
+	 * Note that both approaches still use nc_mount:
+	 *
+	 * - If we wanna pass through lower mounts, we do have to find the
+	 *   the lower fs ncp-wise, we simply don't have choice.
+	 *
+	 * - If we just work with a fixed lower fs, we are able to access
+	 *   that if we are willing to use nc_mount. Hence it just seems to be
+	 *   stupid to keep around a direct reference to the lower fs, but
+	 *   that's of course feasible.
+	 */
+	return (ncp->nc_shadowed->nc_mount);
+}
+
 static int
 null_nresolve(struct vop_nresolve_args *ap)
 {
-	ap->a_head.a_ops = MOUNTTONULLMOUNT(ap->a_ncp->nc_mount)->nullm_vfs->mnt_vn_norm_ops;
-
-	return vop_nresolve_ap(ap);
+	struct namecache *ncp = ap->a_ncp;
+	struct nlcomponent nlc;
+	struct namecache *sncp;
+	struct mount *lmp;
+	int error = 0;
+
+	if (ncp->nc_shadowed)
+		cache_put(cache_shadow_detach(ncp));
+
+	nlc.nlc_nameptr = ncp->nc_name;
+	nlc.nlc_namelen = ncp->nc_nlen;
+
+	KKASSERT(ncp->nc_parent->nc_shadowed);
+	cache_unlock(ncp);
+	sncp = cache_nlookup(ncp->nc_parent->nc_shadowed, &nlc);
+
+	if ((error = cache_shadow_attach(ncp, sncp))) {
+		cache_put(sncp);
+		if (ncp->nc_flag & NCF_UNRESOLVED)
+			cache_setvp(ncp, NULL);
+		return (error);
+	}
+
+	NULLNCDEBUG(ncp);
+	NULLNCDEBUG(ncp->nc_shadowed);
+
+checkready:
+
+	if ((ncp->nc_shadowed->nc_flag & NCF_UNRESOLVED) == 0) {
+		cache_setvp(ncp, ncp->nc_shadowed->nc_vp);
+		return (ncp->nc_shadowed->nc_error);
+	}
+
+	/*
+	 * XXX Querying/ensuring usability of lower fs still not got right.
+	 * As a quick hack, we do a simple test here, that will do for
+	 * avoiding most obvious fallacies.
+	 */
+	if ((lmp = nullfs_lowermount_l(ncp)) &&
+	    (ap->a_head.a_ops = lmp->mnt_vn_use_ops)) {
+		/*
+		 * Moving down in the shadow chain is for avoiding a recursed
+		 * loop (ending up in exhausting the kernel stack).
+		 *
+		 * Otherwise it's the same whether we use ncp or
+		 * ncp->nc_shadowed -- we go for group shared ncp attributes.
+	 	 */
+		ap->a_ncp = ncp->nc_shadowed;
+		/*
+		 * According to cache_resolve(), the primary place for
+		 * VOP_NRESOLVE calls, the caller of the nresolve method
+		 * is the one who should take care about ncp->nc_error.
+		 */
+		ap->a_ncp->nc_error = vop_nresolve_ap(ap);
+
+		goto checkready;
+	}
+
+	return (ENXIO);
 }
 
 static int
 null_ncreate(struct vop_ncreate_args *ap)
 {
-	ap->a_head.a_ops = MOUNTTONULLMOUNT(ap->a_ncp->nc_mount)->nullm_vfs->mnt_vn_norm_ops;
+	struct namecache *ncp = ap->a_ncp;
+
+	NULLNCDEBUG(ap->a_ncp);
+	NULLNCDEBUG(ap->a_ncp->nc_shadowed);
+	ap->a_head.a_ops = nullfs_lowermount_l(ap->a_ncp)->mnt_vn_use_ops;
+	ap->a_ncp = ncp->nc_shadowed;
 
 	return vop_ncreate_ap(ap);
 }
@@ -141,7 +239,12 @@ static int
 static int
 null_nmkdir(struct vop_nmkdir_args *ap)
 {
-	ap->a_head.a_ops = MOUNTTONULLMOUNT(ap->a_ncp->nc_mount)->nullm_vfs->mnt_vn_norm_ops;
+	struct namecache *ncp = ap->a_ncp;
+
+	NULLNCDEBUG(ap->a_ncp);
+	NULLNCDEBUG(ap->a_ncp->nc_shadowed);
+	ap->a_head.a_ops = nullfs_lowermount_l(ap->a_ncp)->mnt_vn_use_ops;
+	ap->a_ncp = ncp->nc_shadowed;
 
 	return vop_nmkdir_ap(ap);
 }
@@ -149,7 +252,12 @@ static int
 static int
 null_nmknod(struct vop_nmknod_args *ap)
 {
-	ap->a_head.a_ops = MOUNTTONULLMOUNT(ap->a_ncp->nc_mount)->nullm_vfs->mnt_vn_norm_ops;
+	struct namecache *ncp = ap->a_ncp;
+
+	NULLNCDEBUG(ap->a_ncp);
+	NULLNCDEBUG(ap->a_ncp->nc_shadowed);
+	ap->a_head.a_ops = nullfs_lowermount_l(ap->a_ncp)->mnt_vn_use_ops;
+	ap->a_ncp = ncp->nc_shadowed;
 
 	return vop_nmknod_ap(ap);
 }
@@ -157,7 +265,12 @@ static int
 static int
 null_nlink(struct vop_nlink_args *ap)
 {
-	ap->a_head.a_ops = MOUNTTONULLMOUNT(ap->a_ncp->nc_mount)->nullm_vfs->mnt_vn_norm_ops;
+	struct namecache *ncp = ap->a_ncp;
+
+	NULLNCDEBUG(ap->a_ncp);
+	NULLNCDEBUG(ap->a_ncp->nc_shadowed);
+	ap->a_head.a_ops = nullfs_lowermount_l(ap->a_ncp)->mnt_vn_use_ops;
+	ap->a_ncp = ncp->nc_shadowed;
 
 	return vop_nlink_ap(ap);
 }
@@ -165,7 +278,12 @@ static int
 static int
 null_nsymlink(struct vop_nsymlink_args *ap)
 {
-	ap->a_head.a_ops = MOUNTTONULLMOUNT(ap->a_ncp->nc_mount)->nullm_vfs->mnt_vn_norm_ops;
+	struct namecache *ncp = ap->a_ncp;
+
+	NULLNCDEBUG(ap->a_ncp);
+	NULLNCDEBUG(ap->a_ncp->nc_shadowed);
+	ap->a_head.a_ops = nullfs_lowermount_l(ap->a_ncp)->mnt_vn_use_ops;
+	ap->a_ncp = ncp->nc_shadowed;
 
 	return vop_nsymlink_ap(ap);
 }
@@ -173,7 +291,12 @@ static int
 static int
 null_nwhiteout(struct vop_nwhiteout_args *ap)
 {
-	ap->a_head.a_ops = MOUNTTONULLMOUNT(ap->a_ncp->nc_mount)->nullm_vfs->mnt_vn_norm_ops;
+	struct namecache *ncp = ap->a_ncp;
+
+	NULLNCDEBUG(ap->a_ncp);
+	NULLNCDEBUG(ap->a_ncp->nc_shadowed);
+	ap->a_head.a_ops = nullfs_lowermount_l(ap->a_ncp)->mnt_vn_use_ops;
+	ap->a_ncp = ncp->nc_shadowed;
 
 	return vop_nwhiteout_ap(ap);
 }
@@ -181,7 +304,12 @@ static int
 static int
 null_nremove(struct vop_nremove_args *ap)
 {
-	ap->a_head.a_ops = MOUNTTONULLMOUNT(ap->a_ncp->nc_mount)->nullm_vfs->mnt_vn_norm_ops;
+	struct namecache *ncp = ap->a_ncp;
+
+	NULLNCDEBUG(ap->a_ncp);
+	NULLNCDEBUG(ap->a_ncp->nc_shadowed);
+	ap->a_head.a_ops = nullfs_lowermount_l(ap->a_ncp)->mnt_vn_use_ops;
+	ap->a_ncp = ncp->nc_shadowed;
 
 	return vop_nremove_ap(ap);
 }
@@ -189,7 +317,12 @@ static int
 static int
 null_nrmdir(struct vop_nrmdir_args *ap)
 {
-	ap->a_head.a_ops = MOUNTTONULLMOUNT(ap->a_ncp->nc_mount)->nullm_vfs->mnt_vn_norm_ops;
+	struct namecache *ncp = ap->a_ncp;
+
+	NULLNCDEBUG(ap->a_ncp);
+	NULLNCDEBUG(ap->a_ncp->nc_shadowed);
+	ap->a_head.a_ops = nullfs_lowermount_l(ap->a_ncp)->mnt_vn_use_ops;
+	ap->a_ncp = ncp->nc_shadowed;
 
 	return vop_nrmdir_ap(ap);
 }
@@ -197,13 +330,21 @@ static int
 static int
 null_nrename(struct vop_nrename_args *ap)
 {
+	struct namecache *fncp = ap->a_fncp;
+	struct namecache *tncp = ap->a_tncp;
 	struct mount *lmp;
 
-	lmp = MOUNTTONULLMOUNT(ap->a_fncp->nc_mount)->nullm_vfs;
-	if (lmp != MOUNTTONULLMOUNT(ap->a_tncp->nc_mount)->nullm_vfs)
-		return (EINVAL);
-
-	ap->a_head.a_ops = lmp->mnt_vn_norm_ops;
+	lmp = nullfs_lowermount_l(fncp);
+	if (lmp != nullfs_lowermount_l(tncp))
+		return (EXDEV);
+
+	NULLNCDEBUG(ap->a_fncp);
+	NULLNCDEBUG(ap->a_fncp->nc_shadowed);
+	NULLNCDEBUG(ap->a_tncp);
+	NULLNCDEBUG(ap->a_tncp->nc_shadowed);
+	ap->a_head.a_ops = lmp->mnt_vn_use_ops;
+	ap->a_fncp = fncp->nc_shadowed;
+	ap->a_tncp = tncp->nc_shadowed;
 
 	return vop_nrename_ap(ap);
 }
@@ -224,4 +365,3 @@ struct vnodeopv_entry_desc null_vnodeop_
 	{ &vop_nrename_desc,		(vnodeopv_entry_t) null_nrename },
 	{ NULL, NULL }
 };
-





More information about the Submit mailing list