a take at cache coherency [patch, rfc]

Csaba Henk csaba.henk at creo.hu
Mon Jan 23 02:44:40 PST 2006


Hi,

For explanatation/discussion please see related post in kernel at .

Patch from signature till bottom.

Csaba

# HG changeset patch
# User csaba@
# Node ID 6db92777ea99902079b1f31fe330b0a5aac96d88
# Parent  820a1f1d791e95af17294ccd87e5f76dbc1be68c
imported patch cachecoh

diff -r 820a1f1d791e -r 6db92777ea99 sys/emulation/linux/linux_stats.c
--- a/sys/emulation/linux/linux_stats.c	Mon Jan 23 02:56:43 2006 +0000
+++ b/sys/emulation/linux/linux_stats.c	Mon Jan 23 05:16:56 2006 +0100
@@ -250,8 +250,8 @@ linux_statfs(struct linux_statfs_args *a
 	if (error == 0)
 		error = kern_statfs(&nd, &statfs);
 	if (error == 0) {
-		if (nd.nl_ncp->nc_vp != NULL)
-			error = vn_get_namelen(nd.nl_ncp->nc_vp, &namelen);
+		if (cache_grphead_l(nd.nl_ncp)->nc_vp != NULL)
+			error = vn_get_namelen(cache_grphead_l(nd.nl_ncp)->nc_vp, &namelen);
 		else
 			error = EINVAL;
 	}
diff -r 820a1f1d791e -r 6db92777ea99 sys/emulation/svr4/svr4_misc.c
--- a/sys/emulation/svr4/svr4_misc.c	Mon Jan 23 02:56:43 2006 +0000
+++ b/sys/emulation/svr4/svr4_misc.c	Mon Jan 23 05:16:56 2006 +0100
@@ -1395,8 +1395,8 @@ svr4_sys_statvfs(struct svr4_sys_statvfs
 	if (error == 0)
 		error = kern_statfs(&nd, &bfs);
 	if (error == 0) {
-		if (nd.nl_ncp->nc_vp != NULL)
-			error = vn_get_namelen(nd.nl_ncp->nc_vp, &namelen);
+		if (cache_grphead_l(nd.nl_ncp)->nc_vp != NULL)
+			error = vn_get_namelen(cache_grphead_l(nd.nl_ncp)->nc_vp, &namelen);
 		else
 			error = EINVAL;
 	}
diff -r 820a1f1d791e -r 6db92777ea99 sys/kern/uipc_usrreq.c
--- a/sys/kern/uipc_usrreq.c	Mon Jan 23 02:56:43 2006 +0000
+++ b/sys/kern/uipc_usrreq.c	Mon Jan 23 05:16:56 2006 +0100
@@ -600,7 +600,7 @@ unp_bind(struct unpcb *unp, struct socka
 	error = nlookup_init(&nd, buf, UIO_SYSSPACE, NLC_LOCKVP|NLC_CREATE);
 	if (error == 0)
 		error = nlookup(&nd);
-	if (error == 0 && nd.nl_ncp->nc_vp != NULL)
+	if (error == 0 && cache_grphead_l(nd.nl_ncp)->nc_vp != NULL)
 		error = EADDRINUSE;
 	if (error)
 		goto done;
diff -r 820a1f1d791e -r 6db92777ea99 sys/kern/vfs_cache.c
--- a/sys/kern/vfs_cache.c	Mon Jan 23 02:56:43 2006 +0000
+++ b/sys/kern/vfs_cache.c	Mon Jan 23 05:16:56 2006 +0100
@@ -198,6 +198,25 @@ SYSCTL_PROC(_vfs_cache, OID_AUTO, nchsta
 
 static void cache_zap(struct namecache *ncp);
 
+static __inline
+struct namecache *
+_cache_grphead_l(struct namecache *ncp)
+{
+#ifdef INVARIANTS
+	struct namecache *startncp = ncp;
+#endif
+
+	while (ncp->nc_shadowed) {
+		ncp = ncp->nc_shadowed;
+		KKASSERT(startncp != ncp);
+		KKASSERT(ncp->nc_refs > 0);
+	}
+
+	KKASSERT(ncp->nc_exlocks > 0);
+	KKASSERT(ncp->nc_locktd == curthread);
+	return(ncp);
+}
+
 /*
  * cache_hold() and cache_drop() prevent the premature deletion of a
  * namecache entry but do not prevent operations (such as zapping) on
@@ -222,15 +241,60 @@ _cache_drop(struct namecache *ncp)
 {
 	KKASSERT(ncp->nc_refs > 0);
 	if (ncp->nc_refs == 1 && 
-	    (ncp->nc_flag & NCF_UNRESOLVED) && 
+	    (ncp->nc_flag & NCF_UNRESOLVED || ncp->nc_shadowed) && 
 	    TAILQ_EMPTY(&ncp->nc_list)
 	) {
-		KKASSERT(ncp->nc_exlocks == 0);
 		cache_lock(ncp);
-		cache_zap(ncp);
-	} else {
+		KKASSERT(_cache_grphead_l(ncp)->nc_exlocks == 1);
+		if (_cache_grphead_l(ncp)->nc_flag & NCF_UNRESOLVED) {
+			cache_zap(ncp);
+			return;
+		}
+		cache_unlock(ncp);
+	}
+	--ncp->nc_refs;
+}
+
+static __inline
+struct namecache *
+_cache_grphead(struct namecache *ncp)
+{
+	if (ncp->nc_shadowed) {
+		/*
+		 * We need a ref to please the locking routine, but
+		 * we get rid of that ASAP. Doing that directly saves
+		 * us from a lot of headache (and some CPU cycles).
+		 */
+		cache_get(ncp);
 		--ncp->nc_refs;
-	}
+		ncp = _cache_grphead_l(ncp);
+		cache_unlock(ncp);
+	}
+
+	return(ncp);
+}
+
+/*
+ * Get the head of the shadow group when it's known to be locked.
+ *
+ * (XXX We need this routine in the API only if recursive locking
+ * is considered bad. The additional costs of the general grphead
+ * routine seem to be negligible, especially for "normal"
+ * (nc_shadowless) namecache entries.)
+ */
+struct namecache *
+cache_grphead_l(struct namecache *ncp)
+{
+	return(_cache_grphead_l(ncp));
+}
+
+/*
+ * Get the head of the shadow group.
+ */
+struct namecache *
+cache_grphead(struct namecache *ncp)
+{
+	return(_cache_grphead(ncp));
 }
 
 /*
@@ -324,6 +388,9 @@ cache_drop(struct namecache *ncp)
 	_cache_drop(ncp);
 }
 
+static void cache_lock_one(struct namecache *ncp);
+static void cache_unlock_one(struct namecache *ncp);
+
 /*
  * Namespace locking.  The caller must already hold a reference to the
  * namecache structure in order to lock/unlock it.  This function prevents
@@ -346,6 +413,29 @@ cache_drop(struct namecache *ncp)
  */
 void
 cache_lock(struct namecache *ncp)
+{
+	struct namecache *oncp;
+#ifdef INVARIANTS
+	struct namecache *startncp = ncp;
+#endif
+
+	for (;;) {
+		cache_lock_one(ncp);
+		oncp = ncp;
+		if (! (ncp = ncp->nc_shadowed))
+			break;
+		KKASSERT(ncp != startncp);
+		/*
+		 * The individual lock was used just to protect the transition.
+		 * Now that we safely know who's next, unlock the entry
+		 * and move on.
+		 */
+		cache_unlock_one(oncp);
+	}
+}
+
+static void
+cache_lock_one(struct namecache *ncp)
 {
 	thread_t td;
 	int didwarn;
@@ -398,12 +488,27 @@ cache_lock_nonblock(struct namecache *nc
 cache_lock_nonblock(struct namecache *ncp)
 {
 	thread_t td;
-
+	struct namecache *oncp;
+#ifdef INVARIANTS
+	struct namecache *startncp = ncp;
+#endif
+
+	td = curthread;
+
+step_one:
 	KKASSERT(ncp->nc_refs != 0);
-	td = curthread;
 	if (ncp->nc_exlocks == 0) {
 		ncp->nc_exlocks = 1;
 		ncp->nc_locktd = td;
+
+		if (ncp->nc_shadowed) {
+			oncp = ncp;
+			ncp = ncp->nc_shadowed;
+			KKASSERT(startncp != ncp);
+			cache_unlock_one(oncp);
+			goto step_one;
+		}
+
 		/* 
 		 * The vp associated with a locked ncp must be held
 		 * to prevent it from being recycled (which would
@@ -422,6 +527,12 @@ void
 void
 cache_unlock(struct namecache *ncp)
 {
+	cache_unlock_one(_cache_grphead_l(ncp));
+}
+
+static void
+cache_unlock_one(struct namecache *ncp)
+{
 	thread_t td = curthread;
 
 	KKASSERT(ncp->nc_refs > 0);
@@ -452,13 +563,17 @@ int
 int
 cache_get_nonblock(struct namecache *ncp)
 {
+	int error;
+
 	/* XXX MP */
-	if (ncp->nc_exlocks == 0 || ncp->nc_locktd == curthread) {
-		_cache_hold(ncp);
-		cache_lock(ncp);
-		return(0);
-	}
-	return(EWOULDBLOCK);
+	_cache_hold(ncp);
+	/*
+	 * We can't test easily whether locking would block
+	 * so we just make a try to get the lock.
+	 */
+	if ((error = cache_lock_nonblock(ncp)))
+		_cache_drop(ncp);
+	return(error);
 }
 
 void
@@ -469,6 +584,61 @@ cache_put(struct namecache *ncp)
 }
 
 /*
+ * Join ncp into the shadow group of sncp.
+ * 
+ * Both entries must be locked on entry. Caller also has to hold a dedicated
+ * reference of sncp.
+ *
+ * The routine will fail and return ELOOP if the intended shadowing association
+ * yielded a loop in the shadow chain.
+ *
+ * - On success ncp will be a representative of the joint shadow group, which
+ *   then will be locked.
+ * - On failure the namecache entries will exist separately just as they did
+ *   before, in the same state.
+ */
+int
+cache_shadow_attach(struct namecache *ncp, struct namecache *sncp)
+{
+	KKASSERT(! ncp->nc_shadowed);
+	KKASSERT(! ncp->nc_vp);
+	KKASSERT(ncp->nc_flag & NCF_UNRESOLVED);
+
+	if (_cache_grphead_l(sncp) == ncp) {
+		/* ncp->nc_error = ELOOP; */
+		return(ELOOP);
+	}
+
+	ncp->nc_shadowed = sncp;
+	cache_unlock_one(ncp);
+	return(0);
+}
+
+/*
+ * Take out namecache entry from its shadow group.
+ *
+ * ncp must really shadow someone, and the shadow group must be locked
+ * upon entry.
+ *
+ * After the routine returns, ncp will be the head of a new (possibly singleton)
+ * shadow group. The routine returns the former successor of ncp in the original
+ * shadow group in a locked+ref'd state.
+ */
+struct namecache *
+cache_shadow_detach(struct namecache *ncp)
+{
+	struct namecache *sncp = ncp->nc_shadowed;
+
+	KKASSERT(sncp);
+
+	cache_lock_one(ncp);
+	ncp->nc_shadowed = NULL;
+	cache_setunresolved(ncp);
+
+	return(sncp);
+}
+
+/*
  * Resolve an unresolved ncp by associating a vnode with it.  If the
  * vnode is NULL, a negative cache entry is created.
  *
@@ -477,6 +647,8 @@ void
 void
 cache_setvp(struct namecache *ncp, struct vnode *vp)
 {
+	ncp = _cache_grphead_l(ncp);
+
 	KKASSERT(ncp->nc_flag & NCF_UNRESOLVED);
 	ncp->nc_vp = vp;
 	if (vp != NULL) {
@@ -517,6 +689,8 @@ void
 void
 cache_settimeout(struct namecache *ncp, int nticks)
 {
+	ncp = _cache_grphead_l(ncp);
+
 	if ((ncp->nc_timeout = ticks + nticks) == 0)
 		ncp->nc_timeout = 1;
 }
@@ -542,6 +716,8 @@ cache_setunresolved(struct namecache *nc
 cache_setunresolved(struct namecache *ncp)
 {
 	struct vnode *vp;
+
+	ncp = _cache_grphead_l(ncp);
 
 	if ((ncp->nc_flag & NCF_UNRESOLVED) == 0) {
 		ncp->nc_flag |= NCF_UNRESOLVED;
@@ -619,11 +795,11 @@ cache_inval(struct namecache *ncp, int f
 	struct namecache *nextkid;
 	int rcnt = 0;
 
-	KKASSERT(ncp->nc_exlocks);
+	KKASSERT(_cache_grphead_l(ncp)->nc_exlocks);
 
 	cache_setunresolved(ncp);
 	if (flags & CINV_DESTROY)
-		ncp->nc_flag |= NCF_DESTROYED;
+		_cache_grphead_l(ncp)->nc_flag |= NCF_DESTROYED;
 
 	if ((flags & CINV_CHILDREN) && 
 	    (kid = TAILQ_FIRST(&ncp->nc_list)) != NULL
@@ -634,7 +810,8 @@ cache_inval(struct namecache *ncp, int f
 			if ((nextkid = TAILQ_NEXT(kid, nc_entry)) != NULL)
 				cache_hold(nextkid);
 			if ((kid->nc_flag & NCF_UNRESOLVED) == 0 ||
-			    TAILQ_FIRST(&kid->nc_list)
+			    TAILQ_FIRST(&kid->nc_list) ||
+			    kid->nc_shadowed
 			) {
 				cache_lock(kid);
 				rcnt += cache_inval(kid, flags & ~CINV_DESTROY);
@@ -650,7 +827,7 @@ cache_inval(struct namecache *ncp, int f
 	 * Someone could have gotten in there while ncp was unlocked,
 	 * retry if so.
 	 */
-	if ((ncp->nc_flag & NCF_UNRESOLVED) == 0)
+	if ((_cache_grphead_l(ncp)->nc_flag & NCF_UNRESOLVED) == 0)
 		++rcnt;
 	return (rcnt);
 }
@@ -774,6 +951,8 @@ cache_vget(struct namecache *ncp, struct
 	struct vnode *vp;
 	int error;
 
+	ncp = _cache_grphead(ncp);
+
 again:
 	vp = NULL;
 	if (ncp->nc_flag & NCF_UNRESOLVED) {
@@ -805,6 +984,8 @@ cache_vref(struct namecache *ncp, struct
 {
 	struct vnode *vp;
 	int error;
+
+	ncp = _cache_grphead(ncp);
 
 again:
 	vp = NULL;
@@ -1121,9 +1302,10 @@ again:
 			goto again;
 	}
 	if (rncp) {
+		struct namecache *srncp = _cache_grphead_l(rncp);
 		vrele(pvp);
-		if (rncp->nc_flag & NCF_UNRESOLVED) {
-			cache_setvp(rncp, dvp);
+		if (srncp->nc_flag & NCF_UNRESOLVED) {
+			cache_setvp(srncp, dvp);
 			if (ncvp_debug >= 2) {
 				printf("cache_inefficient_scan: setvp %s/%s = %p\n",
 					ncp->nc_name, rncp->nc_name, dvp);
@@ -1132,11 +1314,11 @@ again:
 			if (ncvp_debug >= 2) {
 				printf("cache_inefficient_scan: setvp %s/%s already set %p/%p\n", 
 					ncp->nc_name, rncp->nc_name, dvp,
-					rncp->nc_vp);
+					srncp->nc_vp);
 			}
 		}
-		if (rncp->nc_vp == NULL)
-			error = rncp->nc_error;
+		if (srncp->nc_vp == NULL)
+			error = srncp->nc_error;
 		cache_put(rncp);
 	} else {
 		printf("cache_inefficient_scan: dvp %p NOT FOUND in %s\n",
@@ -1179,7 +1361,7 @@ cache_zap(struct namecache *ncp)
 	 * We only scrap unref'd (other then our ref) unresolved entries,
 	 * we do not scrap 'live' entries.
 	 */
-	while (ncp->nc_flag & NCF_UNRESOLVED) {
+	while (_cache_grphead_l(ncp)->nc_flag & NCF_UNRESOLVED) {
 		/*
 		 * Someone other then us has a ref, stop.
 		 */
@@ -1206,6 +1388,9 @@ cache_zap(struct namecache *ncp)
 			if (par->nc_vp && TAILQ_EMPTY(&par->nc_list))
 				vdrop(par->nc_vp);
 		}
+
+		if (ncp->nc_shadowed)
+			cache_put(cache_shadow_detach(ncp));
 
 		/*
 		 * ncp should not have picked up any refs.  Physically
@@ -1303,6 +1488,7 @@ cache_nlookup(struct namecache *par, str
 cache_nlookup(struct namecache *par, struct nlcomponent *nlc)
 {
 	struct namecache *ncp;
+	struct namecache *sncp;
 	struct namecache *new_ncp;
 	struct nchashhead *nchpp;
 	u_int32_t hash;
@@ -1319,15 +1505,16 @@ cache_nlookup(struct namecache *par, str
 	new_ncp = NULL;
 restart:
 	LIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) {
+		sncp = _cache_grphead(ncp);
 		numchecks++;
 
 		/*
 		 * Zap entries that have timed out.
 		 */
-		if (ncp->nc_timeout && 
-		    (int)(ncp->nc_timeout - ticks) < 0 &&
-		    (ncp->nc_flag & NCF_UNRESOLVED) == 0 &&
-		    ncp->nc_exlocks == 0
+		if (sncp->nc_timeout && 
+		    (int)(sncp->nc_timeout - ticks) < 0 &&
+		    (sncp->nc_flag & NCF_UNRESOLVED) == 0 &&
+		    sncp->nc_exlocks == 0
 		) {
 			cache_zap(cache_get(ncp));
 			goto restart;
@@ -1341,7 +1528,7 @@ restart:
 		if (ncp->nc_parent == par &&
 		    ncp->nc_nlen == nlc->nlc_namelen &&
 		    bcmp(ncp->nc_name, nlc->nlc_nameptr, ncp->nc_nlen) == 0 &&
-		    (ncp->nc_flag & NCF_DESTROYED) == 0
+		    (sncp->nc_flag & NCF_DESTROYED) == 0
 		) {
 			if (cache_get_nonblock(ncp) == 0) {
 				if (new_ncp)
@@ -1414,15 +1601,15 @@ int
 int
 cache_resolve(struct namecache *ncp, struct ucred *cred)
 {
-	struct namecache *par;
+	struct namecache *par, *sncp;
 	int error;
 
 restart:
 	/*
 	 * If the ncp is already resolved we have nothing to do.
 	 */
-	if ((ncp->nc_flag & NCF_UNRESOLVED) == 0)
-		return (ncp->nc_error);
+	if ((_cache_grphead_l(ncp)->nc_flag & NCF_UNRESOLVED) == 0)
+		return (_cache_grphead_l(ncp)->nc_error);
 
 	/*
 	 * Mount points need special handling because the parent does not
@@ -1461,17 +1648,20 @@ restart:
 	 * not occur all that often, or if it does not have to go back too
 	 * many nodes to resolve the ncp.
 	 */
-	while (ncp->nc_parent->nc_vp == NULL) {
+	while (_cache_grphead(ncp->nc_parent)->nc_vp == NULL) {
+		struct namecache *spar;
+
 		/*
 		 * This case can occur if a process is CD'd into a
 		 * directory which is then rmdir'd.  If the parent is marked
 		 * destroyed there is no point trying to resolve it.
 		 */
-		if (ncp->nc_parent->nc_flag & NCF_DESTROYED)
+		if (_cache_grphead(ncp->nc_parent)->nc_flag & NCF_DESTROYED)
 			return(ENOENT);
 
 		par = ncp->nc_parent;
-		while (par->nc_parent && par->nc_parent->nc_vp == NULL)
+		while (par->nc_parent &&
+		       _cache_grphead(par->nc_parent)->nc_vp == NULL)
 			par = par->nc_parent;
 		if (par->nc_parent == NULL) {
 			printf("EXDEV case 2 %*.*s\n",
@@ -1488,20 +1678,23 @@ restart:
 		 * will handle any moves.
 		 */
 		cache_get(par);
+		spar = _cache_grphead_l(par);
 		if (par->nc_flag & NCF_MOUNTPT) {
 			cache_resolve_mp(par);
-		} else if (par->nc_parent->nc_vp == NULL) {
+		} else if (_cache_grphead(par->nc_parent)->nc_vp == NULL) {
 			printf("[diagnostic] cache_resolve: raced on %*.*s\n", par->nc_nlen, par->nc_nlen, par->nc_name);
 			cache_put(par);
 			continue;
-		} else if (par->nc_flag & NCF_UNRESOLVED) {
-			par->nc_error = VOP_NRESOLVE(par, cred);
-		}
-		if ((error = par->nc_error) != 0) {
-			if (par->nc_error != EAGAIN) {
+		} else if (spar->nc_flag & NCF_UNRESOLVED) {
+			error = VOP_NRESOLVE(par, cred);
+			spar = _cache_grphead_l(par);
+			spar->nc_error = error;
+		}
+		if ((error = spar->nc_error) != 0) {
+			if (spar->nc_error != EAGAIN) {
 				printf("EXDEV case 3 %*.*s error %d\n",
 				    par->nc_nlen, par->nc_nlen, par->nc_name,
-				    par->nc_error);
+				    spar->nc_error);
 				cache_put(par);
 				return(error);
 			}
@@ -1521,14 +1714,16 @@ restart:
 	 * ncp must already be resolved.
 	 */
 	KKASSERT((ncp->nc_flag & NCF_MOUNTPT) == 0);
-	ncp->nc_error = VOP_NRESOLVE(ncp, cred);
-	/*vop_nresolve(*ncp->nc_parent->nc_vp->v_ops, ncp, cred);*/
-	if (ncp->nc_error == EAGAIN) {
+	error = VOP_NRESOLVE(ncp, cred);
+	sncp = _cache_grphead_l(ncp);
+	sncp->nc_error = error;
+	/*vop_nresolve(*_cache_grphead_unlokced(ncp->nc_parent)->nc_vp->v_ops, ncp, cred);*/
+	if (error == EAGAIN) {
 		printf("[diagnostic] cache_resolve: EAGAIN ncp %p %*.*s\n",
 			ncp, ncp->nc_nlen, ncp->nc_nlen, ncp->nc_name);
 		goto restart;
 	}
-	return(ncp->nc_error);
+	return(error);
 }
 
 /*
@@ -1549,6 +1744,8 @@ cache_resolve_mp(struct namecache *ncp)
 	struct vnode *vp;
 	struct mount *mp = ncp->nc_mount;
 	int error;
+
+        ncp = _cache_grphead_l(ncp);
 
 	KKASSERT(mp != NULL);
 	if (ncp->nc_flag & NCF_UNRESOLVED) {
diff -r 820a1f1d791e -r 6db92777ea99 sys/kern/vfs_default.c
--- a/sys/kern/vfs_default.c	Mon Jan 23 02:56:43 2006 +0000
+++ b/sys/kern/vfs_default.c	Mon Jan 23 05:16:56 2006 +0100
@@ -203,7 +203,7 @@ vop_compat_nresolve(struct vop_nresolve_
 		return(EPERM);
 	if (ncp->nc_parent == NULL)
 		return(EPERM);
-	if ((dvp = ncp->nc_parent->nc_vp) == NULL)
+	if ((dvp = cache_grphead(ncp->nc_parent)->nc_vp) == NULL)
 		return(EPERM);
 
 	/*
@@ -234,7 +234,7 @@ vop_compat_nresolve(struct vop_nresolve_
 		VOP_UNLOCK(vp, 0, curthread);
 	if ((cnp.cn_flags & CNP_PDIRUNLOCK) == 0)
 		VOP_UNLOCK(dvp, 0, curthread);
-	if ((ncp->nc_flag & NCF_UNRESOLVED) == 0) {
+	if ((cache_grphead_l(ncp)->nc_flag & NCF_UNRESOLVED) == 0) {
 		/* was resolved by another process while we were unlocked */
 		if (error == 0)
 			vrele(vp);
@@ -245,7 +245,7 @@ vop_compat_nresolve(struct vop_nresolve_
 	} else if (error == ENOENT) {
 		KKASSERT(vp == NULL);
 		if (cnp.cn_flags & CNP_ISWHITEOUT)
-			ncp->nc_flag |= NCF_WHITEOUT;
+			cache_grphead_l(ncp)->nc_flag |= NCF_WHITEOUT;
 		cache_setvp(ncp, NULL);
 	}
 	vrele(dvp);
@@ -338,7 +338,7 @@ vop_compat_ncreate(struct vop_ncreate_ar
 		return(EPERM);
 	if (ncp->nc_parent == NULL)
 		return(EPERM);
-	if ((dvp = ncp->nc_parent->nc_vp) == NULL)
+	if ((dvp = cache_grphead(ncp->nc_parent)->nc_vp) == NULL)
 		return(EPERM);
 
 	if ((error = vget(dvp, LK_EXCLUSIVE, td)) != 0) {
@@ -421,7 +421,7 @@ vop_compat_nmkdir(struct vop_nmkdir_args
 		return(EPERM);
 	if (ncp->nc_parent == NULL)
 		return(EPERM);
-	if ((dvp = ncp->nc_parent->nc_vp) == NULL)
+	if ((dvp = cache_grphead(ncp->nc_parent)->nc_vp) == NULL)
 		return(EPERM);
 
 	if ((error = vget(dvp, LK_EXCLUSIVE, td)) != 0) {
@@ -504,7 +504,7 @@ vop_compat_nmknod(struct vop_nmknod_args
 		return(EPERM);
 	if (ncp->nc_parent == NULL)
 		return(EPERM);
-	if ((dvp = ncp->nc_parent->nc_vp) == NULL)
+	if ((dvp = cache_grphead(ncp->nc_parent)->nc_vp) == NULL)
 		return(EPERM);
 
 	if ((error = vget(dvp, LK_EXCLUSIVE, td)) != 0) {
@@ -586,7 +586,7 @@ vop_compat_nlink(struct vop_nlink_args *
 		return(EPERM);
 	if (ncp->nc_parent == NULL)
 		return(EPERM);
-	if ((dvp = ncp->nc_parent->nc_vp) == NULL)
+	if ((dvp = cache_grphead(ncp->nc_parent)->nc_vp) == NULL)
 		return(EPERM);
 
 	if ((error = vget(dvp, LK_EXCLUSIVE, td)) != 0) {
@@ -660,7 +660,7 @@ vop_compat_nsymlink(struct vop_nsymlink_
 		return(EPERM);
 	if (ncp->nc_parent == NULL)
 		return(EPERM);
-	if ((dvp = ncp->nc_parent->nc_vp) == NULL)
+	if ((dvp = cache_grphead(ncp->nc_parent)->nc_vp) == NULL)
 		return(EPERM);
 
 	if ((error = vget(dvp, LK_EXCLUSIVE, td)) != 0) {
@@ -746,7 +746,7 @@ vop_compat_nwhiteout(struct vop_nwhiteou
 		return(EPERM);
 	if (ncp->nc_parent == NULL)
 		return(EPERM);
-	if ((dvp = ncp->nc_parent->nc_vp) == NULL)
+	if ((dvp = cache_grphead(ncp->nc_parent)->nc_vp) == NULL)
 		return(EPERM);
 
 	if ((error = vget(dvp, LK_EXCLUSIVE, td)) != 0) {
@@ -836,7 +836,7 @@ vop_compat_nremove(struct vop_nremove_ar
 		return(EPERM);
 	if (ncp->nc_parent == NULL)
 		return(EPERM);
-	if ((dvp = ncp->nc_parent->nc_vp) == NULL)
+	if ((dvp = cache_grphead(ncp->nc_parent)->nc_vp) == NULL)
 		return(EPERM);
 
 	if ((error = vget(dvp, LK_EXCLUSIVE, td)) != 0) {
@@ -912,7 +912,7 @@ vop_compat_nrmdir(struct vop_nrmdir_args
 		return(EPERM);
 	if (ncp->nc_parent == NULL)
 		return(EPERM);
-	if ((dvp = ncp->nc_parent->nc_vp) == NULL)
+	if ((dvp = cache_grphead(ncp->nc_parent)->nc_vp) == NULL)
 		return(EPERM);
 
 	if ((error = vget(dvp, LK_EXCLUSIVE, td)) != 0) {
@@ -1005,7 +1005,7 @@ vop_compat_nrename(struct vop_nrename_ar
 		return(EPERM);
 	if (fncp->nc_parent == NULL)
 		return(EPERM);
-	if ((fdvp = fncp->nc_parent->nc_vp) == NULL)
+	if ((fdvp = cache_grphead(fncp->nc_parent)->nc_vp) == NULL)
 		return(EPERM);
 
 	/*
@@ -1064,7 +1064,7 @@ vop_compat_nrename(struct vop_nrename_ar
 		error = EPERM;
 	if (tncp->nc_parent == NULL)
 		error = EPERM;
-	if ((tdvp = tncp->nc_parent->nc_vp) == NULL)
+	if ((tdvp = cache_grphead(tncp->nc_parent)->nc_vp) == NULL)
 		error = EPERM;
 	if (error) {
 		vrele(fdvp);
diff -r 820a1f1d791e -r 6db92777ea99 sys/kern/vfs_journal.c
--- a/sys/kern/vfs_journal.c	Mon Jan 23 02:56:43 2006 +0000
+++ b/sys/kern/vfs_journal.c	Mon Jan 23 05:16:56 2006 +0100
@@ -1824,7 +1824,7 @@ jrecord_write_vnode_ref(struct jrecord *
     struct namecache *ncp;
 
     TAILQ_FOREACH(ncp, &vp->v_namecache, nc_vnode) {
-	if ((ncp->nc_flag & (NCF_UNRESOLVED|NCF_DESTROYED)) == 0)
+	if ((cache_grphead(ncp)->nc_flag & (NCF_UNRESOLVED|NCF_DESTROYED)) == 0)
 	    break;
     }
     if (ncp)
@@ -1840,7 +1840,7 @@ jrecord_write_vnode_link(struct jrecord 
     TAILQ_FOREACH(ncp, &vp->v_namecache, nc_vnode) {
 	if (ncp == notncp)
 	    continue;
-	if ((ncp->nc_flag & (NCF_UNRESOLVED|NCF_DESTROYED)) == 0)
+	if ((cache_grphead(ncp)->nc_flag & (NCF_UNRESOLVED|NCF_DESTROYED)) == 0)
 	    break;
     }
     if (ncp)
@@ -2533,7 +2533,7 @@ journal_nremove(struct vop_nremove_args 
 
     mp = ap->a_head.a_ops->vv_mount;
     if (jreclist_init(mp, &jreclist, &jreccache, JTYPE_REMOVE) &&
-	ap->a_ncp->nc_vp
+	cache_grphead(ap->a_ncp)->nc_vp
     ) {
 	jreclist_undo_file(&jreclist, ap->a_ncp->nc_vp, 
 			   JRUNDO_ALL|JRUNDO_GETVP|JRUNDO_CONDLINK, 0, -1);
@@ -2599,7 +2599,7 @@ journal_nrmdir(struct vop_nrmdir_args *a
 
     mp = ap->a_head.a_ops->vv_mount;
     if (jreclist_init(mp, &jreclist, &jreccache, JTYPE_RMDIR)) {
-	jreclist_undo_file(&jreclist, ap->a_ncp->nc_vp,
+	jreclist_undo_file(&jreclist, cache_grphead(ap->a_ncp)->nc_vp,
 			   JRUNDO_VATTR|JRUNDO_GETVP, 0, 0);
     }
     error = vop_journal_operate_ap(&ap->a_head);
@@ -2628,9 +2628,9 @@ journal_nrename(struct vop_nrename_args 
 
     mp = ap->a_head.a_ops->vv_mount;
     if (jreclist_init(mp, &jreclist, &jreccache, JTYPE_RENAME) &&
-	ap->a_tncp->nc_vp
+	cache_grphead(ap->a_tncp)->nc_vp
     ) {
-	jreclist_undo_file(&jreclist, ap->a_tncp->nc_vp, 
+	jreclist_undo_file(&jreclist, cache_grphead(ap->a_tncp)->nc_vp, 
 			   JRUNDO_ALL|JRUNDO_GETVP|JRUNDO_CONDLINK, 0, -1);
     }
     error = vop_journal_operate_ap(&ap->a_head);
diff -r 820a1f1d791e -r 6db92777ea99 sys/kern/vfs_nlookup.c
--- a/sys/kern/vfs_nlookup.c	Mon Jan 23 02:56:43 2006 +0000
+++ b/sys/kern/vfs_nlookup.c	Mon Jan 23 05:16:56 2006 +0100
@@ -381,13 +381,15 @@ nlookup(struct nlookupdata *nd)
 		ncp = cache_get(ncp);
 	    } else {
 		while ((ncp->nc_flag & NCF_MOUNTPT) && ncp != nd->nl_rootncp) {
-		    if (ncp->nc_parent->nc_flag & NCF_DESTROYED)
+		    if (cache_grphead(ncp->nc_parent)->nc_flag &
+		        NCF_DESTROYED)
 			break;
 		    ncp = ncp->nc_parent;	/* get to underlying node */
 		    KKASSERT(ncp != NULL && 1);
 		}
 		if (ncp != nd->nl_rootncp) {
-			if (ncp->nc_parent->nc_flag & NCF_DESTROYED) {
+			if (cache_grphead(ncp->nc_parent)->nc_flag &
+			    NCF_DESTROYED) {
 				error = EINVAL;
 				break;
 			}
@@ -421,11 +423,11 @@ nlookup(struct nlookupdata *nd)
 	 * XXX neither '.' nor '..' should return EAGAIN since they were
 	 * previously resolved and thus cannot be newly created ncp's.
 	 */
-	if (ncp->nc_flag & NCF_UNRESOLVED) {
+	if (cache_grphead_l(ncp)->nc_flag & NCF_UNRESOLVED) {
 	    error = cache_resolve(ncp, nd->nl_cred);
 	    KKASSERT(error != EAGAIN);
 	} else {
-	    error = ncp->nc_error;
+	    error = cache_grphead_l(ncp)->nc_error;
 	}
 
 	/*
@@ -459,7 +461,7 @@ nlookup(struct nlookupdata *nd)
 	 * element or it is the last element and we are allowed to
 	 * follow symlinks, resolve the symlink.
 	 */
-	if ((ncp->nc_flag & NCF_ISSYMLINK) &&
+	if ((cache_grphead_l(ncp)->nc_flag & NCF_ISSYMLINK) &&
 	    (*ptr || (nd->nl_flags & NLC_FOLLOW))
 	) {
 	    if (nd->nl_loopcnt++ >= MAXSYMLINKS) {
@@ -509,24 +511,31 @@ nlookup(struct nlookupdata *nd)
 	 *
 	 * XXX NOCROSSMOUNT
 	 */
-	while ((ncp->nc_flag & NCF_ISDIR) && ncp->nc_vp->v_mountedhere &&
+	while ((cache_grphead_l(ncp)->nc_flag & NCF_ISDIR) &&
+	        cache_grphead_l(ncp)->nc_vp->v_mountedhere &&
 		(nd->nl_flags & NLC_NOCROSSMOUNT) == 0
 	) {
 	    struct mount *mp;
 	    struct vnode *tdp;
 
-	    mp = ncp->nc_vp->v_mountedhere;
+	    mp = cache_grphead_l(ncp)->nc_vp->v_mountedhere;
 	    cache_put(ncp);
 	    ncp = cache_get(mp->mnt_ncp);
 
-	    if (ncp->nc_flag & NCF_UNRESOLVED) {
+	    if (cache_grphead_l(ncp)->nc_flag & NCF_UNRESOLVED) {
 		while (vfs_busy(mp, 0, nd->nl_td))
 		    ;
 		error = VFS_ROOT(mp, &tdp);
 		vfs_unbusy(mp, nd->nl_td);
 		if (error)
 		    break;
-		cache_setvp(ncp, tdp);
+		/*
+		 * The VFS_ROOT call might have the side effect of
+		 * resolving the ncp. Or is that declared a sin
+		 * anywhere?
+		 */
+	    	if (cache_grphead_l(ncp)->nc_flag & NCF_UNRESOLVED)
+			cache_setvp(ncp, tdp);
 		vput(tdp);
 	    }
 	}
@@ -543,7 +552,7 @@ nlookup(struct nlookupdata *nd)
 	 * to the failure case below.
 	 */
 	while (*ptr == '/') {
-	    if ((ncp->nc_flag & NCF_ISDIR) == 0 && 
+	    if ((cache_grphead_l(ncp)->nc_flag & NCF_ISDIR) == 0 && 
 		!(nd->nl_flags & NLC_WILLBEDIR)
 	    ) {
 		break;
@@ -555,7 +564,7 @@ nlookup(struct nlookupdata *nd)
 	 * Continuation case: additional elements and the current
 	 * element is a directory.
 	 */
-	if (*ptr && (ncp->nc_flag & NCF_ISDIR)) {
+	if (*ptr && (cache_grphead_l(ncp)->nc_flag & NCF_ISDIR)) {
 	    cache_drop(nd->nl_ncp);
 	    cache_unlock(ncp);
 	    nd->nl_ncp = ncp;
@@ -619,7 +628,7 @@ nlookup_mp(struct mount *mp, struct name
     error = 0;
     ncp = mp->mnt_ncp;
     cache_get(ncp);
-    if (ncp->nc_flag & NCF_UNRESOLVED) {
+    if (cache_grphead_l(ncp)->nc_flag & NCF_UNRESOLVED) {
 	while (vfs_busy(mp, 0, curthread))
 	    ;
 	error = VFS_ROOT(mp, &vp);
@@ -655,7 +664,7 @@ nreadsymlink(struct nlookupdata *nd, str
 
     nlc->nlc_nameptr = NULL;
     nlc->nlc_namelen = 0;
-    if (ncp->nc_vp == NULL)
+    if (cache_grphead_l(ncp)->nc_vp == NULL)
 	return(ENOENT);
     if ((error = cache_vget(ncp, nd->nl_cred, LK_SHARED, &vp)) != 0)
 	return(error);
@@ -713,13 +722,14 @@ int
 int
 naccess(struct namecache *ncp, int vmode, struct ucred *cred)
 {
-    struct namecache *par;
+    struct namecache *par, *oncp = ncp;
     struct vnode *vp;
     struct vattr va;
     int error;
 
-    if (ncp->nc_flag & NCF_UNRESOLVED) {
+    if (ncp->nc_flag & NCF_UNRESOLVED || ncp->nc_shadowed) {
 	cache_lock(ncp);
+	ncp = cache_grphead_l(ncp);
 	cache_resolve(ncp, cred);
 	cache_unlock(ncp);
     }
@@ -728,7 +738,7 @@ naccess(struct namecache *ncp, int vmode
 	if (((vmode & VCREATE) && ncp->nc_vp == NULL) ||
 	    ((vmode & VDELETE) && ncp->nc_vp != NULL)
 	) {
-	    if ((par = ncp->nc_parent) == NULL) {
+	    if ((par = oncp->nc_parent) == NULL) {
 		if (error != EAGAIN)
 			error = EINVAL;
 	    } else {
diff -r 820a1f1d791e -r 6db92777ea99 sys/kern/vfs_syscalls.c
--- a/sys/kern/vfs_syscalls.c	Mon Jan 23 02:56:43 2006 +0000
+++ b/sys/kern/vfs_syscalls.c	Mon Jan 23 05:16:56 2006 +0100
@@ -140,7 +140,7 @@ mount(struct mount_args *uap)
 	error = nlookup_init(&nd, uap->path, UIO_USERSPACE, NLC_FOLLOW);
 	if (error == 0) {
 		if ((error = nlookup(&nd)) == 0) {
-			if (nd.nl_ncp->nc_vp == NULL)
+			if (cache_grphead_l(nd.nl_ncp)->nc_vp == NULL)
 				error = ENOENT;
 		}
 	}
@@ -159,7 +159,7 @@ mount(struct mount_args *uap)
 	/*
 	 * now we have the locked ref'd ncp and unreferenced vnode.
 	 */
-	vp = ncp->nc_vp;
+	vp = cache_grphead_l(ncp)->nc_vp;
 	if ((error = vget(vp, LK_EXCLUSIVE, td)) != 0) {
 		cache_put(ncp);
 		return (error);
@@ -1050,9 +1050,9 @@ fchdir(struct fchdir_args *uap)
 	while (!error && (mp = vp->v_mountedhere) != NULL) {
 		error = nlookup_mp(mp, &nct);
 		if (error == 0) {
+			vput(vp);
+			vp = cache_grphead_l(nct)->nc_vp;
 			cache_unlock(nct);	/* leave ref intact */
-			vput(vp);
-			vp = nct->nc_vp;
 			error = vget(vp, LK_SHARED, td);
 			KKASSERT(error == 0);
 			cache_drop(ncp);
@@ -1086,7 +1086,7 @@ kern_chdir(struct nlookupdata *nd)
 
 	if ((error = nlookup(nd)) != 0)
 		return (error);
-	if ((vp = nd->nl_ncp->nc_vp) == NULL)
+	if ((vp = cache_grphead_l(nd->nl_ncp)->nc_vp) == NULL)
 		return (ENOENT);
 	if ((error = vget(vp, LK_SHARED, td)) != 0)
 		return (error);
@@ -1192,7 +1192,7 @@ kern_chroot(struct namecache *ncp)
 		if ((error = chroot_refuse_vdir_fds(fdp)) != 0)
 			return (error);
 	}
-	if ((vp = ncp->nc_vp) == NULL)
+	if ((vp = cache_grphead_l(ncp)->nc_vp) == NULL)
 		return (ENOENT);
 
 	if ((error = vget(vp, LK_SHARED, td)) != 0)
@@ -1464,7 +1464,7 @@ kern_mknod(struct nlookupdata *nd, int m
 	if ((error = nlookup(nd)) != 0)
 		return (error);
 	ncp = nd->nl_ncp;
-	if (ncp->nc_vp)
+	if (cache_grphead_l(ncp)->nc_vp)
 		return (EEXIST);
 
 	VATTR_NULL(&vattr);
@@ -1536,7 +1536,7 @@ kern_mkfifo(struct nlookupdata *nd, int 
 	if ((error = nlookup(nd)) != 0)
 		return (error);
 	ncp = nd->nl_ncp;
-	if (ncp->nc_vp)
+	if (cache_grphead_l(ncp)->nc_vp)
 		return (EEXIST);
 
 	VATTR_NULL(&vattr);
@@ -1633,7 +1633,7 @@ kern_link(struct nlookupdata *nd, struct
 	bwillwrite();
 	if ((error = nlookup(nd)) != 0)
 		return (error);
-	vp = nd->nl_ncp->nc_vp;
+	vp = cache_grphead_l(nd->nl_ncp)->nc_vp;
 	KKASSERT(vp != NULL);
 	if (vp->v_type == VDIR)
 		return (EPERM);		/* POSIX */
@@ -1654,7 +1654,7 @@ kern_link(struct nlookupdata *nd, struct
 		vput(vp);
 		return (error);
 	}
-	if (linknd->nl_ncp->nc_vp) {
+	if (cache_grphead_l(linknd->nl_ncp)->nc_vp) {
 		vput(vp);
 		return (EEXIST);
 	}
@@ -1704,7 +1704,7 @@ kern_symlink(struct nlookupdata *nd, cha
 	if ((error = nlookup(nd)) != 0)
 		return (error);
 	ncp = nd->nl_ncp;
-	if (ncp->nc_vp)
+	if (cache_grphead_l(ncp)->nc_vp)
 		return (EEXIST);
 
 	VATTR_NULL(&vattr);
@@ -1922,7 +1922,7 @@ kern_stat(struct nlookupdata *nd, struct
 	if ((error = nlookup(nd)) != 0)
 		return (error);
 again:
-	if ((vp = nd->nl_ncp->nc_vp) == NULL)
+	if ((vp = cache_grphead_l(nd->nl_ncp)->nc_vp) == NULL)
 		return (ENOENT);
 
 	td = curthread;
@@ -2718,13 +2718,17 @@ kern_rename(struct nlookupdata *fromnd, 
 	 * Due to relocking of the source, fromnd->nl_ncp->nc_vp might have
 	 * become NULL.
 	 */
-	if (tond->nl_ncp->nc_vp) {
-		if (fromnd->nl_ncp->nc_vp == NULL) {
+	/*
+	 * XXX I was lazy to find out who is locked exactly, so just dumbly
+	 * cache_grphead() the parties...
+	 */
+	if (cache_grphead(tond->nl_ncp)->nc_vp) {
+		if (cache_grphead(fromnd->nl_ncp)->nc_vp == NULL) {
 			error = ENOENT;
-		} else if (fromnd->nl_ncp->nc_vp->v_type == VDIR) {
-			if (tond->nl_ncp->nc_vp->v_type != VDIR)
+		} else if (cache_grphead(fromnd->nl_ncp)->nc_vp->v_type == VDIR) {
+			if (cache_grphead(tond->nl_ncp)->nc_vp->v_type != VDIR)
 				error = ENOTDIR;
-		} else if (tond->nl_ncp->nc_vp->v_type == VDIR) {
+		} else if (cache_grphead(tond->nl_ncp)->nc_vp->v_type == VDIR) {
 			error = EISDIR;
 		}
 	}
@@ -2753,7 +2757,7 @@ kern_rename(struct nlookupdata *fromnd, 
 	 * when we detect the situation.
 	 */
 	if (error == 0) {
-		if (fromnd->nl_ncp->nc_vp == tond->nl_ncp->nc_vp) {
+		if (cache_grphead(fromnd->nl_ncp)->nc_vp == cache_grphead(tond->nl_ncp)->nc_vp) {
 			error = VOP_NREMOVE(fromnd->nl_ncp, fromnd->nl_cred);
 		} else {
 			error = VOP_NRENAME(fromnd->nl_ncp, tond->nl_ncp, 
@@ -2802,7 +2806,7 @@ kern_mkdir(struct nlookupdata *nd, int m
 		return (error);
 
 	ncp = nd->nl_ncp;
-	if (ncp->nc_vp)
+	if (cache_grphead_l(ncp)->nc_vp)
 		return (EEXIST);
 
 	VATTR_NULL(&vattr);
diff -r 820a1f1d791e -r 6db92777ea99 sys/kern/vfs_vnops.c
--- a/sys/kern/vfs_vnops.c	Mon Jan 23 02:56:43 2006 +0000
+++ b/sys/kern/vfs_vnops.c	Mon Jan 23 05:16:56 2006 +0100
@@ -169,7 +169,7 @@ vn_open(struct nlookupdata *nd, struct f
 	 */
 again:
 	if (fmode & O_CREAT) {
-		if (ncp->nc_vp == NULL) {
+		if (cache_grphead_l(ncp)->nc_vp == NULL) {
 			VATTR_NULL(vap);
 			vap->va_type = VREG;
 			vap->va_mode = cmode;
diff -r 820a1f1d791e -r 6db92777ea99 sys/sys/namecache.h
--- a/sys/sys/namecache.h	Mon Jan 23 02:56:43 2006 +0000
+++ b/sys/sys/namecache.h	Mon Jan 23 05:16:56 2006 +0100
@@ -102,6 +102,7 @@ struct namecache {
     TAILQ_ENTRY(namecache) nc_vnode;	/* scan via vnode->v_namecache */
     struct namecache_list  nc_list;	/* list of children */
     struct namecache *nc_parent;	/* namecache entry for parent */
+    struct namecache *nc_shadowed;	/* lower layer entry in layered fs */
     struct	vnode *nc_vp;		/* vnode representing name or NULL */
     int		nc_refs;		/* ref count prevents deletion */
     u_short	nc_flag;
@@ -150,6 +151,10 @@ void	cache_lock(struct namecache *ncp);
 void	cache_lock(struct namecache *ncp);
 int	cache_lock_nonblock(struct namecache *ncp);
 void	cache_unlock(struct namecache *ncp);
+struct namecache *cache_grphead_l(struct namecache *ncp);
+struct namecache *cache_grphead(struct namecache *ncp);
+int	cache_shadow_attach(struct namecache *ncp, struct namecache *sncp);
+struct namecache *cache_shadow_detach(struct namecache *ncp);
 void	cache_setvp(struct namecache *ncp, struct vnode *vp);
 void	cache_settimeout(struct namecache *ncp, int nticks);
 void	cache_setunresolved(struct namecache *ncp);
diff -r 820a1f1d791e -r 6db92777ea99 sys/vfs/nfs/nfs_serv.c
--- a/sys/vfs/nfs/nfs_serv.c	Mon Jan 23 02:56:43 2006 +0000
+++ b/sys/vfs/nfs/nfs_serv.c	Mon Jan 23 05:16:56 2006 +0100
@@ -2183,8 +2183,8 @@ nfsrv_rename(struct nfsrv_descript *nfsd
 	}
 	fromnd.nl_flags |= NLC_NCPISLOCKED;
 
-	tvp = tond.nl_ncp->nc_vp;
-	fvp = fromnd.nl_ncp->nc_vp;
+	tvp = cache_grphead_l(tond.nl_ncp)->nc_vp;
+	fvp = cache_grphead_l(fromnd.nl_ncp)->nc_vp;
 
 	if (tvp != NULL) {
 		if (fvp->v_type == VDIR && tvp->v_type != VDIR) {
diff -r 820a1f1d791e -r 6db92777ea99 sys/vfs/nfs/nfs_subs.c
--- a/sys/vfs/nfs/nfs_subs.c	Mon Jan 23 02:56:43 2006 +0000
+++ b/sys/vfs/nfs/nfs_subs.c	Mon Jan 23 05:16:56 2006 +0100
@@ -1671,7 +1671,7 @@ nfs_namei(struct nlookupdata *nd, struct
 				error = ENXIO;
 			}
 		}
-		if (vpp && ncp->nc_vp) {
+		if (vpp && cache_grphead_l(ncp)->nc_vp) {
 			error = cache_vget(ncp, nd->nl_cred, LK_EXCLUSIVE, vpp);
 		}
 		if (error) {
diff -r 820a1f1d791e -r 6db92777ea99 sys/vfs/nfs/nfs_vnops.c
--- a/sys/vfs/nfs/nfs_vnops.c	Mon Jan 23 02:56:43 2006 +0000
+++ b/sys/vfs/nfs/nfs_vnops.c	Mon Jan 23 05:16:56 2006 +0100
@@ -883,8 +883,8 @@ nfs_nresolve(struct vop_nresolve_args *a
 	cred = ap->a_cred;
 	ncp = ap->a_ncp;
 
-	KKASSERT(ncp->nc_parent && ncp->nc_parent->nc_vp);
-	dvp = ncp->nc_parent->nc_vp;
+	KKASSERT(ncp->nc_parent && cache_grphead(ncp->nc_parent)->nc_vp);
+	dvp = cache_grphead(ncp->nc_parent)->nc_vp;
 	if ((error = vget(dvp, LK_SHARED, td)) != 0)
 		return (error);
 
diff -r 820a1f1d791e -r 6db92777ea99 sys/vfs/nullfs/null.h
--- a/sys/vfs/nullfs/null.h	Mon Jan 23 02:56:43 2006 +0000
+++ b/sys/vfs/nullfs/null.h	Mon Jan 23 05:16:56 2006 +0100
@@ -43,18 +43,19 @@ struct null_args {
 	char		*target;	/* Target of loopback  */
 };
 
-struct null_mount {
-	struct mount	*nullm_vfs;
-	struct vnode	*nullm_rootvp;	/* Reference to root null_node */
-};
-
 #ifdef _KERNEL
-#define	MOUNTTONULLMOUNT(mp) ((struct null_mount *)((mp)->mnt_data))
 
 #ifdef NULLFS_DEBUG
-#define NULLFSDEBUG(format, args...) printf(format ,## args)
+#define NULLFSDEBUG(format, args...) \
+	printf("[nullfs] %s:%d: " format, __func__, __LINE__, ## args)
+#define	NULLNCDEBUG(ncp) \
+        NULLFSDEBUG(#ncp " %p: name %s, refs %d, exlocks %d, " \
+                    "nc_mount %p, nc_shadowed %p\n", \
+                    (ncp), (ncp)->nc_name, (ncp)->nc_refs, (ncp)->nc_exlocks, \
+                    (ncp)->nc_mount, (ncp)->nc_shadowed);
 #else
 #define NULLFSDEBUG(format, args...)
+#define NULLNCDEBUG(ncp)
 #endif /* NULLFS_DEBUG */
 
 #endif /* _KERNEL */
diff -r 820a1f1d791e -r 6db92777ea99 sys/vfs/nullfs/null_vfsops.c
--- a/sys/vfs/nullfs/null_vfsops.c	Mon Jan 23 02:56:43 2006 +0000
+++ b/sys/vfs/nullfs/null_vfsops.c	Mon Jan 23 05:16:56 2006 +0100
@@ -80,12 +80,10 @@ nullfs_mount(struct mount *mp, char *pat
 {
 	int error = 0;
 	struct null_args args;
-	struct vnode *rootvp;
-	struct null_mount *xmp;
 	u_int size;
 	struct nlookupdata nd;
 
-	NULLFSDEBUG("nullfs_mount(mp = %p)\n", (void *)mp);
+	NULLFSDEBUG("mp %p\n", (void *)mp);
 
 	/*
 	 * Update is a no-op
@@ -98,118 +96,118 @@ nullfs_mount(struct mount *mp, char *pat
 	 * Get argument
 	 */
 	error = copyin(data, (caddr_t)&args, sizeof(struct null_args));
-	if (error)
-		return (error);
-
-	/*
-	 * Find lower node
-	 */
-	rootvp = NULL;
-	error = nlookup_init(&nd, args.target, UIO_USERSPACE, NLC_FOLLOW);
+
+	/*
+	 * Do a lookup just to see if things are not fundamentally broken...
+	 * but it's too early to make a proper use of the result.
+	 */
+	if (error == 0)
+		error = nlookup_init(&nd, args.target, UIO_USERSPACE,
+				     NLC_FOLLOW);
 	if (error == 0)
 		error = nlookup(&nd);
-	if (error == 0) {
-		error = cache_vget(nd.nl_ncp, nd.nl_cred, LK_EXCLUSIVE, 
-					&rootvp);
-	}
-
-	xmp = (struct null_mount *) malloc(sizeof(struct null_mount),
-				M_NULLFSMNT, M_WAITOK);	/* XXX */
-
-	/*
-	 * Save reference to underlying FS
-	 */
-        /*
-         * As lite stacking enters the scene, the old way of doing this
-	 * -- via the vnode -- is not good enough anymore...
-	 */
-	xmp->nullm_vfs = nd.nl_ncp->nc_mount;
+	if (error)
+		return(error);
+
 	nlookup_done(&nd);
 
-	vfs_add_vnodeops(mp, &mp->mnt_vn_norm_ops, 
-			 null_vnodeop_entries, 0);
-
-	VOP_UNLOCK(rootvp, 0, td);
-
-	/*
-	 * Keep a held reference to the root vnode.
-	 * It is vrele'd in nullfs_unmount.
-	 */
-	xmp->nullm_rootvp = rootvp;
-	/*
-	 * XXX What's the proper safety condition for querying
-	 * the underlying mount? Is this flag tuning necessary
-	 * at all?
-	 */
-	if (xmp->nullm_vfs->mnt_flag & MNT_LOCAL)
-		mp->mnt_flag |= MNT_LOCAL;
-	mp->mnt_data = (qaddr_t) xmp;
-	vfs_getnewfsid(mp);
+	vfs_add_vnodeops(mp, &mp->mnt_vn_norm_ops, null_vnodeop_entries, 0);
+
+	/*
+	 * Heck it, let it just be local. I bet I need only five minutes to
+	 * find out a sound sounding meaning for "local" by which null mounts
+	 * are always local.
+	 */
+	mp->mnt_flag |= MNT_LOCAL; vfs_getnewfsid(mp);
 
 	(void) copyinstr(args.target, mp->mnt_stat.f_mntfromname, MNAMELEN - 1,
 	    &size);
 	bzero(mp->mnt_stat.f_mntfromname + size, MNAMELEN - size);
-	(void)nullfs_statfs(mp, &mp->mnt_stat, td);
-	NULLFSDEBUG("nullfs_mount: lower %s, alias at %s\n",
-		mp->mnt_stat.f_mntfromname, mp->mnt_stat.f_mntfromname);
+	NULLFSDEBUG("lower %s, alias at %s\n",
+	            mp->mnt_stat.f_mntfromname, mp->mnt_stat.f_mntonname);
 	return (0);
 }
 
-/*
- * Free reference to null layer
- */
 static int
 nullfs_unmount(struct mount *mp, int mntflags, struct thread *td)
 {
-	void *mntdata;
-	int flags = 0;
-
-	NULLFSDEBUG("nullfs_unmount: mp = %p\n", (void *)mp);
-
-	if (mntflags & MNT_FORCE)
-		flags |= FORCECLOSE;
-
-	/*
-	 * Finally, throw away the null_mount structure
-	 */
-	mntdata = mp->mnt_data;
-	mp->mnt_data = 0;
-	free(mntdata, M_NULLFSMNT);
+	NULLFSDEBUG("mp %p\n", (void *)mp);
+
+	cache_lock(mp->mnt_ncp);
+	cache_put(cache_shadow_detach(mp->mnt_ncp));
+	cache_unlock(mp->mnt_ncp);
+
 	return 0;
 }
 
 static int
+nullfs_start(struct mount *mp, int flags, struct thread *td)
+{
+	int error;
+	struct nlookupdata nd;
+
+	NULLFSDEBUG("nlookup %s\n", mp->mnt_stat.f_mntfromname);
+
+	error = nlookup_init(&nd, mp->mnt_stat.f_mntfromname,
+		                     UIO_SYSSPACE, NLC_FOLLOW);
+	if (error == 0)
+		error = nlookup(&nd);
+	if (error)
+		return(error);
+
+	cache_hold(nd.nl_ncp);
+	cache_lock(mp->mnt_ncp);
+
+	error = mp->mnt_ncp->nc_shadowed ?
+	        EINVAL :
+	        cache_shadow_attach(mp->mnt_ncp, nd.nl_ncp);
+
+	nlookup_done(&nd);
+
+	NULLNCDEBUG(mp->mnt_ncp);
+#ifdef NULLFS_DEBUG
+	if (mp->mnt_ncp->nc_shadowed)
+		NULLNCDEBUG(mp->mnt_ncp->nc_shadowed);
+#endif
+
+	return (error);
+}	
+
+/*
+ * As the mount won't get aborted if VFS_START fails, we have to check in each 
+ * VFS call whether it has succeeded...
+ */ 
+
+static int
 nullfs_root(struct mount *mp, struct vnode **vpp)
 {
-	struct thread *td = curthread;	/* XXX */
-	struct vnode *vp;
-
-	NULLFSDEBUG("nullfs_root(mp = %p, vp = %p)\n", (void *)mp,
-	    (void *)MOUNTTONULLMOUNT(mp)->nullm_rootvp);
-
-	/*
-	 * Return locked reference to root.
-	 */
-	vp = MOUNTTONULLMOUNT(mp)->nullm_rootvp;
-	vref(vp);
-
-#ifdef NULLFS_DEBUG
-	if (VOP_ISLOCKED(vp, NULL)) {
-		Debugger("root vnode is locked.\n");
-		vrele(vp);
-		return (EDEADLK);
-	}
-#endif
-	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
-	*vpp = vp;
-	return 0;
+	int error;
+
+	if (! mp->mnt_ncp || ! mp->mnt_ncp->nc_shadowed)
+		return (ENXIO);
+
+	error = cache_vget(mp->mnt_ncp, crhold(proc0.p_ucred),
+	                   LK_EXCLUSIVE | LK_RETRY, vpp);
+	crfree(proc0.p_ucred);
+
+	return (error);
+}
+
+static __inline
+struct mount *
+nullfs_lowermount_0(struct mount *mp)
+{
+	return (mp->mnt_ncp->nc_shadowed->nc_mount);
 }
 
 static int
 nullfs_quotactl(struct mount *mp, int cmd, uid_t uid, caddr_t arg,
 		struct thread *td)
 {
-	return VFS_QUOTACTL(MOUNTTONULLMOUNT(mp)->nullm_vfs, cmd, uid, arg, td);
+	if (! mp->mnt_ncp || ! mp->mnt_ncp->nc_shadowed)
+		return (ENXIO);
+
+	return VFS_QUOTACTL(nullfs_lowermount_0(mp), cmd, uid, arg, td);
 }
 
 static int
@@ -218,12 +216,15 @@ nullfs_statfs(struct mount *mp, struct s
 	int error;
 	struct statfs mstat;
 
-	NULLFSDEBUG("nullfs_statfs(mp = %p, vp = %p)\n", (void *)mp,
-	    (void *)MOUNTTONULLMOUNT(mp)->nullm_rootvp);
+	if (! mp->mnt_ncp || ! mp->mnt_ncp->nc_shadowed)
+		return (ENXIO);
+
+	NULLFSDEBUG("mp %p, ncp %p, lower mp %p\n",
+	            mp, mp->mnt_ncp, nullfs_lowermount_0(mp));
 
 	bzero(&mstat, sizeof(mstat));
 
-	error = VFS_STATFS(MOUNTTONULLMOUNT(mp)->nullm_vfs, &mstat, td);
+	error = VFS_STATFS(nullfs_lowermount_0(mp), &mstat, td);
 	if (error)
 		return (error);
 
@@ -248,23 +249,27 @@ nullfs_checkexp(struct mount *mp, struct
 nullfs_checkexp(struct mount *mp, struct sockaddr *nam, int *extflagsp,
 		struct ucred **credanonp)
 {
-
-	return VFS_CHECKEXP(MOUNTTONULLMOUNT(mp)->nullm_vfs, nam, 
-		extflagsp, credanonp);
+	if (! mp->mnt_ncp || ! mp->mnt_ncp->nc_shadowed)
+		return (ENXIO);
+
+	return VFS_CHECKEXP(nullfs_lowermount_0(mp), nam, extflagsp, credanonp);
 }
 
 static int                        
 nullfs_extattrctl(struct mount *mp, int cmd, const char *attrname, caddr_t arg,
 		  struct thread *td)
 {
-	return VFS_EXTATTRCTL(MOUNTTONULLMOUNT(mp)->nullm_vfs, cmd, attrname,
-	    arg, td);
+	if (! mp->mnt_ncp || ! mp->mnt_ncp->nc_shadowed)
+		return (ENXIO);
+
+	return VFS_EXTATTRCTL(nullfs_lowermount_0(mp), cmd, attrname, arg, td);
 }
 
 
 static struct vfsops null_vfsops = {
 	.vfs_mount =   	 	nullfs_mount,
 	.vfs_unmount =   	nullfs_unmount,
+	.vfs_start =     	nullfs_start,
 	.vfs_root =     	nullfs_root,
 	.vfs_quotactl =   	nullfs_quotactl,
 	.vfs_statfs =    	nullfs_statfs,
diff -r 820a1f1d791e -r 6db92777ea99 sys/vfs/nullfs/null_vnops.c
--- a/sys/vfs/nullfs/null_vnops.c	Mon Jan 23 02:56:43 2006 +0000
+++ b/sys/vfs/nullfs/null_vnops.c	Mon Jan 23 05:16:56 2006 +0100
@@ -109,6 +109,8 @@
 #include <sys/namei.h>
 #include <sys/malloc.h>
 #include <sys/buf.h>
+#include <sys/namecache.h>
+#include <sys/nlookup.h>
 #include "null.h"
 
 static int	null_nresolve(struct vop_nresolve_args *ap);
@@ -122,18 +124,93 @@ static int	null_nrmdir(struct vop_nrmdir
 static int	null_nrmdir(struct vop_nrmdir_args *ap);
 static int	null_nrename(struct vop_nrename_args *ap);
 
+static __inline
+struct mount *
+nullfs_lowermount_l(struct namecache *ncp)
+{
+	/*
+	 * The code in use below allows allows passing through lower mounts.
+	 * If we didn't want to do that, we could use
+	 *
+	 *   ncp->nc_mount->mnt_ncp->nc_shadowed->nc_mount
+	 *
+	 * Eventually, the choice might be configurable.
+	 *
+	 *                  -  -  -
+	 *
+	 * Matt says in
+	 * http://leaf.dragonflybsd.org/mailarchive/kernel/2006-01/msg00023.html
+	 * :
+	 
+    The ncp->nc_mount field was never meant to be used by the VFS code...
+    only to be used internally by cache_*().  It looks like I broke my own
+    rule... I have two references in NFS, but that's for later.
+
+	 * Note that both approaches still use nc_mount:
+	 *
+	 * - If we wanna pass through lower mounts, we do have to find the
+	 *   the lower fs ncp-wise, we simply don't have choice.
+	 *
+	 * - If we just work with a fixed lower fs, we are able to access
+	 *   that if we are willing to use nc_mount. Hence it just seems to be
+	 *   stupid to keep around a direct reference to the lower fs, but
+	 *   that's of course feasible.
+	 */
+	return (ncp->nc_shadowed->nc_mount);
+}
+
 static int
 null_nresolve(struct vop_nresolve_args *ap)
 {
-	ap->a_head.a_ops = MOUNTTONULLMOUNT(ap->a_ncp->nc_mount)->nullm_vfs->mnt_vn_norm_ops;
-
-	return vop_nresolve_ap(ap);
+	struct namecache *ncp = ap->a_ncp;
+	struct mount *lmp;
+
+	if (! ncp->nc_shadowed) {
+		struct nlcomponent nlc;
+		struct namecache *sncp;
+
+		nlc.nlc_nameptr = ncp->nc_name;
+		nlc.nlc_namelen = ncp->nc_nlen;
+
+		KKASSERT(ncp->nc_parent->nc_shadowed);
+		sncp = cache_nlookup(ncp->nc_parent->nc_shadowed, &nlc);
+
+		if ((ncp->nc_error = cache_shadow_attach(ncp, sncp))) {
+			cache_put(sncp);
+			cache_setvp(ncp, NULL);
+			return (ncp->nc_error);
+		}
+
+		NULLNCDEBUG(ncp);
+		NULLNCDEBUG(ncp->nc_shadowed);
+	}
+
+	/*
+	 * XXX Querying/ensuring usability of lower fs still not got right.
+	 * As a quick hack, we do a simple test here, that will do for
+	 * avoiding most obvious fallacies.
+	 */
+	if ((lmp = nullfs_lowermount_l(ncp)) &&
+	    (ap->a_head.a_ops = lmp->mnt_vn_use_ops)) {
+		/*
+		 * Moving down in the shadow chain is for avoiding a recursed
+		 * loop (ending up in exhausting the kernel stack).
+		 *
+		 * Otherwise it's the same whether we use ncp or
+		 * ncp->nc_shadowed -- we go for group shared ncp attributes.
+	 	 */
+		ap->a_ncp = ncp->nc_shadowed;
+		return vop_nresolve_ap(ap);
+	}
+
+	return (ENXIO);
 }
 
 static int
 null_ncreate(struct vop_ncreate_args *ap)
 {
-	ap->a_head.a_ops = MOUNTTONULLMOUNT(ap->a_ncp->nc_mount)->nullm_vfs->mnt_vn_norm_ops;
+	ap->a_head.a_ops = nullfs_lowermount_l(ap->a_ncp)->mnt_vn_use_ops;
+	ap->a_ncp = ap->a_ncp->nc_shadowed;
 
 	return vop_ncreate_ap(ap);
 }
@@ -141,7 +218,8 @@ static int
 static int
 null_nmkdir(struct vop_nmkdir_args *ap)
 {
-	ap->a_head.a_ops = MOUNTTONULLMOUNT(ap->a_ncp->nc_mount)->nullm_vfs->mnt_vn_norm_ops;
+	ap->a_head.a_ops = nullfs_lowermount_l(ap->a_ncp)->mnt_vn_use_ops;
+	ap->a_ncp = ap->a_ncp->nc_shadowed;
 
 	return vop_nmkdir_ap(ap);
 }
@@ -149,7 +227,8 @@ static int
 static int
 null_nmknod(struct vop_nmknod_args *ap)
 {
-	ap->a_head.a_ops = MOUNTTONULLMOUNT(ap->a_ncp->nc_mount)->nullm_vfs->mnt_vn_norm_ops;
+	ap->a_head.a_ops = nullfs_lowermount_l(ap->a_ncp)->mnt_vn_use_ops;
+	ap->a_ncp = ap->a_ncp->nc_shadowed;
 
 	return vop_nmknod_ap(ap);
 }
@@ -157,7 +236,8 @@ static int
 static int
 null_nlink(struct vop_nlink_args *ap)
 {
-	ap->a_head.a_ops = MOUNTTONULLMOUNT(ap->a_ncp->nc_mount)->nullm_vfs->mnt_vn_norm_ops;
+	ap->a_head.a_ops = nullfs_lowermount_l(ap->a_ncp)->mnt_vn_use_ops;
+	ap->a_ncp = ap->a_ncp->nc_shadowed;
 
 	return vop_nlink_ap(ap);
 }
@@ -165,7 +245,8 @@ static int
 static int
 null_nsymlink(struct vop_nsymlink_args *ap)
 {
-	ap->a_head.a_ops = MOUNTTONULLMOUNT(ap->a_ncp->nc_mount)->nullm_vfs->mnt_vn_norm_ops;
+	ap->a_head.a_ops = nullfs_lowermount_l(ap->a_ncp)->mnt_vn_use_ops;
+	ap->a_ncp = ap->a_ncp->nc_shadowed;
 
 	return vop_nsymlink_ap(ap);
 }
@@ -173,7 +254,8 @@ static int
 static int
 null_nwhiteout(struct vop_nwhiteout_args *ap)
 {
-	ap->a_head.a_ops = MOUNTTONULLMOUNT(ap->a_ncp->nc_mount)->nullm_vfs->mnt_vn_norm_ops;
+	ap->a_head.a_ops = nullfs_lowermount_l(ap->a_ncp)->mnt_vn_use_ops;
+	ap->a_ncp = ap->a_ncp->nc_shadowed;
 
 	return vop_nwhiteout_ap(ap);
 }
@@ -181,7 +263,8 @@ static int
 static int
 null_nremove(struct vop_nremove_args *ap)
 {
-	ap->a_head.a_ops = MOUNTTONULLMOUNT(ap->a_ncp->nc_mount)->nullm_vfs->mnt_vn_norm_ops;
+	ap->a_head.a_ops = nullfs_lowermount_l(ap->a_ncp)->mnt_vn_use_ops;
+	ap->a_ncp = ap->a_ncp->nc_shadowed;
 
 	return vop_nremove_ap(ap);
 }
@@ -189,7 +272,8 @@ static int
 static int
 null_nrmdir(struct vop_nrmdir_args *ap)
 {
-	ap->a_head.a_ops = MOUNTTONULLMOUNT(ap->a_ncp->nc_mount)->nullm_vfs->mnt_vn_norm_ops;
+	ap->a_head.a_ops = nullfs_lowermount_l(ap->a_ncp)->mnt_vn_use_ops;
+	ap->a_ncp = ap->a_ncp->nc_shadowed;
 
 	return vop_nrmdir_ap(ap);
 }
@@ -197,15 +281,32 @@ static int
 static int
 null_nrename(struct vop_nrename_args *ap)
 {
+	struct namecache *fncp = ap->a_fncp;
+	struct namecache *tncp = ap->a_tncp;
 	struct mount *lmp;
-
-	lmp = MOUNTTONULLMOUNT(ap->a_fncp->nc_mount)->nullm_vfs;
-	if (lmp != MOUNTTONULLMOUNT(ap->a_tncp->nc_mount)->nullm_vfs)
-		return (EINVAL);
-
-	ap->a_head.a_ops = lmp->mnt_vn_norm_ops;
-
-	return vop_nrename_ap(ap);
+	int error;
+
+	lmp = nullfs_lowermount_l(fncp);
+	if (lmp != nullfs_lowermount_l(tncp))
+		return (EXDEV);
+
+	ap->a_head.a_ops = lmp->mnt_vn_use_ops;
+
+	/*
+	 * Parent-children relations are not subject to shadowing, therefore
+	 * we can't just dumbassly delegate, we have some work in both layers.
+	 * However, shadowing still saves us from locking or dealing with
+	 * vnodes.
+	 */
+	ap->a_fncp = fncp->nc_shadowed;
+	ap->a_tncp = tncp->nc_shadowed;
+
+	if ((error = vop_nrename_ap(ap)))
+		return (error);
+
+	cache_rename(fncp, tncp);
+
+	return (error);
 }
 
 /*
@@ -224,4 +325,3 @@ struct vnodeopv_entry_desc null_vnodeop_
 	{ &vop_nrename_desc,		(vnodeopv_entry_t) null_nrename },
 	{ NULL, NULL }
 };
-





More information about the Submit mailing list