namecache stage-2 patch for testing

Matthew Dillon dillon at apollo.backplane.com
Wed Sep 24 19:03:34 PDT 2003


    This is the stage-2 namecache patch.  It should work pretty much the 
    same as the namecache works now except I have reorgranized the namecache
    structure's topology the way it will need to be to move forward.

    Originally vnodes contained two lists: a list of namecache entries
    for which the vnode is the 'parent directory', and a list of namecache
    entries for which the vnode is the target (child in a directory).  The
    namecache structure would contain a reference to the two vnodes but
    was otherwise disconnected from other namecache structures.  The problem
    with the old methodology is that it is impossible to reverse-engineer
    multiple distinct paths flowing through the same vnode.

    The new topology is based around the namecache structure itself.  The
    structure contains a parent pointer, a list of children, and a reference
    to a single vnode.  A vnode then has a list of all the namecache
    structures flowing through it.

    --

    With the new topology we will be able to keep paths distinct by recording
    a pointer to the namecache entry instead of the vnode as the 'directory'
    being searched, and as the 'current directory', 'root directory' and
    'jail directory'.  This is what stage-3 is going to do.  Amoung other
    things it will allow you to 'mount' the same filesystem or a subdirectory
    at multiple points in the directory tree without having to use a null
    filesystem, and without causing the system to get confused vis-a-vie the
    use of "..". 

    There are many other uses once stage-3 is in... for example, you could
    literally cut-and-paste pieces of a filesystem together to form a new
    topology that shares the related vnodes but otherwise appears as a
    distinct directory tree in the system.  It will be possible to
    implement the concept of VFS 'environments' that I have been positing
    for the last few months simply by writing a VFS that does nothing more
    then resolve namecache issues, allowing actual operation within the
    'environment' to occur without any significant interaction with the
    environment once things are cached in the namecache.  *VERY* important.

    I would like to commit this stage-2 patch tomorrow evening so I can move
    on to stage-3.

						-Matt

Index: kern/init_main.c
===================================================================
RCS file: /cvs/src/sys/kern/init_main.c,v
retrieving revision 1.24
diff -u -r1.24 init_main.c
--- kern/init_main.c	24 Sep 2003 18:38:51 -0000	1.24
+++ kern/init_main.c	24 Sep 2003 18:38:59 -0000
@@ -459,6 +459,7 @@
 	VREF(p->p_fd->fd_cdir);
 	p->p_fd->fd_rdir = rootvnode;
 	VREF(p->p_fd->fd_rdir);
+	vfs_cache_setroot(rootvnode);
 	VOP_UNLOCK(rootvnode, 0, curthread);
 
 	/*
Index: kern/vfs_cache.c
===================================================================
RCS file: /cvs/src/sys/kern/vfs_cache.c,v
retrieving revision 1.8
diff -u -r1.8 vfs_cache.c
--- kern/vfs_cache.c	23 Sep 2003 05:03:51 -0000	1.8
+++ kern/vfs_cache.c	25 Sep 2003 01:45:26 -0000
@@ -1,6 +1,8 @@
 /*
  * Copyright (c) 1989, 1993, 1995
  *	The Regents of the University of California.  All rights reserved.
+ * Copyright (c) 2003 Matthew Dillon <dillon at xxxxxxxxxxxxx>,
+ *	All rights reserved.
  *
  * This code is derived from software contributed to Berkeley by
  * Poul-Henning Kamp of the FreeBSD Project.
@@ -35,7 +37,7 @@
  *
  *	@(#)vfs_cache.c	8.5 (Berkeley) 3/22/95
  * $FreeBSD: src/sys/kern/vfs_cache.c,v 1.42.2.6 2001/10/05 20:07:03 dillon Exp $
- * $DragonFly: src/sys/kern/vfs_cache.c,v 1.8 2003/09/23 05:03:51 dillon Exp $
+ * $DragonFly$
  */
 
 #include <sys/param.h>
@@ -52,16 +54,10 @@
 #include <sys/fnv_hash.h>
 
 /*
- * The namecache structure describes the elements in the cache of recent
- * names looked up by namei.  The cache is nominally LRU, but namecache
- * path element sequences to active vnodes and in-progress lookups are
- * always retained.  Non-inclusive of retained entries, the cache is
- * mananged in an LRU fashion.
- *
- * random lookups in the cache are accomplished with a hash atble using
+ * Random lookups in the cache are accomplished with a hash table using
  * a hash key of (nc_src_vp, name).
  *
- * Negative entries may exist and correspond to structures where nc_dst_vp
+ * Negative entries may exist and correspond to structures where nc_vp
  * is NULL.  In a negative entry, NCF_WHITEOUT will be set if the entry
  * corresponds to a whited-out directory entry (verses simply not finding the
  * entry at all).
@@ -70,16 +66,6 @@
  * or NOCACHE is set (rewrite), and the name is located in the cache, it
  * will be dropped.
  */
-struct	namecache {
-	LIST_ENTRY(namecache) nc_hash;	/* hash chain (nc_src_vp,name) */
-	LIST_ENTRY(namecache) nc_src;	/* scan via nc_src_vp based list */
-	TAILQ_ENTRY(namecache) nc_dst;	/* destination vnode list */
-	struct	vnode *nc_src_vp;	/* directory containing name */
-	struct	vnode *nc_dst_vp;	/* vnode representing name or NULL */
-	u_char	nc_flag;
-	u_char	nc_nlen;		/* The length of the name, 255 max */
-	char	nc_name[0];		/* The segment name (embedded) */
-};
 
 /*
  * Structures associated with name cacheing.
@@ -87,7 +73,11 @@
 #define NCHHASH(hash)	(&nchashtbl[(hash) & nchash])
 
 static LIST_HEAD(nchashhead, namecache) *nchashtbl;	/* Hash Table */
-static TAILQ_HEAD(, namecache) ncneg;			/* Hash Table */
+static struct namecache_list	ncneglist;		/* instead of vnode */
+static struct namecache		rootnamecache;		/* Dummy node */
+
+static int	nczapcheck;		/* panic on bad release */
+SYSCTL_INT(_debug, OID_AUTO, nczapcheck, CTLFLAG_RW, &nczapcheck, 0, "");
 
 static u_long	nchash;			/* size of hash table */
 SYSCTL_ULONG(_debug, OID_AUTO, nchash, CTLFLAG_RD, &nchash, 0, "");
@@ -126,51 +116,140 @@
 static u_long numneghits; STATNODE(CTLFLAG_RD, numneghits, &numneghits);
 
 
-static void cache_zap (struct namecache *ncp);
+static void cache_zap(struct namecache *ncp);
 
 MALLOC_DEFINE(M_VFSCACHE, "vfscache", "VFS name cache entries");
 
 /*
- * Flags in namecache.nc_flag
- */
-#define NCF_WHITE	0x01	/* negative entry corresponds to whiteout */
+ * cache_hold() and cache_drop() prevent the premature deletion of a
+ * namecache entry but do not prevent operations (such as zapping) on
+ * that namecache entry.
+ */
+static __inline
+struct namecache *
+cache_hold(struct namecache *ncp)
+{
+	++ncp->nc_refs;
+	return(ncp);
+}
+
+static __inline
+void
+cache_drop(struct namecache *ncp)
+{
+	KKASSERT(ncp->nc_refs > 0);
+	if (ncp->nc_refs == 1 && (ncp->nc_flag & NCF_HASHED) == 0)
+		cache_zap(ncp);
+	else
+		--ncp->nc_refs;
+}
 
 /*
- * Delete an entry from its hash list and move it to the front
- * of the LRU list for immediate reuse.
+ * Try to destroy a namecache entry.  The entry is disassociated from its
+ * vnode or ncneglist and reverted to an UNRESOLVED state.
+ *
+ * Then, if there are no additional references to the ncp and we can
+ * successfully delete the children, the entry is also removed from the
+ * namecache hashlist / topology.
+ *
+ * References or undeletable children will prevent the entry from being
+ * removed from the topology.  The entry may be revalidated (typically
+ * by cache_enter()) at a later time.  Children remain because:
+ *
+ *	+ we have tried to delete a node rather then a leaf in the topology.
+ *	+ the presence of negative entries (we try to scrap these).
+ *	+ an entry or child has a non-zero ref count and cannot be scrapped.
+ *
+ * This function must be called with the ncp held and will drop the ref
+ * count during zapping.
  */
 static void
 cache_zap(struct namecache *ncp)
 {
-	LIST_REMOVE(ncp, nc_hash);
-	LIST_REMOVE(ncp, nc_src);
-	if (LIST_EMPTY(&ncp->nc_src_vp->v_cache_src)) 
-		vdrop(ncp->nc_src_vp);
-	if (ncp->nc_dst_vp) {
-		TAILQ_REMOVE(&ncp->nc_dst_vp->v_cache_dst, ncp, nc_dst);
-	} else {
-		TAILQ_REMOVE(&ncneg, ncp, nc_dst);
-		numneg--;
+	struct namecache *par;
+	struct vnode *vp;
+
+	/*
+	 * Disassociate the vnode or negative cache ref and set NCF_UNRESOLVED.
+	 */
+	if ((ncp->nc_flag & NCF_UNRESOLVED) == 0) {
+		ncp->nc_flag |= NCF_UNRESOLVED;
+		--numcache;
+		if ((vp = ncp->nc_vp) != NULL) {
+			ncp->nc_vp = NULL;	/* safety */
+			TAILQ_REMOVE(&vp->v_namecache, ncp, nc_vnode);
+			if (vp && !TAILQ_EMPTY(&ncp->nc_list))
+				vdrop(vp);
+		} else {
+			TAILQ_REMOVE(&ncneglist, ncp, nc_vnode);
+			--numneg;
+		}
+	}
+
+	/*
+	 * Try to scrap the entry and possibly tail-recurse on its parent.
+	 * We only scrap unref'd (other then our ref) unresolved entries,
+	 * we do not scrap 'live' entries.
+	 */
+	while (ncp->nc_flag & NCF_UNRESOLVED) {
+		/*
+		 * Someone other then us has a ref, stop.
+		 */
+		if (ncp->nc_refs > 1)
+			goto done;
+
+		/*
+		 * We have children, stop.
+		 */
+		if (!TAILQ_EMPTY(&ncp->nc_list))
+			goto done;
+
+		/*
+		 * Ok, we can completely destroy and free this entry.  Sanity
+		 * check it against our static rootnamecache structure,
+		 * then remove it from the hash.
+		 */
+		KKASSERT(ncp != &rootnamecache);
+
+		if (ncp->nc_flag & NCF_HASHED) {
+			ncp->nc_flag &= ~NCF_HASHED;
+			LIST_REMOVE(ncp, nc_hash);
+		}
+
+		/*
+		 * Unlink from its parent and free, then loop on the
+		 * parent.
+		 */
+		par = cache_hold(ncp->nc_parent);
+		TAILQ_REMOVE(&par->nc_list, ncp, nc_entry);
+		if (par->nc_vp && TAILQ_EMPTY(&par->nc_list))
+			vdrop(par->nc_vp);
+		ncp->nc_refs = -1;	/* safety */
+		ncp->nc_parent = NULL;	/* safety */
+		free(ncp, M_VFSCACHE);
+		ncp = par;
 	}
-	numcache--;
-	free(ncp, M_VFSCACHE);
+done:
+	--ncp->nc_refs;
 }
 
 /*
  * Lookup an entry in the cache
  *
- * We don't do this if the segment name is long, simply so the cache
- * can avoid holding long names (which would either waste space, or
- * add greatly to the complexity).
- *
  * Lookup is called with dvp pointing to the directory to search,
- * cnp pointing to the name of the entry being sought. If the lookup
- * succeeds, the vnode is returned in *vpp, and a status of -1 is
- * returned. If the lookup determines that the name does not exist
- * (negative cacheing), a status of ENOENT is returned. If the lookup
- * fails, a status of zero is returned.
+ * cnp pointing to the name of the entry being sought. 
+ *
+ * If the lookup succeeds, the vnode is returned in *vpp, and a
+ * status of -1 is returned.
+ *
+ * If the lookup determines that the name does not exist (negative cacheing),
+ * a status of ENOENT is returned. 
+ *
+ * If the lookup fails, a status of zero is returned.
+ *
+ * Note that UNRESOLVED entries are ignored.  They are not negative cache
+ * entries.
  */
-
 int
 cache_lookup(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp)
 {
@@ -183,10 +262,12 @@
 		if (cnp->cn_namelen == 1) {
 			*vpp = dvp;
 			dothits++;
+			numposhits++;	/* include in total statistics */
 			return (-1);
 		}
 		if (cnp->cn_namelen == 2 && cnp->cn_nameptr[1] == '.') {
 			dotdothits++;
+			numposhits++;	/* include in total statistics */
 			if (dvp->v_dd->v_id != dvp->v_ddid ||
 			    (cnp->cn_flags & CNP_MAKEENTRY) == 0) {
 				dvp->v_ddid = 0;
@@ -201,13 +282,18 @@
 	hash = fnv_32_buf(&dvp->v_id, sizeof(dvp->v_id), hash);
 	LIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) {
 		numchecks++;
-		if (ncp->nc_src_vp == dvp && ncp->nc_nlen == cnp->cn_namelen &&
-		    !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen))
+		if ((ncp->nc_flag & NCF_UNRESOLVED) == 0 &&
+		    ncp->nc_parent->nc_vp == dvp && 
+		    ncp->nc_nlen == cnp->cn_namelen &&
+		    bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen) == 0
+		) {
+			cache_hold(ncp);
 			break;
+		}
 	}
 
 	/* We failed to find an entry */
-	if (ncp == 0) {
+	if (ncp == NULL) {
 		if ((cnp->cn_flags & CNP_MAKEENTRY) == 0) {
 			nummisszap++;
 		} else {
@@ -226,10 +312,11 @@
 	}
 
 	/* We found a "positive" match, return the vnode */
-        if (ncp->nc_dst_vp) {
+	if (ncp->nc_vp) {
 		numposhits++;
 		nchstats.ncs_goodhits++;
-		*vpp = ncp->nc_dst_vp;
+		*vpp = ncp->nc_vp;
+		cache_drop(ncp);
 		return (-1);
 	}
 
@@ -242,29 +329,111 @@
 	}
 
 	numneghits++;
+
 	/*
 	 * We found a "negative" match, ENOENT notifies client of this match.
-	 * The nc_flag field records whether this is a whiteout.
+	 * The nc_flag field records whether this is a whiteout.  Since there
+	 * is no vnode we can use the vnode tailq link field with ncneglist.
 	 */
-	TAILQ_REMOVE(&ncneg, ncp, nc_dst);
-	TAILQ_INSERT_TAIL(&ncneg, ncp, nc_dst);
+	TAILQ_REMOVE(&ncneglist, ncp, nc_vnode);
+	TAILQ_INSERT_TAIL(&ncneglist, ncp, nc_vnode);
 	nchstats.ncs_neghits++;
-	if (ncp->nc_flag & NCF_WHITE)
+	if (ncp->nc_flag & NCF_WHITEOUT)
 		cnp->cn_flags |= CNP_ISWHITEOUT;
+	cache_drop(ncp);
 	return (ENOENT);
 }
 
 /*
+ * Generate a special linkage between the mount point and the root of the 
+ * mounted filesystem in order to maintain the namecache topology across
+ * a mount point.  The special linkage has a 0-length name component
+ * and sets NCF_MOUNTPT.
+ */
+void
+cache_mount(struct vnode *dvp, struct vnode *tvp)
+{
+	struct namecache *ncp;
+	struct namecache *par;
+	struct nchashhead *ncpp;
+	u_int32_t hash;
+
+	/*
+	 * If a linkage already exists we do not have to do anything.
+	 */
+	hash = fnv_32_buf("", 0, FNV1_32_INIT);
+	hash = fnv_32_buf(&dvp->v_id, sizeof(dvp->v_id), hash);
+	LIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) {
+		numchecks++;
+		if (ncp->nc_vp == tvp &&
+		    ncp->nc_parent->nc_vp == dvp && ncp->nc_nlen == 0) {
+			return;
+		}
+	}
+
+	/*
+	 * YYY temporary hack, the 'directory' dvp in the next stage
+	 * will simply be a namecache pointer to the parent.
+	 */
+	par = TAILQ_FIRST(&dvp->v_namecache);
+	if (par == NULL)
+		par = &rootnamecache;
+
+	/*
+	 * Otherwise create a new linkage.
+	 */
+	ncp = malloc(sizeof(*ncp), M_VFSCACHE, M_WAITOK);
+	bzero(ncp, sizeof(*ncp));
+	numcache++;
+
+	TAILQ_INIT(&ncp->nc_list);
+	ncp->nc_flag = NCF_MOUNTPT;
+	ncp->nc_parent = par;
+	if (TAILQ_EMPTY(&par->nc_list)) {
+		if (par->nc_vp)
+			vhold(par->nc_vp);
+	}
+	TAILQ_INSERT_HEAD(&par->nc_list, ncp, nc_entry);
+
+	/*
+	 * Linkup the target vnode.  The target vnode is NULL if this is
+	 * to be a negative cache entry.
+	 */
+	ncp->nc_vp = tvp;
+	TAILQ_INSERT_HEAD(&tvp->v_namecache, ncp, nc_vnode);
+#if 0
+	if (tvp->v_type == VDIR) {
+		vp->v_dd = dvp;
+		vp->v_ddid = dvp->v_id;
+	}
+#endif
+
+	/*
+	 * Hash table
+	 */
+	hash = fnv_32_buf("", 0, FNV1_32_INIT);
+	hash = fnv_32_buf(&dvp->v_id, sizeof(dvp->v_id), hash);
+	ncpp = NCHHASH(hash);
+	LIST_INSERT_HEAD(ncpp, ncp, nc_hash);
+
+	ncp->nc_flag |= NCF_HASHED;
+}
+
+/*
  * Add an entry to the cache.
  */
 void
 cache_enter(struct vnode *dvp, struct vnode *vp, struct componentname *cnp)
 {
 	struct namecache *ncp;
+	struct namecache *par;
 	struct nchashhead *ncpp;
 	u_int32_t hash;
-	int len;
 
+	/*
+	 * "." and ".." are degenerate cases, they are not added to the
+	 * cache.
+	 */
 	if (cnp->cn_nameptr[0] == '.') {
 		if (cnp->cn_namelen == 1) {
 			return;
@@ -280,63 +449,152 @@
 			return;
 		}
 	}
-	 
-	ncp = (struct namecache *)
-		malloc(sizeof *ncp + cnp->cn_namelen, M_VFSCACHE, M_WAITOK);
-	bzero((char *)ncp, sizeof *ncp);
-	numcache++;
-	if (!vp) {
-		numneg++;
-		ncp->nc_flag = cnp->cn_flags & CNP_ISWHITEOUT ? NCF_WHITE : 0;
-	} else if (vp->v_type == VDIR) {
-		vp->v_dd = dvp;
-		vp->v_ddid = dvp->v_id;
-	}
 
 	/*
-	 * Fill in cache info, if vp is NULL this is a "negative" cache entry.
-	 * For negative entries, we have to record whether it is a whiteout.
-	 * the whiteout flag is stored in the nc_flag field.
-	 */
-	ncp->nc_dst_vp = vp;
-	ncp->nc_src_vp = dvp;
-	len = ncp->nc_nlen = cnp->cn_namelen;
-	hash = fnv_32_buf(cnp->cn_nameptr, len, FNV1_32_INIT);
-	bcopy(cnp->cn_nameptr, ncp->nc_name, len);
+	 * Attempt to locate an unresolved entry in the cache and 
+	 * reassociate it, otherwise allocate a new entry.  Unresolved
+	 * entries can exist due to normal vnode reuse and/or cache
+	 * purges issued by filesystems for various reasons.
+	 */
+
+	hash = fnv_32_buf(cnp->cn_nameptr, cnp->cn_namelen, FNV1_32_INIT);
 	hash = fnv_32_buf(&dvp->v_id, sizeof(dvp->v_id), hash);
-	ncpp = NCHHASH(hash);
-	LIST_INSERT_HEAD(ncpp, ncp, nc_hash);
-	if (LIST_EMPTY(&dvp->v_cache_src))
-		vhold(dvp);
-	LIST_INSERT_HEAD(&dvp->v_cache_src, ncp, nc_src);
-	if (vp) {
-		TAILQ_INSERT_HEAD(&vp->v_cache_dst, ncp, nc_dst);
+	LIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) {
+		numchecks++;
+		if ((ncp->nc_flag & NCF_UNRESOLVED) &&
+		    ncp->nc_parent->nc_vp == dvp && 
+		    ncp->nc_nlen == cnp->cn_namelen &&
+		    bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen) == 0
+		) {
+			printf("Reresolve %*.*s\n", (int)cnp->cn_namelen, (int)cnp->cn_namelen, cnp->cn_nameptr);
+			break;
+		}
+	}
+
+	if (ncp == NULL) {
+		ncp = malloc(sizeof(*ncp) + cnp->cn_namelen, M_VFSCACHE, M_WAITOK);
+		bzero(ncp, sizeof(*ncp));
+		TAILQ_INIT(&ncp->nc_list);
+
+		/*
+		 * Linkup the parent pointer, bump the parent vnode's hold
+		 * count when we go from 0->1 children.  The calculation
+		 * of 'par' is YYY a hack temporarily.  par will eventually
+		 * be passed as an argument.
+		 */
+		par = TAILQ_FIRST(&dvp->v_namecache);
+		if (par == NULL) {
+			par = &rootnamecache;
+			if (dvp != rootvnode)
+				printf("cache_enter: parent has no namecache entry entering '%*.*s'\n", (int)cnp->cn_namelen, (int)cnp->cn_namelen, cnp->cn_nameptr);
+		}
+		ncp->nc_parent = par;
+		if (TAILQ_EMPTY(&par->nc_list)) {
+			if (par->nc_vp)
+				vhold(par->nc_vp);
+		}
+		TAILQ_INSERT_HEAD(&par->nc_list, ncp, nc_entry);
+
+		/*
+		 * Add to the hash table
+		 */
+		ncp->nc_nlen = cnp->cn_namelen;
+		bcopy(cnp->cn_nameptr, ncp->nc_name, cnp->cn_namelen);
+		ncpp = NCHHASH(hash);
+		LIST_INSERT_HEAD(ncpp, ncp, nc_hash);
+
+		ncp->nc_flag |= NCF_HASHED;
+	}
+
+	/*
+	 * Linkup the target vnode.  The target vnode is NULL if this is
+	 * to be a negative cache entry.
+	 */
+	numcache++;
+	ncp->nc_vp = vp;
+	ncp->nc_flag &= ~NCF_UNRESOLVED;
+	if (vp == NULL) {
+		++numneg;
+		TAILQ_INSERT_HEAD(&ncneglist, ncp, nc_vnode);
+		ncp->nc_flag &= ~NCF_WHITEOUT;
+		if (cnp->cn_flags & CNP_ISWHITEOUT)
+			ncp->nc_flag |= NCF_WHITEOUT;
 	} else {
-		TAILQ_INSERT_TAIL(&ncneg, ncp, nc_dst);
+		if (!TAILQ_EMPTY(&ncp->nc_list))
+			vhold(vp);
+		TAILQ_INSERT_HEAD(&vp->v_namecache, ncp, nc_vnode);
+		if (vp->v_type == VDIR) {
+			vp->v_dd = dvp;
+			vp->v_ddid = dvp->v_id;
+		}
 	}
+
+	/*
+	 * Don't cache too many negative hits
+	 */
 	if (numneg * ncnegfactor > numcache) {
-		ncp = TAILQ_FIRST(&ncneg);
+		ncp = TAILQ_FIRST(&ncneglist);
+		KKASSERT(ncp != NULL);
 		cache_zap(ncp);
 	}
 }
 
 /*
  * Name cache initialization, from vfs_init() when we are booting
+ *
+ * rootnamecache is initialized such that it cannot be recursively deleted.
  */
 void
 nchinit(void)
 {
-	TAILQ_INIT(&ncneg);
+	TAILQ_INIT(&ncneglist);
 	nchashtbl = hashinit(desiredvnodes*2, M_VFSCACHE, &nchash);
+	TAILQ_INIT(&rootnamecache.nc_list);
+	rootnamecache.nc_flag |= NCF_HASHED | NCF_ROOT | NCF_UNRESOLVED;
+	rootnamecache.nc_refs = 1;
 }
 
 /*
- * Invalidate all entries to a particular vnode.
+ * vfs_cache_setroot()
  *
- * Remove all entries in the namecache relating to this vnode and
- * change the v_id.  We take the v_id from a global counter, since
- * it becomes a handy sequence number in crash-dumps that way.
- * No valid vnode will ever have (v_id == 0).
+ *	Create an association between the root of our namecache and
+ *	the root vnode.  This routine may be called several times during
+ *	booting.
+ */
+void
+vfs_cache_setroot(struct vnode *nvp)
+{
+	KKASSERT(rootnamecache.nc_refs > 0);	/* don't accidently free */
+	cache_zap(cache_hold(&rootnamecache));
+
+	rootnamecache.nc_vp = nvp;
+	rootnamecache.nc_flag &= ~NCF_UNRESOLVED;
+	++numcache;
+	if (nvp) {
+		if (!TAILQ_EMPTY(&rootnamecache.nc_list))
+			vhold(nvp);
+		TAILQ_INSERT_HEAD(&nvp->v_namecache, &rootnamecache, nc_vnode);
+	} else {
+		++numneg;
+		TAILQ_INSERT_HEAD(&ncneglist, &rootnamecache, nc_vnode);
+		rootnamecache.nc_flag &= ~NCF_WHITEOUT;
+	}
+}
+
+/*
+ * Invalidate all namecache entries to a particular vnode as well as 
+ * any direct children of that vnode in the namecache.  This is a 
+ * 'catch all' purge used by filesystems that do not know any better.
+ *
+ * A new vnode v_id is generated.  Note that no vnode will ever have a
+ * v_id of 0.
+ *
+ * Note that the linkage between the vnode and its namecache entries will
+ * be removed, but the namecache entries themselves might stay put due to
+ * active references from elsewhere in the system or due to the existance of
+ * the children.   The namecache topology is left intact even if we do not
+ * know what the vnode association is.  Such entries will be marked
+ * NCF_UNRESOLVED.
  *
  * XXX: Only time and the size of v_id prevents this from failing:
  * XXX: In theory we should hunt down all (struct vnode*, v_id)
@@ -345,20 +603,36 @@
  * XXX: by incrementing each vnodes v_id individually instead of
  * XXX: using the global v_id.
  */
-
 void
 cache_purge(struct vnode *vp)
 {
 	static u_long nextid;
+	struct namecache *ncp;
+	struct namecache *scan;
 
-	while (!LIST_EMPTY(&vp->v_cache_src)) 
-		cache_zap(LIST_FIRST(&vp->v_cache_src));
-	while (!TAILQ_EMPTY(&vp->v_cache_dst)) 
-		cache_zap(TAILQ_FIRST(&vp->v_cache_dst));
+	/*
+	 * Disassociate the vnode from its namecache entries along with
+	 * (for historical reasons) any direct children.
+	 */
+	while ((ncp = TAILQ_FIRST(&vp->v_namecache)) != NULL) {
+		cache_hold(ncp);
 
+restart: /* YYY hack, fix me */
+		TAILQ_FOREACH(scan, &ncp->nc_list, nc_entry) {
+			if ((scan->nc_flag & NCF_UNRESOLVED) == 0) {
+				cache_zap(cache_hold(scan));
+				goto restart;
+			}
+		}
+		cache_zap(ncp);
+	}
+
+	/*
+	 * Calculate a new unique id for ".." handling
+	 */
 	do {
 		nextid++;
-	} while (nextid == vp->v_id || !nextid);
+	} while (nextid == vp->v_id || nextid == 0);
 	vp->v_id = nextid;
 	vp->v_dd = vp;
 	vp->v_ddid = 0;
@@ -376,13 +650,22 @@
 	struct nchashhead *ncpp;
 	struct namecache *ncp, *nnp;
 
-	/* Scan hash tables for applicable entries */
+	/*
+	 * Scan hash tables for applicable entries.
+	 */
 	for (ncpp = &nchashtbl[nchash]; ncpp >= nchashtbl; ncpp--) {
-		for (ncp = LIST_FIRST(ncpp); ncp != 0; ncp = nnp) {
+		ncp = LIST_FIRST(ncpp);
+		if (ncp)
+			cache_hold(ncp);
+		while (ncp) {
 			nnp = LIST_NEXT(ncp, nc_hash);
-			if (ncp->nc_src_vp->v_mount == mp) {
+			if (nnp)
+				cache_hold(nnp);
+			if (ncp->nc_vp && ncp->nc_vp->v_mount == mp)
 				cache_zap(ncp);
-			}
+			else
+				cache_drop(ncp);
+			ncp = nnp;
 		}
 	}
 }
@@ -390,24 +673,22 @@
 /*
  * cache_leaf_test()
  *
- *	Test whether this (directory) vnode's namei cache entry contains
- *	subdirectories or not.  Used to determine whether the directory is
- *	a leaf in the namei cache or not.  Note: the directory may still
- *	contain files in the namei cache.
+ *	Test whether the vnode is at a leaf in the nameicache tree.
  *
- *	Returns 0 if the directory is a leaf, -1 if it isn't.
+ *	Returns 0 if it is a leaf, -1 if it isn't.
  */
 int
 cache_leaf_test(struct vnode *vp)
 {
-	struct namecache *ncpc;
+	struct namecache *scan;
+	struct namecache *ncp;
 
-	for (ncpc = LIST_FIRST(&vp->v_cache_src);
-	     ncpc != NULL;
-	     ncpc = LIST_NEXT(ncpc, nc_src)
-	) {
-		if (ncpc->nc_dst_vp != NULL && ncpc->nc_dst_vp->v_type == VDIR)
-			return(-1);
+	TAILQ_FOREACH(scan, &vp->v_namecache, nc_vnode) {
+		TAILQ_FOREACH(ncp, &scan->nc_list, nc_entry) {
+			/* YYY && ncp->nc_vp->v_type == VDIR ? */
+			if (ncp->nc_vp != NULL)
+				return(-1);
+		}
 	}
 	return(0);
 }
@@ -443,8 +724,9 @@
                 return (ENOTDIR);
 
 	if ((flags & CNP_ISLASTCN) && (dvp->v_mount->mnt_flag & MNT_RDONLY) &&
-	    (cnp->cn_nameiop == NAMEI_DELETE || cnp->cn_nameiop == NAMEI_RENAME))
+	    (cnp->cn_nameiop == NAMEI_DELETE || cnp->cn_nameiop == NAMEI_RENAME)) {
 		return (EROFS);
+	}
 
 	error = VOP_ACCESS(dvp, VEXEC, cred, td);
 
@@ -548,17 +830,15 @@
 			free(buf, M_TEMP);
 			return (ENOTDIR);
 		}
-		ncp = TAILQ_FIRST(&vp->v_cache_dst);
-		if (!ncp) {
+		TAILQ_FOREACH(ncp, &vp->v_namecache, nc_vnode) {
+			if (ncp->nc_parent->nc_vp == vp->v_dd)
+				break;
+		}
+		if (ncp == NULL) {
 			numcwdfail2++;
 			free(buf, M_TEMP);
 			return (ENOENT);
 		}
-		if (ncp->nc_src_vp != vp->v_dd) {
-			numcwdfail3++;
-			free(buf, M_TEMP);
-			return (EBADF);
-		}
 		for (i = ncp->nc_nlen - 1; i >= 0; i--) {
 			if (bp == buf) {
 				numcwdfail4++;
@@ -644,17 +924,17 @@
 			free(buf, M_TEMP);
 			return (ENOTDIR);
 		}
-		ncp = TAILQ_FIRST(&vp->v_cache_dst);
-		if (!ncp) {
+		TAILQ_FOREACH(ncp, &vp->v_namecache, nc_vnode) {
+			if (vp == textvp)
+				break;
+			if (ncp->nc_parent->nc_vp == vp->v_dd)
+				break;
+		}
+		if (ncp == NULL) {
 			numfullpathfail2++;
 			free(buf, M_TEMP);
 			return (ENOENT);
 		}
-		if (vp != textvp && ncp->nc_src_vp != vp->v_dd) {
-			numfullpathfail3++;
-			free(buf, M_TEMP);
-			return (EBADF);
-		}
 		for (i = ncp->nc_nlen - 1; i >= 0; i--) {
 			if (bp == buf) {
 				numfullpathfail4++;
@@ -670,7 +950,7 @@
 		}
 		*--bp = '/';
 		slash_prefixed = 1;
-		vp = ncp->nc_src_vp;
+		vp = vp->v_dd;
 	}
 	if (!slash_prefixed) {
 		if (bp == buf) {
Index: kern/vfs_lookup.c
===================================================================
RCS file: /cvs/src/sys/kern/vfs_lookup.c,v
retrieving revision 1.6
diff -u -r1.6 vfs_lookup.c
--- kern/vfs_lookup.c	23 Sep 2003 05:03:51 -0000	1.6
+++ kern/vfs_lookup.c	24 Sep 2003 05:07:18 -0000
@@ -510,6 +510,7 @@
 			dpunlocked = 1;
 			goto bad2;
 		}
+		cache_mount(dp, tdp);
 		vrele(dp);
 		ndp->ni_vp = dp = tdp;
 	}
Index: kern/vfs_subr.c
===================================================================
RCS file: /cvs/src/sys/kern/vfs_subr.c,v
retrieving revision 1.20
diff -u -r1.20 vfs_subr.c
--- kern/vfs_subr.c	23 Sep 2003 05:03:51 -0000	1.20
+++ kern/vfs_subr.c	24 Sep 2003 05:07:18 -0000
@@ -37,7 +37,7 @@
  *
  *	@(#)vfs_subr.c	8.31 (Berkeley) 5/26/95
  * $FreeBSD: src/sys/kern/vfs_subr.c,v 1.249.2.30 2003/04/04 20:35:57 tegge Exp $
- * $DragonFly: src/sys/kern/vfs_subr.c,v 1.20 2003/09/23 05:03:51 dillon Exp $
+ * $DragonFly$
  */
 
 /*
@@ -117,8 +117,6 @@
 SYSCTL_INT(_vfs, OID_AUTO, reassignbufsortbad, CTLFLAG_RW, &reassignbufsortbad, 0, "");
 static int reassignbufmethod = 1;
 SYSCTL_INT(_vfs, OID_AUTO, reassignbufmethod, CTLFLAG_RW, &reassignbufmethod, 0, "");
-static int nameileafonly = 0;
-SYSCTL_INT(_vfs, OID_AUTO, nameileafonly, CTLFLAG_RW, &nameileafonly, 0, "");
 
 #ifdef ENABLE_VFS_IOOPT
 int vfs_ioopt = 0;
@@ -673,33 +671,13 @@
 			TAILQ_REMOVE(&vnode_free_list, vp, v_freelist);
 			KKASSERT(vp->v_flag & VFREE);
 
-			if (LIST_FIRST(&vp->v_cache_src)) {
-				/*
-				 * note: nameileafonly sysctl is temporary,
-				 * for debugging only, and will eventually be
-				 * removed.
-				 */
-				if (nameileafonly > 0) {
-					/*
-					 * Do not reuse namei-cached directory
-					 * vnodes that have cached
-					 * subdirectories.
-					 */
-					if (cache_leaf_test(vp) < 0) {
-						lwkt_reltoken(&vp->v_interlock);
-						TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist);
-						vp = NULL;
-						continue;
-					}
-				} else if (nameileafonly < 0 || 
-					    vmiodirenable == 0) {
-					/*
-					 * Do not reuse namei-cached directory
-					 * vnodes if nameileafonly is -1 or
-					 * if VMIO backing for directories is
-					 * turned off (otherwise we reuse them
-					 * too quickly).
-					 */
+			/*
+			 * If we have children in the namecache we cannot
+			 * reuse the vnode yet because it will break the
+			 * namecache chain (YYY use nc_refs for the check?)
+			 */
+			if (TAILQ_FIRST(&vp->v_namecache)) {
+				if (cache_leaf_test(vp) < 0) {
 					lwkt_reltoken(&vp->v_interlock);
 					TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist);
 					vp = NULL;
@@ -749,8 +727,7 @@
 		lwkt_inittoken(&vp->v_interlock);
 		vp->v_dd = vp;
 		cache_purge(vp);
-		LIST_INIT(&vp->v_cache_src);
-		TAILQ_INIT(&vp->v_cache_dst);
+		TAILQ_INIT(&vp->v_namecache);
 		numvnodes++;
 	}
 
Index: kern/vfs_syscalls.c
===================================================================
RCS file: /cvs/src/sys/kern/vfs_syscalls.c,v
retrieving revision 1.17
diff -u -r1.17 vfs_syscalls.c
--- kern/vfs_syscalls.c	23 Sep 2003 05:03:51 -0000	1.17
+++ kern/vfs_syscalls.c	24 Sep 2003 05:07:18 -0000
@@ -394,6 +394,7 @@
 		vrele(rootvnode);
 		VREF(newdp);
 		rootvnode = newdp;
+		vfs_cache_setroot(rootvnode);
 	}
 	vput(newdp);
 }
@@ -511,7 +512,7 @@
 	}
 	TAILQ_REMOVE(&mountlist, mp, mnt_list);
 	if ((coveredvp = mp->mnt_vnodecovered) != NULLVP) {
-		coveredvp->v_mountedhere = (struct mount *)0;
+		coveredvp->v_mountedhere = NULL;
 		vrele(coveredvp);
 	}
 	mp->mnt_vfc->vfc_refcount--;
Index: sys/namecache.h
===================================================================
RCS file: sys/namecache.h
diff -N sys/namecache.h
--- /dev/null	1 Jan 1970 00:00:00 -0000
+++ sys/namecache.h	24 Sep 2003 05:13:52 -0000
@@ -0,0 +1,98 @@
+/*
+ * Copyright (c) 1989, 1993
+ *	The Regents of the University of California.  All rights reserved.
+ * Copyright (c) 2003 Matthew Dillon <dillon at xxxxxxxxxxxxx>,
+ *	All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *	This product includes software developed by the University of
+ *	California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $DragonFly$
+ */
+
+#ifndef _SYS_NAMECACHE_H_
+#define	_SYS_NAMECACHE_H_
+
+struct vnode;
+
+TAILQ_HEAD(namecache_list, namecache);
+
+/*
+ * The namecache structure is used to manage the filesystem namespace.  Most
+ * vnodes cached by the system will reference one or more associated namecache
+ * structures.
+ */
+struct namecache {
+    LIST_ENTRY(namecache) nc_hash;	/* hash chain (nc_parent,name) */
+    TAILQ_ENTRY(namecache) nc_entry;	/* scan via nc_parent->nc_list */
+    TAILQ_ENTRY(namecache) nc_vnode;	/* scan via vnode->v_namecache */
+    struct namecache_list  nc_list;	/* list of children */
+    struct namecache *nc_parent;	/* namecache entry for parent */
+    struct	vnode *nc_vp;		/* vnode representing name or NULL */
+    int		nc_refs;		/* ref count prevents deletion */
+    u_char	nc_flag;
+    u_char	nc_nlen;		/* The length of the name, 255 max */
+    char	nc_name[0];		/* Separate allocated seg name */
+};
+
+typedef struct namecache *namecache_t;
+
+/*
+ * Flags in namecache.nc_flag (u_char)
+ */
+#define NCF_NEGATIVE	0x01	/* negative entry */
+#define NCF_WHITEOUT	0x02	/* negative entry corresponds to whiteout */
+#define NCF_UNRESOLVED	0x04	/* invalid or unresolved entry */
+#define NCF_MOUNTPT	0x08	/* mount point */
+#define NCF_ROOT	0x10	/* namecache root (static) */
+#define NCF_HASHED	0x20	/* namecache entry in hash table */
+
+#define CINV_SELF	0x0001	/* invalidate a specific (dvp,vp) entry */
+#define CINV_CHILDREN	0x0002	/* invalidate all children of vp */
+
+#ifdef _KERNEL
+
+struct vop_lookup_args;
+struct componentname;
+struct mount;
+
+int	cache_lookup(struct vnode *dvp, struct vnode **vpp,
+			struct componentname *cnp);
+void	cache_mount(struct vnode *dvp, struct vnode *tvp);
+void	cache_enter(struct vnode *dvp, struct vnode *vp,
+			struct componentname *cnp);
+void	vfs_cache_setroot(struct vnode *vp);
+void	cache_purge(struct vnode *vp);
+void	cache_purgevfs (struct mount *mp);
+int	cache_leaf_test (struct vnode *vp);
+int	vfs_cache_lookup(struct vop_lookup_args *ap);
+int	textvp_fullpath (struct proc *p, char **retbuf, char **retfreebuf);
+
+#endif
+
+#endif
Index: sys/vnode.h
===================================================================
RCS file: /cvs/src/sys/sys/vnode.h,v
retrieving revision 1.7
diff -u -r1.7 vnode.h
--- sys/vnode.h	20 Aug 2003 07:31:22 -0000	1.7
+++ sys/vnode.h	24 Sep 2003 05:10:42 -0000
@@ -32,7 +32,7 @@
  *
  *	@(#)vnode.h	8.7 (Berkeley) 2/4/94
  * $FreeBSD: src/sys/sys/vnode.h,v 1.111.2.19 2002/12/29 18:19:53 dillon Exp $
- * $DragonFly: src/sys/sys/vnode.h,v 1.7 2003/08/20 07:31:22 rob Exp $
+ * $DragonFly$
  */
 
 #ifndef _SYS_VNODE_H_
@@ -43,6 +43,7 @@
 #include <sys/select.h>
 #include <sys/uio.h>
 #include <sys/acl.h>
+#include <sys/namecache.h>
 
 #include <machine/lock.h>
 
@@ -76,7 +77,6 @@
 TAILQ_HEAD(buflists, buf);
 
 typedef	int 	vop_t (void *);
-struct namecache;
 
 /*
  * Reading or writing any of these items requires holding the appropriate lock.
@@ -120,8 +120,12 @@
 	struct	lock *v_vnlock;			/* used for non-locking fs's */
 	enum	vtagtype v_tag;			/* type of underlying data */
 	void 	*v_data;			/* private data for fs */
-	LIST_HEAD(, namecache) v_cache_src;	/* Cache entries from us */
-	TAILQ_HEAD(, namecache) v_cache_dst;	/* Cache entries to us */
+
+	/*
+	 * YYY note: v_dd, v_ddid will become obsolete once the namecache
+	 * code is finished.
+	 */
+	struct namecache_list v_namecache;	/* associated nc entries */
 	struct	vnode *v_dd;			/* .. vnode */
 	u_long	v_ddid;				/* .. capability identifier */
 	struct	{
@@ -313,8 +317,8 @@
 	  !((vp)->v_object->ref_count || (vp)->v_object->resident_page_count)))
 
 #define VMIGHTFREE(vp) \
-        (!((vp)->v_flag & (VFREE|VDOOMED|VXLOCK)) &&   \
-	 LIST_EMPTY(&(vp)->v_cache_src) && !(vp)->v_usecount)
+        (((vp)->v_flag & (VFREE|VDOOMED|VXLOCK)) == 0 &&   \
+	 cache_leaf_test(vp) == 0 && (vp)->v_usecount == 0)
 
 #define VSHOULDBUSY(vp)	\
 	(((vp)->v_flag & VFREE) && \
@@ -538,7 +542,6 @@
 /*
  * Public vnode manipulation functions.
  */
-struct componentname;
 struct file;
 struct mount;
 struct nameidata;
@@ -558,14 +561,6 @@
 void	addalias (struct vnode *vp, dev_t nvp_rdev);
 void	addaliasu (struct vnode *vp, udev_t nvp_rdev);
 int 	bdevvp (dev_t dev, struct vnode **vpp);
-/* cache_* may belong in namei.h. */
-void	cache_enter (struct vnode *dvp, struct vnode *vp,
-	    struct componentname *cnp);
-int	cache_lookup (struct vnode *dvp, struct vnode **vpp,
-	    struct componentname *cnp);
-void	cache_purge (struct vnode *vp);
-void	cache_purgevfs (struct mount *mp);
-int	cache_leaf_test (struct vnode *vp);
 void	cvtstat (struct stat *st, struct ostat *ost);
 void	cvtnstat (struct stat *sb, struct nstat *nsb);
 int 	getnewvnode (enum vtagtype tag,
@@ -573,7 +568,6 @@
 int	lease_check (struct vop_lease_args *ap);
 int	spec_vnoperate (struct vop_generic_args *);
 int	speedup_syncer (void);
-int	textvp_fullpath (struct proc *p, char **retbuf, char **retfreebuf);
 void 	vattr_null (struct vattr *vap);
 int 	vcount (struct vnode *vp);
 void	vdrop (struct vnode *);
@@ -612,7 +606,6 @@
 	    struct ucred *cred, int *aresid, struct thread *td);
 int	vn_stat (struct vnode *vp, struct stat *sb, struct thread *td);
 dev_t	vn_todev (struct vnode *vp);
-int	vfs_cache_lookup (struct vop_lookup_args *ap);
 int	vfs_object_create (struct vnode *vp, struct thread *td);
 void	vfs_timestamp (struct timespec *);
 int 	vn_writechk (struct vnode *vp);





More information about the Kernel mailing list