namecache coherency 3rd turn
Csaba Henk
csaba.henk at creo.hu
Wed Mar 29 01:58:00 PST 2006
Hi!
This time the following changes were made:
- Namecache locks are kept by shadowinfo structures which
can be embedded into the namecache structure itself (singleton
groups) or fetched from a pool (for non-singleton groups).
- Shadow group data structure: I ended up at what has been
suggested by Matt earlier: shadow group entries form a circular list.
Double linked in order to support O(1) node deletion, enhanced
with a height counter to retain tree semantics.
- cache_setunresolved(ncp) blows up subtree over ncp to break broken
topologies.
- Deadlock avoiding techniques of the previous patch have been kept.
- The "struct namecache *nc_shadowed" field of namecache structures
still exists, but is nowhere referred by cache code. Now it's sort
of a private field, almost like the "void *" fields in vnodes,
specinfo structures, etc. It could be easily ditched and replaced by
per-mount hash for those fs-es who would use it. I just kept it as
is -- I didn't want to do anything about it without having a
consensus.
- Nullfs adjusted to this API.
Regards,
Csaba
diff -r e1d135a8f666 sys/kern/vfs_cache.c
--- a/sys/kern/vfs_cache.c Sun Mar 26 07:56:54 2006 +0000
+++ b/sys/kern/vfs_cache.c Wed Mar 29 10:23:26 2006 +0200
@@ -108,6 +108,10 @@
#define NCHHASH(hash) (&nchashtbl[(hash) & nchash])
#define MINNEG 1024
+/* Modes for shadow group traversal */
+#define SG_ALL 0 /* traverse whole group */
+#define SG_SUBTREE 1 /* traverse only subtree */
+
MALLOC_DEFINE(M_VFSCACHE, "vfscache", "VFS name cache entries");
static LIST_HEAD(nchashhead, namecache) *nchashtbl; /* Hash Table */
@@ -170,6 +174,15 @@ static u_long numneghits; STATNODE(CTLFL
static u_long numneghits; STATNODE(CTLFLAG_RD, numneghits, &numneghits);
struct nchstats nchstats[SMP_MAXCPU];
+
+static STAILQ_HEAD(, shadowinfo) shadowinfo_freeq;
+static u_long numshadowinfo = 0;
+STATNODE(CTLFLAG_RD, numshadowinfo, &numshadowinfo);
+static long maxnumshadowinfo = -1;
+SYSCTL_LONG(_vfs_cache, OID_AUTO, maxnumshadowinfo, CTLFLAG_RW,
+ &maxnumshadowinfo, 0, "");
+MALLOC_DEFINE(M_SHADOWINFO, "shadowinfo", "VFS name cache shadowinfo");
+
/*
* Export VFS cache effectiveness statistics to user-land.
*
@@ -196,6 +209,62 @@ SYSCTL_PROC(_vfs_cache, OID_AUTO, nchsta
SYSCTL_PROC(_vfs_cache, OID_AUTO, nchstats, CTLTYPE_OPAQUE|CTLFLAG_RD,
0, 0, sysctl_nchstats, "S,nchstats", "VFS cache effectiveness statistics");
+/* XXX stubs for later MPSAFE work */
+#define shadowinfo_freeq_lock()
+#define shadowinfo_freeq_unlock()
+
+static struct shadowinfo *
+shadowinfo_fetch(void)
+{
+ struct shadowinfo *shinf = STAILQ_FIRST(&shadowinfo_freeq);
+
+ if (! shinf)
+ goto alloc;
+
+ shadowinfo_freeq_lock();
+ if ((shinf = STAILQ_FIRST(&shadowinfo_freeq)))
+ STAILQ_REMOVE_HEAD(&shadowinfo_freeq, sh_entry);
+ shadowinfo_freeq_unlock();
+
+ if (shinf)
+ return (shinf);
+
+alloc:
+ shinf = malloc(sizeof(*shinf), M_SHADOWINFO, M_WAITOK|M_ZERO);
+ numshadowinfo++;
+
+ return (shinf);
+}
+
+static __inline
+struct shadowinfo*
+shadowinfo_ref(struct shadowinfo *shinf)
+{
+ shinf->sh_refs++;
+
+ return (shinf);
+}
+
+static void
+shadowinfo_put(struct shadowinfo *shinf)
+{
+ if (--shinf->sh_refs > 0)
+ return;
+
+ if (maxnumshadowinfo >= 0 && numshadowinfo > maxnumshadowinfo) {
+ free(shinf, M_SHADOWINFO);
+ numshadowinfo--;
+ return;
+ }
+
+ shinf->sh_exlocks = 0;
+ shinf->sh_locktd = NULL;
+
+ shadowinfo_freeq_lock();
+ STAILQ_INSERT_TAIL(&shadowinfo_freeq, shinf, sh_entry);
+ shadowinfo_freeq_unlock();
+}
+
static void cache_zap(struct namecache *ncp);
/*
@@ -225,7 +294,7 @@ _cache_drop(struct namecache *ncp)
(ncp->nc_flag & NCF_UNRESOLVED) &&
TAILQ_EMPTY(&ncp->nc_list)
) {
- KKASSERT(ncp->nc_exlocks == 0);
+ KKASSERT(ncp->nc_shadowinfo->sh_exlocks == 0);
cache_lock(ncp);
cache_zap(ncp);
} else {
@@ -295,6 +364,10 @@ cache_alloc(int nlen)
ncp->nc_error = ENOTCONN; /* needs to be resolved */
ncp->nc_refs = 1;
ncp->nc_fsmid = 1;
+ ncp->nc_shadowinfo = &ncp->nc_shadowinfo_internal;
+ ncp->nc_shadowinfo_internal.sh_refs = 2;
+ ncp->nc_shadow_prev = NULL;
+ ncp->nc_shadow_next = NULL;
TAILQ_INIT(&ncp->nc_list);
cache_lock(ncp);
return(ncp);
@@ -303,7 +376,7 @@ static void
static void
cache_free(struct namecache *ncp)
{
- KKASSERT(ncp->nc_refs == 1 && ncp->nc_exlocks == 1);
+ KKASSERT(ncp->nc_refs == 1 && ncp->nc_shadowinfo->sh_exlocks == 1);
if (ncp->nc_name)
free(ncp->nc_name, M_VFSCACHE);
free(ncp, M_VFSCACHE);
@@ -322,6 +395,188 @@ cache_drop(struct namecache *ncp)
cache_drop(struct namecache *ncp)
{
_cache_drop(ncp);
+}
+
+/*
+ * Iterate an "updater" function over a shadow group.
+ * All-group and subtree-only traversals are supported.
+ */
+static struct namecache *
+cache_group_walk(struct namecache *ncp,
+ int (*updater)(struct namecache *xncp, void *param),
+ int flags, void *param)
+{
+ struct namecache *xncp = ncp, *yncp;
+
+ for (;;) {
+ yncp = xncp->nc_shadow_next;
+ if (updater(xncp, param))
+ break;
+ if (! yncp || yncp == ncp ||
+ (flags & SG_SUBTREE &&
+ yncp->nc_shadowheight <= ncp->nc_shadowheight))
+ break;
+ xncp = yncp;
+ }
+
+ return(xncp);
+}
+
+struct migrate_param {
+ int heightdelta;
+ int exlocks;
+ struct shadowinfo *shadowinfo;
+};
+
+static int
+migrate_updater(struct namecache *ncp, void *param)
+{
+ struct migrate_param *mpm = param;
+ struct shadowinfo *shinf = mpm->shadowinfo;
+ struct shadowinfo *oldshinf = ncp->nc_shadowinfo;
+
+ if (! shinf)
+ shinf = &ncp->nc_shadowinfo_internal;
+
+ if (shinf == oldshinf)
+ goto out;
+
+ shinf->sh_locktd = oldshinf->sh_locktd;
+
+ ncp->nc_shadowinfo = shadowinfo_ref(shinf);
+ shadowinfo_put(oldshinf);
+
+out:
+ ncp->nc_shadowheight += mpm->heightdelta;
+ if (mpm->exlocks >= 0)
+ shinf->sh_exlocks = mpm->exlocks;
+
+ return (0);
+}
+
+static __inline
+void
+cache_shadow_link(struct namecache *sncp, struct namecache *ncp)
+{
+ struct namecache *pncp;
+ struct namecache *nsncp;
+
+ pncp = ncp->nc_shadow_prev ?: ncp;
+ nsncp = sncp->nc_shadow_next ?: sncp;
+
+ pncp->nc_shadow_next = nsncp;
+ nsncp->nc_shadow_prev = pncp;
+
+ sncp->nc_shadow_next = ncp;
+ ncp->nc_shadow_prev = sncp;
+}
+
+static __inline
+void
+cache_shadow_unlink(struct namecache *ncp)
+{
+ if (! ncp->nc_shadow_next)
+ return;
+
+ KKASSERT(ncp->nc_shadow_prev);
+
+ if (ncp->nc_shadow_prev == ncp->nc_shadow_next) {
+ ncp->nc_shadow_prev->nc_shadow_next = NULL;
+ ncp->nc_shadow_next->nc_shadow_prev = NULL;
+ } else {
+ ncp->nc_shadow_prev->nc_shadow_next = ncp->nc_shadow_next;
+ ncp->nc_shadow_next->nc_shadow_prev = ncp->nc_shadow_prev;
+ }
+
+ ncp->nc_shadow_prev = ncp->nc_shadow_next = NULL;
+}
+
+/*
+ * Join ncp into the shadow group of sncp.
+ *
+ * ncp must be unlocked on entry, while sncp must be locked on entry.
+ *
+ * The routine will fail and return ELOOP if the intended shadowing association
+ * doesnt' make sense (currently this boils down to ncp being the same as
+ * sncp).
+ * It will fail with EEXIST if ncp gets resolved or acquires a shadow
+ * association from elsewhere during the attach attempt (it is possbile due to
+ * the fact that ncp is unlocked).
+ *
+ * - On success ncp will be a representative of the joint shadow group, which
+ * then will be locked.
+ * - On failure the namecache entries will exist separately just as they did
+ * before; both entries will be locked.
+ */
+int
+cache_shadow_attach(struct namecache *ncp, struct namecache *sncp)
+{
+ struct migrate_param mpm;
+
+ if (ncp == sncp)
+ return(ELOOP);
+
+ KKASSERT(ncp->nc_shadowinfo->sh_locktd != curthread);
+ KKASSERT(sncp->nc_shadowinfo->sh_locktd == curthread);
+
+ cache_lock_two(ncp, sncp);
+
+ if ((ncp->nc_flag & NCF_UNRESOLVED) == 0 || ncp->nc_shadowheight != 0)
+ return(EEXIST);
+
+ if (sncp->nc_shadowinfo == &sncp->nc_shadowinfo_internal) {
+ mpm.heightdelta = 0;
+ mpm.shadowinfo = shadowinfo_fetch();
+ mpm.exlocks = sncp->nc_shadowinfo->sh_exlocks;
+ migrate_updater(sncp, &mpm);
+ }
+
+ mpm.heightdelta = sncp->nc_shadowheight + 1;
+ mpm.shadowinfo = sncp->nc_shadowinfo;
+ mpm.exlocks = -1;
+
+ cache_group_walk(ncp, &migrate_updater, SG_ALL, &mpm);
+ cache_shadow_link(sncp, ncp);
+
+ return(0);
+}
+
+/*
+ * Take out namecache entry from its shadow group.
+ *
+ * The shadow group must be locked upon entry.
+ *
+ * On return both the entry and its former group remain locked.
+ */
+void
+cache_shadow_detach(struct namecache *ncp)
+{
+ struct namecache *pncp, *nncp;
+ struct migrate_param mpm;
+
+ mpm.shadowinfo = NULL;
+again:
+ mpm.heightdelta = -ncp->nc_shadowheight;
+ mpm.exlocks = ncp->nc_shadowinfo->sh_exlocks;
+ pncp = ncp->nc_shadow_prev;
+ nncp = ncp->nc_shadow_next;
+
+ migrate_updater(ncp, &mpm);
+ cache_shadow_unlink(ncp);
+
+ if (nncp && nncp == pncp) {
+ ncp = nncp;
+ goto again;
+ }
+}
+
+static int
+vref_updater(struct namecache *ncp, void *param)
+{
+ if (ncp->nc_vp)
+ *(int *)param > 0 ? vhold(ncp->nc_vp) : vdrop(ncp->nc_vp);
+
+ return(0);
}
/*
@@ -349,15 +604,21 @@ cache_lock(struct namecache *ncp)
{
thread_t td;
int didwarn;
+ struct shadowinfo *shinf;
KKASSERT(ncp->nc_refs != 0);
didwarn = 0;
td = curthread;
for (;;) {
- if (ncp->nc_exlocks == 0) {
- ncp->nc_exlocks = 1;
- ncp->nc_locktd = td;
+ shinf = ncp->nc_shadowinfo;
+ KKASSERT(shinf);
+ KKASSERT(shinf->sh_refs != 0);
+ if (shinf->sh_exlocks == 0) {
+ int ref = 1;
+
+ shinf->sh_exlocks = 1;
+ shinf->sh_locktd = td;
/*
* The vp associated with a locked ncp must be held
* to prevent it from being recycled (which would
@@ -365,16 +626,15 @@ cache_lock(struct namecache *ncp)
*
* XXX loop on race for later MPSAFE work.
*/
- if (ncp->nc_vp)
- vhold(ncp->nc_vp);
+ cache_group_walk(ncp, &vref_updater, SG_ALL, &ref);
break;
}
- if (ncp->nc_locktd == td) {
- ++ncp->nc_exlocks;
+ if (shinf->sh_locktd == td) {
+ ++shinf->sh_exlocks;
break;
}
- ncp->nc_flag |= NCF_LOCKREQ;
- if (tsleep(ncp, 0, "clock", nclockwarn) == EWOULDBLOCK) {
+ shinf->sh_lockreq = 1;
+ if (tsleep(shinf, 0, "clock", nclockwarn) == EWOULDBLOCK) {
if (didwarn)
continue;
didwarn = 1;
@@ -398,12 +658,17 @@ cache_lock_nonblock(struct namecache *nc
cache_lock_nonblock(struct namecache *ncp)
{
thread_t td;
+ struct shadowinfo *shinf = ncp->nc_shadowinfo;
KKASSERT(ncp->nc_refs != 0);
+ KKASSERT(shinf);
+ KKASSERT(shinf->sh_refs != 0);
td = curthread;
- if (ncp->nc_exlocks == 0) {
- ncp->nc_exlocks = 1;
- ncp->nc_locktd = td;
+ if (shinf->sh_exlocks == 0) {
+ int ref = 1;
+
+ shinf->sh_exlocks = 1;
+ shinf->sh_locktd = td;
/*
* The vp associated with a locked ncp must be held
* to prevent it from being recycled (which would
@@ -411,8 +676,7 @@ cache_lock_nonblock(struct namecache *nc
*
* XXX loop on race for later MPSAFE work.
*/
- if (ncp->nc_vp)
- vhold(ncp->nc_vp);
+ cache_group_walk(ncp, &vref_updater, SG_ALL, &ref);
return(0);
} else {
return(EWOULDBLOCK);
@@ -423,17 +687,45 @@ cache_unlock(struct namecache *ncp)
cache_unlock(struct namecache *ncp)
{
thread_t td = curthread;
+ struct shadowinfo *shinf = ncp->nc_shadowinfo;
KKASSERT(ncp->nc_refs > 0);
- KKASSERT(ncp->nc_exlocks > 0);
- KKASSERT(ncp->nc_locktd == td);
- if (--ncp->nc_exlocks == 0) {
- if (ncp->nc_vp)
- vdrop(ncp->nc_vp);
- ncp->nc_locktd = NULL;
- if (ncp->nc_flag & NCF_LOCKREQ) {
- ncp->nc_flag &= ~NCF_LOCKREQ;
- wakeup(ncp);
+ KKASSERT(shinf);
+ KKASSERT(shinf->sh_refs > 0);
+ KKASSERT(shinf->sh_exlocks > 0);
+ KKASSERT(shinf->sh_locktd == td);
+ if (shinf->sh_exlocks == 1) {
+ int ref = -1;
+ cache_group_walk(ncp, &vref_updater, SG_ALL, &ref);
+ }
+ if (--shinf->sh_exlocks == 0) {
+ shinf->sh_locktd = NULL;
+ if (shinf->sh_lockreq) {
+ shinf->sh_lockreq = 0;
+ wakeup(shinf);
+ }
+ }
+}
+
+/*
+ * Obtain lock on both of uncp and lncp.
+ *
+ * On entry, uncp is assumed to be unlocked, and lncp is assumed to be
+ * locked.
+ *
+ * After this function returns, caller is responsible for checking
+ * the state of lncp which might have got unlocked temporarily.
+ */
+void
+cache_lock_two(struct namecache *uncp, struct namecache *lncp)
+{
+ if (cache_lock_nonblock(uncp) != 0) {
+ if (uncp > lncp)
+ cache_lock(uncp);
+ else {
+ cache_unlock(lncp);
+ cache_lock(uncp);
+ cache_lock(lncp);
}
}
}
@@ -453,7 +745,8 @@ cache_get_nonblock(struct namecache *ncp
cache_get_nonblock(struct namecache *ncp)
{
/* XXX MP */
- if (ncp->nc_exlocks == 0 || ncp->nc_locktd == curthread) {
+ if (ncp->nc_shadowinfo->sh_exlocks == 0 ||
+ ncp->nc_shadowinfo->sh_locktd == curthread) {
_cache_hold(ncp);
cache_lock(ncp);
return(0);
@@ -487,7 +780,7 @@ cache_setvp(struct namecache *ncp, struc
if (!TAILQ_EMPTY(&ncp->nc_list))
vhold(vp);
TAILQ_INSERT_HEAD(&vp->v_namecache, ncp, nc_vnode);
- if (ncp->nc_exlocks)
+ if (ncp->nc_shadowinfo->sh_exlocks)
vhold(vp);
/*
@@ -521,6 +814,8 @@ cache_settimeout(struct namecache *ncp,
ncp->nc_timeout = 1;
}
+static int unresolver_updater(struct namecache *ncp, void *param);
+
/*
* Disassociate the vnode or negative-cache association and mark a
* namecache entry as unresolved again. Note that the ncp is still
@@ -541,7 +836,25 @@ void
void
cache_setunresolved(struct namecache *ncp)
{
+ struct namecache *nncp;
+
+ cache_group_walk(ncp, &unresolver_updater, SG_SUBTREE, ncp);
+
+ nncp = ncp->nc_shadow_next;
+ if (nncp)
+ cache_hold(nncp);
+ unresolver_updater(ncp, NULL);
+ if (nncp)
+ cache_put(nncp);
+}
+
+static int
+unresolver_updater(struct namecache *ncp, void *param)
+{
struct vnode *vp;
+
+ if (ncp == param)
+ return(0);
if ((ncp->nc_flag & NCF_UNRESOLVED) == 0) {
ncp->nc_flag |= NCF_UNRESOLVED;
@@ -563,13 +876,23 @@ cache_setunresolved(struct namecache *nc
*/
if (!TAILQ_EMPTY(&ncp->nc_list))
vdrop(vp);
- if (ncp->nc_exlocks)
+ if (ncp->nc_shadowinfo->sh_exlocks)
vdrop(vp);
} else {
TAILQ_REMOVE(&ncneglist, ncp, nc_vnode);
--numneg;
}
- }
+
+ cache_shadow_detach(ncp);
+ }
+
+ if (ncp->nc_refs == 0) {
+ cache_hold(ncp);
+ cache_put(ncp);
+ }
+
+
+ return(0);
}
/*
@@ -619,7 +942,7 @@ cache_inval(struct namecache *ncp, int f
struct namecache *nextkid;
int rcnt = 0;
- KKASSERT(ncp->nc_exlocks);
+ KKASSERT(ncp->nc_shadowinfo->sh_exlocks);
cache_setunresolved(ncp);
if (flags & CINV_DESTROY)
@@ -715,6 +1038,7 @@ restart:
* XXX the disconnection could pose a problem, check code paths to make
* sure any code that blocks can handle the parent being changed out from
* under it. Maybe we should lock the children (watch out for deadlocks) ?
+ * [UPDATE: attempt made to lock children, see in situ explanation]
*
* After we return the caller has the option of calling cache_setvp() if
* the vnode of the new target ncp is known.
@@ -726,26 +1050,62 @@ cache_rename(struct namecache *fncp, str
cache_rename(struct namecache *fncp, struct namecache *tncp)
{
struct namecache *scan;
- int didwarn = 0;
-
+ int didwarn[] = { 0, 0 };
+
+ /* XXX should we rather make here a non-equality assertion? */
+ if (fncp == tncp)
+ return;
+
+again:
cache_setunresolved(fncp);
cache_setunresolved(tncp);
+
+ /*
+ * It seems we need to unlock fncp before calling cache_inval():
+ * cache_inval() does a lot of lock/unlock/relock-ing (with tncp
+ * and its children), therefore keeping fncp locked might be
+ * deadlocky...
+ */
+ cache_unlock(fncp);
+
while (cache_inval(tncp, CINV_CHILDREN) != 0) {
- if (didwarn++ % 10 == 0) {
- printf("Warning: cache_rename: race during "
+ if (didwarn[0]++ % 10 == 0) {
+ printf("Warning: cache_rename: race #1 during "
"rename %s->%s\n",
fncp->nc_name, tncp->nc_name);
}
tsleep(tncp, 0, "mvrace", hz / 10);
cache_setunresolved(tncp);
}
+
+ cache_unlock(tncp);
+ cache_lock(fncp);
+
while ((scan = TAILQ_FIRST(&fncp->nc_list)) != NULL) {
- cache_hold(scan);
+ cache_unlock(fncp);
+ /*
+ * We have to lock fncp's kids in order to unresolve
+ * their shadow kids...
+ */
+ cache_get(scan);
cache_unlink_parent(scan);
+ cache_group_walk(scan, &unresolver_updater, SG_SUBTREE, scan);
cache_link_parent(scan, tncp);
if (scan->nc_flag & NCF_HASHED)
cache_rehash(scan);
- cache_drop(scan);
+ cache_put(scan);
+ cache_lock(fncp);
+ }
+
+ cache_lock_two(tncp, fncp);
+
+ if ((fncp->nc_flag & tncp->nc_flag & NCF_UNRESOLVED) == 0) {
+ if (didwarn[1]++ % 10 == 0) {
+ printf("Warning: cache_rename: race #2 during "
+ "rename %s->%s\n",
+ fncp->nc_name, tncp->nc_name);
+ }
+ goto again;
}
}
@@ -1321,7 +1681,7 @@ cache_zap(struct namecache *ncp)
cache_drop(ncp);
return;
}
- KKASSERT(par->nc_exlocks == 0);
+ KKASSERT(par->nc_shadowinfo->sh_exlocks == 0);
cache_lock(ncp);
}
done:
@@ -1417,7 +1777,7 @@ restart:
if (ncp->nc_timeout &&
(int)(ncp->nc_timeout - ticks) < 0 &&
(ncp->nc_flag & NCF_UNRESOLVED) == 0 &&
- ncp->nc_exlocks == 0
+ ncp->nc_shadowinfo->sh_exlocks == 0
) {
cache_zap(cache_get(ncp));
goto restart;
@@ -1738,6 +2098,7 @@ nchinit(void)
gd->gd_nchstats = &nchstats[i];
}
TAILQ_INIT(&ncneglist);
+ STAILQ_INIT(&shadowinfo_freeq);
nchashtbl = hashinit(desiredvnodes*2, M_VFSCACHE, &nchash);
nclockwarn = 1 * hz;
}
diff -r e1d135a8f666 sys/sys/namecache.h
--- a/sys/sys/namecache.h Sun Mar 26 07:56:54 2006 +0000
+++ b/sys/sys/namecache.h Wed Mar 29 10:23:26 2006 +0200
@@ -70,7 +70,20 @@
struct vnode;
+/*
+ * Auxiliary structure for locking namecache entries,
+ * either on their own or grouped into "shadow groups".
+ */
+struct shadowinfo {
+ STAILQ_ENTRY(shadowinfo) sh_entry; /* entry for free list */
+ int sh_exlocks; /* namespace locking */
+ struct thread *sh_locktd; /* namespace locking */
+ int sh_refs; /* reference count */
+ uint8_t sh_lockreq :1; /* lock intent sign */
+};
+
TAILQ_HEAD(namecache_list, namecache);
+LIST_HEAD(namecache_shadow_list, namecache);
/*
* The namecache structure is used to manage the filesystem namespace. Most
@@ -110,8 +123,12 @@ struct namecache {
char *nc_name; /* Separately allocated seg name */
int nc_error;
int nc_timeout; /* compared against ticks, or 0 */
- int nc_exlocks; /* namespace locking */
- struct thread *nc_locktd; /* namespace locking */
+ struct shadowinfo *nc_shadowinfo; /* namespace locking */
+ struct shadowinfo nc_shadowinfo_internal; /* private locking information */
+ struct namecache *nc_shadow_prev; /* previous entry in shadow group */
+ struct namecache *nc_shadow_next; /* next entry in shadow group */
+ int nc_shadowheight; /* measure within shadow group */
+ struct namecache *nc_shadowed; /* lower layer entry in layered fs */
struct mount *nc_mount; /* associated mount for vopops */
int64_t nc_fsmid; /* filesystem modified id */
};
@@ -127,7 +144,7 @@ typedef struct namecache *namecache_t;
#define NCF_MOUNTPT 0x0008 /* mount point */
#define NCF_ROOT 0x0010 /* namecache root (static) */
#define NCF_HASHED 0x0020 /* namecache entry in hash table */
-#define NCF_LOCKREQ 0x0040
+#define NCF_UNUSED040 0x0040
#define NCF_UNUSED080 0x0080
#define NCF_ISSYMLINK 0x0100 /* represents a symlink */
#define NCF_ISDIR 0x0200 /* represents a directory */
@@ -150,6 +167,9 @@ void cache_lock(struct namecache *ncp);
void cache_lock(struct namecache *ncp);
int cache_lock_nonblock(struct namecache *ncp);
void cache_unlock(struct namecache *ncp);
+void cache_lock_two(struct namecache *uncp, struct namecache *lncp);
+int cache_shadow_attach(struct namecache *ncp, struct namecache *sncp);
+void cache_shadow_detach(struct namecache *ncp);
void cache_setvp(struct namecache *ncp, struct vnode *vp);
void cache_settimeout(struct namecache *ncp, int nticks);
void cache_setunresolved(struct namecache *ncp);
diff -r e1d135a8f666 sys/vfs/nullfs/null.h
--- a/sys/vfs/nullfs/null.h Sun Mar 26 07:56:54 2006 +0000
+++ b/sys/vfs/nullfs/null.h Wed Mar 29 10:29:08 2006 +0200
@@ -44,17 +44,26 @@ struct null_args {
};
struct null_mount {
- struct mount *nullm_vfs;
- struct vnode *nullm_rootvp; /* Reference to root null_node */
+ struct namecache *nullm_ncp;
};
#ifdef _KERNEL
+
#define MOUNTTONULLMOUNT(mp) ((struct null_mount *)((mp)->mnt_data))
-
#ifdef NULLFS_DEBUG
-#define NULLFSDEBUG(format, args...) printf(format ,## args)
+#define NULLFSDEBUG(format, args...) \
+ printf(" [nullfs] %s:%d: " format, __func__, __LINE__, ## args)
+#define NULLNCDEBUG(ncp) \
+ NULLFSDEBUG(#ncp " %p: name %s, refs %d, exlocks %d, nc_flag 0x%x, " \
+ "nc_mount %p, nc_shadowed %p, nc_shadowinfo %p, " \
+ "nc_shadowheight %d, nc_vp %p\n", \
+ (ncp), (ncp)->nc_name, (ncp)->nc_refs, \
+ (ncp)->nc_shadowinfo->sh_exlocks, (ncp)->nc_flag, \
+ (ncp)->nc_mount, (ncp)->nc_shadowed, \
+ (ncp)->nc_shadowinfo, (ncp)->nc_shadowheight, (ncp)->nc_vp)
#else
#define NULLFSDEBUG(format, args...)
+#define NULLNCDEBUG(ncp)
#endif /* NULLFS_DEBUG */
#endif /* _KERNEL */
diff -r e1d135a8f666 sys/vfs/nullfs/null_vfsops.c
--- a/sys/vfs/nullfs/null_vfsops.c Sun Mar 26 07:56:54 2006 +0000
+++ b/sys/vfs/nullfs/null_vfsops.c Wed Mar 29 10:29:08 2006 +0200
@@ -53,6 +53,7 @@
#include <sys/vnode.h>
#include <sys/mount.h>
#include <sys/nlookup.h>
+#include <sys/namecache.h>
#include "null.h"
extern struct vnodeopv_entry_desc null_vnodeop_entries[];
@@ -80,12 +81,10 @@ nullfs_mount(struct mount *mp, char *pat
{
int error = 0;
struct null_args args;
- struct vnode *rootvp;
struct null_mount *xmp;
u_int size;
- struct nlookupdata nd;
-
- NULLFSDEBUG("nullfs_mount(mp = %p)\n", (void *)mp);
+
+ NULLFSDEBUG("mp %p\n", (void *)mp);
/*
* Update is a no-op
@@ -98,118 +97,79 @@ nullfs_mount(struct mount *mp, char *pat
* Get argument
*/
error = copyin(data, (caddr_t)&args, sizeof(struct null_args));
- if (error)
+
+ xmp = malloc(sizeof(*xmp), M_NULLFSMNT, M_WAITOK | M_ZERO);
+
+ NULLFSDEBUG("nlookup %s\n", args.target);
+
+ xmp->nullm_ncp = nlookup_simple(args.target,
+ UIO_SYSSPACE, NLC_FOLLOW, &error);
+
+ if (! xmp->nullm_ncp) {
+ free(xmp, M_NULLFSMNT);
return (error);
-
- /*
- * Find lower node
- */
- rootvp = NULL;
- error = nlookup_init(&nd, args.target, UIO_USERSPACE, NLC_FOLLOW);
- if (error == 0)
- error = nlookup(&nd);
- if (error == 0) {
- error = cache_vget(nd.nl_ncp, nd.nl_cred, LK_EXCLUSIVE,
- &rootvp);
- }
-
- xmp = (struct null_mount *) malloc(sizeof(struct null_mount),
- M_NULLFSMNT, M_WAITOK); /* XXX */
-
- /*
- * Save reference to underlying FS
- */
- /*
- * As lite stacking enters the scene, the old way of doing this
- * -- via the vnode -- is not good enough anymore...
- */
- xmp->nullm_vfs = nd.nl_ncp->nc_mount;
- nlookup_done(&nd);
-
- vfs_add_vnodeops(mp, &mp->mnt_vn_norm_ops,
- null_vnodeop_entries, 0);
-
- VOP_UNLOCK(rootvp, 0, td);
-
- /*
- * Keep a held reference to the root vnode.
- * It is vrele'd in nullfs_unmount.
- */
- xmp->nullm_rootvp = rootvp;
- /*
- * XXX What's the proper safety condition for querying
- * the underlying mount? Is this flag tuning necessary
- * at all?
- */
- if (xmp->nullm_vfs->mnt_flag & MNT_LOCAL)
+ }
+
+ cache_unlock(xmp->nullm_ncp);
+
+ vfs_add_vnodeops(mp, &mp->mnt_vn_norm_ops, null_vnodeop_entries, 0);
+
+ if (xmp->nullm_ncp->nc_mount->mnt_flag & MNT_LOCAL)
mp->mnt_flag |= MNT_LOCAL;
- mp->mnt_data = (qaddr_t) xmp;
+ mp->mnt_data = (void *)xmp;
vfs_getnewfsid(mp);
(void) copyinstr(args.target, mp->mnt_stat.f_mntfromname, MNAMELEN - 1,
&size);
bzero(mp->mnt_stat.f_mntfromname + size, MNAMELEN - size);
- (void)nullfs_statfs(mp, &mp->mnt_stat, td);
- NULLFSDEBUG("nullfs_mount: lower %s, alias at %s\n",
- mp->mnt_stat.f_mntfromname, mp->mnt_stat.f_mntfromname);
- return (0);
-}
-
-/*
- * Free reference to null layer
- */
+ NULLFSDEBUG("lower %s, alias at %s\n",
+ mp->mnt_stat.f_mntfromname, mp->mnt_stat.f_mntonname);
+ return (0);
+}
+
static int
nullfs_unmount(struct mount *mp, int mntflags, struct thread *td)
{
- void *mntdata;
- int flags = 0;
-
- NULLFSDEBUG("nullfs_unmount: mp = %p\n", (void *)mp);
-
- if (mntflags & MNT_FORCE)
- flags |= FORCECLOSE;
-
- /*
- * Finally, throw away the null_mount structure
- */
- mntdata = mp->mnt_data;
- mp->mnt_data = 0;
- free(mntdata, M_NULLFSMNT);
- return 0;
+ NULLNCDEBUG(mp->mnt_ncp);
+
+ cache_drop(MOUNTTONULLMOUNT(mp)->nullm_ncp);
+ free(mp->mnt_data, M_NULLFSMNT);
+
+ return (0);
+}
+
+static int
+nullfs_start(struct mount *mp, int flags, struct thread *td)
+{
+ mp->mnt_ncp->nc_shadowed = MOUNTTONULLMOUNT(mp)->nullm_ncp;
+
+ return (0);
}
static int
nullfs_root(struct mount *mp, struct vnode **vpp)
{
- struct thread *td = curthread; /* XXX */
- struct vnode *vp;
-
- NULLFSDEBUG("nullfs_root(mp = %p, vp = %p)\n", (void *)mp,
- (void *)MOUNTTONULLMOUNT(mp)->nullm_rootvp);
-
- /*
- * Return locked reference to root.
- */
- vp = MOUNTTONULLMOUNT(mp)->nullm_rootvp;
- vref(vp);
-
-#ifdef NULLFS_DEBUG
- if (VOP_ISLOCKED(vp, NULL)) {
- Debugger("root vnode is locked.\n");
- vrele(vp);
- return (EDEADLK);
- }
-#endif
- vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
- *vpp = vp;
- return 0;
+ int error;
+
+ error = cache_vget(MOUNTTONULLMOUNT(mp)->nullm_ncp,
+ crhold(proc0.p_ucred), LK_EXCLUSIVE | LK_RETRY, vpp);
+ crfree(proc0.p_ucred);
+
+ return (error);
+}
+
+static __inline
+struct mount *
+nullfs_lowermount_0(struct mount *mp)
+{
+ return (MOUNTTONULLMOUNT(mp)->nullm_ncp->nc_mount);
static int
nullfs_quotactl(struct mount *mp, int cmd, uid_t uid, caddr_t arg,
struct thread *td)
{
- return VFS_QUOTACTL(MOUNTTONULLMOUNT(mp)->nullm_vfs, cmd, uid, arg, td);
+ return VFS_QUOTACTL(nullfs_lowermount_0(mp), cmd, uid, arg, td);
}
static int
@@ -218,12 +178,12 @@ nullfs_statfs(struct mount *mp, struct s
int error;
struct statfs mstat;
- NULLFSDEBUG("nullfs_statfs(mp = %p, vp = %p)\n", (void *)mp,
- (void *)MOUNTTONULLMOUNT(mp)->nullm_rootvp);
+ NULLFSDEBUG("mp %p, ncp %p, lower mp %p\n",
+ mp, mp->mnt_ncp, nullfs_lowermount_0(mp));
bzero(&mstat, sizeof(mstat));
- error = VFS_STATFS(MOUNTTONULLMOUNT(mp)->nullm_vfs, &mstat, td);
+ error = VFS_STATFS(nullfs_lowermount_0(mp), &mstat, td);
if (error)
return (error);
@@ -248,23 +208,21 @@ nullfs_checkexp(struct mount *mp, struct
nullfs_checkexp(struct mount *mp, struct sockaddr *nam, int *extflagsp,
struct ucred **credanonp)
{
-
- return VFS_CHECKEXP(MOUNTTONULLMOUNT(mp)->nullm_vfs, nam,
- extflagsp, credanonp);
+ return VFS_CHECKEXP(nullfs_lowermount_0(mp), nam, extflagsp, credanonp);
}
static int
nullfs_extattrctl(struct mount *mp, int cmd, const char *attrname, caddr_t arg,
struct thread *td)
{
- return VFS_EXTATTRCTL(MOUNTTONULLMOUNT(mp)->nullm_vfs, cmd, attrname,
- arg, td);
+ return VFS_EXTATTRCTL(nullfs_lowermount_0(mp), cmd, attrname, arg, td);
}
static struct vfsops null_vfsops = {
.vfs_mount = nullfs_mount,
.vfs_unmount = nullfs_unmount,
+ .vfs_start = nullfs_start,
.vfs_root = nullfs_root,
.vfs_quotactl = nullfs_quotactl,
.vfs_statfs = nullfs_statfs,
diff -r e1d135a8f666 sys/vfs/nullfs/null_vnops.c
--- a/sys/vfs/nullfs/null_vnops.c Sun Mar 26 07:56:54 2006 +0000
+++ b/sys/vfs/nullfs/null_vnops.c Wed Mar 29 10:29:08 2006 +0200
@@ -98,7 +98,7 @@
* might be able to get on with a hybrid solution: overlay some vnodes, and rely
* on namecache API for the rest.
*/
-
+
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/kernel.h>
@@ -109,6 +109,8 @@
#include <sys/namei.h>
#include <sys/malloc.h>
#include <sys/buf.h>
+#include <sys/namecache.h>
+#include <sys/nlookup.h>
#include "null.h"
static int null_nresolve(struct vop_nresolve_args *ap);
@@ -122,90 +124,190 @@ static int null_nrmdir(struct vop_nrmdir
static int null_nrmdir(struct vop_nrmdir_args *ap);
static int null_nrename(struct vop_nrename_args *ap);
+static __inline
+struct mount *
+nullfs_lowermount_l(struct namecache *ncp)
+{
+ /*
+ * The code in use below allows allows passing through lower mounts.
+ * If we didn't want to do that, we could use
+ *
+ * MOUNTTONULLMOUNT(ncp->nc_mount)->nullm_ncp->nc_mount
+ *
+ * Eventually, the choice might be configurable.
+ */
+ return (ncp->nc_shadowed->nc_mount);
+}
+
+
+static __inline
+int
+nullfs_check(struct namecache *ncp)
+{
+ if (ncp->nc_mount->mnt_ncp == ncp)
+ return (EPERM);
+
+ if (!ncp->nc_shadowed)
+ return (ENOENT);
+
+ if (ncp->nc_shadowheight == 0)
+ return (EINVAL);
+
+ return (0);
+}
+
static int
null_nresolve(struct vop_nresolve_args *ap)
{
- ap->a_head.a_ops = MOUNTTONULLMOUNT(ap->a_ncp->nc_mount)->nullm_vfs->mnt_vn_norm_ops;
-
- return vop_nresolve_ap(ap);
-}
-
-static int
-null_ncreate(struct vop_ncreate_args *ap)
-{
- ap->a_head.a_ops = MOUNTTONULLMOUNT(ap->a_ncp->nc_mount)->nullm_vfs->mnt_vn_norm_ops;
-
- return vop_ncreate_ap(ap);
-}
-
-static int
-null_nmkdir(struct vop_nmkdir_args *ap)
-{
- ap->a_head.a_ops = MOUNTTONULLMOUNT(ap->a_ncp->nc_mount)->nullm_vfs->mnt_vn_norm_ops;
-
- return vop_nmkdir_ap(ap);
-}
-
-static int
-null_nmknod(struct vop_nmknod_args *ap)
-{
- ap->a_head.a_ops = MOUNTTONULLMOUNT(ap->a_ncp->nc_mount)->nullm_vfs->mnt_vn_norm_ops;
-
- return vop_nmknod_ap(ap);
-}
-
-static int
-null_nlink(struct vop_nlink_args *ap)
-{
- ap->a_head.a_ops = MOUNTTONULLMOUNT(ap->a_ncp->nc_mount)->nullm_vfs->mnt_vn_norm_ops;
-
- return vop_nlink_ap(ap);
-}
-
-static int
-null_nsymlink(struct vop_nsymlink_args *ap)
-{
- ap->a_head.a_ops = MOUNTTONULLMOUNT(ap->a_ncp->nc_mount)->nullm_vfs->mnt_vn_norm_ops;
-
- return vop_nsymlink_ap(ap);
-}
-
-static int
-null_nwhiteout(struct vop_nwhiteout_args *ap)
-{
- ap->a_head.a_ops = MOUNTTONULLMOUNT(ap->a_ncp->nc_mount)->nullm_vfs->mnt_vn_norm_ops;
-
- return vop_nwhiteout_ap(ap);
-}
-
-static int
-null_nremove(struct vop_nremove_args *ap)
-{
- ap->a_head.a_ops = MOUNTTONULLMOUNT(ap->a_ncp->nc_mount)->nullm_vfs->mnt_vn_norm_ops;
-
- return vop_nremove_ap(ap);
-}
-
-static int
-null_nrmdir(struct vop_nrmdir_args *ap)
-{
- ap->a_head.a_ops = MOUNTTONULLMOUNT(ap->a_ncp->nc_mount)->nullm_vfs->mnt_vn_norm_ops;
-
- return vop_nrmdir_ap(ap);
-}
+ struct namecache *ncp = ap->a_ncp;
+ struct nlcomponent nlc;
+ struct namecache *sncp, *psncp;
+ int error = 0;
+
+ sncp = ncp->nc_shadow_next;
+ if (sncp) {
+ cache_hold(sncp);
+ cache_setunresolved(sncp);
+ cache_put(sncp);
+ }
+
+ cache_unlock(ncp);
+ cache_lock(ncp->nc_parent);
+ psncp = ncp->nc_parent->nc_shadowed;
+ if (psncp)
+ cache_hold(psncp);
+ cache_unlock(ncp->nc_parent);
+
+ if (! psncp) {
+ cache_lock(ncp);
+ if ((ncp->nc_flag & NCF_UNRESOLVED) == 0)
+ cache_setvp(ncp, NULL);
+ return (ncp->nc_error);
+ }
+
+ nlc.nlc_nameptr = ncp->nc_name;
+ nlc.nlc_namelen = ncp->nc_nlen;
+ sncp = cache_nlookup(psncp, &nlc);
+ cache_drop(psncp);
+
+ if ((sncp->nc_flag & NCF_UNRESOLVED) == 0)
+ goto postdowncall;
+
+ ap->a_head.a_ops = sncp->nc_mount->mnt_vn_use_ops;
+ ap->a_ncp = sncp;
+ /*
+ * According to cache_resolve(), the primary place for
+ * VOP_NRESOLVE calls, the caller of the nresolve method
+ * is the one who should take care about ncp->nc_error.
+ */
+ ap->a_ncp->nc_error = vop_nresolve_ap(ap);
+
+postdowncall:
+
+ error = cache_shadow_attach(ncp, sncp);
+
+ NULLNCDEBUG(ncp);
+ NULLNCDEBUG(sncp);
+ NULLFSDEBUG("attach error %d\n", error);
+
+ if (error) {
+ cache_put(sncp);
+ if (ncp->nc_flag & NCF_UNRESOLVED) {
+ cache_setvp(ncp, NULL);
+ error = ENOENT;
+ } else if (error == EEXIST)
+ error = ncp->nc_error;
+ } else {
+ error = sncp->nc_error;
+ cache_setvp(ncp, sncp->nc_vp);
+ ncp->nc_shadowed = sncp;
+ cache_drop(sncp);
+ }
+
+ NULLFSDEBUG("error %d\n", error);
+ return (error);
+}
+
+#define NULL_NVOP_TEMPLATE(OP) \
+static int \
+null_ ## OP(struct vop_ ## OP ## _args *ap) \
+{ \
+ struct namecache *ncp = ap->a_ncp; \
+ struct namecache *sncp = ncp->nc_shadowed; \
+ int error; \
+ \
+ NULLNCDEBUG(ap->a_ncp); \
+ \
+ if ((error = nullfs_check(ncp))) \
+ return (error); \
+ cache_hold(sncp); \
+ \
+ NULLNCDEBUG(ap->a_ncp->nc_shadowed); \
+ \
+ ap->a_head.a_ops = nullfs_lowermount_l(ap->a_ncp)->mnt_vn_use_ops; \
+ ap->a_ncp = ncp->nc_shadowed; \
+ \
+ error = vop_ ## OP ## _ap(ap); \
+ NULLNCDEBUG(ncp); \
+ NULLNCDEBUG(sncp); \
+ sncp->nc_shadowinfo == ncp->nc_shadowinfo ? \
+ cache_drop(sncp) : \
+ cache_put(sncp); \
+ \
+ return (error); \
+}
+
+NULL_NVOP_TEMPLATE(ncreate)
+NULL_NVOP_TEMPLATE(nmkdir)
+NULL_NVOP_TEMPLATE(nmknod)
+NULL_NVOP_TEMPLATE(nlink)
+NULL_NVOP_TEMPLATE(nsymlink)
+NULL_NVOP_TEMPLATE(nwhiteout)
+NULL_NVOP_TEMPLATE(nremove)
+NULL_NVOP_TEMPLATE(nrmdir)
static int
null_nrename(struct vop_nrename_args *ap)
{
+ struct namecache *fncp = ap->a_fncp;
+ struct namecache *tncp = ap->a_tncp;
+ struct namecache *sfncp = fncp->nc_shadowed;
+ struct namecache *stncp = tncp->nc_shadowed;
struct mount *lmp;
-
- lmp = MOUNTTONULLMOUNT(ap->a_fncp->nc_mount)->nullm_vfs;
- if (lmp != MOUNTTONULLMOUNT(ap->a_tncp->nc_mount)->nullm_vfs)
- return (EINVAL);
-
- ap->a_head.a_ops = lmp->mnt_vn_norm_ops;
-
- return vop_nrename_ap(ap);
+ int error;
+
+ NULLNCDEBUG(ap->a_fncp);
+ NULLNCDEBUG(ap->a_tncp);
+
+ if ((error = nullfs_check(fncp)))
+ return (error);
+ if ((error = nullfs_check(tncp)))
+ return (error);
+
+ lmp = nullfs_lowermount_l(fncp);
+ if (lmp != nullfs_lowermount_l(tncp))
+ return (EXDEV);
+
+ cache_hold(sfncp);
+ cache_hold(stncp);
+
+ NULLNCDEBUG(ap->a_fncp->nc_shadowed);
+ NULLNCDEBUG(ap->a_tncp->nc_shadowed);
+
+ ap->a_head.a_ops = lmp->mnt_vn_use_ops;
+ ap->a_fncp = fncp->nc_shadowed;
+ ap->a_tncp = tncp->nc_shadowed;
+
+ error = vop_nrename_ap(ap);
+
+ sfncp->nc_shadowinfo == fncp->nc_shadowinfo ?
+ cache_drop(sfncp) :
+ cache_put(sfncp);
+ stncp->nc_shadowinfo == tncp->nc_shadowinfo ?
+ cache_drop(stncp) :
+ cache_put(stncp);
+
+ return (error);
}
/*
@@ -224,4 +326,3 @@ struct vnodeopv_entry_desc null_vnodeop_
{ &vop_nrename_desc, (vnodeopv_entry_t) null_nrename },
{ NULL, NULL }
};
-
More information about the Submit
mailing list