Tentitive UFS patch to narrow down dup alloc panics

Matthew Dillon dillon at apollo.backplane.com
Thu Jan 13 13:22:42 PST 2005


    I would appreciate it if everyone who regularly gets 'dup alloc' inode
    panics would apply this patch, install a new kernel, reboot into single
    user, and fsck (to get a 'clean slate'), then reboot normally and see if
    the dup alloc panic still occurs.

    I am trying to narrow down the possible sources of the bug.  The
    dup alloc (inode) panic also occurs in FreeBSD 5.3 and FreeBSD 4.x,
    and I think they are all related.  But finding the problem is proving
    to be difficult due to its delayed-action nature.

    This patch prevents the UFS filesystem from reallocating an inode
    that has not yet been completely disposed of by the system.  There
    is a possible race in the vnode disposal code where the inode hash
    table is inconsistent with the vnode state.  If an inode is reallocated
    at that point it is entirely possible that the on-disk inode structure
    could become inconsistent with the actual state of the system.

    If you get any 'conflict with vnode ...' messages on the console/dmesg
    using this patch, please report them immediately along with what you
    were doing at the time.

						-Matt


Index: ffs_alloc.c
===================================================================
RCS file: /cvs/src/sys/vfs/ufs/ffs_alloc.c,v
retrieving revision 1.11
diff -u -r1.11 ffs_alloc.c
--- ffs_alloc.c	9 Aug 2004 19:41:04 -0000	1.11
+++ ffs_alloc.c	13 Jan 2005 21:09:01 -0000
@@ -1282,6 +1282,15 @@
  *   1) allocate the requested inode.
  *   2) allocate the next available inode after the requested
  *      inode in the specified cylinder group.
+ *   3) the inode must not already be in the inode hash table.  We
+ *	can encounter such a case because the vnode reclamation sequence
+ *	frees the bit
+ *   3) the inode must not already be in the inode hash, otherwise it
+ *	may be in the process of being deallocated.  This can occur
+ *	because the bitmap is updated before the inode is removed from
+ *	hash.  If we were to reallocate the inode the caller could wind
+ *	up returning a vnode/inode combination which is in an indeterminate
+ *	state.
  */
 static ino_t
 ffs_nodealloccg(struct inode *ip, int cg, ufs_daddr_t ipref, int mode)
@@ -1290,7 +1299,9 @@
 	struct cg *cgp;
 	struct buf *bp;
 	uint8_t *inosused;
-	int error, start, len, loc, map, i;
+	int error, len, map, i;
+	int icheckmiss;
+	ufs_daddr_t ibase;
 
 	fs = ip->i_fs;
 	if (fs->fs_cs(fs, cg).cs_nifree == 0)
@@ -1306,43 +1317,87 @@
 		brelse(bp);
 		return (0);
 	}
-	bp->b_xflags |= BX_BKGRDWRITE;
-	cgp->cg_time = time_second;
 	inosused = cg_inosused(cgp);
+	icheckmiss = 0;
+
+	/*
+	 * Quick check, reuse the most recently free inode or continue
+	 * a scan from where we left off the last time.
+	 */
+	ibase = cg * fs->fs_ipg;
 	if (ipref) {
 		ipref %= fs->fs_ipg;
-		if (isclr(inosused, ipref))
-			goto gotit;
-	}
-	start = cgp->cg_irotor / NBBY;
-	len = howmany(fs->fs_ipg - cgp->cg_irotor, NBBY);
-	loc = skpc(0xff, len, &inosused[start]);
-	if (loc == 0) {
-		len = start + 1;
-		start = 0;
-		loc = skpc(0xff, len, &inosused[0]);
-		if (loc == 0) {
-			printf("cg = %d, irotor = %ld, fs = %s\n",
-			    cg, (long)cgp->cg_irotor, fs->fs_fsmnt);
-			panic("ffs_nodealloccg: map corrupted");
-			/* NOTREACHED */
+		if (isclr(inosused, ipref)) {
+			if (ufs_ihashcheck(ip->i_dev, ibase + ipref) == 0)
+				goto gotit;
 		}
 	}
-	i = start + len - loc;
-	map = inosused[i];
-	ipref = i * NBBY;
-	for (i = 1; i < (1 << NBBY); i <<= 1, ipref++) {
-		if ((map & i) == 0) {
-			cgp->cg_irotor = ipref;
-			goto gotit;
+
+	/*
+	 * Scan the inode bitmap starting at irotor, be sure to handle
+	 * the edge case by going back to the beginning of the array.
+	 * Note that 'len' can go negative.  Start the scan at bit 0 to
+	 * simplify the code.
+	 */
+	ipref = (cgp->cg_irotor % fs->fs_ipg) & ~7;
+	map = inosused[ipref >> 3];
+	len = fs->fs_ipg;
+
+	while (len > 0) {
+		if (map != (1 << NBBY) - 1) {
+			for (i = 0; i < NBBY; ++i) {
+				/*
+				 * If we find a free bit we have to make sure
+				 * that the inode is not in the middle of
+				 * being destroyed.  The inode should not exist
+				 * in the inode hash.
+				 *
+				 * Adjust the rotor to try to hit the 
+				 * quick-check up above.
+				 */
+				if ((map & (1 << i)) == 0) {
+					KKASSERT(ipref + i < fs->fs_ipg);
+					if (ufs_ihashcheck(ip->i_dev, ibase + ipref + i) == 0) {
+						ipref += i;
+						cgp->cg_irotor = (ipref + 1) % fs->fs_ipg;
+						goto gotit;
+					}
+					++icheckmiss;
+				}
+			}
+		}
+
+		/*
+		 * Setup for the next byte.  If we hit the edge make sure to
+		 * adjust the remaining length only by the number of bits in
+		 * the last byte.
+		 */
+		ipref += NBBY;
+		if (ipref >= fs->fs_ipg) {
+			ipref = 0;
+			len -= fs->fs_ipg & 7;
+		} else {
+			len -= NBBY;
 		}
+		map = inosused[ipref >> 3];
+	}
+	if (icheckmiss == cgp->cg_cs.cs_nifree) {
+		brelse(bp);
+		return(0);
 	}
 	printf("fs = %s\n", fs->fs_fsmnt);
-	panic("ffs_nodealloccg: block not in map");
+	panic("ffs_nodealloccg: block not in map, icheckmiss/nfree %d/%d",
+		icheckmiss, cgp->cg_cs.cs_nifree);
 	/* NOTREACHED */
 gotit:
+	if (icheckmiss) {
+		printf("Warning: inode free race avoided %d times\n",
+			icheckmiss);
+	}
+	bp->b_xflags |= BX_BKGRDWRITE;
+	cgp->cg_time = time_second;
 	if (DOINGSOFTDEP(ITOV(ip)))
-		softdep_setup_inomapdep(bp, ip, cg * fs->fs_ipg + ipref);
+		softdep_setup_inomapdep(bp, ip, ibase + ipref);
 	setbit(inosused, ipref);
 	cgp->cg_cs.cs_nifree--;
 	fs->fs_cstotal.cs_nifree--;
@@ -1354,7 +1409,7 @@
 		fs->fs_cs(fs, cg).cs_ndir++;
 	}
 	bdwrite(bp);
-	return (cg * fs->fs_ipg + ipref);
+	return (ibase + ipref);
 }
 
 /*
Index: ufs_extern.h
===================================================================
RCS file: /cvs/src/sys/vfs/ufs/ufs_extern.h,v
retrieving revision 1.10
diff -u -r1.10 ufs_extern.h
--- ufs_extern.h	12 Nov 2004 00:09:52 -0000	1.10
+++ ufs_extern.h	8 Jan 2005 01:22:55 -0000
@@ -78,6 +78,7 @@
 int	 ufs_getlbns(struct vnode *, ufs_daddr_t, struct indir *, int *);
 struct vnode *
 	 ufs_ihashget(dev_t, ino_t);
+int	 ufs_ihashcheck(dev_t, ino_t);
 void	 ufs_ihashinit(void);
 int	 ufs_ihashins(struct inode *);
 struct vnode *
Index: ufs_ihash.c
===================================================================
RCS file: /cvs/src/sys/vfs/ufs/ufs_ihash.c,v
retrieving revision 1.14
diff -u -r1.14 ufs_ihash.c
--- ufs_ihash.c	12 Oct 2004 19:21:12 -0000	1.14
+++ ufs_ihash.c	8 Jan 2005 04:25:58 -0000
@@ -137,6 +137,33 @@
 }
 
 /*
+ * Check to see if an inode is in the hash table.  This is used to interlock
+ * file free operations to ensure that the vnode is not reused due to a
+ * reallocate of its inode number before we have had a chance to recycle it.
+ */
+int
+ufs_ihashcheck(dev_t dev, ino_t inum)
+{
+	lwkt_tokref ilock;
+	struct inode *ip;
+
+	lwkt_gettoken(&ilock, &ufs_ihash_token);
+	for (ip = *INOHASH(dev, inum); ip; ip = ip->i_next) {
+		if (inum == ip->i_number && dev == ip->i_dev) {
+			if (ip->i_vnode) {
+			    printf("conflict with vnode %p", ip->i_vnode);
+			    if (TAILQ_FIRST(&ip->i_vnode->v_namecache))
+				printf(" ncp %s", TAILQ_FIRST(&ip->i_vnode->v_namecache)->nc_name);
+			    printf("\n");
+			}
+			break;
+		}
+	}
+	lwkt_reltoken(&ilock);
+	return(ip ? 1 : 0);
+}
+
+/*
  * Insert the inode into the hash table, and return it locked.
  */
 int





More information about the Bugs mailing list