ccd on NATA problem - ata0: FAILURE - oversized DMA transfer attempt 73728 > 65536

Matthew Dillon dillon at apollo.backplane.com
Wed Jul 18 21:24:04 PDT 2007


    Please test this patch.  It survived a couple of a buildworld runs
    on my test box with the chunk size forced to 4K but it does mess
    with low level disk I/O so make sure anything important is backed up
    first.

						-Matt

Index: kern/subr_diskgpt.c
===================================================================
RCS file: /cvs/src/sys/kern/subr_diskgpt.c,v
retrieving revision 1.3
diff -u -p -r1.3 subr_diskgpt.c
--- kern/subr_diskgpt.c	19 Jun 2007 06:07:57 -0000	1.3
+++ kern/subr_diskgpt.c	19 Jul 2007 02:42:50 -0000
@@ -136,6 +136,9 @@ 		error = EINVAL;
 		goto done;
 	}
 
+	/*
+	 * XXX subject to device dma size limitations
+	 */
 	bp2 = geteblk((int)(table_blocks * info->d_media_blksize));
 	bp2->b_bio1.bio_offset = (off_t)table_lba * info->d_media_blksize;
 	bp2->b_bcount = table_blocks * info->d_media_blksize;
Index: kern/subr_disklabel64.c
===================================================================
RCS file: /cvs/src/sys/kern/subr_disklabel64.c,v
retrieving revision 1.4
diff -u -p -r1.4 subr_disklabel64.c
--- kern/subr_disklabel64.c	19 Jun 2007 06:39:06 -0000	1.4
+++ kern/subr_disklabel64.c	19 Jul 2007 02:43:56 -0000
@@ -118,6 +118,9 @@ 	size_t dlpcrcsize;
 	size_t bpsize;
 	int secsize;
 
+	/*
+	 * XXX I/O size is subject to device DMA limitations
+	 */
 	secsize = info->d_media_blksize;
 	bpsize = (sizeof(*dlp) + secsize - 1) & ~(secsize - 1);
 
@@ -289,6 +292,9 @@ 	int secsize;
 
 	lp = lpx.lab64;
 
+	/*
+	 * XXX I/O size is subject to device DMA limitations
+	 */
 	secsize = ssp->dss_secsize;
 	bpsize = (sizeof(*lp) + secsize - 1) & ~(secsize - 1);
 
Index: kern/vfs_aio.c
===================================================================
RCS file: /cvs/src/sys/kern/vfs_aio.c,v
retrieving revision 1.41
diff -u -p -r1.41 vfs_aio.c
--- kern/vfs_aio.c	29 Jun 2007 21:54:08 -0000	1.41
+++ kern/vfs_aio.c	19 Jul 2007 02:49:35 -0000
@@ -944,8 +944,12 @@ 	bp->b_error = 0;
 
 	crit_exit();
 	
-	/* Perform transfer. */
-	dev_dstrategy(vp->v_rdev, &bp->b_bio1);
+	/*
+	 * Perform the transfer.  vn_strategy must be used even though we
+	 * know we have a device in order to deal with requests which exceed
+	 * device DMA limitations.
+	 */
+	vn_strategy(vp, &bp->b_bio1);
 
 	notify = 0;
 	crit_enter();
Index: vfs/specfs/spec_vnops.c
===================================================================
RCS file: /cvs/src/sys/vfs/specfs/spec_vnops.c,v
retrieving revision 1.51
diff -u -p -r1.51 spec_vnops.c
--- vfs/specfs/spec_vnops.c	9 May 2007 00:53:36 -0000	1.51
+++ vfs/specfs/spec_vnops.c	19 Jul 2007 02:39:25 -0000
@@ -60,6 +60,15 @@ #include <sys/buf2.h>
 
 #include <sys/thread2.h>
 
+/*
+ * Specfs chained debugging (bitmask)
+ *
+ * 0 - disable debugging
+ * 1 - report chained I/Os
+ * 2 - force 4K chained I/Os
+ */
+#define SPEC_CHAIN_DEBUG	0
+
 static int	spec_advlock (struct vop_advlock_args *);  
 static int	spec_bmap (struct vop_bmap_args *);
 static int	spec_close (struct vop_close_args *);
@@ -75,6 +84,7 @@ 
 static int	spec_read (struct vop_read_args *);  
 static int	spec_strategy (struct vop_strategy_args *);
 static int	spec_write (struct vop_write_args *);
+static void	spec_strategy_done(struct bio *nbio);
 
 struct vop_ops spec_vnode_vops = {
 	.vop_default =		vop_defaultop,
@@ -438,7 +448,9 @@ 	return (0);
 }
 
 /*
- * Just call the device strategy routine
+ * Convert a vnode strategy call into a device strategy call.  Vnode strategy
+ * calls are not limited to device DMA limits so we have to deal with the
+ * case.
  *
  * spec_strategy(struct vnode *a_vp, struct bio *a_bio)
  */
@@ -447,8 +459,11 @@  */
 {
 	struct bio *bio = ap->a_bio;
 	struct buf *bp = bio->bio_buf;
+	struct buf *nbp;
 	struct vnode *vp;
 	struct mount *mp;
+	int chunksize;
+	int maxiosize;
 
 	if (bp->b_cmd != BUF_CMD_READ &&
 	    (LIST_FIRST(&bp->b_dep)) != NULL && bioops.io_start) {
@@ -474,11 +489,151 @@ 			else
 				mp->mnt_stat.f_syncwrites++;
 		}
 	}
-	dev_dstrategy_chain(vp->v_rdev, bio);
+
+        /*
+         * Device iosize limitations only apply to read and write.  Shortcut
+         * the I/O if it fits.
+         */
+	maxiosize = vp->v_rdev->si_iosize_max;
+#if SPEC_CHAIN_DEBUG & 2
+	maxiosize = 4096;
+#endif
+        if (bp->b_bcount <= maxiosize ||
+            (bp->b_cmd != BUF_CMD_READ && bp->b_cmd != BUF_CMD_WRITE)) {
+                dev_dstrategy_chain(vp->v_rdev, bio);
+                return (0);
+        }
+
+	/*
+	 * Clone the buffer and set up an I/O chain to chunk up the I/O.
+	 */
+	nbp = kmalloc(sizeof(*bp), M_DEVBUF, M_INTWAIT|M_ZERO);
+	initbufbio(nbp);
+	LIST_INIT(&nbp->b_dep);
+	BUF_LOCKINIT(nbp);
+	BUF_LOCK(nbp, LK_EXCLUSIVE);
+	BUF_KERNPROC(nbp);
+	nbp->b_vp = vp;
+	nbp->b_flags = B_PAGING | (bp->b_flags & B_BNOCLIP);
+	nbp->b_data = bp->b_data;
+	nbp->b_bio1.bio_done = spec_strategy_done;
+	nbp->b_bio1.bio_offset = bio->bio_offset;
+	nbp->b_bio1.bio_caller_info1.ptr = bio;
+
+	/*
+	 * Start the first transfer
+	 */
+	if (vn_isdisk(vp, NULL))
+		chunksize = vp->v_rdev->si_bsize_phys;
+	else
+		chunksize = DEV_BSIZE;
+	chunksize = maxiosize / chunksize * chunksize;
+#if SPEC_CHAIN_DEBUG & 1
+	kprintf("spec_strategy chained I/O chunksize=%d\n", chunksize);
+#endif
+	nbp->b_cmd = bp->b_cmd;
+	nbp->b_bcount = chunksize;
+	nbp->b_bufsize = chunksize;	/* used to detect a short I/O */
+	nbp->b_bio1.bio_caller_info2.index = chunksize;
+
+#if SPEC_CHAIN_DEBUG & 1
+	kprintf("spec_strategy: chain %p offset %d/%d bcount %d\n",
+		bp, 0, bp->b_bcount, nbp->b_bcount);
+#endif
+
+	dev_dstrategy(vp->v_rdev, &nbp->b_bio1);
 	return (0);
 }
 
 /*
+ * Chunked up transfer completion routine - chain transfers until done
+ */
+static
+void
+spec_strategy_done(struct bio *nbio)
+{
+	struct buf *nbp = nbio->bio_buf;
+	struct bio *bio = nbio->bio_caller_info1.ptr;	/* original bio */
+	struct buf *bp = bio->bio_buf;			/* original bp */
+	int chunksize = nbio->bio_caller_info2.index;	/* chunking */
+	int boffset = nbp->b_data - bp->b_data;
+
+	if (nbp->b_flags & B_ERROR) {
+		/*
+		 * An error terminates the chain, propogate the error back
+		 * to the original bp
+		 */
+		bp->b_flags |= B_ERROR;
+		bp->b_error = nbp->b_error;
+		bp->b_resid = bp->b_bcount - boffset +
+			      (nbp->b_bcount - nbp->b_resid);
+#if SPEC_CHAIN_DEBUG & 1
+		kprintf("spec_strategy: chain %p error %d bcount %d/%d\n",
+			bp, bp->b_error, bp->b_bcount,
+			bp->b_bcount - bp->b_resid);
+#endif
+		kfree(nbp, M_DEVBUF);
+		biodone(bio);
+	} else if (nbp->b_resid) {
+		/*
+		 * A short read or write terminates the chain
+		 */
+		bp->b_error = nbp->b_error;
+		bp->b_resid = bp->b_bcount - boffset +
+			      (nbp->b_bcount - nbp->b_resid);
+#if SPEC_CHAIN_DEBUG & 1
+		kprintf("spec_strategy: chain %p short read(1) bcount %d/%d\n",
+			bp, bp->b_bcount - bp->b_resid, bp->b_bcount);
+#endif
+		kfree(nbp, M_DEVBUF);
+		biodone(bio);
+	} else if (nbp->b_bcount != nbp->b_bufsize) {
+		/*
+		 * A short read or write can also occur by truncating b_bcount
+		 */
+#if SPEC_CHAIN_DEBUG & 1
+		kprintf("spec_strategy: chain %p short read(2) bcount %d/%d\n",
+			bp, nbp->b_bcount + boffset, bp->b_bcount);
+#endif
+		bp->b_error = 0;
+		bp->b_bcount = nbp->b_bcount + boffset; 
+		bp->b_resid = nbp->b_resid;
+		kfree(nbp, M_DEVBUF);
+		biodone(bio);
+	} else if (nbp->b_bcount + boffset == bp->b_bcount) {
+		/*
+		 * No more data terminates the chain
+		 */
+#if SPEC_CHAIN_DEBUG & 1
+		kprintf("spec_strategy: chain %p finished bcount %d\n",
+			bp, bp->b_bcount);
+#endif
+		bp->b_error = 0;
+		bp->b_resid = 0;
+		kfree(nbp, M_DEVBUF);
+		biodone(bio);
+	} else {
+		/*
+		 * Continue the chain
+		 */
+		boffset += nbp->b_bcount;
+		nbp->b_data = bp->b_data + boffset;
+		nbp->b_bcount = bp->b_bcount - boffset;
+		if (nbp->b_bcount > chunksize)
+			nbp->b_bcount = chunksize;
+		nbp->b_bio1.bio_done = spec_strategy_done;
+		nbp->b_bio1.bio_offset = bio->bio_offset + boffset;
+
+#if SPEC_CHAIN_DEBUG & 1
+		kprintf("spec_strategy: chain %p offset %d/%d bcount %d\n",
+			bp, boffset, bp->b_bcount, nbp->b_bcount);
+#endif
+
+		dev_dstrategy(nbp->b_vp->v_rdev, &nbp->b_bio1);
+	}
+}
+
+/*
  * spec_freeblks(struct vnode *a_vp, daddr_t a_addr, daddr_t a_length)
  */
 static int
Index: vm/vm_swap.c
===================================================================
RCS file: /cvs/src/sys/vm/vm_swap.c,v
retrieving revision 1.35
diff -u -p -r1.35 vm_swap.c
--- vm/vm_swap.c	15 May 2007 22:44:21 -0000	1.35
+++ vm/vm_swap.c	19 Jul 2007 02:47:43 -0000
@@ -142,8 +142,9 @@ 	 * Issue a strategy call on the appropr
 	 * bp->b_vp is not modified.  Strategy code is always supposed to
 	 * use the passed vp.
 	 *
-	 * XXX do a dev_dstrategy() call on sp->sw_device instead of on
-	 * sp->sw_vp ?
+	 * We have to use vn_strategy() here even if we know we have a
+	 * device in order to properly break up requests which exceed the
+	 * device's DMA limits.
 	 */
 	vn_strategy(sp->sw_vp, nbio);
 	return 0;





More information about the Bugs mailing list