GSOC: Device mapper mirror target

Sun Apr 10 15:11:54 PDT 2011

>> Each of the write bufs could be totally written, partially written, or
>> not written at all to each of the disks. More importantly, each disk
>> could have seen and completed (or not completed) the requests in a
>> different order. And this reorder can happen after the buf has been
>> declared done and biodone() has been called (and we've reported
>> success to userland). This could be because of reordering or
>> coalescing and the drive controller or the drive, for example.

> On thing I learned from writing the flash block driver: never biodone
> i/o that is not really finished. You can do it, but its a PITA to handle.
> Also, its the simplest to do write syncronously, we can report success
> when the fastest write is done or wait for the slower.

Just to be clear, I am not advocating calling biodone() when the request is
issued. I'm advocating calling biodone() when the backing device completes
the request. In the case of writes, this is not when data has hit the disk
platter; there are WB caches in drives that buffer writes.

Concretely:
/*
 * dmirror_strategy()
 *
 *   Initiate I/O on a dmirror VNODE.
 *
 *   READ:  disk_issue_read -> disk_read_bio_done -> (disk_issue_read)
 *
 *   The read-path uses push_bio to get a new BIO structure linked to
 *   the BUF and ties the new BIO to the disk and mirror it is issued
 *   on behalf of. The callback is set to disk_read_bio_done.
 *   In disk_read_bio_done, if the request succeeded, biodone() is called;
 *   if the request failed, the BIO is reinitialized with a new disk
 *   in the mirror and reissued till we get a success or run out of disks.
 *
 *   WRITE: disk_issue_write -> disk_write_bio_done(..) -> disk_write_tx_done
 *	
 *   The write path allocates a write group and transaction structures for
 *   each backing disc. It then sets up each transaction and issues them
 *   to the backing devices. When all of the devices have reported in,
 *   disk_write_tx_done finalizes the original BIO and deallocates the
 *   write group.
 */
static int
dmirror_strategy(dm_table_entry_t *table_en, struct buf *bp)
{
	struct bio *bio;
	struct dmirror_pdev *pdev;
	struct dmirror_dev *dmcfg;
	struct dmirror_write_group *wg;
	struct dmirror_write_tx *tx;
	int ndevs;
	int i;	

	dmcfg = table_en->target_config;

	switch (bp->b_cmd) {
	case (BUF_CMD_READ):
		pdev = dmirror_read_pick(dmcfg, &bp->b_bio1);	
		if (pdev == NULL) {
			bp->b_flags |= B_ERROR | B_INVAL;
			bp->b_resid = 0;
			biodone(&bp->b_bio1);
			break;
		}

		disk_issue_read(dmcfg, pdev, &bp->b_bio1);
		break;
	case (BUF_CMD_WRITE):
		/* Allocate a write group */
		wg = kmalloc(sizeof(struct dmirror_write_group), M_DMIRROR_WG,
			     M_WAITOK);
		dmirror_wg_ctor(dmcfg, &bp->b_bio1, wg);

		/* Allocate one transaction per device */
		for (i = 0; i < ndevs; i++) {
			tx = kmalloc(sizeof(struct dmirror_write_tx),
					M_DMIRROR_TX, M_WAITOK);
			dmirror_write_tx_ctor(wg, tx);

		}	

		/* Setup transactions */

		/* Issue requests */
		// do some stuff, then call vn_strategy() on each tx->bio

		break;
	case (BUF_CMD_FLUSH):
	default:
		break;
	}
}

static void
dmirror_write_tx_ctor(struct dmirror_write_group *wg,
		      struct dmirror_write_tx *write_tx,
		      struct dmirror_pdev *pdev)
{
	write_tx->write_group = wg;
	write_tx->state = DMIRROR_START;
	initbufbio(&write_tx->bio);
        write_tx->bio->bio_caller_info1.ptr = pdev;
        write_tx->bio->bio_done = disk_write_bio_done;
}

/* This is called in an interrrupt thread, when the drive has
 * completed an I/O; the I/O may not be stable on disk, but as
 * far as we are concerned, the I/O is done. We only call biodone()
 * when all disks have checked in. */
static void
disk_write_bio_done(struct bio *bio)
{
	struct dmirror_write_tx *write_tx;
	struct dmirror_pdev *pdev;
	struct bio *obio;

	pdev = bio->bio_caller_info1.ptr;
	write_tx = bio->bio_caller_info2.ptr;

	if (bio->bio_buf->bp_flags & B_ERROR) {
		// do some stuff.
	}

	switch (write_tx->state) {
	case (DMIRROR_JOURNAL_WRITE):
		write_tx->state = DMIRROR_JOURNAL_FLUSH;
		disk_issue_flush(bio);
		break;
	case (DMIRROR_JOURNAL_FLUSH):
		write_tx->state = DMIRROR_DATA_WRITE;
		break;
	case (DMIRROR_DATA_WRITE):
		write_tx->state = DMIRROR_DATA_FLUSH;
		disk_issue_flush(bio);
		break;
	case (DMIRROR_DATA_FLUSH):
		write_tx->state = DMIRROR_SUPER_WRITE;
		break;
	case (DMIRROR_SUPER_WRITE):
		write_tx->state = DMIRROR_SUPER_FLUSH;
		disk_issue_flush(bio);
		break;
	case (DMIRROR_SUPER_FLUSH):
		disk_write_tx_done(write_tx);
		break;
	default:
		break;
	}
}

static void
disk_write_tx_done(struct dmirror_write_tx *write_tx)
{
	struct dmirror_write_group *wg;
	struct bio *obio;
	int nfree_wg;

	wg = write_tx->write_group;

	do {
		lockmgr(&wg->lk, LK_EXCLUSIVE);

		nfree_wg = kref_dec(&wg->ref, dmirror_wg_dtor, wg, NULL);
		if (nfree_wg == 0)
			break;

		lockmgr(&wg->lk, LK_RELEASE);
	} while (0);

	wg = NULL;

	dmirror_write_tx_dtor(write_tx);
}

void
dmirror_wg_dtor(struct dmirror_write_group *wg)
{
        struct bio *obio;
        enum dmirror_wg_state state;

        obio = wg->obio;
        state = wg->state;

        lockmgr(&wg->lk, LK_RELEASE);

        lockuninit(&wg->lk);

        kfree(wg, M_DMIRROR_WG);

        biodone(obio);
}

If you want to only biodone() when the i/o has hit platter, you will need
to flush caches after ever write. Performance will me miserable. And you can
do better in terms of reliability.

I'm replying to the comments from the Google Melange page here:

>>> Adam Hoka wrote:
>>> "In case of a write, when the chunk is in sync, the write goes to the
>>> primary and the sync job gets added to mirror thread's queue and we wake
>>> the thread, we also increment a pending synchronization counter."

>> On April 8, 2011, 3:26p Venkatesh Srinivas wrote:
>> Wait, if I read your proposal correctly, you will declare a write as
>> done when it hits the primary mirror alone? Interesting...

> On April 9, 2011, 11:57a Adam Hoka write:
> Do you think that data safety should be allways before performance? It
> could be synchronous of course. Can you explain your concernes?

I don't like a design like this:

mirror_strategy(struct buf *bp) {
	switch(bp->cmd) {
	case (READ):
		vn_strategy(bp, disk0);
	case (WRITE):
		vn_strategy(bp, disk0);
		bp_copy := bufcopy(bp);
		workq_queue(vn_strategy, bp_copy, bp_copy, disk1);
	}
	biodone(bp);
}

A design like this leaves many windows for degraded states; at any point,
a failure of the primary disk leaves you with an unsynchronized secondary
disk, even though we'd returned success to the upper FS.

A degraded array is not _just_ a robustness concern; in a degraded array,
without tracking the outstanding I/O requests for sync, you can't serve
reads from other disks.

Also, eew.

-- vs