[DRBD-user] DRBD and Direct Block IO [SOLVED]

Ross S. W. Walker rwalker at medallion.com
Mon Jan 22 04:27:43 CET 2007

Note: "permalinks" may not be as permanent as we would like,
direct links of old sources may well be a few messages off.


Fixed the performance issue. Since drbd 0.7.X processes 4K at a time, by
only sending 4K, max_sector size, at a time and waiting I wasn't using
the queue to my advantage and seriously degrading performance. By
issuing all requests onto the queue for the full io and then kicking the
queue I was able to get acceptable performance, though still quite a bit
less then LVM or SD.

I had to make significant changes for this, but I even get higher
throughput on large blocks to LVM and SD now so it was worth it.

-Ross
 

> -----Original Message-----
> From: drbd-user-bounces at lists.linbit.com 
> [mailto:drbd-user-bounces at lists.linbit.com] On Behalf Of Ross 
> S. W. Walker
> Sent: Thursday, January 11, 2007 7:48 PM
> To: drbd-user at lists.linbit.com
> Subject: [DRBD-user] DRBD and Direct Block IO
> 
> 
> I am having some serious performance issues with a kernel module for
> iSCSI Enterprise Target and drbd 0.7.21 on CentOS 4.4.
> 
> The kernel module (source code listed below) basically takes a data
> request issued over iSCSI translates it to a bio request that is then
> carried out synchronously to the device below.
> 
> When run against an MD/LVM/SD device I don't have any problems, I get
> performance to be expected, but when run against a drbd 
> 0.7.21 device it
> chokes down hard. For example when doing seq 64K block write direct to
> device I can get 112 MB/s sustained, when I have drbd in the 
> middle that
> throughput drops to 10 MB/s.
> 
> Can anybody help explain this or point out a serious flaw in the code
> below that would cause this. I would rather solve the problem (if it
> can) then try to run version 8 beta in production (if it will 
> even solve
> my problem).
> 
> Thanks,
> 
> ---------------- block-io.c
> /*
>  * Target device block I/O.
>  *
>  * Based on file I/O driver from FUJITA Tomonori
>  * (C) 2004 - 2005 FUJITA Tomonori <tomof at acm.org>
>  * (C) 2006 Andre Brinkmann <brinkman at hni.upb.de>
>  * This code is licenced under the GPL.
>  */
> 
> #include <linux/blkdev.h>
> #include <linux/writeback.h>
> #include <linux/parser.h>
> 
> #include <linux/blkdev.h>
> #include <linux/buffer_head.h>
> #include <linux/version.h>
> #include <linux/kernel.h>
> #include <linux/proc_fs.h>
> #include <linux/genhd.h>
> #include <linux/fs.h>
> #include <linux/stat.h>
> #include <linux/ctype.h>
> #include <linux/delay.h>
> 
> #include "iscsi.h"
> #include "iscsi_dbg.h"
> #include "iotype.h"
> 
> struct blockio_data
> {
> 	char *path;
> 	struct block_device *device;
> 	struct file *filp;
>        unsigned long old_ra_pages;
> #if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,12))
> 	unsigned int old_capabilities;
> #else
> 	int old_memory_backed;
> #endif
> };
> 
> static void
> blockio_bio_endio (struct bio *bio, unsigned int bytes_done, 
> int error)
> {
> 
> 	struct completion *wait = bio->bi_private;
> 
> 	/* check if all bytes have been written */
> 	if (bio->bi_size)
> 		eprintk ("I/O error %d  Not all bytes written for
> bio\n", 
> 		bio->bi_size);
> 
> 	if (error)
> 		eprintk ("I/O error: Error %d occured \n", error);
> 
> 	if (bio->bi_private) {
> 		wait = (struct completion *) bio->bi_private;
> 		complete (wait);
> 	}
> 
> 	/* release bio structure */
> 	bio_put (bio);
> 
> 	return;
> }
> 
> /**
>  * blockio_make_request(): The function translates an 
> iscsi-request into
> 
>  * a number of requests to the corresponding block device. 
>  **/
> int
> blockio_make_request (struct iet_volume *lu, struct tio *tio, int rw)
> {
> 	struct blockio_data *p;
> 	struct block_device *target_device;
> 	struct request_queue *target_queue;
> 	struct bio *target_bio;
> 	int max_sectors;
> 	int pg_number;
> 	int page_count;
> 	int counter;
> 	struct page *page;
> 	mm_segment_t oldfs;
> 
> 	u32 offset, size;
> 	u32 len;
> 
> 	loff_t ppos;
> 	int i;
> 	ssize_t ret;
> 	DECLARE_COMPLETION (work);
> 
> 	p = (struct blockio_data *) lu->private;
> 	assert (p);
> 
> 	target_device = p->device;
> 	assert (target_device);
> 
> 	size = tio->size;
> 	offset = tio->offset;
> 
> 	ppos = (loff_t) tio->idx << PAGE_SHIFT;
> 	ppos += offset;
> 
> 	/* All IO is to be synchronous */
> 	rw |= (1 << BIO_RW_SYNC);
> 
> 	/* Get maximum number of sectors / pages that could be sent to
> target 
> 	 * block device within a single bio-structure */
> 
> 	target_queue = target_device->bd_disk->queue;
> 	if (target_queue) {
> 		max_sectors = target_queue->max_sectors;
> 		if (max_sectors > 0) {
> 			pg_number = (max_sectors << SECTOR_SIZE_BITS) >>
> PAGE_SHIFT;
> 			if (pg_number > tio->pg_cnt)
> 				pg_number = tio->pg_cnt;
> 		}
> 		else
> 			pg_number = tio->pg_cnt;
> 	}
> 	else {
> 		max_sectors = 0;
> 		pg_number = tio->pg_cnt;
> 	}
> 
> 	page_count = 0;
> 	counter = tio->pg_cnt;
> 
> 	while (counter > 0) {
> 		/* get new bio-structure */
> 		target_bio = bio_alloc (GFP_NOIO, pg_number);
> 		if (!target_bio) {
> 			eprintk ("I/O error:  %d\n", page_count);
> 			return -ENOMEM;
> 		}
> 
> 		/* Initialize bio */
> 		target_bio->bi_sector = ppos >> SECTOR_SIZE_BITS;
> 		target_bio->bi_bdev = target_device;
> 		target_bio->bi_rw = rw;
> 		target_bio->bi_end_io = (bio_end_io_t *)
> blockio_bio_endio;
> 		target_bio->bi_private = &work;
> 
> 		for (i = 0; i < pg_number; i++) {
> 			page = tio->pvec[page_count];
> 			assert (page);
> 
> 			/* calc access length for this page */
> 			len = PAGE_SIZE;
> 			if (offset)
> 				len -= offset;
> 			if (size < len)
> 				len = size;
> 
> 			/* bio_add_page returns len if successful */
> 			ret = bio_add_page (target_bio, page, len,
> offset);
> 			if (!ret) {
> 				eprintk ("I/O error:  %ld\n", (long)
> ret);
> 				return -EIO;
> 			}
> 			/* offset valid only once */
> 			offset = 0;
> 			size -= len;
> 			page_count++;
> 		}
> 
> 		counter -= pg_number;
> 		ppos += (pg_number << PAGE_SHIFT);
> 
> 		if (pg_number > counter)
> 			pg_number = counter;
> 
> 		oldfs = get_fs ();
> 		set_fs (get_ds ());
> 
> 		/* send bio to generic_make_request */
> 		submit_bio (rw, target_bio);
> 
> 		wait_for_completion (&work);
> 
> 		set_fs (oldfs);
> 
> 	}
> 	assert (!size);
> 
> 	return 0;
> }
> 
> static int
> open_path (struct iet_volume *volume, const char *path)
> {
> 	int err = 0;
> 	struct blockio_data *info = (struct blockio_data *)
> volume->private;
> 	struct file *filp;
> 	mm_segment_t oldfs;
> 	int flags;
> 
> 	info->path = kmalloc (strlen (path) + 1, GFP_KERNEL);
> 	if (!info->path)
> 		return -ENOMEM;
> 	strcpy (info->path, path);
> 	info->path[strlen (path)] = '\0';
> 
> 	oldfs = get_fs ();
> 	set_fs (get_ds ());
> 	flags = (LUReadonly (volume) ? O_RDONLY : O_RDWR) | O_LARGEFILE
> | O_SYNC | O_DIRECT;
> 	filp = filp_open (path, flags, 0);
> 	set_fs (oldfs);
> 
> 	if (IS_ERR (filp)) {
> 		err = PTR_ERR (filp);
> 		eprintk ("Can't open %s %d\n", path, err);
> 		info->filp = NULL;
> 	}
> 	else
> 		info->filp = filp;
> 
> 	return err;
> }
> 
> static int
> set_scsiid (struct iet_volume *volume, const char *id)
> {
> 	size_t len;
> 
> 	if ((len = strlen (id)) > SCSI_ID_LEN - VENDOR_ID_LEN) {
> 		eprintk ("too long SCSI ID %lu\n", (unsigned long) len);
> 		return -EINVAL;
> 	}
> 
> 	len = min (sizeof (volume->scsi_id) - VENDOR_ID_LEN, len);
> 	memcpy (volume->scsi_id + VENDOR_ID_LEN, id, len);
> 
> 	return 0;
> }
> 
> static void
> gen_scsiid (struct iet_volume *volume, struct inode *inode)
> {
> 	int i;
> 	u32 *p;
> 
> 	strlcpy (volume->scsi_id, VENDOR_ID, VENDOR_ID_LEN);
> 
> 	for (i = VENDOR_ID_LEN; i < SCSI_ID_LEN; i++)
> 		if (volume->scsi_id[i])
> 			return;
> 
> 	p = (u32 *) (volume->scsi_id + VENDOR_ID_LEN);
> 	*(p + 0) = volume->target->trgt_param.target_type;
> 	*(p + 1) = volume->target->tid;
> 	*(p + 2) = (unsigned int) inode->i_ino;
> 	*(p + 3) = (unsigned int) inode->i_sb->s_dev;
> }
> 
> static int
> set_scsisn(struct iet_volume *volume, const char *sn)
> {
> 	size_t len;
> 
> 	if ((len = strlen(sn)) > SCSI_SN_LEN) {
> 		eprintk("too long SCSI SN %lu\n", (unsigned long) len);
> 		return -EINVAL;
> 	}
> 	memcpy(volume->scsi_sn, sn, len);
> 	return 0;
> }
> 
> enum
> {
> 	Opt_scsiid, Opt_scsisn, Opt_path, Opt_ignore, Opt_err,
> };
> 
> static match_table_t tokens = {
> 	{Opt_scsiid, "ScsiId=%s"},
> 	{Opt_scsisn, "ScsiSN=%s"},
> 	{Opt_path, "Path=%s"},
> 	{Opt_ignore, "Type=%s"},
> 	{Opt_ignore, "IOMode=%s"},
> 	{Opt_err, NULL},
> };
> 
> static int
> parse_blockio_params (struct iet_volume *volume, char *params)
> {
> 	int err = 0;
> 	char *p, *q;
> 
> 	while ((p = strsep (&params, ",")) != NULL) {
> 		substring_t args[MAX_OPT_ARGS];
> 		int token;
> 		if (!*p)
> 			continue;
> 		token = match_token (p, tokens, args);
> 		switch (token) {
> 		case Opt_scsiid:
> 			if (!(q = match_strdup (&args[0]))) {
> 				err = -ENOMEM;
> 				goto out;
> 			}
> 			err = set_scsiid (volume, q);
> 			kfree (q);
> 			if (err < 0)
> 				goto out;
> 			break;
> 		case Opt_scsisn:
> 			if (!(q = match_strdup(&args[0]))) {
> 				err = -ENOMEM;
> 				goto out;
> 			}
> 			err = set_scsisn(volume, q);
> 			kfree(q);
> 			if (err < 0)
> 				goto out;
> 			break;
> 		case Opt_path:
> 			if (!(q = match_strdup (&args[0]))) {
> 				err = -ENOMEM;
> 				goto out;
> 			}
> 			err = open_path (volume, q);
> 			kfree (q);
> 			if (err < 0)
> 				goto out;
> 			break;
> 		case Opt_ignore:
> 			break;
> 		default:
> 			eprintk ("Unknown %s\n", p);
> 			return -EINVAL;
> 		}
> 	}
> 
>   out:
> 	return err;
> }
> 
> static void
> blockio_detach (struct iet_volume *lu)
> {
> 	struct inode *inode;
> 	struct blockio_data *p = (struct blockio_data *) lu->private;
> 
> 	inode = p->device->bd_inode;
> 
> 	inode->i_mapping->backing_dev_info->ra_pages = p->old_ra_pages;
> #if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,12))
> 	inode->i_mapping->backing_dev_info->capabilities =
> p->old_capabilities;
> #else
> 	inode->i_mapping->backing_dev_info->memory_backed =
> p->old_memory_backed;
> #endif
> 
> 	kfree (p->path);
> 	if (p->filp)
> 		filp_close (p->filp, NULL);
> 	kfree (p);
> 	lu->private = NULL;
> }
> 
> static int
> blockio_attach (struct iet_volume *lu, char *args)
> {
> 	int err = 0;
> 	struct blockio_data *p;
> 	struct inode *inode;
> 
> 	if (lu->private) {
> 		printk ("already attached ? %d\n", lu->lun);
> 		return -EBUSY;
> 	}
> 
> 	if (!(p = kmalloc (sizeof (*p), GFP_KERNEL)))
> 		return -ENOMEM;
> 	memset (p, 0, sizeof (*p));
> 	lu->private = p;
> 
> 	if ((err = parse_blockio_params (lu, args)) < 0) {
> 		eprintk ("%d\n", err);
> 		goto out;
> 	}
> 	inode = p->filp->f_dentry->d_inode;
> 
> 	gen_scsiid (lu, inode);
> 
> 	/* Only block devices are allowed here */
> 	if (S_ISBLK (inode->i_mode)) {
> 		inode = inode->i_bdev->bd_inode;
> 		p->device = inode->i_bdev;
> 		printk (KERN_INFO "Max queue length: %d \n",
> 				p->device->bd_disk->queue->max_sectors);
> 	}
> 	else {
> 		err = -EINVAL;
> 		goto out;
> 	}
> 
> 	p->old_ra_pages = inode->i_mapping->backing_dev_info->ra_pages;
> #if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,12))
> 	p->old_capabilities =
> inode->i_mapping->backing_dev_info->capabilities;
> #else
> 	p->old_memory_backed =
> inode->i_mapping->backing_dev_info->memory_backed;
> #endif
> 
> 	inode->i_mapping->backing_dev_info->ra_pages = 0;
> #if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,12))
> 	inode->i_mapping->backing_dev_info->capabilities =
> BDI_CAP_NO_ACCT_DIRTY | BDI_CAP_NO_WRITEBACK;
> #else
> 	inode->i_mapping->backing_dev_info->memory_backed = 1;
> #endif
> 
> 	/* get sector size of the block device */
> 	lu->blk_shift = SECTOR_SIZE_BITS;
> 	lu->blk_cnt = inode->i_size >> lu->blk_shift;
> 
>   out:
> 	if (err < 0)
> 		blockio_detach (lu);
> 	return err;
> }
> 
> void
> blockio_show (struct iet_volume *lu, struct seq_file *seq)
> {
> 	struct blockio_data *p = (struct blockio_data *) lu->private;
> 	seq_printf (seq, " path:%s\n", p->path);
> }
> 
> struct iotype blockio = {
> 	.name = "blockio",
> 	.attach = blockio_attach,
> 	.make_request = blockio_make_request,
> 	.detach = blockio_detach,
> 	.show = blockio_show,
> };
> 
> ______________________________________________________________________
> This e-mail, and any attachments thereto, is intended only for use by
> the addressee(s) named herein and may contain legally privileged
> and/or confidential information. If you are not the intended recipient
> of this e-mail, you are hereby notified that any dissemination,
> distribution or copying of this e-mail, and any attachments thereto,
> is strictly prohibited. If you have received this e-mail in error,
> please immediately notify the sender and permanently delete the
> original and any copy or printout thereof.
> 
> _______________________________________________
> drbd-user mailing list
> drbd-user at lists.linbit.com
> http://lists.linbit.com/mailman/listinfo/drbd-user
> 

______________________________________________________________________
This e-mail, and any attachments thereto, is intended only for use by
the addressee(s) named herein and may contain legally privileged
and/or confidential information. If you are not the intended recipient
of this e-mail, you are hereby notified that any dissemination,
distribution or copying of this e-mail, and any attachments thereto,
is strictly prohibited. If you have received this e-mail in error,
please immediately notify the sender and permanently delete the
original and any copy or printout thereof.




More information about the drbd-user mailing list