Note: "permalinks" may not be as permanent as we would like,
direct links of old sources may well be a few messages off.
Fixed the performance issue. Since drbd 0.7.X processes 4K at a time, by only sending 4K, max_sector size, at a time and waiting I wasn't using the queue to my advantage and seriously degrading performance. By issuing all requests onto the queue for the full io and then kicking the queue I was able to get acceptable performance, though still quite a bit less then LVM or SD. I had to make significant changes for this, but I even get higher throughput on large blocks to LVM and SD now so it was worth it. -Ross > -----Original Message----- > From: drbd-user-bounces at lists.linbit.com > [mailto:drbd-user-bounces at lists.linbit.com] On Behalf Of Ross > S. W. Walker > Sent: Thursday, January 11, 2007 7:48 PM > To: drbd-user at lists.linbit.com > Subject: [DRBD-user] DRBD and Direct Block IO > > > I am having some serious performance issues with a kernel module for > iSCSI Enterprise Target and drbd 0.7.21 on CentOS 4.4. > > The kernel module (source code listed below) basically takes a data > request issued over iSCSI translates it to a bio request that is then > carried out synchronously to the device below. > > When run against an MD/LVM/SD device I don't have any problems, I get > performance to be expected, but when run against a drbd > 0.7.21 device it > chokes down hard. For example when doing seq 64K block write direct to > device I can get 112 MB/s sustained, when I have drbd in the > middle that > throughput drops to 10 MB/s. > > Can anybody help explain this or point out a serious flaw in the code > below that would cause this. I would rather solve the problem (if it > can) then try to run version 8 beta in production (if it will > even solve > my problem). > > Thanks, > > ---------------- block-io.c > /* > * Target device block I/O. > * > * Based on file I/O driver from FUJITA Tomonori > * (C) 2004 - 2005 FUJITA Tomonori <tomof at acm.org> > * (C) 2006 Andre Brinkmann <brinkman at hni.upb.de> > * This code is licenced under the GPL. > */ > > #include <linux/blkdev.h> > #include <linux/writeback.h> > #include <linux/parser.h> > > #include <linux/blkdev.h> > #include <linux/buffer_head.h> > #include <linux/version.h> > #include <linux/kernel.h> > #include <linux/proc_fs.h> > #include <linux/genhd.h> > #include <linux/fs.h> > #include <linux/stat.h> > #include <linux/ctype.h> > #include <linux/delay.h> > > #include "iscsi.h" > #include "iscsi_dbg.h" > #include "iotype.h" > > struct blockio_data > { > char *path; > struct block_device *device; > struct file *filp; > unsigned long old_ra_pages; > #if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,12)) > unsigned int old_capabilities; > #else > int old_memory_backed; > #endif > }; > > static void > blockio_bio_endio (struct bio *bio, unsigned int bytes_done, > int error) > { > > struct completion *wait = bio->bi_private; > > /* check if all bytes have been written */ > if (bio->bi_size) > eprintk ("I/O error %d Not all bytes written for > bio\n", > bio->bi_size); > > if (error) > eprintk ("I/O error: Error %d occured \n", error); > > if (bio->bi_private) { > wait = (struct completion *) bio->bi_private; > complete (wait); > } > > /* release bio structure */ > bio_put (bio); > > return; > } > > /** > * blockio_make_request(): The function translates an > iscsi-request into > > * a number of requests to the corresponding block device. > **/ > int > blockio_make_request (struct iet_volume *lu, struct tio *tio, int rw) > { > struct blockio_data *p; > struct block_device *target_device; > struct request_queue *target_queue; > struct bio *target_bio; > int max_sectors; > int pg_number; > int page_count; > int counter; > struct page *page; > mm_segment_t oldfs; > > u32 offset, size; > u32 len; > > loff_t ppos; > int i; > ssize_t ret; > DECLARE_COMPLETION (work); > > p = (struct blockio_data *) lu->private; > assert (p); > > target_device = p->device; > assert (target_device); > > size = tio->size; > offset = tio->offset; > > ppos = (loff_t) tio->idx << PAGE_SHIFT; > ppos += offset; > > /* All IO is to be synchronous */ > rw |= (1 << BIO_RW_SYNC); > > /* Get maximum number of sectors / pages that could be sent to > target > * block device within a single bio-structure */ > > target_queue = target_device->bd_disk->queue; > if (target_queue) { > max_sectors = target_queue->max_sectors; > if (max_sectors > 0) { > pg_number = (max_sectors << SECTOR_SIZE_BITS) >> > PAGE_SHIFT; > if (pg_number > tio->pg_cnt) > pg_number = tio->pg_cnt; > } > else > pg_number = tio->pg_cnt; > } > else { > max_sectors = 0; > pg_number = tio->pg_cnt; > } > > page_count = 0; > counter = tio->pg_cnt; > > while (counter > 0) { > /* get new bio-structure */ > target_bio = bio_alloc (GFP_NOIO, pg_number); > if (!target_bio) { > eprintk ("I/O error: %d\n", page_count); > return -ENOMEM; > } > > /* Initialize bio */ > target_bio->bi_sector = ppos >> SECTOR_SIZE_BITS; > target_bio->bi_bdev = target_device; > target_bio->bi_rw = rw; > target_bio->bi_end_io = (bio_end_io_t *) > blockio_bio_endio; > target_bio->bi_private = &work; > > for (i = 0; i < pg_number; i++) { > page = tio->pvec[page_count]; > assert (page); > > /* calc access length for this page */ > len = PAGE_SIZE; > if (offset) > len -= offset; > if (size < len) > len = size; > > /* bio_add_page returns len if successful */ > ret = bio_add_page (target_bio, page, len, > offset); > if (!ret) { > eprintk ("I/O error: %ld\n", (long) > ret); > return -EIO; > } > /* offset valid only once */ > offset = 0; > size -= len; > page_count++; > } > > counter -= pg_number; > ppos += (pg_number << PAGE_SHIFT); > > if (pg_number > counter) > pg_number = counter; > > oldfs = get_fs (); > set_fs (get_ds ()); > > /* send bio to generic_make_request */ > submit_bio (rw, target_bio); > > wait_for_completion (&work); > > set_fs (oldfs); > > } > assert (!size); > > return 0; > } > > static int > open_path (struct iet_volume *volume, const char *path) > { > int err = 0; > struct blockio_data *info = (struct blockio_data *) > volume->private; > struct file *filp; > mm_segment_t oldfs; > int flags; > > info->path = kmalloc (strlen (path) + 1, GFP_KERNEL); > if (!info->path) > return -ENOMEM; > strcpy (info->path, path); > info->path[strlen (path)] = '\0'; > > oldfs = get_fs (); > set_fs (get_ds ()); > flags = (LUReadonly (volume) ? O_RDONLY : O_RDWR) | O_LARGEFILE > | O_SYNC | O_DIRECT; > filp = filp_open (path, flags, 0); > set_fs (oldfs); > > if (IS_ERR (filp)) { > err = PTR_ERR (filp); > eprintk ("Can't open %s %d\n", path, err); > info->filp = NULL; > } > else > info->filp = filp; > > return err; > } > > static int > set_scsiid (struct iet_volume *volume, const char *id) > { > size_t len; > > if ((len = strlen (id)) > SCSI_ID_LEN - VENDOR_ID_LEN) { > eprintk ("too long SCSI ID %lu\n", (unsigned long) len); > return -EINVAL; > } > > len = min (sizeof (volume->scsi_id) - VENDOR_ID_LEN, len); > memcpy (volume->scsi_id + VENDOR_ID_LEN, id, len); > > return 0; > } > > static void > gen_scsiid (struct iet_volume *volume, struct inode *inode) > { > int i; > u32 *p; > > strlcpy (volume->scsi_id, VENDOR_ID, VENDOR_ID_LEN); > > for (i = VENDOR_ID_LEN; i < SCSI_ID_LEN; i++) > if (volume->scsi_id[i]) > return; > > p = (u32 *) (volume->scsi_id + VENDOR_ID_LEN); > *(p + 0) = volume->target->trgt_param.target_type; > *(p + 1) = volume->target->tid; > *(p + 2) = (unsigned int) inode->i_ino; > *(p + 3) = (unsigned int) inode->i_sb->s_dev; > } > > static int > set_scsisn(struct iet_volume *volume, const char *sn) > { > size_t len; > > if ((len = strlen(sn)) > SCSI_SN_LEN) { > eprintk("too long SCSI SN %lu\n", (unsigned long) len); > return -EINVAL; > } > memcpy(volume->scsi_sn, sn, len); > return 0; > } > > enum > { > Opt_scsiid, Opt_scsisn, Opt_path, Opt_ignore, Opt_err, > }; > > static match_table_t tokens = { > {Opt_scsiid, "ScsiId=%s"}, > {Opt_scsisn, "ScsiSN=%s"}, > {Opt_path, "Path=%s"}, > {Opt_ignore, "Type=%s"}, > {Opt_ignore, "IOMode=%s"}, > {Opt_err, NULL}, > }; > > static int > parse_blockio_params (struct iet_volume *volume, char *params) > { > int err = 0; > char *p, *q; > > while ((p = strsep (¶ms, ",")) != NULL) { > substring_t args[MAX_OPT_ARGS]; > int token; > if (!*p) > continue; > token = match_token (p, tokens, args); > switch (token) { > case Opt_scsiid: > if (!(q = match_strdup (&args[0]))) { > err = -ENOMEM; > goto out; > } > err = set_scsiid (volume, q); > kfree (q); > if (err < 0) > goto out; > break; > case Opt_scsisn: > if (!(q = match_strdup(&args[0]))) { > err = -ENOMEM; > goto out; > } > err = set_scsisn(volume, q); > kfree(q); > if (err < 0) > goto out; > break; > case Opt_path: > if (!(q = match_strdup (&args[0]))) { > err = -ENOMEM; > goto out; > } > err = open_path (volume, q); > kfree (q); > if (err < 0) > goto out; > break; > case Opt_ignore: > break; > default: > eprintk ("Unknown %s\n", p); > return -EINVAL; > } > } > > out: > return err; > } > > static void > blockio_detach (struct iet_volume *lu) > { > struct inode *inode; > struct blockio_data *p = (struct blockio_data *) lu->private; > > inode = p->device->bd_inode; > > inode->i_mapping->backing_dev_info->ra_pages = p->old_ra_pages; > #if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,12)) > inode->i_mapping->backing_dev_info->capabilities = > p->old_capabilities; > #else > inode->i_mapping->backing_dev_info->memory_backed = > p->old_memory_backed; > #endif > > kfree (p->path); > if (p->filp) > filp_close (p->filp, NULL); > kfree (p); > lu->private = NULL; > } > > static int > blockio_attach (struct iet_volume *lu, char *args) > { > int err = 0; > struct blockio_data *p; > struct inode *inode; > > if (lu->private) { > printk ("already attached ? %d\n", lu->lun); > return -EBUSY; > } > > if (!(p = kmalloc (sizeof (*p), GFP_KERNEL))) > return -ENOMEM; > memset (p, 0, sizeof (*p)); > lu->private = p; > > if ((err = parse_blockio_params (lu, args)) < 0) { > eprintk ("%d\n", err); > goto out; > } > inode = p->filp->f_dentry->d_inode; > > gen_scsiid (lu, inode); > > /* Only block devices are allowed here */ > if (S_ISBLK (inode->i_mode)) { > inode = inode->i_bdev->bd_inode; > p->device = inode->i_bdev; > printk (KERN_INFO "Max queue length: %d \n", > p->device->bd_disk->queue->max_sectors); > } > else { > err = -EINVAL; > goto out; > } > > p->old_ra_pages = inode->i_mapping->backing_dev_info->ra_pages; > #if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,12)) > p->old_capabilities = > inode->i_mapping->backing_dev_info->capabilities; > #else > p->old_memory_backed = > inode->i_mapping->backing_dev_info->memory_backed; > #endif > > inode->i_mapping->backing_dev_info->ra_pages = 0; > #if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,12)) > inode->i_mapping->backing_dev_info->capabilities = > BDI_CAP_NO_ACCT_DIRTY | BDI_CAP_NO_WRITEBACK; > #else > inode->i_mapping->backing_dev_info->memory_backed = 1; > #endif > > /* get sector size of the block device */ > lu->blk_shift = SECTOR_SIZE_BITS; > lu->blk_cnt = inode->i_size >> lu->blk_shift; > > out: > if (err < 0) > blockio_detach (lu); > return err; > } > > void > blockio_show (struct iet_volume *lu, struct seq_file *seq) > { > struct blockio_data *p = (struct blockio_data *) lu->private; > seq_printf (seq, " path:%s\n", p->path); > } > > struct iotype blockio = { > .name = "blockio", > .attach = blockio_attach, > .make_request = blockio_make_request, > .detach = blockio_detach, > .show = blockio_show, > }; > > ______________________________________________________________________ > This e-mail, and any attachments thereto, is intended only for use by > the addressee(s) named herein and may contain legally privileged > and/or confidential information. If you are not the intended recipient > of this e-mail, you are hereby notified that any dissemination, > distribution or copying of this e-mail, and any attachments thereto, > is strictly prohibited. If you have received this e-mail in error, > please immediately notify the sender and permanently delete the > original and any copy or printout thereof. > > _______________________________________________ > drbd-user mailing list > drbd-user at lists.linbit.com > http://lists.linbit.com/mailman/listinfo/drbd-user > ______________________________________________________________________ This e-mail, and any attachments thereto, is intended only for use by the addressee(s) named herein and may contain legally privileged and/or confidential information. If you are not the intended recipient of this e-mail, you are hereby notified that any dissemination, distribution or copying of this e-mail, and any attachments thereto, is strictly prohibited. If you have received this e-mail in error, please immediately notify the sender and permanently delete the original and any copy or printout thereof.