Note: "permalinks" may not be as permanent as we would like,
direct links of old sources may well be a few messages off.
Fixed the performance issue. Since drbd 0.7.X processes 4K at a time, by
only sending 4K, max_sector size, at a time and waiting I wasn't using
the queue to my advantage and seriously degrading performance. By
issuing all requests onto the queue for the full io and then kicking the
queue I was able to get acceptable performance, though still quite a bit
less then LVM or SD.
I had to make significant changes for this, but I even get higher
throughput on large blocks to LVM and SD now so it was worth it.
-Ross
> -----Original Message-----
> From: drbd-user-bounces at lists.linbit.com
> [mailto:drbd-user-bounces at lists.linbit.com] On Behalf Of Ross
> S. W. Walker
> Sent: Thursday, January 11, 2007 7:48 PM
> To: drbd-user at lists.linbit.com
> Subject: [DRBD-user] DRBD and Direct Block IO
>
>
> I am having some serious performance issues with a kernel module for
> iSCSI Enterprise Target and drbd 0.7.21 on CentOS 4.4.
>
> The kernel module (source code listed below) basically takes a data
> request issued over iSCSI translates it to a bio request that is then
> carried out synchronously to the device below.
>
> When run against an MD/LVM/SD device I don't have any problems, I get
> performance to be expected, but when run against a drbd
> 0.7.21 device it
> chokes down hard. For example when doing seq 64K block write direct to
> device I can get 112 MB/s sustained, when I have drbd in the
> middle that
> throughput drops to 10 MB/s.
>
> Can anybody help explain this or point out a serious flaw in the code
> below that would cause this. I would rather solve the problem (if it
> can) then try to run version 8 beta in production (if it will
> even solve
> my problem).
>
> Thanks,
>
> ---------------- block-io.c
> /*
> * Target device block I/O.
> *
> * Based on file I/O driver from FUJITA Tomonori
> * (C) 2004 - 2005 FUJITA Tomonori <tomof at acm.org>
> * (C) 2006 Andre Brinkmann <brinkman at hni.upb.de>
> * This code is licenced under the GPL.
> */
>
> #include <linux/blkdev.h>
> #include <linux/writeback.h>
> #include <linux/parser.h>
>
> #include <linux/blkdev.h>
> #include <linux/buffer_head.h>
> #include <linux/version.h>
> #include <linux/kernel.h>
> #include <linux/proc_fs.h>
> #include <linux/genhd.h>
> #include <linux/fs.h>
> #include <linux/stat.h>
> #include <linux/ctype.h>
> #include <linux/delay.h>
>
> #include "iscsi.h"
> #include "iscsi_dbg.h"
> #include "iotype.h"
>
> struct blockio_data
> {
> char *path;
> struct block_device *device;
> struct file *filp;
> unsigned long old_ra_pages;
> #if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,12))
> unsigned int old_capabilities;
> #else
> int old_memory_backed;
> #endif
> };
>
> static void
> blockio_bio_endio (struct bio *bio, unsigned int bytes_done,
> int error)
> {
>
> struct completion *wait = bio->bi_private;
>
> /* check if all bytes have been written */
> if (bio->bi_size)
> eprintk ("I/O error %d Not all bytes written for
> bio\n",
> bio->bi_size);
>
> if (error)
> eprintk ("I/O error: Error %d occured \n", error);
>
> if (bio->bi_private) {
> wait = (struct completion *) bio->bi_private;
> complete (wait);
> }
>
> /* release bio structure */
> bio_put (bio);
>
> return;
> }
>
> /**
> * blockio_make_request(): The function translates an
> iscsi-request into
>
> * a number of requests to the corresponding block device.
> **/
> int
> blockio_make_request (struct iet_volume *lu, struct tio *tio, int rw)
> {
> struct blockio_data *p;
> struct block_device *target_device;
> struct request_queue *target_queue;
> struct bio *target_bio;
> int max_sectors;
> int pg_number;
> int page_count;
> int counter;
> struct page *page;
> mm_segment_t oldfs;
>
> u32 offset, size;
> u32 len;
>
> loff_t ppos;
> int i;
> ssize_t ret;
> DECLARE_COMPLETION (work);
>
> p = (struct blockio_data *) lu->private;
> assert (p);
>
> target_device = p->device;
> assert (target_device);
>
> size = tio->size;
> offset = tio->offset;
>
> ppos = (loff_t) tio->idx << PAGE_SHIFT;
> ppos += offset;
>
> /* All IO is to be synchronous */
> rw |= (1 << BIO_RW_SYNC);
>
> /* Get maximum number of sectors / pages that could be sent to
> target
> * block device within a single bio-structure */
>
> target_queue = target_device->bd_disk->queue;
> if (target_queue) {
> max_sectors = target_queue->max_sectors;
> if (max_sectors > 0) {
> pg_number = (max_sectors << SECTOR_SIZE_BITS) >>
> PAGE_SHIFT;
> if (pg_number > tio->pg_cnt)
> pg_number = tio->pg_cnt;
> }
> else
> pg_number = tio->pg_cnt;
> }
> else {
> max_sectors = 0;
> pg_number = tio->pg_cnt;
> }
>
> page_count = 0;
> counter = tio->pg_cnt;
>
> while (counter > 0) {
> /* get new bio-structure */
> target_bio = bio_alloc (GFP_NOIO, pg_number);
> if (!target_bio) {
> eprintk ("I/O error: %d\n", page_count);
> return -ENOMEM;
> }
>
> /* Initialize bio */
> target_bio->bi_sector = ppos >> SECTOR_SIZE_BITS;
> target_bio->bi_bdev = target_device;
> target_bio->bi_rw = rw;
> target_bio->bi_end_io = (bio_end_io_t *)
> blockio_bio_endio;
> target_bio->bi_private = &work;
>
> for (i = 0; i < pg_number; i++) {
> page = tio->pvec[page_count];
> assert (page);
>
> /* calc access length for this page */
> len = PAGE_SIZE;
> if (offset)
> len -= offset;
> if (size < len)
> len = size;
>
> /* bio_add_page returns len if successful */
> ret = bio_add_page (target_bio, page, len,
> offset);
> if (!ret) {
> eprintk ("I/O error: %ld\n", (long)
> ret);
> return -EIO;
> }
> /* offset valid only once */
> offset = 0;
> size -= len;
> page_count++;
> }
>
> counter -= pg_number;
> ppos += (pg_number << PAGE_SHIFT);
>
> if (pg_number > counter)
> pg_number = counter;
>
> oldfs = get_fs ();
> set_fs (get_ds ());
>
> /* send bio to generic_make_request */
> submit_bio (rw, target_bio);
>
> wait_for_completion (&work);
>
> set_fs (oldfs);
>
> }
> assert (!size);
>
> return 0;
> }
>
> static int
> open_path (struct iet_volume *volume, const char *path)
> {
> int err = 0;
> struct blockio_data *info = (struct blockio_data *)
> volume->private;
> struct file *filp;
> mm_segment_t oldfs;
> int flags;
>
> info->path = kmalloc (strlen (path) + 1, GFP_KERNEL);
> if (!info->path)
> return -ENOMEM;
> strcpy (info->path, path);
> info->path[strlen (path)] = '\0';
>
> oldfs = get_fs ();
> set_fs (get_ds ());
> flags = (LUReadonly (volume) ? O_RDONLY : O_RDWR) | O_LARGEFILE
> | O_SYNC | O_DIRECT;
> filp = filp_open (path, flags, 0);
> set_fs (oldfs);
>
> if (IS_ERR (filp)) {
> err = PTR_ERR (filp);
> eprintk ("Can't open %s %d\n", path, err);
> info->filp = NULL;
> }
> else
> info->filp = filp;
>
> return err;
> }
>
> static int
> set_scsiid (struct iet_volume *volume, const char *id)
> {
> size_t len;
>
> if ((len = strlen (id)) > SCSI_ID_LEN - VENDOR_ID_LEN) {
> eprintk ("too long SCSI ID %lu\n", (unsigned long) len);
> return -EINVAL;
> }
>
> len = min (sizeof (volume->scsi_id) - VENDOR_ID_LEN, len);
> memcpy (volume->scsi_id + VENDOR_ID_LEN, id, len);
>
> return 0;
> }
>
> static void
> gen_scsiid (struct iet_volume *volume, struct inode *inode)
> {
> int i;
> u32 *p;
>
> strlcpy (volume->scsi_id, VENDOR_ID, VENDOR_ID_LEN);
>
> for (i = VENDOR_ID_LEN; i < SCSI_ID_LEN; i++)
> if (volume->scsi_id[i])
> return;
>
> p = (u32 *) (volume->scsi_id + VENDOR_ID_LEN);
> *(p + 0) = volume->target->trgt_param.target_type;
> *(p + 1) = volume->target->tid;
> *(p + 2) = (unsigned int) inode->i_ino;
> *(p + 3) = (unsigned int) inode->i_sb->s_dev;
> }
>
> static int
> set_scsisn(struct iet_volume *volume, const char *sn)
> {
> size_t len;
>
> if ((len = strlen(sn)) > SCSI_SN_LEN) {
> eprintk("too long SCSI SN %lu\n", (unsigned long) len);
> return -EINVAL;
> }
> memcpy(volume->scsi_sn, sn, len);
> return 0;
> }
>
> enum
> {
> Opt_scsiid, Opt_scsisn, Opt_path, Opt_ignore, Opt_err,
> };
>
> static match_table_t tokens = {
> {Opt_scsiid, "ScsiId=%s"},
> {Opt_scsisn, "ScsiSN=%s"},
> {Opt_path, "Path=%s"},
> {Opt_ignore, "Type=%s"},
> {Opt_ignore, "IOMode=%s"},
> {Opt_err, NULL},
> };
>
> static int
> parse_blockio_params (struct iet_volume *volume, char *params)
> {
> int err = 0;
> char *p, *q;
>
> while ((p = strsep (¶ms, ",")) != NULL) {
> substring_t args[MAX_OPT_ARGS];
> int token;
> if (!*p)
> continue;
> token = match_token (p, tokens, args);
> switch (token) {
> case Opt_scsiid:
> if (!(q = match_strdup (&args[0]))) {
> err = -ENOMEM;
> goto out;
> }
> err = set_scsiid (volume, q);
> kfree (q);
> if (err < 0)
> goto out;
> break;
> case Opt_scsisn:
> if (!(q = match_strdup(&args[0]))) {
> err = -ENOMEM;
> goto out;
> }
> err = set_scsisn(volume, q);
> kfree(q);
> if (err < 0)
> goto out;
> break;
> case Opt_path:
> if (!(q = match_strdup (&args[0]))) {
> err = -ENOMEM;
> goto out;
> }
> err = open_path (volume, q);
> kfree (q);
> if (err < 0)
> goto out;
> break;
> case Opt_ignore:
> break;
> default:
> eprintk ("Unknown %s\n", p);
> return -EINVAL;
> }
> }
>
> out:
> return err;
> }
>
> static void
> blockio_detach (struct iet_volume *lu)
> {
> struct inode *inode;
> struct blockio_data *p = (struct blockio_data *) lu->private;
>
> inode = p->device->bd_inode;
>
> inode->i_mapping->backing_dev_info->ra_pages = p->old_ra_pages;
> #if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,12))
> inode->i_mapping->backing_dev_info->capabilities =
> p->old_capabilities;
> #else
> inode->i_mapping->backing_dev_info->memory_backed =
> p->old_memory_backed;
> #endif
>
> kfree (p->path);
> if (p->filp)
> filp_close (p->filp, NULL);
> kfree (p);
> lu->private = NULL;
> }
>
> static int
> blockio_attach (struct iet_volume *lu, char *args)
> {
> int err = 0;
> struct blockio_data *p;
> struct inode *inode;
>
> if (lu->private) {
> printk ("already attached ? %d\n", lu->lun);
> return -EBUSY;
> }
>
> if (!(p = kmalloc (sizeof (*p), GFP_KERNEL)))
> return -ENOMEM;
> memset (p, 0, sizeof (*p));
> lu->private = p;
>
> if ((err = parse_blockio_params (lu, args)) < 0) {
> eprintk ("%d\n", err);
> goto out;
> }
> inode = p->filp->f_dentry->d_inode;
>
> gen_scsiid (lu, inode);
>
> /* Only block devices are allowed here */
> if (S_ISBLK (inode->i_mode)) {
> inode = inode->i_bdev->bd_inode;
> p->device = inode->i_bdev;
> printk (KERN_INFO "Max queue length: %d \n",
> p->device->bd_disk->queue->max_sectors);
> }
> else {
> err = -EINVAL;
> goto out;
> }
>
> p->old_ra_pages = inode->i_mapping->backing_dev_info->ra_pages;
> #if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,12))
> p->old_capabilities =
> inode->i_mapping->backing_dev_info->capabilities;
> #else
> p->old_memory_backed =
> inode->i_mapping->backing_dev_info->memory_backed;
> #endif
>
> inode->i_mapping->backing_dev_info->ra_pages = 0;
> #if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,12))
> inode->i_mapping->backing_dev_info->capabilities =
> BDI_CAP_NO_ACCT_DIRTY | BDI_CAP_NO_WRITEBACK;
> #else
> inode->i_mapping->backing_dev_info->memory_backed = 1;
> #endif
>
> /* get sector size of the block device */
> lu->blk_shift = SECTOR_SIZE_BITS;
> lu->blk_cnt = inode->i_size >> lu->blk_shift;
>
> out:
> if (err < 0)
> blockio_detach (lu);
> return err;
> }
>
> void
> blockio_show (struct iet_volume *lu, struct seq_file *seq)
> {
> struct blockio_data *p = (struct blockio_data *) lu->private;
> seq_printf (seq, " path:%s\n", p->path);
> }
>
> struct iotype blockio = {
> .name = "blockio",
> .attach = blockio_attach,
> .make_request = blockio_make_request,
> .detach = blockio_detach,
> .show = blockio_show,
> };
>
> ______________________________________________________________________
> This e-mail, and any attachments thereto, is intended only for use by
> the addressee(s) named herein and may contain legally privileged
> and/or confidential information. If you are not the intended recipient
> of this e-mail, you are hereby notified that any dissemination,
> distribution or copying of this e-mail, and any attachments thereto,
> is strictly prohibited. If you have received this e-mail in error,
> please immediately notify the sender and permanently delete the
> original and any copy or printout thereof.
>
> _______________________________________________
> drbd-user mailing list
> drbd-user at lists.linbit.com
> http://lists.linbit.com/mailman/listinfo/drbd-user
>
______________________________________________________________________
This e-mail, and any attachments thereto, is intended only for use by
the addressee(s) named herein and may contain legally privileged
and/or confidential information. If you are not the intended recipient
of this e-mail, you are hereby notified that any dissemination,
distribution or copying of this e-mail, and any attachments thereto,
is strictly prohibited. If you have received this e-mail in error,
please immediately notify the sender and permanently delete the
original and any copy or printout thereof.