Note: "permalinks" may not be as permanent as we would like,
direct links of old sources may well be a few messages off.
I am having some serious performance issues with a kernel module for
iSCSI Enterprise Target and drbd 0.7.21 on CentOS 4.4.
The kernel module (source code listed below) basically takes a data
request issued over iSCSI translates it to a bio request that is then
carried out synchronously to the device below.
When run against an MD/LVM/SD device I don't have any problems, I get
performance to be expected, but when run against a drbd 0.7.21 device it
chokes down hard. For example when doing seq 64K block write direct to
device I can get 112 MB/s sustained, when I have drbd in the middle that
throughput drops to 10 MB/s.
Can anybody help explain this or point out a serious flaw in the code
below that would cause this. I would rather solve the problem (if it
can) then try to run version 8 beta in production (if it will even solve
my problem).
Thanks,
---------------- block-io.c
/*
* Target device block I/O.
*
* Based on file I/O driver from FUJITA Tomonori
* (C) 2004 - 2005 FUJITA Tomonori <tomof at acm.org>
* (C) 2006 Andre Brinkmann <brinkman at hni.upb.de>
* This code is licenced under the GPL.
*/
#include <linux/blkdev.h>
#include <linux/writeback.h>
#include <linux/parser.h>
#include <linux/blkdev.h>
#include <linux/buffer_head.h>
#include <linux/version.h>
#include <linux/kernel.h>
#include <linux/proc_fs.h>
#include <linux/genhd.h>
#include <linux/fs.h>
#include <linux/stat.h>
#include <linux/ctype.h>
#include <linux/delay.h>
#include "iscsi.h"
#include "iscsi_dbg.h"
#include "iotype.h"
struct blockio_data
{
char *path;
struct block_device *device;
struct file *filp;
unsigned long old_ra_pages;
#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,12))
unsigned int old_capabilities;
#else
int old_memory_backed;
#endif
};
static void
blockio_bio_endio (struct bio *bio, unsigned int bytes_done, int error)
{
struct completion *wait = bio->bi_private;
/* check if all bytes have been written */
if (bio->bi_size)
eprintk ("I/O error %d Not all bytes written for
bio\n",
bio->bi_size);
if (error)
eprintk ("I/O error: Error %d occured \n", error);
if (bio->bi_private) {
wait = (struct completion *) bio->bi_private;
complete (wait);
}
/* release bio structure */
bio_put (bio);
return;
}
/**
* blockio_make_request(): The function translates an iscsi-request into
* a number of requests to the corresponding block device.
**/
int
blockio_make_request (struct iet_volume *lu, struct tio *tio, int rw)
{
struct blockio_data *p;
struct block_device *target_device;
struct request_queue *target_queue;
struct bio *target_bio;
int max_sectors;
int pg_number;
int page_count;
int counter;
struct page *page;
mm_segment_t oldfs;
u32 offset, size;
u32 len;
loff_t ppos;
int i;
ssize_t ret;
DECLARE_COMPLETION (work);
p = (struct blockio_data *) lu->private;
assert (p);
target_device = p->device;
assert (target_device);
size = tio->size;
offset = tio->offset;
ppos = (loff_t) tio->idx << PAGE_SHIFT;
ppos += offset;
/* All IO is to be synchronous */
rw |= (1 << BIO_RW_SYNC);
/* Get maximum number of sectors / pages that could be sent to
target
* block device within a single bio-structure */
target_queue = target_device->bd_disk->queue;
if (target_queue) {
max_sectors = target_queue->max_sectors;
if (max_sectors > 0) {
pg_number = (max_sectors << SECTOR_SIZE_BITS) >>
PAGE_SHIFT;
if (pg_number > tio->pg_cnt)
pg_number = tio->pg_cnt;
}
else
pg_number = tio->pg_cnt;
}
else {
max_sectors = 0;
pg_number = tio->pg_cnt;
}
page_count = 0;
counter = tio->pg_cnt;
while (counter > 0) {
/* get new bio-structure */
target_bio = bio_alloc (GFP_NOIO, pg_number);
if (!target_bio) {
eprintk ("I/O error: %d\n", page_count);
return -ENOMEM;
}
/* Initialize bio */
target_bio->bi_sector = ppos >> SECTOR_SIZE_BITS;
target_bio->bi_bdev = target_device;
target_bio->bi_rw = rw;
target_bio->bi_end_io = (bio_end_io_t *)
blockio_bio_endio;
target_bio->bi_private = &work;
for (i = 0; i < pg_number; i++) {
page = tio->pvec[page_count];
assert (page);
/* calc access length for this page */
len = PAGE_SIZE;
if (offset)
len -= offset;
if (size < len)
len = size;
/* bio_add_page returns len if successful */
ret = bio_add_page (target_bio, page, len,
offset);
if (!ret) {
eprintk ("I/O error: %ld\n", (long)
ret);
return -EIO;
}
/* offset valid only once */
offset = 0;
size -= len;
page_count++;
}
counter -= pg_number;
ppos += (pg_number << PAGE_SHIFT);
if (pg_number > counter)
pg_number = counter;
oldfs = get_fs ();
set_fs (get_ds ());
/* send bio to generic_make_request */
submit_bio (rw, target_bio);
wait_for_completion (&work);
set_fs (oldfs);
}
assert (!size);
return 0;
}
static int
open_path (struct iet_volume *volume, const char *path)
{
int err = 0;
struct blockio_data *info = (struct blockio_data *)
volume->private;
struct file *filp;
mm_segment_t oldfs;
int flags;
info->path = kmalloc (strlen (path) + 1, GFP_KERNEL);
if (!info->path)
return -ENOMEM;
strcpy (info->path, path);
info->path[strlen (path)] = '\0';
oldfs = get_fs ();
set_fs (get_ds ());
flags = (LUReadonly (volume) ? O_RDONLY : O_RDWR) | O_LARGEFILE
| O_SYNC | O_DIRECT;
filp = filp_open (path, flags, 0);
set_fs (oldfs);
if (IS_ERR (filp)) {
err = PTR_ERR (filp);
eprintk ("Can't open %s %d\n", path, err);
info->filp = NULL;
}
else
info->filp = filp;
return err;
}
static int
set_scsiid (struct iet_volume *volume, const char *id)
{
size_t len;
if ((len = strlen (id)) > SCSI_ID_LEN - VENDOR_ID_LEN) {
eprintk ("too long SCSI ID %lu\n", (unsigned long) len);
return -EINVAL;
}
len = min (sizeof (volume->scsi_id) - VENDOR_ID_LEN, len);
memcpy (volume->scsi_id + VENDOR_ID_LEN, id, len);
return 0;
}
static void
gen_scsiid (struct iet_volume *volume, struct inode *inode)
{
int i;
u32 *p;
strlcpy (volume->scsi_id, VENDOR_ID, VENDOR_ID_LEN);
for (i = VENDOR_ID_LEN; i < SCSI_ID_LEN; i++)
if (volume->scsi_id[i])
return;
p = (u32 *) (volume->scsi_id + VENDOR_ID_LEN);
*(p + 0) = volume->target->trgt_param.target_type;
*(p + 1) = volume->target->tid;
*(p + 2) = (unsigned int) inode->i_ino;
*(p + 3) = (unsigned int) inode->i_sb->s_dev;
}
static int
set_scsisn(struct iet_volume *volume, const char *sn)
{
size_t len;
if ((len = strlen(sn)) > SCSI_SN_LEN) {
eprintk("too long SCSI SN %lu\n", (unsigned long) len);
return -EINVAL;
}
memcpy(volume->scsi_sn, sn, len);
return 0;
}
enum
{
Opt_scsiid, Opt_scsisn, Opt_path, Opt_ignore, Opt_err,
};
static match_table_t tokens = {
{Opt_scsiid, "ScsiId=%s"},
{Opt_scsisn, "ScsiSN=%s"},
{Opt_path, "Path=%s"},
{Opt_ignore, "Type=%s"},
{Opt_ignore, "IOMode=%s"},
{Opt_err, NULL},
};
static int
parse_blockio_params (struct iet_volume *volume, char *params)
{
int err = 0;
char *p, *q;
while ((p = strsep (¶ms, ",")) != NULL) {
substring_t args[MAX_OPT_ARGS];
int token;
if (!*p)
continue;
token = match_token (p, tokens, args);
switch (token) {
case Opt_scsiid:
if (!(q = match_strdup (&args[0]))) {
err = -ENOMEM;
goto out;
}
err = set_scsiid (volume, q);
kfree (q);
if (err < 0)
goto out;
break;
case Opt_scsisn:
if (!(q = match_strdup(&args[0]))) {
err = -ENOMEM;
goto out;
}
err = set_scsisn(volume, q);
kfree(q);
if (err < 0)
goto out;
break;
case Opt_path:
if (!(q = match_strdup (&args[0]))) {
err = -ENOMEM;
goto out;
}
err = open_path (volume, q);
kfree (q);
if (err < 0)
goto out;
break;
case Opt_ignore:
break;
default:
eprintk ("Unknown %s\n", p);
return -EINVAL;
}
}
out:
return err;
}
static void
blockio_detach (struct iet_volume *lu)
{
struct inode *inode;
struct blockio_data *p = (struct blockio_data *) lu->private;
inode = p->device->bd_inode;
inode->i_mapping->backing_dev_info->ra_pages = p->old_ra_pages;
#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,12))
inode->i_mapping->backing_dev_info->capabilities =
p->old_capabilities;
#else
inode->i_mapping->backing_dev_info->memory_backed =
p->old_memory_backed;
#endif
kfree (p->path);
if (p->filp)
filp_close (p->filp, NULL);
kfree (p);
lu->private = NULL;
}
static int
blockio_attach (struct iet_volume *lu, char *args)
{
int err = 0;
struct blockio_data *p;
struct inode *inode;
if (lu->private) {
printk ("already attached ? %d\n", lu->lun);
return -EBUSY;
}
if (!(p = kmalloc (sizeof (*p), GFP_KERNEL)))
return -ENOMEM;
memset (p, 0, sizeof (*p));
lu->private = p;
if ((err = parse_blockio_params (lu, args)) < 0) {
eprintk ("%d\n", err);
goto out;
}
inode = p->filp->f_dentry->d_inode;
gen_scsiid (lu, inode);
/* Only block devices are allowed here */
if (S_ISBLK (inode->i_mode)) {
inode = inode->i_bdev->bd_inode;
p->device = inode->i_bdev;
printk (KERN_INFO "Max queue length: %d \n",
p->device->bd_disk->queue->max_sectors);
}
else {
err = -EINVAL;
goto out;
}
p->old_ra_pages = inode->i_mapping->backing_dev_info->ra_pages;
#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,12))
p->old_capabilities =
inode->i_mapping->backing_dev_info->capabilities;
#else
p->old_memory_backed =
inode->i_mapping->backing_dev_info->memory_backed;
#endif
inode->i_mapping->backing_dev_info->ra_pages = 0;
#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,12))
inode->i_mapping->backing_dev_info->capabilities =
BDI_CAP_NO_ACCT_DIRTY | BDI_CAP_NO_WRITEBACK;
#else
inode->i_mapping->backing_dev_info->memory_backed = 1;
#endif
/* get sector size of the block device */
lu->blk_shift = SECTOR_SIZE_BITS;
lu->blk_cnt = inode->i_size >> lu->blk_shift;
out:
if (err < 0)
blockio_detach (lu);
return err;
}
void
blockio_show (struct iet_volume *lu, struct seq_file *seq)
{
struct blockio_data *p = (struct blockio_data *) lu->private;
seq_printf (seq, " path:%s\n", p->path);
}
struct iotype blockio = {
.name = "blockio",
.attach = blockio_attach,
.make_request = blockio_make_request,
.detach = blockio_detach,
.show = blockio_show,
};
______________________________________________________________________
This e-mail, and any attachments thereto, is intended only for use by
the addressee(s) named herein and may contain legally privileged
and/or confidential information. If you are not the intended recipient
of this e-mail, you are hereby notified that any dissemination,
distribution or copying of this e-mail, and any attachments thereto,
is strictly prohibited. If you have received this e-mail in error,
please immediately notify the sender and permanently delete the
original and any copy or printout thereof.