[DRBD-user] Trouble getting RAID5 on top of DRBD to work..

Fri May 30 07:45:48 CEST 2008

Hello,

The thread "drbd8 and 80+ 1TB mirrors/cluster, can it be done?"  
suggests using RAID5 or RAID6 on top of  DRBD to improve redundancy...  
A distributed RAID 15/16 :-)

I like this idea so I tried to build it in VMware but I can't get the  
RAID5 array to assemble on the mirrored node :-/ I hope I'm missing  
something basic but I could not find any documentation on the web  
about this. My best guess is that drbd is not synchronizing the RAID  
super blocks but I don't know how to change this... I know I can flip  
this setup and use DRBD to mirror a RAID5 array but I would rather  
mirror first improved reliability and better re-sync characteristics...

In my lap I've got 2 VMware Images (vmpod-000-0 and vmpod-000-1), each  
with 5 1GB SCSI drives (/dev/sdb - /dev/sdf). I'm using drbd v8.0.12  
and mdadm v2.5.6 on 64-bit Debian "Etch."

Here's what I tried to do in copy/paste script format....

--------------------------------------------------------------------------------
fdisk /dev/sdb
# Create a new primary partition (1) using 99% of the cylinders  
(120375 on WD 1TB drives) of type Linux LVM (8e)
# Write changes to disk

sfdisk -d /dev/sdb | sfdisk /dev/sdc
sfdisk -d /dev/sdb | sfdisk /dev/sdd
sfdisk -d /dev/sdb | sfdisk /dev/sde
sfdisk -d /dev/sdb | sfdisk /dev/sdf

# Give the volumes stable device names using LVM
pvcreate /dev/sdb
pvcreate /dev/sdc
pvcreate /dev/sdd
pvcreate /dev/sde
pvcreate /dev/sdf

vgcreate vg00 /dev/sdb
vgcreate vg01 /dev/sdc
vgcreate vg02 /dev/sdd
vgcreate vg03 /dev/sde
vgcreate vg04 /dev/sdf

lvcreate -l`vgdisplay vg00 | grep "Total PE" | awk '{print $3;}'` vg00  
-n lv00
lvcreate -l`vgdisplay vg01 | grep "Total PE" | awk '{print $3;}'` vg01  
-n lv01
lvcreate -l`vgdisplay vg02 | grep "Total PE" | awk '{print $3;}'` vg02  
-n lv02
lvcreate -l`vgdisplay vg03 | grep "Total PE" | awk '{print $3;}'` vg03  
-n lv03
lvcreate -l`vgdisplay vg04 | grep "Total PE" | awk '{print $3;}'` vg04  
-n lv04

# Configure drbd8....

cat >/etc/drbd.conf << EOF
#
# drbd.conf
#
global {
   usage-count no;
   minor-count 85;
}
common {
   protocol C;

   handlers {
     pri-on-incon-degr "echo o > /proc/sysrq-trigger ; halt -f";
     pri-lost-after-sb "echo o > /proc/sysrq-trigger ; halt -f";
     local-io-error "echo o > /proc/sysrq-trigger ; halt -f";
     # outdate-peer "/usr/lib/heartbeat/drbd-peer-outdater -t 5";

     pri-lost "echo pri-lost. Have a look at the log files. | mail -s  
'DRBD Alert' root";
     split-brain "echo split-brain. drbdadm -- --discard-my-data  
connect \$DRBD_RESOURCE ? | mail -s 'DRBD Alert' root";
   }

   startup {
     wfc-timeout         0;  ## Infinite!
     degr-wfc-timeout  120;  ## 2 minutes.
   }

   disk {
     on-io-error   detach;
   }

   net {
     # sndbuf-size 512k;

     # timeout       60;    #  6 seconds  (unit = 0.1 seconds)
     # connect-int   10;    # 10 seconds  (unit = 1 second)
     # ping-int      10;    # 10 seconds  (unit = 1 second)

     # max-buffers        2048;
     # unplug-watermark   128;
     # max-epoch-size     2048;
     # ko-count           4;
   }

   syncer {
     rate 33M;
     # group 1;
     al-extents 257;
   }
}
resource lv00 {
   on vmpod-000-0 {
     device              /dev/drbd0;
     disk                /dev/vg00/lv00;
     flexible-meta-disk  internal;
     address             172.16.0.101:7788;
   }
   on vmpod-000-1 {
     device              /dev/drbd0;
     disk                /dev/vg00/lv00;
     flexible-meta-disk  internal;
     address             172.16.0.102:7788;
   }
}
resource lv01 {
   on vmpod-000-0 {
     device              /dev/drbd1;
     disk                /dev/vg01/lv01;
     flexible-meta-disk  internal;
     address             172.16.0.101:7789;
   }
   on vmpod-000-1 {
     device              /dev/drbd1;
     disk                /dev/vg01/lv01;
     flexible-meta-disk  internal;
     address             172.16.0.102:7789;
   }
}
resource lv02 {
   on vmpod-000-0 {
     device              /dev/drbd2;
     disk                /dev/vg02/lv02;
     flexible-meta-disk  internal;
     address             172.16.0.101:7790;
   }
   on vmpod-000-1 {
     device              /dev/drbd2;
     disk                /dev/vg02/lv02;
     flexible-meta-disk  internal;
     address             172.16.0.102:7790;
   }
}
resource lv03 {
   on vmpod-000-0 {
     device              /dev/drbd3;
     disk                /dev/vg03/lv03;
     flexible-meta-disk  internal;
     address             172.16.0.101:7791;
   }
   on vmpod-000-1 {
     device              /dev/drbd3;
     disk                /dev/vg03/lv03;
     flexible-meta-disk  internal;
     address             172.16.0.102:7791;
   }
}
resource lv04 {
   on vmpod-000-0 {
     device              /dev/drbd4;
     disk                /dev/vg04/lv04;
     flexible-meta-disk  internal;
     address             172.16.0.101:7792;
   }
   on vmpod-000-1 {
     device              /dev/drbd4;
     disk                /dev/vg04/lv04;
     flexible-meta-disk  internal;
     address             172.16.0.102:7792;
   }
}
EOF

drbdadm create-md lv00
drbdadm create-md lv01
drbdadm create-md lv02
drbdadm create-md lv03
drbdadm create-md lv04

# Only on the primary system...
drbdadm -- --overwrite-data-of-peer primary lv00
drbdadm -- --overwrite-data-of-peer primary lv01
drbdadm -- --overwrite-data-of-peer primary lv02
drbdadm -- --overwrite-data-of-peer primary lv03
drbdadm -- --overwrite-data-of-peer primary lv04

## At this point DRBD is working great and the mirrors are syncing  
nicely :-) ##

# Configure a RAID5 volume on top of DRBD

mdadm --create /dev/md2 --auto=yes -l 5 -n 5 /dev/drbd0 /dev/drbd1 / 
dev/drbd2 /dev/drbd3 /dev/drbd4
mkfs.ext3 -b 4096 /dev/md2

## Test...

mount -t ext3 /dev/md2 /mnt
echo "Hello Tim" > /mnt/hi
cat /mnt/hi

## All of this worked great, so try to fail-over to the other node...

umount /mnt
mdadm --stop --scan
drbdadm secondary all

# On the second node

drbdadm primary all
mdadm --assemble --scan

## This last call returns "mdadm: No arrays found in config file or  
automatically"

--------------------------------------------------------------------------------

I tried adding "DEVICE /dev/drbd*" to /etc/mdadm/mdadm.conf but that  
did not help..

If I look at the drbd status everything looks right...

vmpod-000-1:/home/tnufire# cat /proc/drbd
version: 8.0.12 (api:86/proto:86)
GIT-hash: 5c9f89594553e32adb87d9638dce591782f947e3 build by  
root at vmpod-000-1, 2008-05-29 21:17:12
  0: cs:Connected st:Primary/Secondary ds:UpToDate/UpToDate C r---
     ns:0 nr:1079032 dw:1079032 dr:216 al:0 bm:64 lo:0 pe:0 ua:0 ap:0
	resync: used:0/61 hits:65212 misses:64 starving:0 dirty:0 changed:64
	act_log: used:0/257 hits:0 misses:0 starving:0 dirty:0 changed:0
  1: cs:Connected st:Primary/Secondary ds:UpToDate/UpToDate C r---
     ns:0 nr:1079168 dw:1079168 dr:216 al:0 bm:64 lo:0 pe:0 ua:0 ap:0
	resync: used:0/61 hits:65212 misses:64 starving:0 dirty:0 changed:64
	act_log: used:0/257 hits:0 misses:0 starving:0 dirty:0 changed:0
  2: cs:Connected st:Primary/Secondary ds:UpToDate/UpToDate C r---
     ns:0 nr:1079340 dw:1079340 dr:216 al:0 bm:64 lo:0 pe:0 ua:0 ap:0
	resync: used:0/61 hits:65212 misses:64 starving:0 dirty:0 changed:64
	act_log: used:0/257 hits:0 misses:0 starving:0 dirty:0 changed:0
  3: cs:Connected st:Primary/Secondary ds:UpToDate/UpToDate C r---
     ns:0 nr:1079204 dw:1079204 dr:216 al:0 bm:64 lo:0 pe:0 ua:0 ap:0
	resync: used:0/61 hits:65212 misses:64 starving:0 dirty:0 changed:64
	act_log: used:0/257 hits:0 misses:0 starving:0 dirty:0 changed:0
  4: cs:Connected st:Primary/Secondary ds:UpToDate/UpToDate C r---
     ns:0 nr:1078772 dw:1078772 dr:216 al:0 bm:64 lo:0 pe:0 ua:0 ap:0
	resync: used:0/61 hits:65212 misses:64 starving:0 dirty:0 changed:64
	act_log: used:0/257 hits:0 misses:0 starving:0 dirty:0 changed:0

If I revers the process and go back to the first node, "mdadm -- 
assemble --scan" returns "mdadm: /dev/md/2 has been started with 5  
drives" as it should.

What am I missing? Should this work?

Thanks,
Tim