Note: "permalinks" may not be as permanent as we would like,
direct links of old sources may well be a few messages off.
Hello,
The thread "drbd8 and 80+ 1TB mirrors/cluster, can it be done?"
suggests using RAID5 or RAID6 on top of DRBD to improve redundancy...
A distributed RAID 15/16 :-)
I like this idea so I tried to build it in VMware but I can't get the
RAID5 array to assemble on the mirrored node :-/ I hope I'm missing
something basic but I could not find any documentation on the web
about this. My best guess is that drbd is not synchronizing the RAID
super blocks but I don't know how to change this... I know I can flip
this setup and use DRBD to mirror a RAID5 array but I would rather
mirror first improved reliability and better re-sync characteristics...
In my lap I've got 2 VMware Images (vmpod-000-0 and vmpod-000-1), each
with 5 1GB SCSI drives (/dev/sdb - /dev/sdf). I'm using drbd v8.0.12
and mdadm v2.5.6 on 64-bit Debian "Etch."
Here's what I tried to do in copy/paste script format....
--------------------------------------------------------------------------------
fdisk /dev/sdb
# Create a new primary partition (1) using 99% of the cylinders
(120375 on WD 1TB drives) of type Linux LVM (8e)
# Write changes to disk
sfdisk -d /dev/sdb | sfdisk /dev/sdc
sfdisk -d /dev/sdb | sfdisk /dev/sdd
sfdisk -d /dev/sdb | sfdisk /dev/sde
sfdisk -d /dev/sdb | sfdisk /dev/sdf
# Give the volumes stable device names using LVM
pvcreate /dev/sdb
pvcreate /dev/sdc
pvcreate /dev/sdd
pvcreate /dev/sde
pvcreate /dev/sdf
vgcreate vg00 /dev/sdb
vgcreate vg01 /dev/sdc
vgcreate vg02 /dev/sdd
vgcreate vg03 /dev/sde
vgcreate vg04 /dev/sdf
lvcreate -l`vgdisplay vg00 | grep "Total PE" | awk '{print $3;}'` vg00
-n lv00
lvcreate -l`vgdisplay vg01 | grep "Total PE" | awk '{print $3;}'` vg01
-n lv01
lvcreate -l`vgdisplay vg02 | grep "Total PE" | awk '{print $3;}'` vg02
-n lv02
lvcreate -l`vgdisplay vg03 | grep "Total PE" | awk '{print $3;}'` vg03
-n lv03
lvcreate -l`vgdisplay vg04 | grep "Total PE" | awk '{print $3;}'` vg04
-n lv04
# Configure drbd8....
cat >/etc/drbd.conf << EOF
#
# drbd.conf
#
global {
usage-count no;
minor-count 85;
}
common {
protocol C;
handlers {
pri-on-incon-degr "echo o > /proc/sysrq-trigger ; halt -f";
pri-lost-after-sb "echo o > /proc/sysrq-trigger ; halt -f";
local-io-error "echo o > /proc/sysrq-trigger ; halt -f";
# outdate-peer "/usr/lib/heartbeat/drbd-peer-outdater -t 5";
pri-lost "echo pri-lost. Have a look at the log files. | mail -s
'DRBD Alert' root";
split-brain "echo split-brain. drbdadm -- --discard-my-data
connect \$DRBD_RESOURCE ? | mail -s 'DRBD Alert' root";
}
startup {
wfc-timeout 0; ## Infinite!
degr-wfc-timeout 120; ## 2 minutes.
}
disk {
on-io-error detach;
}
net {
# sndbuf-size 512k;
# timeout 60; # 6 seconds (unit = 0.1 seconds)
# connect-int 10; # 10 seconds (unit = 1 second)
# ping-int 10; # 10 seconds (unit = 1 second)
# max-buffers 2048;
# unplug-watermark 128;
# max-epoch-size 2048;
# ko-count 4;
}
syncer {
rate 33M;
# group 1;
al-extents 257;
}
}
resource lv00 {
on vmpod-000-0 {
device /dev/drbd0;
disk /dev/vg00/lv00;
flexible-meta-disk internal;
address 172.16.0.101:7788;
}
on vmpod-000-1 {
device /dev/drbd0;
disk /dev/vg00/lv00;
flexible-meta-disk internal;
address 172.16.0.102:7788;
}
}
resource lv01 {
on vmpod-000-0 {
device /dev/drbd1;
disk /dev/vg01/lv01;
flexible-meta-disk internal;
address 172.16.0.101:7789;
}
on vmpod-000-1 {
device /dev/drbd1;
disk /dev/vg01/lv01;
flexible-meta-disk internal;
address 172.16.0.102:7789;
}
}
resource lv02 {
on vmpod-000-0 {
device /dev/drbd2;
disk /dev/vg02/lv02;
flexible-meta-disk internal;
address 172.16.0.101:7790;
}
on vmpod-000-1 {
device /dev/drbd2;
disk /dev/vg02/lv02;
flexible-meta-disk internal;
address 172.16.0.102:7790;
}
}
resource lv03 {
on vmpod-000-0 {
device /dev/drbd3;
disk /dev/vg03/lv03;
flexible-meta-disk internal;
address 172.16.0.101:7791;
}
on vmpod-000-1 {
device /dev/drbd3;
disk /dev/vg03/lv03;
flexible-meta-disk internal;
address 172.16.0.102:7791;
}
}
resource lv04 {
on vmpod-000-0 {
device /dev/drbd4;
disk /dev/vg04/lv04;
flexible-meta-disk internal;
address 172.16.0.101:7792;
}
on vmpod-000-1 {
device /dev/drbd4;
disk /dev/vg04/lv04;
flexible-meta-disk internal;
address 172.16.0.102:7792;
}
}
EOF
drbdadm create-md lv00
drbdadm create-md lv01
drbdadm create-md lv02
drbdadm create-md lv03
drbdadm create-md lv04
# Only on the primary system...
drbdadm -- --overwrite-data-of-peer primary lv00
drbdadm -- --overwrite-data-of-peer primary lv01
drbdadm -- --overwrite-data-of-peer primary lv02
drbdadm -- --overwrite-data-of-peer primary lv03
drbdadm -- --overwrite-data-of-peer primary lv04
## At this point DRBD is working great and the mirrors are syncing
nicely :-) ##
# Configure a RAID5 volume on top of DRBD
mdadm --create /dev/md2 --auto=yes -l 5 -n 5 /dev/drbd0 /dev/drbd1 /
dev/drbd2 /dev/drbd3 /dev/drbd4
mkfs.ext3 -b 4096 /dev/md2
## Test...
mount -t ext3 /dev/md2 /mnt
echo "Hello Tim" > /mnt/hi
cat /mnt/hi
## All of this worked great, so try to fail-over to the other node...
umount /mnt
mdadm --stop --scan
drbdadm secondary all
# On the second node
drbdadm primary all
mdadm --assemble --scan
## This last call returns "mdadm: No arrays found in config file or
automatically"
--------------------------------------------------------------------------------
I tried adding "DEVICE /dev/drbd*" to /etc/mdadm/mdadm.conf but that
did not help..
If I look at the drbd status everything looks right...
vmpod-000-1:/home/tnufire# cat /proc/drbd
version: 8.0.12 (api:86/proto:86)
GIT-hash: 5c9f89594553e32adb87d9638dce591782f947e3 build by
root at vmpod-000-1, 2008-05-29 21:17:12
0: cs:Connected st:Primary/Secondary ds:UpToDate/UpToDate C r---
ns:0 nr:1079032 dw:1079032 dr:216 al:0 bm:64 lo:0 pe:0 ua:0 ap:0
resync: used:0/61 hits:65212 misses:64 starving:0 dirty:0 changed:64
act_log: used:0/257 hits:0 misses:0 starving:0 dirty:0 changed:0
1: cs:Connected st:Primary/Secondary ds:UpToDate/UpToDate C r---
ns:0 nr:1079168 dw:1079168 dr:216 al:0 bm:64 lo:0 pe:0 ua:0 ap:0
resync: used:0/61 hits:65212 misses:64 starving:0 dirty:0 changed:64
act_log: used:0/257 hits:0 misses:0 starving:0 dirty:0 changed:0
2: cs:Connected st:Primary/Secondary ds:UpToDate/UpToDate C r---
ns:0 nr:1079340 dw:1079340 dr:216 al:0 bm:64 lo:0 pe:0 ua:0 ap:0
resync: used:0/61 hits:65212 misses:64 starving:0 dirty:0 changed:64
act_log: used:0/257 hits:0 misses:0 starving:0 dirty:0 changed:0
3: cs:Connected st:Primary/Secondary ds:UpToDate/UpToDate C r---
ns:0 nr:1079204 dw:1079204 dr:216 al:0 bm:64 lo:0 pe:0 ua:0 ap:0
resync: used:0/61 hits:65212 misses:64 starving:0 dirty:0 changed:64
act_log: used:0/257 hits:0 misses:0 starving:0 dirty:0 changed:0
4: cs:Connected st:Primary/Secondary ds:UpToDate/UpToDate C r---
ns:0 nr:1078772 dw:1078772 dr:216 al:0 bm:64 lo:0 pe:0 ua:0 ap:0
resync: used:0/61 hits:65212 misses:64 starving:0 dirty:0 changed:64
act_log: used:0/257 hits:0 misses:0 starving:0 dirty:0 changed:0
If I revers the process and go back to the first node, "mdadm --
assemble --scan" returns "mdadm: /dev/md/2 has been started with 5
drives" as it should.
What am I missing? Should this work?
Thanks,
Tim