[DRBD-user] Avoiding split brain

Wed May 26 20:05:55 CEST 2010

Hi all,

I have a two-node HA cluster, running DRBD on LVM, using pacemaker to
manage those resources, plus samba.  Things are almost
working how I want them, except split-brain and fencing.

How things are set up, if I log into either of the nodes and issue
`crm node standy` or `/etc/rc./rc.corosync stop`, all of the resources
will be moved properly to the other node.

However, I started having problems when I began unplugging the network
cables.  For instance, say node1 is master; I unplug it, all moves to
node2.  Plugging node1 back in, split-brain is detected, and node1
stops.  Node2 remains master, which is fine, as I can resolve the
split-brain by hand.

But say node2 is master, same scenario, resources move to node1.
Plugging node2 back in, split-brain, but for some reason, node2 is made master, 
and resources move back to it!  This is bad.  What if changes were written to
disk on node1 in the interim, node2 becomes active, changes are
written to node2-- and I have a real mess of new data spread across
two nodes.

I want resources to STAY where they land when a node fails, so as in
the case of node1 dying.  I've tested this multiple times, and its
always the same.  Node1 is master, it dies, resources stay on node2.
Node2 is master, it dies, resources DO NOT stay on node1 when node2
comes back on-line.

I've tried to enable fencing, and STONITH, but my knowledge is pretty
limited and having trouble finding documentation, can anyone advise?

Here is cib.xml:

node agate \
	attributes standby="off"
node quartz \
	attributes standby="off"
primitive MineralIP ocf:heartbeat:IPaddr2 \
	params ip="192.168.10.13" \
	op monitor interval="30s"
primitive drbd_r0 ocf:linbit:drbd \
	params drbd_resource="r0" \
	op monitor interval="60s"
primitive drbd_r0_fs ocf:heartbeat:Filesystem \
	params device="/dev/drbd0" directory="/home" fstype="ext3"
primitive samba ocf:heartbeat:samba \
	params smbd_enabled="1" nmbd_enabled="1" winbindd_enabled="0" smbd_bin="/usr/sbin/smbd" nmbd_bin="/usr/sbin/nmbd" smbd_pidfile="/var/run/smbd.pid" nmbd_pidfile="/var/run/nmbd.pid" testparm_bin="/usr/bin/testparm" samba_config="/etc/samba/smb.conf" \
	meta target-role="Started"
primitive st-agate stonith:external/ipmi \
	params hostname="agate" ipaddr="192.168.10.252" userid="admin" passwd="password"
primitive st-quartz stonith:external/ipmi \
	params hostname="quartz" ipaddr="192.168.10.29" userid="admin" passwd="password"
group mineral MineralIP drbd_r0_fs samba \
	meta target-role="Started"
ms ms_drbd_r0 drbd_r0 \
	meta master-max="1" master-node-max="1" clone-max="2" clone-node-max="1" notify="true"
location l-st-agate st-agate -inf: agate
location l-st-quartz st-quartz -inf: quartz
colocation mineral-on-drbd_fs inf: mineral ms_drbd_r0:Master
order mineral-after-drbd inf: ms_drbd_r0:promote mineral:start
property $id="cib-bootstrap-options" \
	dc-version="1.1.1-b9b672590e79770afb63b9b455400d92fb6b5d9e" \
	cluster-infrastructure="openais" \
	expected-quorum-votes="2" \
	stonith-enabled="false" \
	no-quorum-policy="ignore" \
	stonith-action="poweroff"
rsc_defaults $id="rsc-options" \
	resource-stickiness="100"

/etc/drbd.conf:

global {
  usage-count yes;
}
common {
  protocol b;
  syncer { rate 50M; }
}
resource r0 {
  disk {
    fencing resource-only;
  }
  handlers {
    fence-peer "/usr/lib/drbd/crm-fence-peer.sh";
    after-resync-target "/usr/lib/drbd/crm-unfence-peer.sh";
  }
  on quartz {
    device    /dev/drbd0;
    disk      /dev/terra/home;
    address   192.168.10.29:7788;
    meta-disk internal;
  }
  on agate {
    device    /dev/drbd0;
    disk      /dev/terra/home;
    address   192.168.10.252:7788;
    meta-disk internal;
  }
}