[DRBD-user] Pacemaker + Dual Primary, handlers and fail-back issues

Wed Feb 29 20:08:59 CET 2012

Hi,

I have a 2 node cluster with sles11sp1, with the latest patches. 
Configured Pacemaker, dual primary drbd and xen.

Here's the configuration:

- drbd.conf
global {
    usage-count  yes;
}
common {
    protocol C;
    disk {
         on-io-error    detach;
         fencing resource-only;
    }
    syncer {
       rate               1G;
       al-extents       3389;
    }
    net {
       allow-two-primaries; # Enable this *after* initial testing
       cram-hmac-alg sha1;
       shared-secret "a6a0680c40bca2439dbe48343ddddcf4";
       after-sb-0pri discard-zero-changes;
       after-sb-1pri discard-secondary;
       after-sb-2pri disconnect;
        }
   startup {
       become-primary-on both;
}
   handlers {
       fence-peer "/usr/lib/drbd/crm-fence-peer.sh";
       after-resync-target "/usr/lib/drbd/crm-unfence-peer.sh";

}
}
resource vmsvn {
       device    /dev/drbd0;
       disk      /dev/sdb;
       meta-disk internal;
    on xm01 {
       address   100.0.0.1:7788;
    }
    on xm02 {
       address   100.0.0.2:7788;
    }
}

resource srvsvn1 {
       protocol C;
       device    /dev/drbd1;
       disk      /dev/sdc;
       meta-disk internal;
    on xm01 {
       address   100.0.0.1:7789;
    }
    on xm02 {
       address   100.0.0.2:7789;
    }
}

resource srvsvn2 {
       protocol C;
       device    /dev/drbd2;
       disk      /dev/sdd;
       meta-disk internal;
    on xm01 {
       address   100.0.0.1:7790;
    }
    on xm02 {
       address   100.0.0.2:7790;
    }
}

resource vmconfig {
         protocol C;
       device    /dev/drbd3;
       meta-disk internal;
    on xm01 {
       address   100.0.0.1:7791;
       disk     /dev/vg_xm01/lv_xm01_vmconfig;
    }
    on xm02 {
       address   100.0.0.2:7791;
       disk     /dev/vg_xm02/lv_xm02_vmconfig;
    }
}

================================

- crm configuration:
node xm01
node xm02
primitive VMSVN ocf:heartbeat:Xen \
         meta target-role="Started" allow-migrate="true" 
is-managed="true" resource-stickiness="0" \
         operations $id="VMSVN-operations" \
         op monitor interval="30" timeout="30" \
         op start interval="0" timeout="60" \
         op stop interval="0" timeout="60" \
         op migrate_to interval="0" timeout="180" \
         params xmfile="/etc/xen/vm/vmsvn"
primitive clvm ocf:lvm2:clvmd \
         operations $id="clvm-operations" \
         op monitor interval="10" timeout="20"
primitive dlm ocf:pacemaker:controld \
         operations $id="dlm-operations" \
         op monitor interval="10" timeout="20" start-delay="0"
primitive ipmi-stonith-xm01 stonith:external/ipmi \
         meta target-role="Started" is-managed="true" priority="10" \
         operations $id="ipmi-stonith-xm01-operations" \
         op monitor interval="15" timeout="15" start-delay="15" \
         params hostname="xm01" ipaddr="125.1.254.107" 
userid="administrator" passwd="17xm45" interface="lan"
primitive ipmi-stonith-xm02 stonith:external/ipmi \
         meta target-role="Started" is-managed="true" priority="9" \
         operations $id="ipmi-stonith-xm02-operations" \
         op monitor interval="15" timeout="15" start-delay="15" \
         params hostname="xm02" ipaddr="125.1.254.248" 
userid="administrator" passwd="17xm45" interface="lan"
primitive o2cb ocf:ocfs2:o2cb \
         operations $id="o2cb-operations" \
         op monitor interval="10" timeout="20"
primitive srvsvn1-drbd ocf:linbit:drbd \
         params drbd_resource="srvsvn1" \
         operations $id="srvsvn1-drbd-operations" \
         op monitor interval="20" role="Master" timeout="20" \
         op monitor interval="30" role="Slave" timeout="20" \
         op start interval="0" timeout="240" \
         op promote interval="0" timeout="90" \
         op demote interval="0" timeout="90" \
         op stop interval="0" timeout="100" \
         meta migration-threshold="10" failure-timeout="600"
primitive srvsvn2-drbd ocf:linbit:drbd \
         params drbd_resource="srvsvn2" \
         operations $id="srvsvn2-drbd-operations" \
         op monitor interval="20" role="Master" timeout="20" \
         op monitor interval="30" role="Slave" timeout="20" \
         op start interval="0" timeout="240" \
         op promote interval="0" timeout="90" \
         op demote interval="0" timeout="90" \
         op stop interval="0" timeout="100" \
         meta migration-threshold="10" failure-timeout="600"
primitive vg_svn ocf:heartbeat:LVM \
         params volgrpname="vg_svn"
primitive vmconfig ocf:linbit:drbd \
         operations $id="vmconfig-operations" \
         op monitor interval="20" role="Master" timeout="120" \
         op start interval="0" timeout="240" \
         op promote interval="0" timeout="90" \
         op demote interval="0" timeout="90" \
         op stop interval="0" timeout="100" \
         params drbd_resource="vmconfig" \
         meta migration-threshold="10" failure-timeout="600"
primitive vmconfig-pri ocf:heartbeat:Filesystem \
         operations $id="vmconfig-pri-operations" \
         op monitor interval="20" timeout="40" \
         params device="/dev/drbd3" directory="/vmconfig" fstype="ocfs2"
primitive vmsvn-drbd ocf:linbit:drbd \
         operations $id="vmsvn-drbd-operations" \
         op monitor interval="20" role="Master" timeout="20" \
         op monitor interval="30" role="Slave" timeout="20" \
         op start interval="0" timeout="240" \
         op promote interval="0" timeout="90" \
         op demote interval="0" timeout="90" \
         op stop interval="0" timeout="100" \
         params drbd_resource="vmsvn" \
         meta is-managed="true" migration-threshold="10" failure-timeout="600"
group init-group dlm o2cb clvm
group operaciones-group vg_svn
ms ms_drbd_srvsvn1 srvsvn1-drbd \
         meta master-max="2" notify="true" target-role="Started" priority="6"
ms ms_drbd_srvsvn2 srvsvn2-drbd \
         meta master-max="2" notify="true" target-role="Started" priority="5"
ms ms_drbd_vmconfig vmconfig \
         meta master-max="2" clone-max="2" notify="true" priority="8" 
target-role="Started"
ms ms_drbd_vmsvn vmsvn-drbd \
         meta resource-stickiness="100" master-max="2" notify="true" 
target-role="Started" priority="7"
clone init-clone init-group \
         meta interleave="true" target-role="Started" 
is-managed="true" priority="4"
clone operaciones-clone operaciones-group \
         meta target-role="Started" is-managed="true" priority="2" 
interleave="true"
clone vmconfig-clone vmconfig-pri \
         meta target-role="Started" priority="3" is-managed="true"
location drbd-fence-by-handler-ms_drbd_srvsvn1 ms_drbd_srvsvn1 \
         rule $id="drbd-fence-by-handler-rule-ms_drbd_srvsvn1" 
$role="Master" -inf: #uname ne xm01
location drbd-fence-by-handler-ms_drbd_srvsvn2 ms_drbd_srvsvn2 \
         rule $id="drbd-fence-by-handler-rule-ms_drbd_srvsvn2" 
$role="Master" -inf: #uname ne xm01
location drbd-fence-by-handler-ms_drbd_vmconfig ms_drbd_vmconfig \
         rule $id="drbd-fence-by-handler-rule-ms_drbd_vmconfig" 
$role="Master" -inf: #uname ne xm01
location drbd-fence-by-handler-ms_drbd_vmsvn ms_drbd_vmsvn \
         rule $id="drbd-fence-by-handler-rule-ms_drbd_vmsvn" 
$role="Master" -inf: #uname ne xm01
location location-stonith-xm01 ipmi-stonith-xm01 -inf: xm01
location location-stonith-xm02 ipmi-stonith-xm02 -inf: xm02
colocation colocacion : init-clone operaciones-clone vmconfig-clone 
VMSVN ms_drbd_vmconfig:Master ms_drbd_vmsvn:Master 
ms_drbd_srvsvn1:Master ms_drbd_srvsvn2:Master
order ordenamiento : ms_drbd_vmconfig:promote ms_drbd_vmsvn:promote 
ms_drbd_srvsvn1:promote ms_drbd_srvsvn2:promote init-clone:start 
operaciones-clone:start vmconfig-clone:start VMSVN:start
property $id="cib-bootstrap-options" \
         dc-version="1.1.5-5bd2b9154d7d9f86d7f56fe0a74072a5a6590c60" \
         cluster-infrastructure="openais" \
         expected-quorum-votes="2" \
         no-quorum-policy="ignore" \
         last-lrm-refresh="1330538418"

=======================================

Now, according to this page: 
<http://www.drbd.org/users-guide-8.3/s-pacemaker-fencing.html>http://www.drbd.org/users-guide-8.3/s-pacemaker-fencing.html 

the last paragraph, says:
Thus, if the DRBD replication link becomes disconnected, the 
crm-fence-peer.sh script contacts the cluster manager, determines the 
Pacemaker Master/Slave resource associated with this DRBD resource, 
and ensures that the Master/Slave resource no longer gets promoted on 
any node other than the currently active one. Conversely, when the 
connection is re-established and DRBD completes its synchronization 
process, then that constraint is removed and the cluster manager is 
free to promote the resource on any node again.

Unfortunately, that is not happening in my configuration and I don't 
understand why.
Here's what I'm doing:
1) rcnetwork stop on XM01
2) XM02 stonith's XM01 (so far, so good)
3) the VM migrates to XM02 (1 minute downtime which is more than fine)
4) XM01 comes back
5) The DRBD resources appear as Master/Slave (on dual primary!)
6) I can see some constraints generated by the handler in drbd.conf
7) xm02:~ # rcdrbd status
drbd driver loaded OK; device status:
version: 8.3.11 (api:88/proto:86-96)
GIT-hash: 0de839cee13a4160eed6037c4bddd066645e23c5 build by 
phil at fat-tyre, 2011-06-29 11:37:11
m:res       cs         ro                 ds                 p  mounted  fstype
0:vmsvn     Connected  Secondary/Primary  UpToDate/UpToDate  C
1:srvsvn1   Connected  Secondary/Primary  UpToDate/UpToDate  C
2:srvsvn2   Connected  Secondary/Primary  UpToDate/UpToDate  C
3:vmconfig  Connected  Secondary/Primary  UpToDate/UpToDate  C

They are all UPTODATE!
8) The constraints generated by the handler are still there. Waited a 
lifetime, still there...
9) Manually remove the constraints, the VM goes down for a little 
while and the DRBD resources are back as Master/Master.

is there anything wrong in my configuration? How can both nodes 
become Master on a fail back?

Thanks!
Daniel 
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.linbit.com/pipermail/drbd-user/attachments/20120229/d1fb0e9a/attachment.htm>