[DRBD-user] recover from a situation

Fri Sep 24 14:35:04 CEST 2010

Hi,

Here is a situation from which I want either automatic (by the cluster) or
manually (by the admin) to recover from.

DRBD resource runs on node 1
shutdown all nodes in a such order which will not cause a failover of the
resources
start the node 2 which was secondary prior the shutdown.

As we know DRBD wont let the cluster to start up the drbd resource because
is marked outdated.
what would be the correct way to recover from this situation?

confs are at the bottom.

Regards,
Pavlos

node $id="b8ad13a6-8a6e-4304-a4a1-8f69fa735100" node-02
node $id="d5557037-cf8f-49b7-95f5-c264927a0c76" node-01
node $id="e5195d6b-ed14-4bb3-92d3-9105543f9251" node-03
primitive drbd_01 ocf:linbit:drbd \
        params drbd_resource="drbd_pbx_service_1" \
        op monitor interval="30s"
primitive drbd_02 ocf:linbit:drbd \
        params drbd_resource="drbd_pbx_service_2" \
        op monitor interval="30s"
primitive fs_01 ocf:heartbeat:Filesystem \
        params device="/dev/drbd1" directory="/pbx_service_01" fstype="ext3"
\
        meta migration-threshold="3" failure-timeout="60" \
        op monitor interval="20s" timeout="40s" OCF_CHECK_LEVEL="20" \
        op start interval="0" timeout="60s" \
        op stop interval="0" timeout="60s"
primitive fs_02 ocf:heartbeat:Filesystem \
        params device="/dev/drbd2" directory="/pbx_service_02" fstype="ext3"
\
        meta migration-threshold="3" failure-timeout="60" \
        op monitor interval="20s" timeout="40s" OCF_CHECK_LEVEL="20" \
        op start interval="0" timeout="60s" \
        op stop interval="0" timeout="60s"
primitive ip_01 ocf:heartbeat:IPaddr2 \
        params ip="10.10.10.10" cidr_netmask="25" broadcast="10.10.10.127" \
        meta failure-timeout="120" migration-threshold="3" \
        op monitor interval="5s"
primitive ip_02 ocf:heartbeat:IPaddr2 \
        params ip="10.10.10.11" cidr_netmask="25" broadcast="10.10.10.127" \
        op monitor interval="5s"
primitive pbx_01 ocf:heartbeat:Dummy \
        params state="/pbx_service_01/Dummy.state" \
        meta failure-timeout="60" migration-threshold="3" \
        op monitor interval="20s" timeout="40s"
primitive pbx_02 ocf:heartbeat:Dummy \
        params state="/pbx_service_02/Dummy.state" \
        meta failure-timeout="60" migration-threshold="3"
group pbx_service_01 ip_01 fs_01 pbx_01 \
        meta target-role="Started"
group pbx_service_02 ip_02 fs_02 pbx_02 \
        meta target-role="Started"
ms ms-drbd_01 drbd_01 \
        meta master-max="1" master-node-max="1" clone-max="2"
clone-node-max="1" notify="true"
ms ms-drbd_02 drbd_02 \
        meta master-max="1" master-node-max="1" clone-max="2"
clone-node-max="1" notify="true" target-role="Started"
location PrimaryNode-drbd_01 ms-drbd_01 100: node-01
location PrimaryNode-drbd_02 ms-drbd_02 100: node-02
location PrimaryNode-pbx_service_01 pbx_service_01 200: node-01
location PrimaryNode-pbx_service_02 pbx_service_02 200: node-02
location SecondaryNode-drbd_01 ms-drbd_01 0: node-03
location SecondaryNode-drbd_02 ms-drbd_02 0: node-03
location SecondaryNode-pbx_service_01 pbx_service_01 10: node-03
location SecondaryNode-pbx_service_02 pbx_service_02 10: node-03
colocation fs_01-on-drbd_01 inf: fs_01 ms-drbd_01:Master
colocation fs_02-on-drbd_02 inf: fs_02 ms-drbd_02:Master
colocation pbx_01-with-fs_01 inf: pbx_01 fs_01
colocation pbx_01-with-ip_01 inf: pbx_01 ip_01
colocation pbx_02-with-fs_02 inf: pbx_02 fs_02
colocation pbx_02-with-ip_02 inf: pbx_02 ip_02
order fs_01-after-drbd_01 inf: ms-drbd_01:promote fs_01:start
order fs_02-after-drbd_02 inf: ms-drbd_02:promote fs_02:start
order pbx_01-after-fs_01 inf: fs_01 pbx_01
order pbx_01-after-ip_01 inf: ip_01 pbx_01
order pbx_02-after-fs_02 inf: fs_02 pbx_02
order pbx_02-after-ip_02 inf: ip_02 pbx_02
property $id="cib-bootstrap-options" \
        dc-version="1.0.9-89bd754939df5150de7cd76835f98fe90851b677" \
        cluster-infrastructure="Heartbeat" \
        stonith-enabled="false" \
        symmetric-cluster="false" \
        last-lrm-refresh="1285323745"
rsc_defaults $id="rsc-options" \
        resource-stickiness="1000"
[root at node-02 ~]# cat /etc/drbd.conf
#
# please have a a look at the example configuration file in
# /usr/share/doc/drbd83/drbd.conf
#

global {
  usage-count yes;
}
common {
  protocol C;

  syncer {
    csums-alg sha1;
    verify-alg sha1;
    rate 10M;
  }

  net {
    data-integrity-alg sha1;
    max-buffers 20480;
    max-epoch-size 16384;
  }

  disk {
    on-io-error detach;
### Only when DRBD is under cluster ###
    fencing resource-only;
### --- ###
  }

  startup {
    wfc-timeout 60;
    degr-wfc-timeout 30;
    outdated-wfc-timeout 15;
   }

### Only when DRBD is under cluster ###
  handlers {
    split-brain "/usr/lib/drbd/notify-split-brain.sh root";
    fence-peer "/usr/lib/drbd/crm-fence-peer.sh";
    after-resync-target "/usr/lib/drbd/crm-unfence-peer.sh";
  }
### --- ###
}

resource drbd_pbx_service_1 {

  on node-01 {
    device    /dev/drbd1;
    disk      /dev/sdd1;
    address   10.10.10.129:7789;
    meta-disk internal;
  }
   on node-03 {
    device    /dev/drbd1;
    disk      /dev/sdd1;
    address   10.10.10.131:7789;
    meta-disk internal;
  }
}

resource drbd_pbx_service_2 {

  on node-02 {
    device    /dev/drbd2;
    disk      /dev/sdb1;
    address   10.10.10.130:7790;
    meta-disk internal;
  }
  on node-03 {
    device    /dev/drbd2;
    disk      /dev/sdc1;
    address   10.10.10.131:7790;
    meta-disk internal;
  }
}
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.linbit.com/pipermail/drbd-user/attachments/20100924/c6ce98c8/attachment.htm>