[DRBD-user] Pulled the Power Plug

sidimustafa reggie at softhome.net
Thu Nov 2 05:12:13 CET 2006

Note: "permalinks" may not be as permanent as we would like,
direct links of old sources may well be a few messages off.


Here's the entire Process that was tested:

Please Note:
The test problem described here is concentrated on drbd resource ZIM0 
(/dev/DRBD3)
All i done in this process is reproduce the problem
No manual intervention as to where drbd is concern..

ConfigInformation: Server A (sto01.curvedict.info)
===========================

/etc/ha.d/ha.cf
---------------
debugfile /var/log/hadebug
logfile /var/log/halog
logfacility     local0

udpport     694
keepalive   2
deadtime   30
warntime   10
initdead     120

bcast   eth0
auto_failback   off

watchdog        /dev/watchdog

respawn hacluster /usr/lib/heartbeat/ipfail
ping 192.168.10.254 192.168.10.1 

node    sto01.curvedict.info
node    sto02.curvedict.info

serial  /dev/ttyS0
baud   115200

/etc/ha.d/haresources
---------------------
sto01.curvedict.info 192.168.10.121 drbddisk::lun0
Filesystem::/dev/drbd0::/san/lun0::jfs drbddisk::lun1
Filesystem::/dev/drbd1::/san/lun1::jfs
sto02.curvedict.info 192.168.10.122 drbddisk::zim0
Filesystem::/dev/drbd3::/opt/zimbra::jfs

/etc/drbd.conf
=============
#
#
global {
   minor-count 6;
   dialog-refresh 5; # 5 seconds
}

resource lun0 {
 protocol  C;

 incon-degr-cmd "echo '!DRBD! pri on incon-degr' | wall ; sleep 60 ; halt
-f";
  
 startup {
   # wfc-timeout  0;
   degr-wfc-timeout 120;    # 2 minutes.
 }

 disk {
   on-io-error   detach;
 }

net {
 }

 syncer {
   rate 10M;
   group 1;
   al-extents 257;
 }

 on sto01.curvedict.info {
   device   /dev/drbd0;
   disk     /dev/stovg/lun0;
   address  192.168.15.160:7789;
   meta-disk  internal;
 }

 on sto02.curvedict.info {
   device   /dev/drbd0;
   disk     /dev/stovg/lun0;
   address  192.168.15.162:7789;
   meta-disk  internal;
 }
}

resource lun1 {
 protocol  C;

 incon-degr-cmd "echo '!DRBD! pri on incon-degr' | wall ; sleep 60 ; halt
-f";
  
 startup {
   # wfc-timeout  0;
   degr-wfc-timeout 120;    # 2 minutes.
 }

 disk {
   on-io-error   detach;
 }

net {
 }

 syncer {
   rate 10M;
   group 1;
   al-extents 257;
 }

 on sto01.curvedict.info {
   device   /dev/drbd1;
   disk     /dev/stovg/lun1;
   address  192.168.15.160:7790;
   meta-disk  internal;
 }

 on sto02.curvedict.info {
   device   /dev/drbd1;
   disk     /dev/stovg/lun1;
   address  192.168.15.162:7790;
   meta-disk  internal;
 }
}

resource zim0 {
 protocol  C;

 incon-degr-cmd "echo '!DRBD! pri on incon-degr' | wall ; sleep 60 ; halt
-f";
  
 startup {
   # wfc-timeout  0;
   degr-wfc-timeout 120;    # 2 minutes.
 }

 disk {
   on-io-error   detach;
 }

net {
 }

 syncer {
   rate 30M;
   group 1;
   al-extents 257;
 }

 on sto01.curvedict.info {
   device   /dev/drbd3;
   disk     /dev/stovg/zim0;
   address  192.168.15.160:7792;
   meta-disk  internal;
 }

 on sto02.curvedict.info {
   device   /dev/drbd3;
   disk     /dev/stovg/zim0;
   address  192.168.15.162:7792;
   meta-disk  internal;
 }
}

resource zim1 {
 protocol  C;

 incon-degr-cmd "echo '!DRBD! pri on incon-degr' | wall ; sleep 60 ; halt
-f";
  
 startup {
   # wfc-timeout  0;
   degr-wfc-timeout 120;    # 2 minutes.
 }

 disk {
   on-io-error   detach;
 }

net {
 }

 syncer {
   rate 30M;
   group 1;
   al-extents 257;
 }

 on sto01.curvedict.info {
   device   /dev/drbd4;
   disk     /dev/stovg/zim1;
   address  192.168.15.160:7793;
   meta-disk  internal;
 }

 on sto02.curvedict.info {
   device   /dev/drbd4;
   disk     /dev/stovg/zim1;
   address  192.168.15.162:7793;
   meta-disk  internal;
 }
}


ConfigInformation: Server B (sto02.curvedict.info)
===========================

/etc/ha.d/ha.cf
---------------------
debugfile /var/log/hadebug
logfile /var/log/halog
logfacility     local0

udpport     694
keepalive   2
deadtime   30
warntime   10
initdead     120

bcast   eth0
auto_failback   off

watchdog        /dev/watchdog

respawn hacluster /usr/lib/heartbeat/ipfail
ping 192.168.10.254 192.168.10.1 

node    sto01.curvedict.info
node    sto02.curvedict.info

serial  /dev/ttyS0
baud   115200

/etc/ha.d/haresources
---------------------
sto01.curvedict.info 192.168.10.121 drbddisk::lun0
Filesystem::/dev/drbd0::/san/lun0::jfs drbddisk::lun1
Filesystem::/dev/drbd1::/san/lun1::jfs
sto02.curvedict.info 192.168.10.122 drbddisk::zim0
Filesystem::/dev/drbd3::/opt/zimbra::jfs

/etc/drbd.conf
=============
#
#
global {
   minor-count 6;
   dialog-refresh 5; # 5 seconds
}

resource lun0 {
 protocol  C;

 incon-degr-cmd "echo '!DRBD! pri on incon-degr' | wall ; sleep 60 ; halt
-f";
  
 startup {
   # wfc-timeout  0;
   degr-wfc-timeout 120;    # 2 minutes.
 }

 disk {
   on-io-error   detach;
 }

net {
 }

 syncer {
   rate 10M;
   group 1;
   al-extents 257;
 }

 on sto01.curvedict.info {
   device   /dev/drbd0;
   disk     /dev/stovg/lun0;
   address  192.168.15.160:7789;
   meta-disk  internal;
 }

 on sto02.curvedict.info {
   device   /dev/drbd0;
   disk     /dev/stovg/lun0;
   address  192.168.15.162:7789;
   meta-disk  internal;
 }
}

resource lun1 {
 protocol  C;

 incon-degr-cmd "echo '!DRBD! pri on incon-degr' | wall ; sleep 60 ; halt
-f";
  
 startup {
   # wfc-timeout  0;
   degr-wfc-timeout 120;    # 2 minutes.
 }

 disk {
   on-io-error   detach;
 }

net {
 }

 syncer {
   rate 10M;
   group 1;
   al-extents 257;
 }

 on sto01.curvedict.info {
   device   /dev/drbd1;
   disk     /dev/stovg/lun1;
   address  192.168.15.160:7790;
   meta-disk  internal;
 }

 on sto02.curvedict.info {
   device   /dev/drbd1;
   disk     /dev/stovg/lun1;
   address  192.168.15.162:7790;
   meta-disk  internal;
 }
}

resource zim0 {
 protocol  C;

 incon-degr-cmd "echo '!DRBD! pri on incon-degr' | wall ; sleep 60 ; halt
-f";
  
 startup {
   # wfc-timeout  0;
   degr-wfc-timeout 120;    # 2 minutes.
 }

 disk {
   on-io-error   detach;
 }

net {
 }

 syncer {
   rate 30M;
   group 1;
   al-extents 257;
 }

 on sto01.curvedict.info {
   device   /dev/drbd3;
   disk     /dev/stovg/zim0;
   address  192.168.15.160:7792;
   meta-disk  internal;
 }

 on sto02.curvedict.info {
   device   /dev/drbd3;
   disk     /dev/stovg/zim0;
   address  192.168.15.162:7792;
   meta-disk  internal;
 }
}

resource zim1 {
 protocol  C;

 incon-degr-cmd "echo '!DRBD! pri on incon-degr' | wall ; sleep 60 ; halt
-f";
  
 startup {
   # wfc-timeout  0;
   degr-wfc-timeout 120;    # 2 minutes.
 }

 disk {
   on-io-error   detach;
 }

net {
 }

 syncer {
   rate 30M;
   group 1;
   al-extents 257;
 }

 on sto01.curvedict.info {
   device   /dev/drbd4;
   disk     /dev/stovg/zim1;
   address  192.168.15.160:7793;
   meta-disk  internal;
 }

 on sto02.curvedict.info {
   device   /dev/drbd4;
   disk     /dev/stovg/zim1;
   address  192.168.15.162:7793;
   meta-disk  internal;
 }
}


Server A --> Before plug is pulled (sto01.curvedict.info)
--------------------------------
version: 0.7.21 (api:79/proto:74)
SVN Revision: 2326 build by buildsvn at build-i386, 2006-10-07 05:13:10
 0: cs:Connected st:Primary/Secondary ld:Consistent
    ns:52 nr:0 dw:12 dr:88 al:0 bm:2 lo:0 pe:0 ua:0 ap:0
 1: cs:Connected st:Primary/Secondary ld:Consistent
    ns:48 nr:0 dw:12 dr:84 al:0 bm:2 lo:0 pe:0 ua:0 ap:0
 2: cs:Unconfigured
 3: cs:Connected st:Secondary/Primary ld:Consistent
    ns:88 nr:68 dw:116 dr:92 al:0 bm:14 lo:0 pe:0 ua:0 ap:0
 4: cs:Connected st:Secondary/Secondary ld:Consistent
    ns:0 nr:0 dw:0 dr:0 al:0 bm:0 lo:0 pe:0 ua:0 ap:0
 5: cs:Unconfigured
 
 Server B --> Before plug is pulled (sto02.curvedict.info)
-------------------------------- 
 version: 0.7.21 (api:79/proto:74)
SVN Revision: 2326 build by buildsvn at build-i386, 2006-10-07 05:13:10
 0: cs:Connected st:Secondary/Primary ld:Consistent
    ns:0 nr:0 dw:0 dr:0 al:0 bm:0 lo:0 pe:0 ua:0 ap:0
 1: cs:Connected st:Secondary/Primary ld:Consistent
    ns:0 nr:0 dw:0 dr:0 al:0 bm:0 lo:0 pe:0 ua:0 ap:0
 2: cs:Unconfigured
 3: cs:Connected st:Primary/Secondary ld:Consistent
    ns:12 nr:36 dw:48 dr:48 al:0 bm:0 lo:0 pe:0 ua:0 ap:0
 4: cs:Connected st:Secondary/Secondary ld:Consistent
    ns:0 nr:0 dw:0 dr:0 al:0 bm:0 lo:0 pe:0 ua:0 ap:0
 5: cs:Unconfigure
 
 Status: Server A -- after plug is pulled on server B (sto01.curvedict.info)
 ----------------------------------------------------
cat /proc/drbd
--------------
version: 0.7.21 (api:79/proto:74)
SVN Revision: 2326 build by buildsvn at build-i386, 2006-10-07 05:13:10
 0: cs:WFConnection st:Primary/Unknown ld:Consistent
    ns:52 nr:0 dw:12 dr:88 al:0 bm:2 lo:0 pe:0 ua:0 ap:0
 1: cs:WFConnection st:Primary/Unknown ld:Consistent
    ns:48 nr:0 dw:12 dr:84 al:0 bm:2 lo:0 pe:0 ua:0 ap:0
 2: cs:Unconfigured
 3: cs:WFConnection st:Secondary/Unknown ld:Consistent
    ns:88 nr:68 dw:116 dr:92 al:0 bm:14 lo:0 pe:0 ua:0 ap:0
 4: cs:WFConnection st:Secondary/Unknown ld:Consistent
    ns:0 nr:0 dw:0 dr:0 al:0 bm:0 lo:0 pe:0 ua:0 ap:0
 5: cs:Unconfigured
 
cat /var/log/halog
------------------
heartbeat: 2006/11/02_03:43:02 WARN: TTY write timeout on [/dev/ttyS0] (no
connection or bad cable? [see documentation])
heartbeat: 2006/11/02_03:43:15 WARN: node sto02.curvedict.info: is dead
heartbeat: 2006/11/02_03:43:15 info: Dead node sto02.curvedict.info gave up
resources.
heartbeat: 2006/11/02_03:43:15 info: Link sto02.curvedict.info:eth0 dead.
heartbeat: 2006/11/02_03:43:15 info: Link sto02.curvedict.info:/dev/ttyS0
dead.

cat /var/log/hadebug
--------------------
Reports Nothing.

cat /var/log/messages
---------------------
Nov  2 03:42:30 sto01 watchdog[1757]: still alive after 125730 seconds =
12573 interval(s)
Nov  2 03:42:45 sto01 watchdog[1757]: still alive after 125740 seconds =
12574 interval(s)
Nov  2 03:42:47 sto01 kernel: e100: eth1: e100_watchdog: link down
Nov  2 03:42:49 sto01 kernel: drbd1: PingAck did not arrive in time.
Nov  2 03:42:49 sto01 kernel: drbd1: drbd1_asender [4045]: cstate Connected
--> NetworkFailure
Nov  2 03:42:49 sto01 kernel: drbd1: asender terminated
Nov  2 03:42:49 sto01 kernel: drbd1: drbd1_receiver [2421]: cstate
NetworkFailure --> BrokenPipe
Nov  2 03:42:49 sto01 kernel: drbd1: short read expecting header on sock:
r=-512
Nov  2 03:42:49 sto01 kernel: drbd1: worker terminated
Nov  2 03:42:49 sto01 kernel: drbd1: drbd1_receiver [2421]: cstate
BrokenPipe --> Unconnected
Nov  2 03:42:49 sto01 kernel: drbd1: Connection lost.
Nov  2 03:42:49 sto01 kernel: drbd1: drbd1_receiver [2421]: cstate
Unconnected --> WFConnection
Nov  2 03:42:50 sto01 kernel: drbd0: PingAck did not arrive in time.
Nov  2 03:42:50 sto01 kernel: drbd0: drbd0_asender [4048]: cstate Connected
--> NetworkFailure
Nov  2 03:42:50 sto01 kernel: drbd0: asender terminated
Nov  2 03:42:50 sto01 kernel: drbd4: PingAck did not arrive in time.
Nov  2 03:42:50 sto01 kernel: drbd4: drbd4_asender [4047]: cstate Connected
--> NetworkFailure
Nov  2 03:42:50 sto01 kernel: drbd4: asender terminated
Nov  2 03:42:50 sto01 kernel: drbd0: drbd0_receiver [2413]: cstate
NetworkFailure --> BrokenPipe
Nov  2 03:42:50 sto01 kernel: drbd0: short read expecting header on sock:
r=-512
Nov  2 03:42:50 sto01 kernel: drbd4: drbd4_receiver [2437]: cstate
NetworkFailure --> BrokenPipe
Nov  2 03:42:50 sto01 kernel: drbd4: short read expecting header on sock:
r=-512
Nov  2 03:42:50 sto01 kernel: drbd0: worker terminated
Nov  2 03:42:50 sto01 kernel: drbd4: worker terminated
Nov  2 03:42:50 sto01 kernel: drbd0: drbd0_receiver [2413]: cstate
BrokenPipe --> Unconnected
Nov  2 03:42:50 sto01 kernel: drbd4: drbd4_receiver [2437]: cstate
BrokenPipe --> Unconnected
Nov  2 03:42:50 sto01 kernel: drbd4: Connection lost.
Nov  2 03:42:50 sto01 kernel: drbd4: drbd4_receiver [2437]: cstate
Unconnected --> WFConnection
Nov  2 03:42:50 sto01 kernel: drbd0: Connection lost.
Nov  2 03:42:50 sto01 kernel: drbd0: drbd0_receiver [2413]: cstate
Unconnected --> WFConnection
Nov  2 03:42:56 sto01 kernel: drbd3: PingAck did not arrive in time.
Nov  2 03:42:56 sto01 kernel: drbd3: drbd3_asender [4046]: cstate Connected
--> NetworkFailure
Nov  2 03:42:56 sto01 kernel: drbd3: asender terminated
Nov  2 03:42:56 sto01 kernel: drbd3: drbd3_receiver [2429]: cstate
NetworkFailure --> BrokenPipe
Nov  2 03:42:56 sto01 kernel: drbd3: short read expecting header on sock:
r=-512
Nov  2 03:42:56 sto01 kernel: drbd3: worker terminated
Nov  2 03:42:56 sto01 kernel: drbd3: drbd3_receiver [2429]: cstate
BrokenPipe --> Unconnected
Nov  2 03:42:56 sto01 kernel: drbd3: Connection lost.
Nov  2 03:42:56 sto01 kernel: drbd3: drbd3_receiver [2429]: cstate
Unconnected --> WFConnection
Nov  2 03:43:00 sto01 watchdog[1757]: still alive after 125750 seconds =
12575 interval(s)
Nov  2 03:43:02 sto01 heartbeat[2565]: WARN: TTY write timeout on
[/dev/ttyS0] (no connection or bad cable? [see documentation])
Nov  2 03:43:15 sto01 heartbeat[2548]: WARN: node sto02.curvedict.info: is
dead
Nov  2 03:43:15 sto01 heartbeat[2548]: info: Dead node sto02.curvedict.info
gave up resources.
Nov  2 03:43:15 sto01 heartbeat[2548]: info: Link sto02.curvedict.info:eth0
dead.
Nov  2 03:43:15 sto01 heartbeat[2548]: info: Link
sto02.curvedict.info:/dev/ttyS0 dead.
Nov  2 03:43:15 sto01 ipfail[3128]: info: Status update: Node
sto02.curvedict.info now has status dead
Nov  2 03:43:15 sto01 ipfail[3128]: info: NS: We are still alive!
Nov  2 03:43:15 sto01 watchdog[1757]: still alive after 125760 seconds =
12576 interval(s)
Nov  2 03:43:16 sto01 ipfail[3128]: info: Link Status update: Link
sto02.curvedict.info/eth0 now has status dead
Nov  2 03:43:16 sto01 ipfail[3128]: info: Asking other side for ping node
count.
Nov  2 03:43:16 sto01 ipfail[3128]: info: Checking remote count of ping
nodes.
Nov  2 03:43:16 sto01 ipfail[3128]: info: Link Status update: Link
sto02.curvedict.info//dev/ttyS0 now has status dead
Nov  2 03:43:16 sto01 ipfail[3128]: info: Asking other side for ping node
count.
Nov  2 03:43:16 sto01 ipfail[3128]: info: Checking remote count of ping
nodes.
Nov  2 03:43:30 sto01 watchdog[1757]: still alive after 125770 seconds =
12577 interval(s)
Nov  2 03:43:45 sto01 watchdog[1757]: still alive after 125780 seconds =
12578 interval(s)
Nov  2 03:44:00 sto01 watchdog[1757]: still alive after 125790 seconds =
12579 interval(s)
Nov  2 03:44:15 sto01 watchdog[1757]: still alive after 125800 seconds =
12580 interval(s)
Nov  2 03:44:19 sto01 sshd(pam_unix)[27977]: session opened for user root by
(uid=0)
Nov  2 03:44:30 sto01 watchdog[1757]: still alive after 125810 seconds =
12581 interval(s)
Nov  2 03:44:45 sto01 watchdog[1757]: still alive after 125820 seconds =
12582 interval(s)
Nov  2 03:45:00 sto01 watchdog[1757]: still alive after 125830 seconds =
12583 interval(s)

 
 
Server B is restarted:
=====================
Taking into consideration that Heartbeat is not config for Auto-Failback
when the server is started,
no action is executed during start up to re-take the resource.
It is expected that the Resource is being handle by Server A, which never
took place when the plug was pulled.

In order to reproduce test, i'll goto Server B,and manually tell it to
re-take the 
resource: this is only done so i can reproduce the entire process.


Once Server B is restarted DRBD service didn't give no message it just
started and connected to Server A

These are the logs that were generated during the boot up phase of Server B
================================================
/var/log/halog on Server A: (sto01.curvedict.info)
-----------------------------
heartbeat: 2006/11/02_04:01:29 info: Heartbeat restart on node
sto02.curvedict.info
heartbeat: 2006/11/02_04:01:29 info: Link sto02.curvedict.info:eth0 up.
heartbeat: 2006/11/02_04:01:29 info: Status update for node
sto02.curvedict.info: status up
heartbeat: 2006/11/02_04:01:29 info: Running /etc/ha.d/rc.d/status status
heartbeat: 2006/11/02_04:01:29 info: Status update for node
sto02.curvedict.info: status active
heartbeat: 2006/11/02_04:01:29 info: Running /etc/ha.d/rc.d/status status
heartbeat: 2006/11/02_04:01:30 info: remote resource transition completed.
heartbeat: 2006/11/02_04:01:30 info: Link sto02.curvedict.info:/dev/ttyS0
up.


/var/log/hadebug on Server A: (sto01.curvedict.info)
-----------------------------
heartbeat: 2006/11/02_04:01:29 debug: notify_world: setting SIGCHLD Handler
to SIG_DFL
heartbeat: 2006/11/02_04:01:29 debug: notify_world: setting SIGCHLD Handler
to SIG_DFL


cat /proc/drbd on Server A (sto01.curvedict.info)
--------------------------
version: 0.7.21 (api:79/proto:74)
SVN Revision: 2326 build by buildsvn at build-i386, 2006-10-07 05:13:10
 0: cs:Connected st:Primary/Secondary ld:Consistent
    ns:52 nr:0 dw:12 dr:88 al:0 bm:2 lo:0 pe:0 ua:0 ap:0
 1: cs:Connected st:Primary/Secondary ld:Consistent
    ns:48 nr:0 dw:12 dr:84 al:0 bm:2 lo:0 pe:0 ua:0 ap:0
 2: cs:Unconfigured
 3: cs:Connected st:Secondary/Secondary ld:Consistent
    ns:88 nr:8260 dw:8308 dr:92 al:0 bm:16 lo:0 pe:0 ua:0 ap:0
 4: cs:Connected st:Secondary/Secondary ld:Consistent
    ns:0 nr:0 dw:0 dr:0 al:0 bm:0 lo:0 pe:0 ua:0 ap:0
 5: cs:Unconfigured
 

/var/log/messages on Server A (sto01.curvedict.info)
-----------------------------
Nov  2 04:00:15 sto01 watchdog[1757]: still alive after 126440 seconds =
12644 interval(s)
Nov  2 04:00:29 sto01 kernel: e100: eth1: e100_watchdog: link up, 100Mbps,
full-duplex
Nov  2 04:00:30 sto01 watchdog[1757]: still alive after 126450 seconds =
12645 interval(s)
Nov  2 04:00:45 sto01 watchdog[1757]: still alive after 126460 seconds =
12646 interval(s)
Nov  2 04:01:00 sto01 watchdog[1757]: still alive after 126470 seconds =
12647 interval(s)
Nov  2 04:01:15 sto01 watchdog[1757]: still alive after 126480 seconds =
12648 interval(s)
Nov  2 04:01:22 sto01 kernel: drbd0: drbd0_receiver [2413]: cstate
WFConnection --> WFReportParams
Nov  2 04:01:22 sto01 kernel: drbd0: Handshake successful: DRBD Network
Protocol version 74
Nov  2 04:01:22 sto01 kernel: drbd0: Connection established.
Nov  2 04:01:22 sto01 kernel: drbd0: I am(P):
1:00000003:00000001:0000000c:00000001:10
Nov  2 04:01:22 sto01 kernel: drbd0: Peer(S):
1:00000003:00000001:0000000b:00000001:00
Nov  2 04:01:22 sto01 kernel: drbd0: drbd0_receiver [2413]: cstate
WFReportParams --> WFBitMapS
Nov  2 04:01:22 sto01 kernel: drbd0: Primary/Unknown --> Primary/Secondary
Nov  2 04:01:22 sto01 kernel: drbd0: drbd0_receiver [2413]: cstate WFBitMapS
--> SyncSource
Nov  2 04:01:22 sto01 kernel: drbd0: Resync started as SyncSource (need to
sync 0 KB [0 bits set]).
Nov  2 04:01:22 sto01 kernel: drbd0: Resync done (total 1 sec; paused 0 sec;
0 K/sec)
Nov  2 04:01:22 sto01 kernel: drbd0: drbd0_receiver [2413]: cstate
SyncSource --> Connected
Nov  2 04:01:23 sto01 kernel: drbd1: drbd1_receiver [2421]: cstate
WFConnection --> WFReportParams
Nov  2 04:01:23 sto01 kernel: drbd1: Handshake successful: DRBD Network
Protocol version 74
Nov  2 04:01:23 sto01 kernel: drbd1: Connection established.
Nov  2 04:01:23 sto01 kernel: drbd1: I am(P):
1:00000003:00000001:0000000c:00000001:10
Nov  2 04:01:23 sto01 kernel: drbd1: Peer(S):
1:00000003:00000001:0000000b:00000001:00
Nov  2 04:01:23 sto01 kernel: drbd1: drbd1_receiver [2421]: cstate
WFReportParams --> WFBitMapS
Nov  2 04:01:23 sto01 kernel: drbd1: Primary/Unknown --> Primary/Secondary
Nov  2 04:01:23 sto01 kernel: drbd1: drbd1_receiver [2421]: cstate WFBitMapS
--> SyncSource
Nov  2 04:01:23 sto01 kernel: drbd1: Resync started as SyncSource (need to
sync 0 KB [0 bits set]).
Nov  2 04:01:23 sto01 kernel: drbd1: Resync done (total 1 sec; paused 0 sec;
0 K/sec)
Nov  2 04:01:23 sto01 kernel: drbd1: drbd1_receiver [2421]: cstate
SyncSource --> Connected
Nov  2 04:01:24 sto01 kernel: drbd3: drbd3_receiver [2429]: cstate
WFConnection --> WFReportParams
Nov  2 04:01:24 sto01 kernel: drbd3: Handshake successful: DRBD Network
Protocol version 74
Nov  2 04:01:24 sto01 kernel: drbd3: Connection established.
Nov  2 04:01:24 sto01 kernel: drbd3: I am(S):
1:00000003:00000001:0000000b:00000002:01
Nov  2 04:01:24 sto01 kernel: drbd3: Peer(S):
1:00000003:00000001:0000000b:00000002:10
Nov  2 04:01:24 sto01 kernel: drbd3: drbd3_receiver [2429]: cstate
WFReportParams --> WFBitMapT
Nov  2 04:01:24 sto01 kernel: drbd3: Secondary/Unknown -->
Secondary/Secondary
Nov  2 04:01:24 sto01 kernel: drbd3: drbd3_receiver [2429]: cstate WFBitMapT
--> SyncTarget
Nov  2 04:01:24 sto01 kernel: drbd3: Resync started as SyncTarget (need to
sync 8192 KB [2048 bits set]).
Nov  2 04:01:24 sto01 kernel: drbd4: drbd4_receiver [2437]: cstate
WFConnection --> WFReportParams
Nov  2 04:01:24 sto01 kernel: drbd4: Handshake successful: DRBD Network
Protocol version 74
Nov  2 04:01:24 sto01 kernel: drbd4: Connection established.
Nov  2 04:01:24 sto01 kernel: drbd4: I am(S):
1:00000002:00000001:00000002:00000001:01
Nov  2 04:01:24 sto01 kernel: drbd4: Peer(S):
1:00000002:00000001:00000002:00000001:00
Nov  2 04:01:24 sto01 kernel: drbd4: drbd4_receiver [2437]: cstate
WFReportParams --> Connected
Nov  2 04:01:24 sto01 kernel: drbd4: Secondary/Unknown -->
Secondary/Secondary
Nov  2 04:01:25 sto01 kernel: drbd3: Resync done (total 1 sec; paused 0 sec;
8192 K/sec)
Nov  2 04:01:25 sto01 kernel: drbd3: drbd3_worker [27965]: cstate SyncTarget
--> Connected
Nov  2 04:01:29 sto01 heartbeat[2548]: info: Heartbeat restart on node
sto02.curvedict.info
Nov  2 04:01:29 sto01 heartbeat[2548]: info: Link sto02.curvedict.info:eth0
up.
Nov  2 04:01:29 sto01 heartbeat[2548]: info: Status update for node
sto02.curvedict.info: status up
Nov  2 04:01:29 sto01 ipfail[3128]: info: Link Status update: Link
sto02.curvedict.info/eth0 now has status up
Nov  2 04:01:29 sto01 ipfail[3128]: info: Status update: Node
sto02.curvedict.info now has status up
Nov  2 04:01:29 sto01 heartbeat: info: Running /etc/ha.d/rc.d/status status
Nov  2 04:01:29 sto01 heartbeat[2548]: info: Status update for node
sto02.curvedict.info: status active
Nov  2 04:01:29 sto01 ipfail[3128]: info: Status update: Node
sto02.curvedict.info now has status active
Nov  2 04:01:29 sto01 heartbeat: info: Running /etc/ha.d/rc.d/status status
Nov  2 04:01:30 sto01 heartbeat[2548]: info: remote resource transition
completed.
Nov  2 04:01:30 sto01 heartbeat[2548]: info: Link
sto02.curvedict.info:/dev/ttyS0 up.
Nov  2 04:01:30 sto01 ipfail[3128]: info: Link Status update: Link
sto02.curvedict.info//dev/ttyS0 now has status up
Nov  2 04:01:30 sto01 ipfail[3128]: info: Asking other side for ping node
count.
Nov  2 04:01:30 sto01 ipfail[3128]: info: No giveup timer to abort.
Nov  2 04:01:30 sto01 watchdog[1757]: still alive after 126490 seconds =
12649 interval(s)
Nov  2 04:01:45 sto01 watchdog[1757]: still alive after 126500 seconds =
12650 interval(s)


-----


/var/log/halog on Server B: (sto02.curvedict.info)
-----------------------------
heartbeat: 2006/11/02_04:03:35 info: **************************
heartbeat: 2006/11/02_04:03:35 info: Configuration validated. Starting
heartbeat 1.2.3.cvs.20050927
heartbeat: 2006/11/02_04:03:35 info: heartbeat: version 1.2.3.cvs.20050927
heartbeat: 2006/11/02_04:03:36 info: Heartbeat generation: 13
heartbeat: 2006/11/02_04:03:36 info: UDP Broadcast heartbeat started on port
694 (694) interface eth0
heartbeat: 2006/11/02_04:03:36 info: ping heartbeat started.
heartbeat: 2006/11/02_04:03:36 info: ping heartbeat started.
heartbeat: 2006/11/02_04:03:36 info: Starting serial heartbeat on tty
/dev/ttyS0 (19200 baud)
heartbeat: 2006/11/02_04:03:36 notice: Using watchdog device: /dev/watchdog
heartbeat: 2006/11/02_04:03:36 info: pid 2591 locked in memory.
heartbeat: 2006/11/02_04:03:36 info: Local status now set to: 'up'
heartbeat: 2006/11/02_04:03:37 info: pid 2601 locked in memory.
heartbeat: 2006/11/02_04:03:37 info: pid 2602 locked in memory.
heartbeat: 2006/11/02_04:03:37 info: pid 2603 locked in memory.
heartbeat: 2006/11/02_04:03:37 info: Link sto01.curvedict.info:eth0 up.
heartbeat: 2006/11/02_04:03:37 info: Status update for node
sto01.curvedict.info: status active
heartbeat: 2006/11/02_04:03:37 info: Link sto02.curvedict.info:eth0 up.
heartbeat: 2006/11/02_04:03:37 info: pid 2604 locked in memory.
heartbeat: 2006/11/02_04:03:37 info: pid 2606 locked in memory.
heartbeat: 2006/11/02_04:03:37 info: pid 2607 locked in memory.
heartbeat: 2006/11/02_04:03:37 info: Status update for node 192.168.10.254:
status ping
heartbeat: 2006/11/02_04:03:37 info: Link 192.168.10.1:192.168.10.1 up.
heartbeat: 2006/11/02_04:03:37 info: Status update for node 192.168.10.1:
status ping
heartbeat: 2006/11/02_04:03:37 info: Local status now set to: 'active'
heartbeat: 2006/11/02_04:03:37 info: pid 2609 locked in memory.
heartbeat: 2006/11/02_04:03:38 info: pid 2608 locked in memory.
heartbeat: 2006/11/02_04:03:38 info: Starting child client
"/usr/lib/heartbeat/ipfail" (1001,104)
heartbeat: 2006/11/02_04:03:38 info: Link sto01.curvedict.info:/dev/ttyS0
up.
heartbeat: 2006/11/02_04:03:38 info: remote resource transition completed.
heartbeat: 2006/11/02_04:03:38 info: remote resource transition completed.
heartbeat: 2006/11/02_04:03:38 info: Local Resource acquisition completed.
(none)
heartbeat: 2006/11/02_04:03:38 info: Initial resource acquisition complete
(T_RESOURCES(them))
heartbeat: 2006/11/02_04:03:38 info: Starting "/usr/lib/heartbeat/ipfail" as
uid 1001  gid 104 (pid 2993)
heartbeat: 2006/11/02_04:03:38 info: pid 2605 locked in memory.
heartbeat: 2006/11/02_04:03:38 info: Running /etc/ha.d/rc.d/status status
heartbeat: 2006/11/02_04:03:38 info: Link 192.168.10.254:192.168.10.254 up.


/var/log/hadebug on Server B: (sto02.curvedict.info)
-----------------------------
heartbeat: 2006/11/02_04:03:38 debug: notify_world: setting SIGCHLD Handler
to SIG_DFL


cat /proc/drbd on Server B (sto02.curvedict.info)
--------------------------
version: 0.7.21 (api:79/proto:74)
SVN Revision: 2326 build by buildsvn at build-i386, 2006-10-07 05:13:10
 0: cs:Connected st:Secondary/Primary ld:Consistent
    ns:0 nr:0 dw:0 dr:0 al:0 bm:0 lo:0 pe:0 ua:0 ap:0
 1: cs:Connected st:Secondary/Primary ld:Consistent
    ns:0 nr:0 dw:0 dr:0 al:0 bm:0 lo:0 pe:0 ua:0 ap:0
 2: cs:Unconfigured
 3: cs:Connected st:Secondary/Secondary ld:Consistent
    ns:8192 nr:0 dw:0 dr:8192 al:0 bm:4 lo:0 pe:0 ua:0 ap:0
 4: cs:Connected st:Secondary/Secondary ld:Consistent
    ns:0 nr:0 dw:0 dr:0 al:0 bm:0 lo:0 pe:0 ua:0 ap:0
 5: cs:Unconfigured

/var/log/messages on Server B  (sto02.curvedict.info)
-----------------------------
Nov  2 04:03:29 sto02 drbd: [ 
Nov  2 04:03:29 sto02 drbd: d0 
Nov  2 04:03:29 sto02 kernel: drbd0: resync bitmap: bits=491520 words=15360
Nov  2 04:03:29 sto02 kernel: drbd0: size = 1920 MB (1966080 KB)
Nov  2 04:03:29 sto02 kernel: drbd0: 0 KB marked out-of-sync by on disk
bit-map.
Nov  2 04:03:29 sto02 kernel: drbd0: Found 2 transactions (2 active extents)
in activity log.
Nov  2 04:03:29 sto02 kernel: drbd0: drbdsetup [2285]: cstate Unconfigured
--> StandAlone
Nov  2 04:03:29 sto02 drbd: d1 
Nov  2 04:03:29 sto02 kernel: drbd1: resync bitmap: bits=229376 words=7168
Nov  2 04:03:29 sto02 kernel: drbd1: size = 896 MB (917504 KB)
Nov  2 04:03:29 sto02 kernel: drbd1: 0 KB marked out-of-sync by on disk
bit-map.
Nov  2 04:03:29 sto02 kernel: drbd1: Found 2 transactions (2 active extents)
in activity log.
Nov  2 04:03:29 sto02 kernel: drbd1: drbdsetup [2289]: cstate Unconfigured
--> StandAlone
Nov  2 04:03:29 sto02 drbd: d2 
Nov  2 04:03:29 sto02 kernel: drbd3: resync bitmap: bits=491520 words=15360
Nov  2 04:03:29 sto02 kernel: drbd3: size = 1920 MB (1966080 KB)
Nov  2 04:03:29 sto02 kernel: drbd3: 0 KB marked out-of-sync by on disk
bit-map.
Nov  2 04:03:29 sto02 kernel: drbd3: Found 2 transactions (2 active extents)
in activity log.
Nov  2 04:03:29 sto02 kernel: drbd3: Marked additional 8192 KB as
out-of-sync based on AL.
Nov  2 04:03:29 sto02 kernel: drbd3: drbdsetup [2293]: cstate Unconfigured
--> StandAlone
Nov  2 04:03:29 sto02 drbd: d3 
Nov  2 04:03:29 sto02 kernel: drbd4: resync bitmap: bits=229376 words=7168
Nov  2 04:03:29 sto02 kernel: drbd4: size = 896 MB (917504 KB)
Nov  2 04:03:29 sto02 kernel: drbd4: 0 KB marked out-of-sync by on disk
bit-map.
Nov  2 04:03:29 sto02 kernel: drbd4: No usable activity log found.
Nov  2 04:03:29 sto02 kernel: drbd4: drbdsetup [2297]: cstate Unconfigured
--> StandAlone
Nov  2 04:03:29 sto02 drbd: s0 
Nov  2 04:03:29 sto02 drbd: s1 
Nov  2 04:03:29 sto02 drbd: s2 
Nov  2 04:03:29 sto02 drbd: s3 
Nov  2 04:03:29 sto02 drbd: n0 
Nov  2 04:03:29 sto02 kernel: drbd0: drbdsetup [2325]: cstate StandAlone -->
Unconnected
Nov  2 04:03:29 sto02 kernel: drbd0: drbd0_receiver [2326]: cstate
Unconnected --> WFConnection
Nov  2 04:03:29 sto02 drbd: n1 
Nov  2 04:03:29 sto02 kernel: drbd1: drbdsetup [2333]: cstate StandAlone -->
Unconnected
Nov  2 04:03:29 sto02 kernel: drbd1: drbd1_receiver [2334]: cstate
Unconnected --> WFConnection
Nov  2 04:03:29 sto02 drbd: n2 
Nov  2 04:03:29 sto02 kernel: drbd3: drbdsetup [2341]: cstate StandAlone -->
Unconnected
Nov  2 04:03:29 sto02 kernel: drbd3: drbd3_receiver [2342]: cstate
Unconnected --> WFConnection
Nov  2 04:03:29 sto02 drbd: n3 
Nov  2 04:03:29 sto02 kernel: drbd4: drbdsetup [2349]: cstate StandAlone -->
Unconnected
Nov  2 04:03:29 sto02 kernel: drbd4: drbd4_receiver [2350]: cstate
Unconnected --> WFConnection
Nov  2 04:03:29 sto02 drbd: ]
Nov  2 04:03:29 sto02 drbd: .
Nov  2 04:03:30 sto02 drbd: WARN: stdin/stdout is not a TTY; using
/dev/console
Nov  2 04:03:30 sto02 kernel: drbd0: drbd0_receiver [2326]: cstate
WFConnection --> WFReportParams
Nov  2 04:03:30 sto02 kernel: drbd0: Handshake successful: DRBD Network
Protocol version 74
Nov  2 04:03:30 sto02 kernel: drbd0: Connection established.
Nov  2 04:03:30 sto02 kernel: drbd0: I am(S):
1:00000003:00000001:0000000b:00000001:00
Nov  2 04:03:30 sto02 kernel: drbd0: Peer(P):
1:00000003:00000001:0000000c:00000001:10
Nov  2 04:03:30 sto02 kernel: drbd0: drbd0_receiver [2326]: cstate
WFReportParams --> WFBitMapT
Nov  2 04:03:30 sto02 kernel: drbd0: Secondary/Unknown --> Secondary/Primary
Nov  2 04:03:30 sto02 kernel: drbd0: drbd0_receiver [2326]: cstate WFBitMapT
--> SyncTarget
Nov  2 04:03:30 sto02 kernel: drbd0: Resync started as SyncTarget (need to
sync 0 KB [0 bits set]).
Nov  2 04:03:30 sto02 kernel: drbd0: Resync done (total 1 sec; paused 0 sec;
0 K/sec)
Nov  2 04:03:30 sto02 kernel: drbd0: drbd0_receiver [2326]: cstate
SyncTarget --> Connected
Nov  2 04:03:31 sto02 kernel: drbd1: drbd1_receiver [2334]: cstate
WFConnection --> WFReportParams
Nov  2 04:03:31 sto02 kernel: drbd1: Handshake successful: DRBD Network
Protocol version 74
Nov  2 04:03:31 sto02 kernel: drbd1: Connection established.
Nov  2 04:03:31 sto02 kernel: drbd1: I am(S):
1:00000003:00000001:0000000b:00000001:00
Nov  2 04:03:31 sto02 kernel: drbd1: Peer(P):
1:00000003:00000001:0000000c:00000001:10
Nov  2 04:03:31 sto02 kernel: drbd1: drbd1_receiver [2334]: cstate
WFReportParams --> WFBitMapT
Nov  2 04:03:31 sto02 kernel: drbd1: Secondary/Unknown --> Secondary/Primary
Nov  2 04:03:31 sto02 kernel: drbd1: drbd1_receiver [2334]: cstate WFBitMapT
--> SyncTarget
Nov  2 04:03:31 sto02 kernel: drbd1: Resync started as SyncTarget (need to
sync 0 KB [0 bits set]).
Nov  2 04:03:31 sto02 kernel: drbd1: Resync done (total 1 sec; paused 0 sec;
0 K/sec)
Nov  2 04:03:31 sto02 kernel: drbd1: drbd1_receiver [2334]: cstate
SyncTarget --> Connected
Nov  2 04:03:32 sto02 kernel: drbd3: drbd3_receiver [2342]: cstate
WFConnection --> WFReportParams
Nov  2 04:03:32 sto02 kernel: drbd3: Handshake successful: DRBD Network
Protocol version 74
Nov  2 04:03:32 sto02 kernel: drbd3: Connection established.
Nov  2 04:03:32 sto02 kernel: drbd3: I am(S):
1:00000003:00000001:0000000b:00000002:10
Nov  2 04:03:32 sto02 kernel: drbd3: Peer(S):
1:00000003:00000001:0000000b:00000002:01
Nov  2 04:03:32 sto02 kernel: drbd3: drbd3_receiver [2342]: cstate
WFReportParams --> WFBitMapS
Nov  2 04:03:32 sto02 kernel: drbd3: Secondary/Unknown -->
Secondary/Secondary
Nov  2 04:03:32 sto02 kernel: drbd3: drbd3_receiver [2342]: cstate WFBitMapS
--> SyncSource
Nov  2 04:03:32 sto02 kernel: drbd3: Resync started as SyncSource (need to
sync 8192 KB [2048 bits set]).
Nov  2 04:03:32 sto02 kernel: drbd4: drbd4_receiver [2350]: cstate
WFConnection --> WFReportParams
Nov  2 04:03:32 sto02 kernel: drbd4: Handshake successful: DRBD Network
Protocol version 74
Nov  2 04:03:32 sto02 kernel: drbd4: Connection established.
Nov  2 04:03:32 sto02 kernel: drbd4: I am(S):
1:00000002:00000001:00000002:00000001:00
Nov  2 04:03:32 sto02 kernel: drbd4: Peer(S):
1:00000002:00000001:00000002:00000001:01
Nov  2 04:03:32 sto02 kernel: drbd4: drbd4_receiver [2350]: cstate
WFReportParams --> Connected
Nov  2 04:03:32 sto02 kernel: drbd4: Secondary/Unknown -->
Secondary/Secondary
Nov  2 04:03:32 sto02 drbd: WARN: stdin/stdout is not a TTY; using
/dev/console
Nov  2 04:03:32 sto02 rc: Starting drbd:  succeeded
Nov  2 04:03:33 sto02 rpcidmapd: rpc.idmapd startup succeeded
Nov  2 04:03:33 sto02 acpid: acpid startup succeeded
Nov  2 04:03:33 sto02 kernel: drbd3: Resync done (total 1 sec; paused 0 sec;
8192 K/sec)
Nov  2 04:03:33 sto02 kernel: drbd3: drbd3_worker [2294]: cstate SyncSource
--> Connected
Nov  2 04:03:33 sto02 kernel: parport0: PC-style at 0x378 (0x778)
[PCSPP,TRISTATE]
Nov  2 04:03:33 sto02 kernel: parport0: irq 7 detected
Nov  2 04:03:33 sto02 kernel: lp0: using parport0 (polling).
Nov  2 04:03:33 sto02 kernel: lp0: console ready
Nov  2 04:03:33 sto02 cups: cupsd startup succeeded
Nov  2 04:03:34 sto02 xinetd: xinetd startup succeeded
Nov  2 04:03:34 sto02 gpm[2484]: *** info [startup.c(95)]: 
Nov  2 04:03:34 sto02 gpm[2484]: Started gpm successfully. Entered daemon
mode.
Nov  2 04:03:34 sto02 xinetd[2475]: xinetd Version 2.3.13 started with
libwrap loadavg options compiled in.
Nov  2 04:03:34 sto02 xinetd[2475]: Started working: 0 available services
Nov  2 04:03:34 sto02 gpm: gpm startup succeeded
Nov  2 04:03:34 sto02 crond: crond startup succeeded
Nov  2 04:03:34 sto02 anacron: anacron startup succeeded
Nov  2 04:03:35 sto02 atd: atd startup succeeded
Nov  2 04:03:35 sto02 haldaemon: haldaemon startup succeeded
Nov  2 04:03:35 sto02 heartbeat[2590]: info: **************************
Nov  2 04:03:35 sto02 heartbeat[2590]: info: Configuration validated.
Starting heartbeat 1.2.3.cvs.20050927
Nov  2 04:03:35 sto02 heartbeat[2591]: info: heartbeat: version
1.2.3.cvs.20050927
Nov  2 04:03:36 sto02 heartbeat[2591]: info: Heartbeat generation: 13
Nov  2 04:03:36 sto02 heartbeat[2591]: info: UDP Broadcast heartbeat started
on port 694 (694) interface eth0
Nov  2 04:03:36 sto02 heartbeat[2591]: info: ping heartbeat started.
Nov  2 04:03:36 sto02 heartbeat[2591]: info: ping heartbeat started.
Nov  2 04:03:36 sto02 heartbeat[2591]: info: Starting serial heartbeat on
tty /dev/ttyS0 (19200 baud)
Nov  2 04:03:36 sto02 heartbeat[2591]: notice: Using watchdog device:
/dev/watchdog
Nov  2 04:03:36 sto02 heartbeat[2591]: info: pid 2591 locked in memory.
Nov  2 04:03:36 sto02 heartbeat[2591]: info: Local status now set to: 'up'
Nov  2 04:03:37 sto02 rc: Starting webmin:  succeeded
Nov  2 04:03:37 sto02 heartbeat[2601]: info: pid 2601 locked in memory.
Nov  2 04:03:37 sto02 heartbeat[2602]: info: pid 2602 locked in memory.
Nov  2 04:03:37 sto02 heartbeat[2603]: info: pid 2603 locked in memory.
Nov  2 04:03:37 sto02 heartbeat[2591]: info: Link sto01.curvedict.info:eth0
up.
Nov  2 04:03:37 sto02 heartbeat[2591]: info: Status update for node
sto01.curvedict.info: status active
Nov  2 04:03:37 sto02 heartbeat[2591]: info: Link sto02.curvedict.info:eth0
up.
Nov  2 04:03:37 sto02 heartbeat[2604]: info: pid 2604 locked in memory.
Nov  2 04:03:37 sto02 heartbeat[2606]: info: pid 2606 locked in memory.
Nov  2 04:03:37 sto02 heartbeat[2607]: info: pid 2607 locked in memory.
Nov  2 04:03:37 sto02 heartbeat[2591]: info: Status update for node
192.168.10.254: status ping
Nov  2 04:03:37 sto02 heartbeat[2591]: info: Link 192.168.10.1:192.168.10.1
up.
Nov  2 04:03:37 sto02 heartbeat[2591]: info: Status update for node
192.168.10.1: status ping
Nov  2 04:03:37 sto02 heartbeat[2591]: info: Local status now set to:
'active'
Nov  2 04:03:37 sto02 heartbeat[2609]: info: pid 2609 locked in memory.
Nov  2 04:03:37 sto02 heartbeat[2608]: info: pid 2608 locked in memory.
Nov  2 04:03:37 sto02 heartbeat[2591]: info: Starting child client
"/usr/lib/heartbeat/ipfail" (1001,104)
Nov  2 04:03:38 sto02 heartbeat[2591]: info: Link
sto01.curvedict.info:/dev/ttyS0 up.
Nov  2 04:03:38 sto02 heartbeat[2591]: info: remote resource transition
completed.
Nov  2 04:03:38 sto02 heartbeat[2591]: info: remote resource transition
completed.
Nov  2 04:03:38 sto02 heartbeat[2591]: info: Local Resource acquisition
completed. (none)
Nov  2 04:03:38 sto02 heartbeat[2591]: info: Initial resource acquisition
complete (T_RESOURCES(them))
Nov  2 04:03:38 sto02 heartbeat[2993]: info: Starting
"/usr/lib/heartbeat/ipfail" as uid 1001  gid 104 (pid 2993)
Nov  2 04:03:38 sto02 heartbeat[2605]: info: pid 2605 locked in memory.
Nov  2 04:03:38 sto02 heartbeat: info: Running /etc/ha.d/rc.d/status status
Nov  2 04:03:38 sto02 heartbeat[2591]: info: Link
192.168.10.254:192.168.10.254 up.
Nov  2 04:03:38 sto02 ipfail[2993]: info: Ping node count is balanced.
Nov  2 04:03:40 sto02 watchdog[1898]: still alive after 20 seconds = 2
interval(s)
Nov  2 04:03:55 sto02 watchdog[1898]: still alive after 30 seconds = 3
interval(s)
Nov  2 04:04:05 sto02 login(pam_unix)[2614]: session opened for user root by
LOGIN(uid=0)
Nov  2 04:04:05 sto02  -- root[2614]: ROOT LOGIN ON tty1
Nov  2 04:04:10 sto02 watchdog[1898]: still alive after 40 seconds = 4
interval(s)
===================================

>From this point i now goto Server B and kindly ask it to take over the
heartbeat Resource (the one that was supposed to be running on Server A):

/var/log/halog on Server B
--------------
heartbeat: 2006/11/02_04:33:17 info: Running /etc/ha.d/rc.d/ip-request-resp
ip-request-resp
heartbeat: 2006/11/02_04:33:17 received ip-request-resp 192.168.10.122 OK no
heartbeat: 2006/11/02_04:33:17 info: Acquiring resource group:
sto02.curvedict.info 192.168.10.122 drbddisk::zim0
Filesystem::/dev/drbd3::/opt/zimbra::jfs
heartbeat: 2006/11/02_04:33:17 info: Running /etc/ha.d/resource.d/IPaddr
192.168.10.122 start
heartbeat: 2006/11/02_04:33:17 info: /sbin/ifconfig eth0:0 192.168.10.122
netmask 255.255.255.0	broadcast 192.168.10.255
heartbeat: 2006/11/02_04:33:17 info: Sending Gratuitous Arp for
192.168.10.122 on eth0:0 [eth0]
heartbeat: 2006/11/02_04:33:17 /usr/lib/heartbeat/send_arp -i 1010 -r 5 -p
/var/lib/heartbeat/rsctmp/send_arp/send_arp-192.168.10.122 eth0
192.168.10.122 auto 192.168.10.122 ffffffffffff
heartbeat: 2006/11/02_04:33:17 info: Running /etc/ha.d/resource.d/drbddisk
zim0 start
heartbeat: 2006/11/02_04:33:18 info: Running /etc/ha.d/resource.d/Filesystem
/dev/drbd3 /opt/zimbra jfs start
heartbeat: 2006/11/02_04:33:18 ERROR: Couldn't mount filesystem /dev/drbd3
on /opt/zimbra
heartbeat: 2006/11/02_04:33:18 ERROR: Return code 1 from
/etc/ha.d/resource.d/Filesystem
heartbeat: 2006/11/02_04:33:18 CRIT: Giving up resources due to failure of
Filesystem::/dev/drbd3::/opt/zimbra::jfs
heartbeat: 2006/11/02_04:33:18 info: Releasing resource group:
sto02.curvedict.info 192.168.10.122 drbddisk::zim0
Filesystem::/dev/drbd3::/opt/zimbra::jfs
heartbeat: 2006/11/02_04:33:18 info: Running /etc/ha.d/resource.d/Filesystem
/dev/drbd3 /opt/zimbra jfs stop
heartbeat: 2006/11/02_04:33:18 WARNING: Filesystem /opt/zimbra not mounted?
heartbeat: 2006/11/02_04:33:18 info: Running /etc/ha.d/resource.d/drbddisk
zim0 stop
heartbeat: 2006/11/02_04:33:18 info: Running /etc/ha.d/resource.d/IPaddr
192.168.10.122 stop
heartbeat: 2006/11/02_04:33:18 info: /sbin/route -n del -host 192.168.10.122
heartbeat: 2006/11/02_04:33:18 info: /sbin/ifconfig eth0:0 down
heartbeat: 2006/11/02_04:33:18 info: IP Address 192.168.10.122 released

/var/log/hadebug on Server B
----------------
heartbeat: 2006/11/02_04:33:17 debug: Starting /etc/ha.d/resource.d/IPaddr
192.168.10.122 start
ls: /var/lib/heartbeat/rsctmp/IPaddr/eth0:*: No such file or directory
heartbeat: 2006/11/02_04:33:17 debug: /etc/ha.d/resource.d/IPaddr
192.168.10.122 start done. RC=0
heartbeat: 2006/11/02_04:33:17 debug: Starting /etc/ha.d/resource.d/drbddisk
zim0 start
heartbeat: 2006/11/02_04:33:18 debug: /etc/ha.d/resource.d/drbddisk zim0
start done. RC=0
heartbeat: 2006/11/02_04:33:18 debug: Starting
/etc/ha.d/resource.d/Filesystem /dev/drbd3 /opt/zimbra jfs start
mount: wrong fs type, bad option, bad superblock on /dev/drbd3,
       or too many mounted file systems
heartbeat: 2006/11/02_04:33:18 debug: /etc/ha.d/resource.d/Filesystem
/dev/drbd3 /opt/zimbra jfs start done. RC=1
heartbeat: 2006/11/02_04:33:18 debug: Starting
/etc/ha.d/resource.d/Filesystem /dev/drbd3 /opt/zimbra jfs stop
heartbeat: 2006/11/02_04:33:18 debug: /etc/ha.d/resource.d/Filesystem
/dev/drbd3 /opt/zimbra jfs stop done. RC=0
heartbeat: 2006/11/02_04:33:18 debug: Starting /etc/ha.d/resource.d/drbddisk
zim0 stop
heartbeat: 2006/11/02_04:33:18 debug: /etc/ha.d/resource.d/drbddisk zim0
stop done. RC=0
heartbeat: 2006/11/02_04:33:18 debug: Starting /etc/ha.d/resource.d/IPaddr
192.168.10.122 stop
SIOCDELRT: No such process
heartbeat: 2006/11/02_04:33:18 debug: /etc/ha.d/resource.d/IPaddr
192.168.10.122 stop done. RC=0

/var/log/messages on Server B
----------------
Nov  2 04:33:10 sto02 watchdog[1898]: still alive after 1200 seconds = 120
interval(s)
Nov  2 04:33:17 sto02 heartbeat: info: Running
/etc/ha.d/rc.d/ip-request-resp ip-request-resp
Nov  2 04:33:17 sto02 heartbeat: received ip-request-resp 192.168.10.122 OK
no
Nov  2 04:33:17 sto02 heartbeat: info: Acquiring resource group:
sto02.curvedict.info 192.168.10.122 drbddisk::zim0
Filesystem::/dev/drbd3::/opt/zimbra::jfs
Nov  2 04:33:17 sto02 heartbeat: info: Running /etc/ha.d/resource.d/IPaddr
192.168.10.122 start
Nov  2 04:33:17 sto02 heartbeat: info: /sbin/ifconfig eth0:0 192.168.10.122
netmask 255.255.255.0	broadcast 192.168.10.255
Nov  2 04:33:17 sto02 heartbeat: info: Sending Gratuitous Arp for
192.168.10.122 on eth0:0 [eth0]
Nov  2 04:33:17 sto02 heartbeat: /usr/lib/heartbeat/send_arp -i 1010 -r 5 -p
/var/lib/heartbeat/rsctmp/send_arp/send_arp-192.168.10.122 eth0
192.168.10.122 auto 192.168.10.122 ffffffffffff
Nov  2 04:33:17 sto02 heartbeat: info: Running /etc/ha.d/resource.d/drbddisk
zim0 start
Nov  2 04:33:18 sto02 kernel: drbd3: Secondary/Secondary -->
Primary/Secondary
Nov  2 04:33:18 sto02 heartbeat: info: Running
/etc/ha.d/resource.d/Filesystem /dev/drbd3 /opt/zimbra jfs start
Nov  2 04:33:18 sto02 heartbeat: ERROR: Couldn't mount filesystem /dev/drbd3
on /opt/zimbra
Nov  2 04:33:18 sto02 heartbeat: ERROR: Return code 1 from
/etc/ha.d/resource.d/Filesystem
Nov  2 04:33:18 sto02 heartbeat: CRIT: Giving up resources due to failure of
Filesystem::/dev/drbd3::/opt/zimbra::jfs
Nov  2 04:33:18 sto02 heartbeat: info: Releasing resource group:
sto02.curvedict.info 192.168.10.122 drbddisk::zim0
Filesystem::/dev/drbd3::/opt/zimbra::jfs
Nov  2 04:33:18 sto02 heartbeat: info: Running
/etc/ha.d/resource.d/Filesystem /dev/drbd3 /opt/zimbra jfs stop
Nov  2 04:33:18 sto02 heartbeat: WARNING: Filesystem /opt/zimbra not
mounted?
Nov  2 04:33:18 sto02 heartbeat: info: Running /etc/ha.d/resource.d/drbddisk
zim0 stop
Nov  2 04:33:18 sto02 kernel: drbd3: Primary/Secondary -->
Secondary/Secondary
Nov  2 04:33:18 sto02 heartbeat: info: Running /etc/ha.d/resource.d/IPaddr
192.168.10.122 stop
Nov  2 04:33:18 sto02 heartbeat: info: /sbin/route -n del -host
192.168.10.122
Nov  2 04:33:18 sto02 heartbeat: info: /sbin/ifconfig eth0:0 down
Nov  2 04:33:18 sto02 heartbeat: info: IP Address 192.168.10.122 released
Nov  2 04:33:25 sto02 watchdog[1898]: still alive after 1210 seconds = 121
interval(s)

cat /proc/drbd on Server B
--------------------------
version: 0.7.21 (api:79/proto:74)
SVN Revision: 2326 build by buildsvn at build-i386, 2006-10-07 05:13:10
 0: cs:Connected st:Secondary/Primary ld:Consistent
    ns:0 nr:0 dw:0 dr:0 al:0 bm:0 lo:0 pe:0 ua:0 ap:0
 1: cs:Connected st:Secondary/Primary ld:Consistent
    ns:0 nr:0 dw:0 dr:0 al:0 bm:0 lo:0 pe:0 ua:0 ap:0
 2: cs:Unconfigured
 3: cs:Connected st:Secondary/Secondary ld:Consistent
    ns:8192 nr:0 dw:0 dr:8196 al:0 bm:6 lo:0 pe:0 ua:0 ap:0
 4: cs:Connected st:Secondary/Secondary ld:Consistent
    ns:0 nr:0 dw:0 dr:0 al:0 bm:0 lo:0 pe:0 ua:0 ap:0
 5: cs:Unconfigured
 
 



/ 2006-10-29 09:57:11 -0800
\ sidimustafa:

> Once server B is up and running the following was done on Server B:
> "drbdadm -- --do-what-I-say primary zim0" <-- the DRBD3 resource in
> question.

>> why do you think you have to use that ugly flag?
>> you really should not, unless you want to take
>> the blame for anything that goes wrong.
I was used because i wanted to mount the drive to see exactly what was
happening.
i'm only testing, so would like to make the mistakes now,and know that i
should never do it again.


> "drbdadm -- connect zim0" <-- the DRBD3 resource in question.

>> why would you need to connect it here?
>> it should have been connected all along.
better that i make all the foolish mistakes, while i'm testing, and learn
the hard wy
than when i go live, and have to suffer cause i don't know what exactly is
happening.

>> I'd like to see facts, not funny desciptive stories.
Sorry if i offended you, wasn't meant to be a funny descriptive story,
i just wanted to post the step by step process that took place,
reason i didn't put no logs, was due to the fact, that it was clearly stated
by drbd
that it can't load the resource, i assumed it was corrupted by the plug
being pull.
and once i re-formated the resource all was good again

-- 
View this message in context: http://www.nabble.com/Pulled-the-Power-Plug-tf2535391.html#a7127520
Sent from the DRBD - User mailing list archive at Nabble.com.




More information about the drbd-user mailing list