Note: "permalinks" may not be as permanent as we would like,
direct links of old sources may well be a few messages off.
I have a DRBD installation on two machines running Centos 6.3 and DRBD 8.4.1
I have a single resource 'agalaxy' being synced across these two machines. This resource has two volumes:
Volume 0: /dev/drbd0 mounted on /a10data
And
Volume 1: /dev/drbd1 mounted on /a10.
Volume 0 is running Postgres. I did a lot of other activities with DRBD shutdown. However after a while, I found that the contents in the directory /a10data on one of the machines was different (some intermediate level directories were missing), yet DRBD (cat /proc/drbd) reported that the file systems were in sync.
Ultimately, I had to re-initialize and resync the volume by invalidating it:
Drbdadm invalidate agalaxy/0
Has anyone run into this kind of issue?
===================================
For example, on balar-lnx3, the contents of /a10data/db/data/system was:
[root at balar-lnx3 system]# ls
12531 12664 12670 12675 12681 12687 12779
12531_fsm 12666 12671 12677 12682 12688 pg_control
12531_vm 12667 12672 12678 12683 12773 pg_filenode.map
12533 12668 12672_fsm 12679 12683_fsm 12775 pg_internal.init
12534 12668_fsm 12672_vm 12679_fsm 12683_vm 12777 pgstat.stat
12662 12668_vm 12674 12679_vm 12685 12778
The contents of /a10/data/db/data/system on balar-lnx was:
[root at balar-lnx system]# ls
base pg_ident.conf pg_stat_tmp PG_VERSION
global pg_multixact pg_subtrans pg_xlog
pg_clog pg_notify pg_tblspc postgresql.conf
pg_hba.conf pg_serial pg_twophase postmaster.opts
[root at balar-lnx system]#
The contents of /a10data/db/data/system on balar-lnx3 was actually the contents of /a10data/db/data/system/global on balar-lnx. Yet, DRBD was reporting the status:
[root at balar-lnx3 ~]# cat /proc/drbd
version: 8.4.1 (api:1/proto:86-100)
GIT-hash: 91b4c048c1a0e06777b5f65d312b38d47abaea80 build by phil at Build64R6, 2012-04-17 11:28:08
0: cs:Connected ro:Secondary/Primary ds:UpToDate/UpToDate C r-----
ns:0 nr:173009724 dw:173009724 dr:0 al:0 bm:10560 lo:0 pe:0 ua:0 ap:0 ep:1 wo:f oos:0
1: cs:Connected ro:Secondary/Primary ds:UpToDate/UpToDate C r-----
ns:0 nr:0 dw:0 dr:0 al:0 bm:0 lo:0 pe:0 ua:0 ap:0 ep:1 wo:f oos:0
[root at balar-lnx3 ~]#
#=====================================================================
Here is my drbd conf:
1. Global:
global {
usage-count yes;
# minor-count dialog-refresh disable-ip-verification
}
common {
handlers {
pri-on-incon-degr "/usr/lib/drbd/notify-pri-on-incon-degr.sh; /usr/lib/drbd/notify-emergency-reboot.sh; echo b > /proc/sysrq-trigger ; reboot -f";
pri-lost-after-sb "/usr/lib/drbd/notify-pri-lost-after-sb.sh; /usr/lib/drbd/notify-emergency-reboot.sh; echo b > /proc/sysrq-trigger ; reboot -f";
local-io-error "/usr/lib/drbd/notify-io-error.sh; /usr/lib/drbd/notify-emergency-shutdown.sh; echo o > /proc/sysrq-trigger ; halt -f";
}
startup {
wfc-timeout 0;
degr-wfc-timeout 120;
}
options {
# cpu-mask on-no-data-accessible
}
disk {
on-io-error detach;
}
net {
protocol C;
}
}
2. Agalaxy.res:
resource agalaxy {
disk {
resync-rate 100M;
fencing resource-only;
}
handlers {
fence-peer "/usr/lib/drbd/crm-fence-peer.sh";
after-resync-target "/usr/lib/drbd/crm-unfence-peer.sh";
}
on balar-lnx {
address 10.0.1.1:7788;
volume 0 {
device /dev/drbd0;
disk /dev/vg_balarlnx3/lv_a10data;
meta-disk internal;
}
volume 1 {
device /dev/drbd1;
disk /dev/vg_balarlnx3/lv_a10;
meta-disk internal;
}
}
on balar-lnx3 {
address 10.0.1.2:7788;
volume 0 {
device /dev/drbd0;
disk /dev/vg_balarlnx2/lv_a10data;
meta-disk internal;
}
volume 1 {
device /dev/drbd1;
disk /dev/vg_balarlnx2/lv_a10;
# meta-disk /dev/vg_balarlnx2/lv_a10_drbdmeta;
meta-disk internal;
}
}
}