Note: "permalinks" may not be as permanent as we would like,
direct links of old sources may well be a few messages off.
Hi,
we have a strange problem with DRBD V 8.4.4:
Since the weekend the load and I/O wait of the server nodes is very high. We have 8 cores and a load like this:
top - 08:52:43 up 45 days, 1:46, 2 users, load average: 95.60, 103.74, 110.63
Tasks: 432 total, 1 running, 431 sleeping, 0 stopped, 0 zombie
Cpu(s): 1.6%us, 0.9%sy, 0.0%ni, 16.2%id, 81.3%wa, 0.0%hi, 0.0%si, 0.0%st
With drbd-overview I saw, that two resources are syncing. I disconnected them on both nodes. Since then, the load and I/O went back to normal.
When I connect one of these resources the sync starts:
root at pmt-ucs02:/etc/drbd.d# drbd-overview
1:vm_pmt-dc1/0 SyncTarget Secondary/Primary Inconsistent/UpToDate A r-----
[>....................] sync'ed: 0.8% (550444/550444)K
2:vm_pmt-mail/0 StandAlone Secondary/Unknown Inconsistent/DUnknown r-----
3:vm_pmt-winsrv/0 Connected Secondary/Primary UpToDate/UpToDate A r-----
4:vm_pmt-erp/0 Connected Secondary/Primary UpToDate/UpToDate A r-----
5:vm_pmt-dc2/0 Connected Primary/Secondary UpToDate/UpToDate A r-----
However, it never finishes. Instead the load and I/O wait raises up again until a point where the server hardly responses at all.
Sometimes the sync goes up to 1.5% or 3% and then it falls back to 0.8% again.
Another strange behaviour is, that the execution of the drbd-overview command always takes at least 10 seconds on one of the nodes. (no matter how low the load is)
The other node responds immediately.
I also get this output from time to time on the slow node:
root at pmt-ucs02:/etc/drbd.d# drbd-overview
1:vm_pmt-dc1/0 StandAlone Secondary/Unknown Inconsistent/DUnknown r-----
2:??not-found?? StandAlone Secondary/Unknown Inconsistent/DUnknown r-----
3:??not-found?? Connected Secondary/Primary UpToDate/UpToDate A r-----
4:??not-found?? Connected Secondary/Primary UpToDate/UpToDate A r-----
5:??not-found?? Connected Primary/Secondary UpToDate/UpToDate A r-----
Three of the five resources are working well.
Thanks for reading.
Any ideas?
Cheers,
Roland.
Some additional information:
root at pmt-ucs02:/etc/drbd.d# cat /proc/drbd
version: 8.4.4 (api:1/proto:86-101)
GIT-hash: 905561ebc321ce0f08ed66b783e05944e733206d build by root@, 2014-08-25 18:11:11
1: cs:StandAlone ro:Secondary/Unknown ds:Inconsistent/DUnknown r-----
ns:0 nr:60304 dw:470969676 dr:0 al:0 bm:0 lo:0 pe:0 ua:0 ap:0 ep:1 wo:d oos:500552
2: cs:StandAlone ro:Secondary/Unknown ds:Inconsistent/DUnknown r-----
ns:0 nr:1080881120 dw:1080881120 dr:0 al:0 bm:0 lo:0 pe:0 ua:0 ap:0 ep:1 wo:d oos:4422884
3: cs:Connected ro:Secondary/Primary ds:UpToDate/UpToDate A r-----
ns:0 nr:0 dw:0 dr:0 al:0 bm:0 lo:0 pe:0 ua:0 ap:0 ep:1 wo:d oos:0
4: cs:Connected ro:Secondary/Primary ds:UpToDate/UpToDate A r-----
ns:0 nr:101852461 dw:101852461 dr:0 al:0 bm:0 lo:0 pe:0 ua:0 ap:0 ep:1 wo:d oos:0
5: cs:Connected ro:Primary/Secondary ds:UpToDate/UpToDate A r-----
ns:134208492 nr:0 dw:129141712 dr:42898984 al:2310 bm:0 lo:1 pe:0 ua:0 ap:1 ep:1 wo:d oos:0
root at pmt-ucs02:~# top
top - 08:52:43 up 45 days, 1:46, 2 users, load average: 95.60, 103.74, 110.63
Tasks: 432 total, 1 running, 431 sleeping, 0 stopped, 0 zombie
Cpu(s): 1.6%us, 0.9%sy, 0.0%ni, 16.2%id, 81.3%wa, 0.0%hi, 0.0%si, 0.0%st
Mem: 28876948k total, 18365484k used, 10511464k free, 358848k buffers
Swap: 10485756k total, 0k used, 10485756k free, 3086740k cached
PID USER PR NI VIRT RES SHR S %CPU %MEM TIME+ COMMAND
10258 libvirt- 20 0 8952m 8.1g 6032 S 21 29.5 6315:39 kvm
708 libvirt- 20 0 8707m 2.9g 6032 S 1 10.4 307:58.90 kvm
3737 root 20 0 11096 1600 912 S 1 0.0 0:01.51 top
28 root 20 0 0 0 0 S 0 0.0 94:50.43 ksoftirqd/4
4390 root 20 0 19388 1692 1012 R 0 0.0 0:00.07 top
1 root 20 0 10452 776 644 S 0 0.0 0:43.99 init
2 root 20 0 0 0 0 S 0 0.0 0:00.56 kthreadd
3 root 20 0 0 0 0 S 0 0.0 87:47.56 ksoftirqd/0
4 root 20 0 0 0 0 S 0 0.0 0:00.00 kworker/0:0
5 root 0 -20 0 0 0 S 0 0.0 0:00.00 kworker/0:0H
7 root RT 0 0 0 0 S 0 0.0 0:01.52 migration/0
8 root 20 0 0 0 0 S 0 0.0 0:00.00 rcu_bh
9 root 20 0 0 0 0 S 0 0.0 4:08.77 rcu_sched
10 root RT 0 0 0 0 S 0 0.0 0:08.32 watchdog/0
11 root RT 0 0 0 0 S 0 0.0 0:08.60 watchdog/1
12 root RT 0 0 0 0 S 0 0.0 0:01.38 migration/1
13 root 20 0 0 0 0 S 0 0.0 88:40.54 ksoftirqd/1
15 root 0 -20 0 0 0 S 0 0.0 0:00.00 kworker/1:0H
16 root RT 0 0 0 0 S 0 0.0 0:08.21 watchdog/2
17 root RT 0 0 0 0 S 0 0.0 0:01.41 migration/2
18 root 20 0 0 0 0 S 0 0.0 89:22.10 ksoftirqd/2
20 root 0 -20 0 0 0 S 0 0.0 0:00.00 kworker/2:0H
21 root RT 0 0 0 0 S 0 0.0 0:07.62 watchdog/3
22 root RT 0 0 0 0 S 0 0.0 0:01.46 migration/3
23 root 20 0 0 0 0 S 0 0.0 77:02.51 ksoftirqd/3
25 root 0 -20 0 0 0 S 0 0.0 0:00.00 kworker/3:0H
26 root RT 0 0 0 0 S 0 0.0 0:06.56 watchdog/4
27 root RT 0 0 0 0 S 0 0.0 0:05.56 migration/4
30 root 0 -20 0 0 0 S 0 0.0 0:00.00 kworker/4:0H
31 root RT 0 0 0 0 S 0 0.0 0:07.11 watchdog/5
32 root RT 0 0 0 0 S 0 0.0 0:05.49 migration/5
33 root 20 0 0 0 0 S 0 0.0 87:29.61 ksoftirqd/5
root at pmt-ucs01:/etc/drbd.d# cat global_common.conf
global {
usage-count yes;
}
common {
handlers {
# These are EXAMPLE handlers only.
# They may have severe implications,
# like hard resetting the node under certain circumstances.
# Be careful when chosing your poison.
# pri-on-incon-degr "/usr/lib/drbd/notify-pri-on-incon-degr.sh; /usr/lib/drbd/notify-emergency-reboot.sh; echo b > /proc/sysrq-trigger ; reboot -f";
# pri-lost-after-sb "/usr/lib/drbd/notify-pri-lost-after-sb.sh; /usr/lib/drbd/notify-emergency-reboot.sh; echo b > /proc/sysrq-trigger ; reboot -f";
# local-io-error "/usr/lib/drbd/notify-io-error.sh; /usr/lib/drbd/notify-emergency-shutdown.sh; echo o > /proc/sysrq-trigger ; halt -f";
# fence-peer "/usr/lib/drbd/crm-fence-peer.sh";
# split-brain "/usr/lib/drbd/notify-split-brain.sh root";
# out-of-sync "/usr/lib/drbd/notify-out-of-sync.sh root";
# before-resync-target "/usr/lib/drbd/snapshot-resync-target-lvm.sh -p 15 -- -c 16k";
# after-resync-target /usr/lib/drbd/unsnapshot-resync-target-lvm.sh;
}
startup {
# wfc-timeout degr-wfc-timeout outdated-wfc-timeout wait-after-sb
}
options {
# cpu-mask on-no-data-accessible
}
disk {
on-io-error detach;
fencing resource-only;
disk-flushes no;
md-flushes no;
al-extents 1237;
c-delay-target 20;
c-fill-target 0;
c-max-rate 150M;
c-min-rate 5M;
}
net {
max-epoch-size 16000;
max-buffers 16000;
ko-count 6;
cram-hmac-alg sha1;
shared-secret ba96f8297d8f16f0f58061f0fcc6e5d13dcaa6dd;
verify-alg crc32c;
## fall behind with secondary on net-congestion
on-congestion pull-ahead;
congestion-extents 800; # e.g. 2/3 of al-extends
congestion-fill 400M;
}
}
root at pmt-ucs01:/etc/drbd.d# cat vm_pmt-mail.res
resource vm_pmt-mail {
net {
protocol A;
cram-hmac-alg sha1;
shared-secret "FooFunFactory";
max-buffers 131072;
max-epoch-size 20000;
sndbuf-size 0;
rcvbuf-size 0;
verify-alg md5;
}
on pmt-ucs01 {
device drbd2;
disk /dev/vg_ucs/vm_pmt-mail;
meta-disk internal;
address 192.168.80.1:7792;
}
on pmt-ucs02 {
device drbd2;
disk /dev/vg_ucs/vm_pmt-mail;
meta-disk internal;
address 192.168.80.2:7792;
}
}
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.linbit.com/pipermail/drbd-user/attachments/20150831/4cac19d9/attachment.htm>