[DRBD-user] DRBD stuck after a strong network failure

Cyril Bouthors cyril at bouthors.org
Tue Apr 25 21:48:12 CEST 2006

Note: "permalinks" may not be as permanent as we would like,
direct links of old sources may well be a few messages off.


Same thing happened tonight, here are more information, hop it helps:

root at ns1:~# ps auxwf | grep drbd
root       879  0.1  0.0      0     0 ?        D    Apr18  17:40 [drbd0_receiver]
root     22526  0.0  0.0   1432   444 pts/5    S+   21:39   0:00  |               \_ grep drbd
root at ns1:~# w
 21:40:20 up 7 days,  8:11,  4 users,  load average: 194.27, 187.96, 158.50
(...)
root at ns1:~# dmesg
(...)
nfs: server 10.0.9.254 OK
nfs: server 10.0.9.254 not responding, still trying
nfs: server 10.0.9.254 OK
nfs: server 10.0.9.254 not responding, still trying
nfs: server 10.0.9.254 OK
drbd0: [kjournald/1180] sock_sendmsg time expired, ko = 4294967295
drbd0: [kupdated/6] sock_sendmsg time expired, ko = 4294967295
drbd0: [kupdated/6] sock_sendmsg time expired, ko = 4294967295
drbd0: [kupdated/6] sock_sendmsg time expired, ko = 4294967294
drbd0: [kupdated/6] sock_sendmsg time expired, ko = 4294967293
drbd0: [kupdated/6] sock_sendmsg time expired, ko = 4294967295
drbd0: [kjournald/1180] sock_sendmsg time expired, ko = 4294967295
drbd0: [kjournald/1180] sock_sendmsg time expired, ko = 4294967294
drbd0: [kjournald/1180] sock_sendmsg time expired, ko = 4294967293
drbd0: [kjournald/1180] sock_sendmsg time expired, ko = 4294967292
drbd0: [kjournald/1180] sock_sendmsg time expired, ko = 4294967295
drbd0: [kjournald/1180] sock_sendmsg time expired, ko = 4294967294
drbd0: PingAck did not arrive in time.
drbd0: drbd0_asender [889]: cstate Connected --> NetworkFailure
drbd0: asender terminated
drbd0: kjournald [1180]: cstate NetworkFailure --> Timeout
drbd0: drbd0_receiver [879]: cstate Timeout --> BrokenPipe
drbd0: short read expecting header on sock: r=-512
drbd0: short sent UnplugRemote size=8 sent=-1001
drbd0: worker terminated
root at ns1:~# ps auxwf | grep kupd
root         6  0.0  0.0      0     0 ?        D    Apr18   1:43 [kupdated]
root at ns1:~# cat /proc/drbd
version: 0.7.15 (api:77/proto:74)
SVN Revision: 2020 build by root at sqlb1, 2006-01-12 06:14:29
 0: cs:BrokenPipe st:Primary/Unknown ld:Consistent
    ns:521240 nr:0 dw:237755488 dr:49577617 al:937152 bm:204 lo:3 pe:3 ua:0 ap:3
root at ns1:~# drbdadm disconnect all
Child process does not terminate!
Exiting.
root at ns1:~# dmesg
(...)
drbd0: worker terminated
root at ns1:~# cat /proc/drbd
version: 0.7.15 (api:77/proto:74)
SVN Revision: 2020 build by root at sqlb1, 2006-01-12 06:14:29
 0: cs:BrokenPipe st:Primary/Unknown ld:Consistent
    ns:521240 nr:0 dw:237758456 dr:49577617 al:937177 bm:229 lo:3 pe:3 ua:0 ap:3
root at ns1:~# df -h /drbd
Filesystem            Size  Used Avail Use% Mounted on
/dev/drbd0            9.5G  7.6G  2.0G  80% /drbd
root at ns1:~# ls /drbd
etc  lost+found  root  usr  var
root at ns1:~# touch /drbd/foo
(this hangs....)
root at ns1:~# mount
/dev/hda4 on / type xfs (rw,noatime)
proc on /proc type proc (rw)
devpts on /dev/pts type devpts (rw,gid=5,mode=620)
tmpfs on /dev/shm type tmpfs (rw)
/dev/hda1 on /boot type xfs (rw)
10.0.9.254:/drbd/webalizer on /mnt type nfs (rw,addr=10.0.9.254)
/dev/drbd0 on /drbd type ext3 (rw,nosuid,nodev,noatime)
root at ns1:~# ps -axo pid,wchan=WIDE-WCHAN-COLUMN -o comm
  PID WIDE-WCHAN-COLUMN COMMAND
    1 select            init
    2 context_thread    keventd
    3 ksoftirqd         ksoftirqd_CPU0
    4 kswapd            kswapd
    5 bdflush           bdflush
    6 down              kupdated
   96 ?                 xfsbufd
   97 ?                 xfslogd/0
   98 ?                 xfsdatad/0
   99 ?                 xfssyncd
  227 ?                 kcopyd
  261 ?                 xfssyncd
  416 poll              portmap
  424 ?                 rpciod
  425 ?                 lockd
  555 select            syslogd
  558 syslog            klogd
  587 select            exim4
  820 select            inetd
  828 select            snmpd
  835 select            sshd
  848 select            rpc.statd
  853 select            ntpd
  879 ?                 drbd0_receiver
  923 poll              heartbeat
  927 nanosleep         cron
  932 pipe_wait         heartbeat
  933 poll              heartbeat
  934 read_chan         heartbeat
  935 poll              heartbeat
  936 wait_for_packet   heartbeat
  937 select            munin-node
  942 read_chan         getty
  943 read_chan         getty
  944 read_chan         getty
  945 read_chan         getty
  946 read_chan         getty
  947 read_chan         getty
 1180 wait_on_buffer    kjournald
 1352 wait4             mysqld_safe
 1383 select            mysqld
 1384 pipe_wait         logger
 1385 poll              mysqld
 1386 rt_sigsuspend     mysqld
 1387 rt_sigsuspend     mysqld
 1388 rt_sigsuspend     mysqld
 1389 rt_sigsuspend     mysqld
 1392 select            mysqld
 1393 select            mysqld
 1394 rt_sigsuspend     mysqld
 1395 rt_sigsuspend     mysqld
 2559 select            screen
 2600 wait4             bash
 3290 down              apache
 3591 down              ircd-hybrid
 4046 wait4             bash
 5021 select            dhcpd3
 5516 select            emacs-snapshot
 7479 ?                 nfsd
 7480 ?                 nfsd
 7481 ?                 nfsd
 7482 ?                 nfsd
 7483 ?                 nfsd
 7484 ?                 nfsd
 7485 ?                 nfsd
 7486 ?                 nfsd
 7487 ?                 nfsd
 7488 ?                 nfsd
 7489 ?                 nfsd
 7490 ?                 nfsd
 7491 ?                 nfsd
 7492 ?                 nfsd
 7493 ?                 nfsd
 7494 ?                 nfsd
 7495 ?                 nfsd
 7496 ?                 nfsd
 7497 ?                 nfsd
 7498 ?                 nfsd
 7499 ?                 nfsd
 7500 ?                 nfsd
 7501 ?                 nfsd
 7502 ?                 nfsd
 7503 ?                 nfsd
 7504 ?                 nfsd
 7505 ?                 nfsd
 7506 ?                 nfsd
 7507 ?                 nfsd
 7508 ?                 nfsd
 7509 ?                 nfsd
 7510 ?                 nfsd
 7798 select            rpc.mountd
14342 select            rsync
15724 wait4             bash
15734 read_chan         bash
26560 wait4             bash
26773 read_chan         bash
 2841 rt_sigsuspend     named
 2842 poll              named
 2843 -                 named
 2844 nanosleep         named
 2845 select            named
24728 read_chan         mysql
 9386 wait4             bash
28084 down              nagios
15053 ?                 sshd
15055 select            sshd
15056 wait4             bash
15067 read_chan         bash
  657 down              apache
  784 pipe_wait         cron
  816 wait4             sh
  821 wait4             munin-cron
 3044 down              apache
 5289 down              apache
 5366 down              apache
 5367 down              apache
14477 wait_on_buffer    apache
14510 wait_on_buffer    apache
14522 wait_on_buffer    munin-graph
14571 down              apache
14575 down              apache
14576 down              apache
14577 down              apache
14578 pipe_wait         cron
14582 wait4             sh
14584 wait4             download_nagios
14586 select            rsync
14613 down              rsync
14680 down              apache
14684 down              apache
14688 down              apache
14692 down              apache
14693 down              apache
14694 down              apache
14698 down              apache
14699 down              apache
14700 down              apache
14704 down              apache
14705 down              apache
14706 down              apache
14710 pipe_wait         cron
14715 pipe_wait         cron
14717 pipe_wait         cron
14719 wait4             mounts
14720 wait4             sh
14723 wait4             download_nagios
14724 wait4             sh
14726 select            rsync
14730 pipe_wait         mrtg
14778 pipe_wait         cron
14779 pipe_wait         cron
14800 wait4             exim4_dbm_hack
14801 wait4             sh
14804 wait4             mkdb_dom
14805 wait4             munin-cron
14808 down              munin-update
14849 down              rsync
14917 wait_on_buffer    mv
14934 down              mv
14959 down              registrar_accou
15003 down              apache
15010 down              apache
15011 down              apache
15012 down              apache
15013 down              apache
15014 down              apache
15018 down              apache
15019 down              apache
15020 down              apache
15021 down              apache
15022 down              apache
15026 down              apache
15027 down              apache
15028 down              apache
15029 down              apache
15030 down              apache
15031 down              apache
15032 down              apache
15033 down              apache
15034 down              apache
15035 down              apache
15037 down              apache
15038 down              apache
15039 down              apache
15040 down              apache
15041 down              apache
15042 down              apache
15043 down              apache
15044 down              apache
15045 down              apache
15046 pipe_wait         cron
15050 wait4             sh
15052 wait4             download_nagios
15058 select            rsync
15086 down              rsync
15145 down              apache
15146 down              apache
15147 down              apache
15148 down              apache
15149 down              apache
15150 down              apache
15151 down              apache
15152 down              apache
15153 down              apache
15154 down              apache
15155 down              apache
15156 down              apache
15157 down              apache
15158 down              apache
15159 down              apache
15160 down              apache
15161 down              apache
15162 down              apache
15163 down              apache
15164 down              apache
15165 down              apache
15166 down              apache
15167 down              apache
15168 down              apache
15169 pipe_wait         cron
15173 wait4             sh
15175 wait4             download_nagios
15178 select            rsync
15204 down              rsync
15268 down              apache
15269 down              apache
15270 down              apache
15271 down              apache
15272 down              apache
15273 down              apache
15274 down              apache
15275 down              apache
15276 down              apache
15277 down              apache
15278 down              apache
15279 down              apache
15280 down              apache
15281 down              apache
15282 down              apache
15283 down              apache
15285 down              apache
15286 down              apache
15287 pipe_wait         cron
15291 wait4             sh
15293 wait4             download_nagios
15296 select            rsync
15322 down              rsync
15397 down              apache
15398 down              apache
15402 down              apache
15403 down              apache
15404 down              apache
15405 down              apache
15406 down              apache
15407 down              apache
15408 down              apache
15409 down              apache
15410 down              apache
15411 down              apache
15412 down              apache
15413 down              apache
15414 down              apache
15415 down              apache
15417 pipe_wait         cron
15421 wait4             sh
15425 wait4             download_nagios
15428 select            rsync
15433 pipe_wait         cron
15445 wait4             update_sqldns
15469 down              rsync
15552 down              mv
15553 down              apache
15554 down              apache
15555 down              apache
15556 down              apache
15557 down              apache
15558 down              apache
15559 down              apache
15560 down              apache
15561 down              apache
15562 down              apache
15563 down              apache
15564 down              apache
15565 down              apache
15566 down              apache
15570 pipe_wait         cron
15575 pipe_wait         cron
15579 wait4             sh
15580 wait4             sh
15582 wait4             restart
15583 wait4             download_nagios
15586 select            rsync
15591 select            lynx
15592 pipe_wait         grep
15615 pipe_wait         cron
15616 pipe_wait         cron
15629 wait4             exim4_dbm_hack
15643 pipe_wait         cron
15649 down              chmod
15650 wait4             mkdb_dom
15653 wait4             sh
15654 down              apache
15655 wait4             munin-cron
15665 down              rsync
15698 down              mv
15711 down              apache
15712 down              apache
15795 pipe_wait         sendmail
15796 down              munin-limits
15831 exit              send_nsca <defunct>
15836 pipe_wait         nsca
21333 pipe_wait         cron
21334 pipe_wait         cron
21337 wait4             sh
21338 wait4             sql
21339 wait4             download_nagios
21342 select            rsync
21380 down              rsync
21383 ?                 mysqldump
21384 pipe_wait         gzip
21399 down              mysqld
27467 pipe_wait         cron
27471 wait4             sh
27475 wait4             download_nagios
27487 select            rsync
27517 down              rsync
27744 pipe_wait         cron
27748 wait4             sh
27750 wait4             download_nagios
27753 select            rsync
27780 down              rsync
27840 pipe_wait         cron
27844 wait4             sh
27846 wait4             download_nagios
27849 select            rsync
27875 down              rsync
27939 pipe_wait         cron
27949 wait4             sh
27952 wait4             download_nagios
27954 select            rsync
27985 pipe_wait         cron
27986 pipe_wait         cron
27997 wait4             exim4_dbm_hack
27998 wait4             sh
27999 wait4             mkdb_dom
28000 wait4             munin-cron
28013 down              rsync
28058 down              mv
28107 pipe_wait         sendmail
28108 down              munin-limits
28109 exit              send_nsca <defunct>
28112 pipe_wait         nsca
28113 pipe_wait         cron
28117 wait4             sh
28119 wait4             download_nagios
28122 select            rsync
28147 down              rsync
28210 pipe_wait         cron
28214 wait4             sh
28216 wait4             download_nagios
28219 select            rsync
28245 down              rsync
28307 pipe_wait         cron
28311 wait4             sh
28313 wait4             download_nagios
28315 select            rsync
28342 down              rsync
28406 pipe_wait         cron
28410 wait4             sh
28412 wait4             download_nagios
28415 select            rsync
28445 down              rsync
28517 pipe_wait         cron
28522 wait4             sh
28523 wait4             download_nagios
28524 select            rsync
28525 pipe_wait         cron
28529 wait4             sh
28531 wait4             restart
28537 select            lynx
28538 pipe_wait         grep
28540 pipe_wait         cron
28547 wait4             signup-check
28557 pipe_wait         cron
28571 wait4             exim4_dbm_hack
28572 wait4             mkdb_dom
28580 down              rsync
28584 tcp_data_wait     mysql
28586 rt_sigsuspend     mysqld
28590 pipe_wait         cron
28592 pipe_wait         cron
28600 down              chmod
28607 wait4             sh
28610 wait4             munin-cron
28617 pipe_wait         signup-check
28663 down              mv
28737 pipe_wait         sendmail
28738 down              munin-limits
28775 exit              send_nsca <defunct>
28780 pipe_wait         nsca
 1772 pipe_wait         cron
 1776 wait4             sh
 1778 wait4             download_nagios
 1781 select            rsync
 1808 down              rsync
 7941 pipe_wait         cron
 7942 pipe_wait         cron
 7945 wait4             sh
 7948 wait4             make
 7949 wait4             download_nagios
 7951 select            rsync
 7966 wait4             make
 7973 ?                 php4
 7986 down              rsync
 7995 rt_sigsuspend     mysqld
 8350 pipe_wait         cron
 8354 wait4             sh
 8356 wait4             download_nagios
 8359 select            rsync
 8386 down              rsync
 8449 pipe_wait         cron
 8453 wait4             sh
 8455 wait4             download_nagios
 8458 select            rsync
 8481 down              rsync
 8551 ?                 sshd
 8553 select            sshd
 8554 wait4             bash
 8565 read_chan         bash
 8579 pipe_wait         cron
 8589 wait4             sh
 8593 wait4             download_nagios
 8598 select            rsync
 8619 pipe_wait         cron
 8631 wait4             exim4_dbm_hack
 8632 wait4             mkdb_dom
 8633 pipe_wait         cron
 8635 wait4             sh
 8636 wait4             munin-cron
 8651 down              rsync
 8698 down              mv
 8750 pipe_wait         sendmail
 8751 down              munin-limits
 8752 exit              send_nsca <defunct>
 8755 pipe_wait         nsca
 8763 pipe_wait         cron
 8767 wait4             sh
 8769 wait4             download_nagios
 8772 select            rsync
 8797 down              rsync
 8866 pipe_wait         cron
 8870 wait4             sh
 8872 wait4             download_nagios
 8875 select            rsync
 8908 down              rsync
 8990 pipe_wait         cron
 8994 wait4             sh
 8996 wait4             download_nagios
 8998 select            rsync
 9024 down              rsync
 9090 pipe_wait         cron
 9094 wait4             sh
 9096 wait4             download_nagios
 9101 select            rsync
 9134 down              rsync
 9189 wait4             exim4
 9191 wait_on_buffer    exim4
 9195 pipe_wait         cron
 9200 pipe_wait         cron
 9204 wait4             sh
 9205 wait4             sh
 9207 wait4             restart
 9208 wait4             download_nagios
 9211 select            rsync
 9224 select            lynx
 9225 pipe_wait         grep
 9240 pipe_wait         cron
 9241 pipe_wait         cron
 9255 wait4             exim4_dbm_hack
 9256 down              chmod
 9258 wait4             mkdb_dom
 9273 pipe_wait         cron
 9284 wait4             sh
 9286 wait4             munin-cron
 9294 down              rsync
 9332 down              mv
 9409 pipe_wait         sendmail
 9410 down              munin-limits
 9443 exit              send_nsca <defunct>
 9448 pipe_wait         nsca
14939 pipe_wait         cron
14943 wait4             sh
14945 wait4             download_nagios
14948 select            rsync
14983 down              rsync
21200 pipe_wait         cron
21204 wait4             sh
21208 wait4             download_nagios
21211 select            rsync
21240 down              rsync
21733 pipe_wait         cron
21737 wait4             sh
21739 wait4             download_nagios
21741 select            rsync
21773 down              rsync
21835 pipe_wait         cron
21839 wait4             sh
21841 wait4             download_nagios
21844 select            rsync
21869 down              rsync
21934 pipe_wait         cron
21944 wait4             sh
21947 wait4             download_nagios
21951 select            rsync
21983 pipe_wait         cron
21984 pipe_wait         cron
21991 wait4             exim4_dbm_hack
21994 wait4             mkdb_dom
21995 wait4             sh
21997 wait4             munin-cron
22057 down              mv
22100 pipe_wait         sendmail
22101 down              munin-limits
22102 down              rsync
22103 exit              send_nsca <defunct>
22106 pipe_wait         nsca
22107 pipe_wait         cron
22111 wait4             sh
22113 wait4             download_nagios
22116 select            rsync
22186 down              rsync
22203 pipe_wait         cron
22207 wait4             sh
22208 wait4             download_nagios
22211 select            rsync
22235 down              rsync
22302 pipe_wait         cron
22306 wait4             sh
22308 wait4             download_nagios
22311 select            rsync
22341 down              rsync
22400 ?                 sshd
22402 select            sshd
22403 pause             screen
22407 pipe_wait         cron
22411 wait4             sh
22417 wait4             download_nagios
22419 select            rsync
22461 down              rsync
22530 pipe_wait         cron
22535 pipe_wait         cron
22538 pipe_wait         cron
22539 wait4             sh
22540 wait4             sh
22542 wait4             restart
22543 wait4             download_nagios
22547 select            rsync
22548 wait4             sh
22552 pipe_wait         cron
22558 pipe_wait         mrtg
22559 select            lynx
22560 pipe_wait         grep
22563 wait4             signup-check
22577 pipe_wait         cron
22589 wait4             exim4_dbm_hack
22590 wait4             mkdb_dom
22603 down              rsync
22605 pipe_wait         cron
22607 pipe_wait         cron
22618 down              chmod
22621 wait4             sh
22630 tcp_data_wait     mysql
22634 rt_sigsuspend     mysqld
22645 wait4             munin-cron
22648 pipe_wait         signup-check
22666 down              mv
22757 pipe_wait         sendmail
22758 down              munin-limits
22797 pipe_wait         send_nsca
22798 pipe_wait         munin-limits
22799 pipe_wait         munin-limits
22802 pipe_wait         nsca
22808 rwsem_down_write_ registrar_accou
28318 pipe_wait         cron
28322 wait4             sh
28324 wait4             download_nagios
28327 select            rsync
28357 down              rsync
 1963 pipe_wait         cron
 1967 wait4             sh
 1971 wait4             download_nagios
 1984 select            rsync
 2008 down              rsync
 2527 down              drbdsetup
 2531 pipe_wait         cron
 2535 wait4             sh
 2537 wait4             download_nagios
 2540 select            rsync
 2564 down              rsync
 2638 down              touch
 2639 pipe_wait         cron
 2643 wait4             sh
 2645 wait4             download_nagios
 2647 select            rsync
 2675 down              rsync
 2739 pipe_wait         cron
 2749 wait4             sh
 2752 wait4             download_nagios
 2756 select            rsync
 2782 pipe_wait         cron
 2783 pipe_wait         cron
 2784 pipe_wait         cron
 2793 wait4             exim4_dbm_hack
 2794 wait4             mkdb_dom
 2795 wait4             update_webs
 2796 wait4             sh
 2797 wait4             munin-cron
 2814 down              rsync
 2877 down              mv
 2974 pipe_wait         sendmail
 2975 down              munin-limits
 3022 pipe_wait         send_nsca
 3023 pipe_wait         munin-limits
 3024 pipe_wait         munin-limits
 3025 pipe_wait         nsca
 3028 down              cp
 3034 wait4             mail
 3035 wait_on_buffer    sendmail
 3036 pipe_wait         cron
 3040 wait4             sh
 3042 wait4             download_nagios
 3046 select            rsync
 3071 down              rsync
 3136 -                 ps

-- 
Cyril Bouthors
-------------- next part --------------
A non-text attachment was scrubbed...
Name: not available
Type: application/pgp-signature
Size: 188 bytes
Desc: not available
URL: <http://lists.linbit.com/pipermail/drbd-user/attachments/20060425/9f537dea/attachment.pgp>


More information about the drbd-user mailing list