[DRBD-user] mkfs on a drbd partition hangs in drbd_al_begin_io

Håkan Engblom zyber_cynic at hotmail.com
Thu May 3 13:23:19 CEST 2007

Note: "permalinks" may not be as permanent as we would like,
direct links of old sources may well be a few messages off.


Hi,

Some background: drbd-version is 0.7.22, running on a Montavista Linux 
dirstribution  2.6.10_mvl4

I've seen that sometimes when doing mkfs on a drbd-partition, the system 
seem to hang in a drbd-function in kernel-space.
The problem has been reported once before to this mailing-list, in February 
2006, a thread called "mkfs hangs with lastest drbd branch build and FC4 
kernel" (I thin it is the same problem) and it has also been observed by 
others (seen when searching for "drbd_al_begin_io hangs" in google)

However I've not seen any soultion to the problem.

So far what I've been able to establish that the process seem to hang in the 
dbrd-function mentioned above, and I also know that it hangs 640 bytes into 
the function. When looking at the source code of this function, my guess is 
that it hangs on "spin_lock_irq(&mdev->al_lock);".

Is this a known problem and does anyone know of a soultion ?

Config :

=== root at fsb1 FSB ~ # more /etc/drbd.conf

resource Core {
        protocol                C;
        on fsb1 {
                device          /dev/drbd0;
                disk            /dev/sda1;
                address         10.90.17.14:7780;
                meta-disk       internal;
        }
        on fsb2 {
                device          /dev/drbd0;
                disk            /dev/sda1;
                address         10.90.17.15:7780;
                meta-disk       internal;
        }
        syncer {
rate            3M;
group           3;
}
net {
timeout         29;
connect-int     3;
ping-int        3;
max-buffers     2048;
on-disconnect   reconnect;
}
startup {
wfc-timeout             180;
degr-wfc-timeout        1;
}
disk {
on-io-error     detach;
}
}

resource charging {
protocol                C;
on fsb1 {
device          /dev/drbd1;
disk            /dev/sda2;
address         10.90.17.14:7781;
meta-disk       internal;
}
on fsb2 {
device          /dev/drbd1;
disk            /dev/sda2;
address         10.90.17.15:7781;
meta-disk       internal;
}
syncer {
rate            3M;
group           2;
}
net {
timeout         29;
connect-int     3;
ping-int        3;
max-buffers     2048;
on-disconnect   reconnect;
}
startup {
wfc-timeout             180;
degr-wfc-timeout        1;
}
disk {
on-io-error     detach;
}
}

resource logs {
protocol                C;
on fsb1 {
device          /dev/drbd2;
disk            /dev/sda3;
address         10.90.17.14:7782;
meta-disk       internal;
}
on fsb2 {
device          /dev/drbd2;
disk            /dev/sda3;
address         10.90.17.15:7782;
meta-disk       internal;
}
syncer {
rate            3M;
group           1;
}
        net {
                timeout         29;
                connect-int     3;
                ping-int        3;
                max-buffers     2048;
                on-disconnect   reconnect;
        }
        startup {
                wfc-timeout             180;
                degr-wfc-timeout        1;
        }
        disk {
                on-io-error     detach;
        }
}




Console log from the primary host when the problem occurs, showing where it 
is stuck :

=== root at fsb1 FSB ~ # ps -ef | grep mk
root      1360  1351  0 12:56 ?        00:00:00 mkfs.ext3 -v /dev/drbd0
root      3538  3288  0 13:28 pts/0    00:00:00 grep mk
=== root at fsb1 FSB ~ # cd /proc/1360
=== root at fsb1 FSB /proc/1360 # ll
total 0
dr-xr-xr-x   3 root root 0 Apr 17 12:56 .
dr-xr-xr-x  71 root root 0 Jan  1  1970 ..
-r--------   1 root root 0 Apr 17 13:28 auxv
-r--r--r--   1 root root 0 Apr 17 13:27 cmdline
lrwxrwxrwx   1 root root 0 Apr 17 13:28 cwd -> /
-r--------   1 root root 0 Apr 17 13:28 environ
lrwxrwxrwx   1 root root 0 Apr 17 13:28 exe -> /sbin/mkfs.ext3
dr-x------   2 root root 0 Apr 17 13:28 fd
-r--r--r--   1 root root 0 Apr 17 13:28 maps
-rw-------   1 root root 0 Apr 17 13:28 mem
-r--r--r--   1 root root 0 Apr 17 12:56 mounts
lrwxrwxrwx   1 root root 0 Apr 17 13:28 root -> /
-r--r--r--   1 root root 0 Apr 17 13:27 stat
-r--r--r--   1 root root 0 Apr 17 13:28 statm
-r--r--r--   1 root root 0 Apr 17 13:27 status
dr-xr-xr-x   3 root root 0 Apr 17 13:28 task
-r--r--r--   1 root root 0 Apr 17 13:28 wchan
=== root at fsb1 FSB /proc/1360 # more wchan
drbd_al_begin_io
=== root at fsb1 FSB /proc/1360 # cat stat
1360 (mkfs.ext3) D 1351 967 967 0 -1 256 478 0 0 0 1 67 0 0 18 0 1 0 147730 
3170304 455 4294967295 268435456 268534220 2147483136 2147480720 266952396 0 
0 0 0 3641589808 0 0 17 0 0 0
=== root at fsb1 FSB /proc/1360 # cat status
Name:   mkfs.ext3
State:  D (disk sleep)
SleepAVG:       78%
Tgid:   1360
Pid:    1360
PPid:   1351
TracerPid:      0
Uid:    0       0       0       0
Gid:    0       0       0       0
FDSize: 256
Groups:
VmSize:     3096 kB
VmLck:         0 kB
VmRSS:      1820 kB
VmData:     1088 kB
VmStk:        84 kB
VmExe:       100 kB
VmLib:      1380 kB
VmPTE:        16 kB
Threads:        1
SigPnd: 0000000000000000
ShdPnd: 0000000000000000
SigBlk: 0000000000000000
SigIgn: 0000000000000000
SigCgt: 0000000000000000
CapInh: 0000000000000000
CapPrm: 00000000fffffeff
CapEff: 00000000fffffeff
=== root at fsb1 FSB /proc/1360 # cat wchan
drbd_al_begin_io=== root at fsb1 FSB /proc/1360 # cat wchan
drbd_al_begin_io=== root at fsb1 FSB /proc/1360 # cat wchan
drbd_al_begin_io=== root at fsb1 FSB /proc/1360 # cat wchan
drbd_al_begin_io=== root at fsb1 FSB /proc/1360 # cat wchan
drbd_al_begin_io=== root at fsb1 FSB /proc/1360 # cat wchan
drbd_al_begin_io=== root at fsb1 FSB /proc/1360 # cat wchan
drbd_al_begin_io=== root at fsb1 FSB /proc/1360 # cat wchan
drbd_al_begin_io=== root at fsb1 FSB /proc/1360 # ll
total 0
dr-xr-xr-x   3 root root 0 Apr 17 12:56 .
dr-xr-xr-x  69 root root 0 Jan  1  1970 ..
-r--------   1 root root 0 Apr 17 13:28 auxv
-r--r--r--   1 root root 0 Apr 17 13:27 cmdline
lrwxrwxrwx   1 root root 0 Apr 17 13:28 cwd -> /
-r--------   1 root root 0 Apr 17 13:28 environ
lrwxrwxrwx   1 root root 0 Apr 17 13:28 exe -> /sbin/mkfs.ext3
dr-x------   2 root root 0 Apr 17 13:28 fd
-r--r--r--   1 root root 0 Apr 17 13:28 maps
-rw-------   1 root root 0 Apr 17 13:28 mem
-r--r--r--   1 root root 0 Apr 17 12:56 mounts
lrwxrwxrwx   1 root root 0 Apr 17 13:28 root -> /
-r--r--r--   1 root root 0 Apr 17 13:27 stat
-r--r--r--   1 root root 0 Apr 17 13:28 statm
-r--r--r--   1 root root 0 Apr 17 13:27 status
dr-xr-xr-x   3 root root 0 Apr 17 13:28 task
-r--r--r--   1 root root 0 Apr 17 13:28 wchan
=== root at fsb1 FSB /proc/1360 # date
Tue Apr 17 13:29:40 UTC 2007
=== root at fsb1 FSB /proc/1360 # more cmdline
mkfs.ext3-v/dev/drbd0
=== root at fsb1 FSB /proc/1360 # cd /mnt/local/etc/
=== root at fsb1 FSB local/etc # ll
total 32
drwxr-xr-x  3 root root 4096 Apr 17 13:32 .
drwxr-xr-x  6 root root 4096 Apr 17 12:55 ..
-rw-r--r--  1 root root 2596 Apr 17 12:55 Core.def
-rw-r--r--  1 root root  561 Apr 17 13:32 group
-rw-r--r--  1 root root 1046 Apr 17 13:32 passwd
-rw-r--r--  1 root root   46 Apr 17 12:55 reformat_date
-rw-------  1 root root  122 Apr 17 13:32 shadow
drwxr-xr-x  2 root root 4096 Apr 17 09:07 ssh
=== root at fsb1 FSB local/etc # date
Tue Apr 17 13:32:32 UTC 2007
=== root at fsb1 FSB local/etc # touch hejdu
=== root at fsb1 FSB local/etc # ll
total 32
drwxr-xr-x  3 root root 4096 Apr 17 13:32 .
drwxr-xr-x  6 root root 4096 Apr 17 12:55 ..
-rw-r--r--  1 root root 2596 Apr 17 12:55 Core.def
-rw-r--r--  1 root root  561 Apr 17 13:32 group
-rw-r--r--  1 root root    0 Apr 17 13:32 hejdu
-rw-r--r--  1 root root 1046 Apr 17 13:32 passwd
-rw-r--r--  1 root root   46 Apr 17 12:55 reformat_date
-rw-------  1 root root  122 Apr 17 13:32 shadow
drwxr-xr-x  2 root root 4096 Apr 17 09:07 ssh
=== root at fsb1 FSB local/etc # rm hejdu
=== root at fsb1 FSB local/etc # cd /proc/1360
=== root at fsb1 FSB /proc/1360 # more wchan
drbd_al_begin_io
=== root at fsb1 FSB /proc/1360 # ll
total 0
dr-xr-xr-x   3 root root 0 Apr 17 12:56 .
dr-xr-xr-x  70 root root 0 Jan  1  1970 ..
-r--------   1 root root 0 Apr 17 13:28 auxv
-r--r--r--   1 root root 0 Apr 17 13:27 cmdline
lrwxrwxrwx   1 root root 0 Apr 17 13:28 cwd -> /
-r--------   1 root root 0 Apr 17 13:28 environ
lrwxrwxrwx   1 root root 0 Apr 17 13:28 exe -> /sbin/mkfs.ext3
dr-x------   2 root root 0 Apr 17 13:28 fd
-r--r--r--   1 root root 0 Apr 17 13:28 maps
-rw-------   1 root root 0 Apr 17 13:28 mem
-r--r--r--   1 root root 0 Apr 17 12:56 mounts
lrwxrwxrwx   1 root root 0 Apr 17 13:28 root -> /
-r--r--r--   1 root root 0 Apr 17 13:27 stat
-r--r--r--   1 root root 0 Apr 17 13:28 statm
-r--r--r--   1 root root 0 Apr 17 13:27 status
dr-xr-xr-x   3 root root 0 Apr 17 13:28 task
-r--r--r--   1 root root 0 Apr 17 13:28 wchan
=== root at fsb1 FSB /proc/1360 # more wchan
drbd_al_begin_io
=== root at fsb1 FSB /proc/1360 # cd ..
=== root at fsb1 FSB /proc # ll
total 786445
dr-xr-xr-x  70 root root         0 Jan  1  1970 .
drwxr-xr-x  23 root root      1024 Apr 17 12:55 ..
dr-xr-xr-x   3 root root         0 Jan  1  1970 1
dr-xr-xr-x   3 root root         0 Jan  1  1970 10
dr-xr-xr-x   3 root root         0 Apr 17 12:56 1003
dr-xr-xr-x   3 root root         0 Apr 17 12:56 1012
dr-xr-xr-x   3 root root         0 Apr 17 12:56 1076
dr-xr-xr-x   3 root root         0 Apr 17 12:56 1084
dr-xr-xr-x   3 root root         0 Apr 17 12:56 1092
dr-xr-xr-x   3 root root         0 Apr 17 12:56 1122
dr-xr-xr-x   3 root root         0 Apr 17 12:56 1127
dr-xr-xr-x   3 root root         0 Apr 17 12:56 1132
dr-xr-xr-x   3 root root         0 Apr 17 12:56 1138
dr-xr-xr-x   3 root root         0 Apr 17 12:56 1139
dr-xr-xr-x   3 root root         0 Apr 17 12:56 1140
dr-xr-xr-x   3 root root         0 Apr 17 12:56 1159
dr-xr-xr-x   3 root root         0 Apr 17 12:56 1161
dr-xr-xr-x   3 root root         0 Apr 17 12:56 1162
dr-xr-xr-x   3 root root         0 Apr 17 12:56 1164
dr-xr-xr-x   3 root root         0 Apr 17 12:56 1179
dr-xr-xr-x   3 root root         0 Apr 17 12:56 1235
dr-xr-xr-x   3 root root         0 Apr 17 12:56 1243
dr-xr-xr-x   3 root root         0 Apr 17 12:56 1343
dr-xr-xr-x   3 root root         0 Apr 17 13:27 1347
dr-xr-xr-x   3 root root         0 Apr 17 12:56 1351
dr-xr-xr-x   3 root root         0 Apr 17 12:56 1360
dr-xr-xr-x   3 root root         0 Jan  1  1970 180
dr-xr-xr-x   3 root root         0 Jan  1  1970 2
dr-xr-xr-x   3 root root         0 Jan  1  1970 210
dr-xr-xr-x   3 root root         0 Jan  1  1970 257
dr-xr-xr-x   3 root root         0 Jan  1  1970 271
dr-xr-xr-x   3 root root         0 Jan  1  1970 3
dr-xr-xr-x   3 root root         0 Jan  1  1970 30
dr-xr-xr-x   3 root root         0 Apr 17 13:27 3284
dr-xr-xr-x   3 root root         0 Apr 17 13:27 3288
dr-xr-xr-x   3 root root         0 Apr 17 13:37 3822
dr-xr-xr-x   3 root root         0 Apr 17 13:37 3823
dr-xr-xr-x   3 root root         0 Apr 17 13:37 3824
dr-xr-xr-x   3 root root         0 Jan  1  1970 4
dr-xr-xr-x   3 bin  bin          0 Apr 18  2007 441
dr-xr-xr-x   3 root root         0 Apr 18  2007 446
dr-xr-xr-x   3 root root         0 Apr 18  2007 449
dr-xr-xr-x   3 root root         0 Jan  1  1970 5
dr-xr-xr-x   3 root root         0 Jan  1  1970 75
dr-xr-xr-x   3 root root         0 Jan  1  1970 76
dr-xr-xr-x   3 root root         0 Jan  1  1970 77
dr-xr-xr-x   3 root root         0 Jan  1  1970 78
dr-xr-xr-x   3 root root         0 Apr 17 12:55 852
dr-xr-xr-x   3 root root         0 Apr 17 12:55 920
dr-xr-xr-x   3 root root         0 Apr 17 12:55 925
dr-xr-xr-x   3 root root         0 Apr 17 12:55 958
dr-xr-xr-x   3 root root         0 Apr 17 12:55 964
dr-xr-xr-x   3 root root         0 Apr 17 12:55 966
dr-xr-xr-x   3 root root         0 Apr 17 12:55 967
dr-xr-xr-x   3 root root         0 Apr 17 12:55 978
dr-xr-xr-x   3 root root         0 Apr 17 12:56 986
-r--r--r--   1 root root         0 Apr 17 13:37 buddyinfo
dr-xr-xr-x   4 root root         0 Apr 17 13:37 bus
-r--r--r--   1 root root         0 Apr 17 13:37 cmdline
-r--r--r--   1 root root      5616 Apr 17 13:37 config.gz
-r--r--r--   1 root root         0 Apr 17 13:37 cpuinfo
-r--r--r--   1 root root         0 Apr 17 13:37 devices
-r--r--r--   1 root root         0 Apr 17 13:37 diskstats
-r--r--r--   1 root root         0 Apr 17 13:37 dma
-r--r--r--   1 root root         0 Apr 17 13:37 drbd
dr-xr-xr-x   2 root root         0 Apr 17 13:37 driver
-r--r--r--   1 root root         0 Apr 17 13:37 execdomains
-r--r--r--   1 root root         0 Apr 17 13:37 filesystems
dr-xr-xr-x   4 root root         0 Apr 17 13:37 fs
-r--r--r--   1 root root         0 Apr 17 13:37 interrupts
-r--r--r--   1 root root         0 Apr 17 13:37 iomem
-r--r--r--   1 root root         0 Apr 17 13:37 ioports
dr-xr-xr-x  98 root root         0 Apr 17 13:37 irq
-r--r--r--   1 root root         0 Apr 17 13:37 kallsyms
-r--------   1 root root 805310464 Apr 17 13:37 kcore
-r--------   1 root root         0 Apr 17 12:56 kmsg
-r--r--r--   1 root root         0 Apr 17 13:37 loadavg
-r--r--r--   1 root root         0 Apr 17 13:37 locks
-r--r--r--   1 root root         0 Apr 17 13:37 meminfo
-r--r--r--   1 root root         0 Apr 17 13:37 misc
-r--r--r--   1 root root         0 Apr 17 13:37 modules
lrwxrwxrwx   1 root root        11 Apr 17 13:37 mounts -> self/mounts
dr-xr-xr-x   3 root root         0 Apr 17 13:37 mpt
-r--r--r--   1 root root         0 Apr 17 13:37 mtd
dr-xr-xr-x   5 root root         0 Apr 17 13:37 net
-r--r--r--   1 root root         0 Apr 17 13:37 partitions
-rw-r--r--   1 root root         0 Apr 17 13:37 ppc_htab
lrwxrwxrwx   1 root root        64 Apr 17 13:02 self -> 3824
-rw-r--r--   1 root root         0 Apr 17 13:37 slabinfo
-r--r--r--   1 root root         0 Apr 17 13:07 stat
-r--r--r--   1 root root         0 Apr 17 13:37 swaps
dr-xr-xr-x  10 root root         0 Apr 17 13:37 sys
dr-xr-xr-x   2 root root         0 Apr 17 13:37 sysvipc
-r--r--r--   1 root root         0 Apr 17 13:37 therm
dr-xr-xr-x   4 root root         0 Apr 17 13:37 tty
-r--r--r--   1 root root         0 Apr 17 13:37 uptime
-r--r--r--   1 root root         0 Apr 17 13:37 version
-r--r--r--   1 root root         0 Apr 17 13:37 vmstat
=== root at fsb1 FSB /proc # cat kallsyms | grep drbd_al_begin_io
d90e3db0 t drbd_al_begin_io     [drbd]
=== root at fsb1 FSB /proc # cd 1360
=== root at fsb1 FSB /proc/1360 # more stat
1360 (mkfs.ext3) D 1351 967 967 0 -1 256 478 0 0 0 1 67 0 0 18 0 1 0 147730 
3170304 455 4294967295 268435456 268534220 2147483136 2147480720 266952396 0 
0 0 0 3641589808 0 0 17 0 0 0
=== root at fsb1 FSB /proc/1360 #

3641589808 - 0xd90e3db0 = 640 (640 bytes into the function)

_________________________________________________________________
Upptäck kärleken på MSN 
http://match.se.msn.com/channel/index.aspx?trackingid=1002962




More information about the drbd-user mailing list