[DRBD-user] Resource may stuck after creating snapshots

kvaps kvapss at gmail.com
Fri May 29 15:14:35 CEST 2020


Hello,

I'm not sure if this bug was already fixed on the latest drbd version but
better to report it.
I'm using 9.0.19-1 (6f5fa5d348a99e5eeb09d83c49853d72e614fd07) and kernel
4.15.18-18-pve

We're running weekly backup for all our resources, backups are made for
each resource like described here:

https://github.com/LINBIT/linstor-server/issues/150#issuecomment-635942823

Thus for each resource we create new snapshot and new deploy the resource
from this snapshot, then snapshot is removed, then perform the backup for
created resource, then remove this resource created from the snapshot.

Time-to-time the VMs might stuck forever even resource is unsuspected after
snapshot.
Such VMs can be killed only with -9 signal. But resource stay on primary
mode and can't be shuted down:

# drbdsetup status one-vm-7944-disk-0 --verbose --statistics
one-vm-7944-disk-0 node-id:3 role:Primary suspended:no
    write-ordering:none
  volume:0 minor:1509 disk:Diskless client:yes quorum:yes
      size:272632908 read:0 written:0 al-writes:0 bm-writes:0
upper-pending:4 lower-pending:0 al-suspended:no blocked:no
  m13c28 node-id:0 connection:Connected role:Secondary congested:no
ap-in-flight:0 rs-in-flight:0
    volume:0 replication:Established peer-disk:UpToDate resync-suspended:no
        received:2027182412 sent:1080354540 out-of-sync:0 pending:0
unacked:0
  m14c10 node-id:1 connection:Connected role:Secondary congested:no
ap-in-flight:0 rs-in-flight:0
    volume:0 replication:Established peer-disk:UpToDate resync-suspended:no
        received:2027184872 sent:1080354540 out-of-sync:0 pending:0
unacked:0

# dmesg | grep one-vm-7944-disk-0
[20157532.568950] drbd one-vm-7944-disk-0: susp-io( no -> user)
[20157534.979777] drbd one-vm-7944-disk-0: susp-io( user -> no)

# drbdsetup secondary one-vm-7944-disk-0
<stuck forever>

strace log:

execve("/usr/sbin/drbdsetup", ["drbdsetup", "secondary",
"one-vm-7944-disk-0"], 0x7ffc6b833b10 /* 16 vars */) = 0
brk(NULL)                               = 0x56134ae0f000
access("/etc/ld.so.nohwcap", F_OK)      = -1 ENOENT (No such file or
directory)
access("/etc/ld.so.preload", R_OK)      = -1 ENOENT (No such file or
directory)
openat(AT_FDCWD, "/etc/ld.so.cache", O_RDONLY|O_CLOEXEC) = 3
fstat(3, {st_mode=S_IFREG|0644, st_size=37110, ...}) = 0
mmap(NULL, 37110, PROT_READ, MAP_PRIVATE, 3, 0) = 0x7f5155c12000
close(3)                                = 0
access("/etc/ld.so.nohwcap", F_OK)      = -1 ENOENT (No such file or
directory)
openat(AT_FDCWD, "/lib/x86_64-linux-gnu/libc.so.6", O_RDONLY|O_CLOEXEC) = 3
read(3,
"\177ELF\2\1\1\3\0\0\0\0\0\0\0\0\3\0>\0\1\0\0\0\260\34\2\0\0\0\0\0"...,
832) = 832
fstat(3, {st_mode=S_IFREG|0755, st_size=2030544, ...}) = 0
mmap(NULL, 8192, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) =
0x7f5155c10000
mmap(NULL, 4131552, PROT_READ|PROT_EXEC, MAP_PRIVATE|MAP_DENYWRITE, 3, 0) =
0x7f5155604000
mprotect(0x7f51557eb000, 2097152, PROT_NONE) = 0
mmap(0x7f51559eb000, 24576, PROT_READ|PROT_WRITE,
MAP_PRIVATE|MAP_FIXED|MAP_DENYWRITE, 3, 0x1e7000) = 0x7f51559eb000
mmap(0x7f51559f1000, 15072, PROT_READ|PROT_WRITE,
MAP_PRIVATE|MAP_FIXED|MAP_ANONYMOUS, -1, 0) = 0x7f51559f1000
close(3)                                = 0
arch_prctl(ARCH_SET_FS, 0x7f5155c11580) = 0
mprotect(0x7f51559eb000, 16384, PROT_READ) = 0
mprotect(0x561349fd2000, 4096, PROT_READ) = 0
mprotect(0x7f5155c1c000, 4096, PROT_READ) = 0
munmap(0x7f5155c12000, 37110)           = 0
chdir("/")                              = 0
stat("/proc/drbd", {st_mode=S_IFREG|0444, st_size=0, ...}) = 0
openat(AT_FDCWD, "/proc/drbd", O_RDONLY) = 3
brk(NULL)                               = 0x56134ae0f000
brk(0x56134ae30000)                     = 0x56134ae30000
read(3, "version: 9.0.19-1 (api:2/proto:8"..., 4095) = 170
close(3)                                = 0
socket(AF_NETLINK, SOCK_DGRAM, NETLINK_GENERIC) = 3
setsockopt(3, SOL_SOCKET, SO_SNDBUF, [1048576], 4) = 0
setsockopt(3, SOL_SOCKET, SO_RCVBUF, [1048576], 4) = 0
bind(3, {sa_family=AF_NETLINK, nl_pid=0, nl_groups=00000000}, 12) = 0
getsockname(3, {sa_family=AF_NETLINK, nl_pid=8512, nl_groups=00000000},
[12]) = 0
write(3, " \0\0\0\20\0\1\0\340\t\321^@!\0\0\3\2\0\0\t\0\2\0drbd\0\0\0\0",
32) = 32
poll([{fd=3, events=POLLIN}], 1, 3000)  = 1 ([{fd=3, revents=POLLIN}])
recvmsg(3, {msg_name={sa_family=AF_NETLINK, nl_pid=0, nl_groups=00000000},
msg_namelen=12, msg_iov=[{iov_base={{len=816, type=nlctrl, flags=0,
seq=1590757856, pid=8512},
"\x01\x02\x00\x00\x09\x00\x02\x00\x64\x72\x62\x64\x00\x00\x00\x00\x06\x00\x01\x00\x1f\x00\x00\x00\x08\x00\x03\x00\x02\x00\x00\x00"...},
iov_len=8192}], msg_iovlen=1, msg_controllen=0, msg_flags=0}, MSG_PEEK) =
816
poll([{fd=3, events=POLLIN}], 1, 3000)  = 1 ([{fd=3, revents=POLLIN}])
recvmsg(3, {msg_name={sa_family=AF_NETLINK, nl_pid=0, nl_groups=00000000},
msg_namelen=12, msg_iov=[{iov_base={{len=816, type=nlctrl, flags=0,
seq=1590757856, pid=8512},
"\x01\x02\x00\x00\x09\x00\x02\x00\x64\x72\x62\x64\x00\x00\x00\x00\x06\x00\x01\x00\x1f\x00\x00\x00\x08\x00\x03\x00\x02\x00\x00\x00"...},
iov_len=8192}], msg_iovlen=1, msg_controllen=0, msg_flags=0}, 0) = 816
write(3,
"8\0\0\0\37\0\1\0\341\t\321^@!\0\0\17\2\0\0\377\377\377\377\0\0\0\0\34\0\2\0"...,
56

Best Regards,
Andrei Kvapil
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.linbit.com/pipermail/drbd-user/attachments/20200529/a750b197/attachment.htm>


More information about the drbd-user mailing list