[DRBD-user] Resource may stuck after creating snapshots

kvaps kvapss at gmail.com
Fri May 29 18:11:52 CEST 2020


Another VM have the same problem, new errors appear:

# dmesg | grep one-vm-8099-disk-0
[20158467.625882] drbd one-vm-8099-disk-0: susp-io( no -> user)
[20158469.308525] drbd one-vm-8099-disk-0: susp-io( user -> no)
[20158469.309400] drbd one-vm-8099-disk-0/0 drbd1545: drbd_req_complete:
Logic BUG rq_state: 8000, completion_ref = -1
[20158469.309504] drbd one-vm-8099-disk-0/0 drbd1545: drbd_req_complete:
Logic BUG rq_state: 8000, completion_ref = -1
[20158469.309553] drbd one-vm-8099-disk-0/0 drbd1545: drbd_req_complete:
Logic BUG rq_state: 8000, completion_ref = -1
[20158469.309647] drbd one-vm-8099-disk-0/0 drbd1545: drbd_req_complete:
Logic BUG rq_state: 8000, completion_ref = -1
[20158469.309989] drbd one-vm-8099-disk-0/0 drbd1545: drbd_req_complete:
Logic BUG rq_state: 8000, completion_ref = -1
[20158469.310208] drbd one-vm-8099-disk-0/0 drbd1545: drbd_req_complete:
Logic BUG rq_state: 8000, completion_ref = -1
[20158469.310762] drbd one-vm-8099-disk-0/0 drbd1545: drbd_req_destroy:
Logic BUG rq_state: 8000, completion_ref = -1

Best Regards,
Andrei Kvapil


On Fri, May 29, 2020 at 3:14 PM kvaps <kvapss at gmail.com> wrote:

> Hello,
>
> I'm not sure if this bug was already fixed on the latest drbd version but
> better to report it.
> I'm using 9.0.19-1 (6f5fa5d348a99e5eeb09d83c49853d72e614fd07) and kernel
> 4.15.18-18-pve
>
> We're running weekly backup for all our resources, backups are made for
> each resource like described here:
>
> https://github.com/LINBIT/linstor-server/issues/150#issuecomment-635942823
>
> Thus for each resource we create new snapshot and new deploy the resource
> from this snapshot, then snapshot is removed, then perform the backup for
> created resource, then remove this resource created from the snapshot.
>
> Time-to-time the VMs might stuck forever even resource is unsuspected
> after snapshot.
> Such VMs can be killed only with -9 signal. But resource stay on primary
> mode and can't be shuted down:
>
> # drbdsetup status one-vm-7944-disk-0 --verbose --statistics
> one-vm-7944-disk-0 node-id:3 role:Primary suspended:no
>     write-ordering:none
>   volume:0 minor:1509 disk:Diskless client:yes quorum:yes
>       size:272632908 read:0 written:0 al-writes:0 bm-writes:0
> upper-pending:4 lower-pending:0 al-suspended:no blocked:no
>   m13c28 node-id:0 connection:Connected role:Secondary congested:no
> ap-in-flight:0 rs-in-flight:0
>     volume:0 replication:Established peer-disk:UpToDate resync-suspended:no
>         received:2027182412 sent:1080354540 out-of-sync:0 pending:0
> unacked:0
>   m14c10 node-id:1 connection:Connected role:Secondary congested:no
> ap-in-flight:0 rs-in-flight:0
>     volume:0 replication:Established peer-disk:UpToDate resync-suspended:no
>         received:2027184872 sent:1080354540 out-of-sync:0 pending:0
> unacked:0
>
> # dmesg | grep one-vm-7944-disk-0
> [20157532.568950] drbd one-vm-7944-disk-0: susp-io( no -> user)
> [20157534.979777] drbd one-vm-7944-disk-0: susp-io( user -> no)
>
> # drbdsetup secondary one-vm-7944-disk-0
> <stuck forever>
>
> strace log:
>
> execve("/usr/sbin/drbdsetup", ["drbdsetup", "secondary",
> "one-vm-7944-disk-0"], 0x7ffc6b833b10 /* 16 vars */) = 0
> brk(NULL)                               = 0x56134ae0f000
> access("/etc/ld.so.nohwcap", F_OK)      = -1 ENOENT (No such file or
> directory)
> access("/etc/ld.so.preload", R_OK)      = -1 ENOENT (No such file or
> directory)
> openat(AT_FDCWD, "/etc/ld.so.cache", O_RDONLY|O_CLOEXEC) = 3
> fstat(3, {st_mode=S_IFREG|0644, st_size=37110, ...}) = 0
> mmap(NULL, 37110, PROT_READ, MAP_PRIVATE, 3, 0) = 0x7f5155c12000
> close(3)                                = 0
> access("/etc/ld.so.nohwcap", F_OK)      = -1 ENOENT (No such file or
> directory)
> openat(AT_FDCWD, "/lib/x86_64-linux-gnu/libc.so.6", O_RDONLY|O_CLOEXEC) = 3
> read(3,
> "\177ELF\2\1\1\3\0\0\0\0\0\0\0\0\3\0>\0\1\0\0\0\260\34\2\0\0\0\0\0"...,
> 832) = 832
> fstat(3, {st_mode=S_IFREG|0755, st_size=2030544, ...}) = 0
> mmap(NULL, 8192, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) =
> 0x7f5155c10000
> mmap(NULL, 4131552, PROT_READ|PROT_EXEC, MAP_PRIVATE|MAP_DENYWRITE, 3, 0)
> = 0x7f5155604000
> mprotect(0x7f51557eb000, 2097152, PROT_NONE) = 0
> mmap(0x7f51559eb000, 24576, PROT_READ|PROT_WRITE,
> MAP_PRIVATE|MAP_FIXED|MAP_DENYWRITE, 3, 0x1e7000) = 0x7f51559eb000
> mmap(0x7f51559f1000, 15072, PROT_READ|PROT_WRITE,
> MAP_PRIVATE|MAP_FIXED|MAP_ANONYMOUS, -1, 0) = 0x7f51559f1000
> close(3)                                = 0
> arch_prctl(ARCH_SET_FS, 0x7f5155c11580) = 0
> mprotect(0x7f51559eb000, 16384, PROT_READ) = 0
> mprotect(0x561349fd2000, 4096, PROT_READ) = 0
> mprotect(0x7f5155c1c000, 4096, PROT_READ) = 0
> munmap(0x7f5155c12000, 37110)           = 0
> chdir("/")                              = 0
> stat("/proc/drbd", {st_mode=S_IFREG|0444, st_size=0, ...}) = 0
> openat(AT_FDCWD, "/proc/drbd", O_RDONLY) = 3
> brk(NULL)                               = 0x56134ae0f000
> brk(0x56134ae30000)                     = 0x56134ae30000
> read(3, "version: 9.0.19-1 (api:2/proto:8"..., 4095) = 170
> close(3)                                = 0
> socket(AF_NETLINK, SOCK_DGRAM, NETLINK_GENERIC) = 3
> setsockopt(3, SOL_SOCKET, SO_SNDBUF, [1048576], 4) = 0
> setsockopt(3, SOL_SOCKET, SO_RCVBUF, [1048576], 4) = 0
> bind(3, {sa_family=AF_NETLINK, nl_pid=0, nl_groups=00000000}, 12) = 0
> getsockname(3, {sa_family=AF_NETLINK, nl_pid=8512, nl_groups=00000000},
> [12]) = 0
> write(3, " \0\0\0\20\0\1\0\340\t\321^@!\0\0\3\2\0\0\t\0\2\0drbd\0\0\0\0",
> 32) = 32
> poll([{fd=3, events=POLLIN}], 1, 3000)  = 1 ([{fd=3, revents=POLLIN}])
> recvmsg(3, {msg_name={sa_family=AF_NETLINK, nl_pid=0, nl_groups=00000000},
> msg_namelen=12, msg_iov=[{iov_base={{len=816, type=nlctrl, flags=0,
> seq=1590757856, pid=8512},
> "\x01\x02\x00\x00\x09\x00\x02\x00\x64\x72\x62\x64\x00\x00\x00\x00\x06\x00\x01\x00\x1f\x00\x00\x00\x08\x00\x03\x00\x02\x00\x00\x00"...},
> iov_len=8192}], msg_iovlen=1, msg_controllen=0, msg_flags=0}, MSG_PEEK) =
> 816
> poll([{fd=3, events=POLLIN}], 1, 3000)  = 1 ([{fd=3, revents=POLLIN}])
> recvmsg(3, {msg_name={sa_family=AF_NETLINK, nl_pid=0, nl_groups=00000000},
> msg_namelen=12, msg_iov=[{iov_base={{len=816, type=nlctrl, flags=0,
> seq=1590757856, pid=8512},
> "\x01\x02\x00\x00\x09\x00\x02\x00\x64\x72\x62\x64\x00\x00\x00\x00\x06\x00\x01\x00\x1f\x00\x00\x00\x08\x00\x03\x00\x02\x00\x00\x00"...},
> iov_len=8192}], msg_iovlen=1, msg_controllen=0, msg_flags=0}, 0) = 816
> write(3,
> "8\0\0\0\37\0\1\0\341\t\321^@!\0\0\17\2\0\0\377\377\377\377\0\0\0\0\34\0\2\0"...,
> 56
>
> Best Regards,
> Andrei Kvapil
>
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.linbit.com/pipermail/drbd-user/attachments/20200529/bc175b92/attachment-0001.htm>


More information about the drbd-user mailing list