[DRBD-user] Machine crashed repeatedly: drbd16: Epoch set size wrong!!found=1061 reported=1060

Fri Oct 29 11:41:12 CEST 2004

Hello all,

I'm wondering about this message, which occured with drbd 0.6.13 running
with original kernel 2.4.27 on a XSeries 235 machine with serveraid 5,
broadcom gigabit ethernet (bcm5700) during copying datas to /dev/nb16.
The machine has 512 MB RAM and 1024 MB cache.

What does this mean:
Oct 29 05:30:20 FAGINTSC kernel: drbd16: Epoch set size wrong!!found=1061
reported=1060

It is interesting, that directly after that message, the whole machine
crashed. In /var/log/messages, I can find the following:

~> ksymoops -m /usr/src/linux-2.4.27/System.map oops
ksymoops 2.4.5 on i686 2.4.27.  Options used
     -V (default)
     -k /proc/ksyms (default)
     -l /proc/modules (default)
     -o /lib/modules/2.4.27/ (default)
     -m /usr/src/linux-2.4.27/System.map (specified)

Oct 29 05:30:29 FAGINTSC kernel: CPU:    0
Oct 29 05:30:29 FAGINTSC kernel: EIP:    0010:[<c0135400>]    Not tainted
Using defaults from ksymoops -t elf32-i386 -a i386
Oct 29 05:30:29 FAGINTSC kernel: EFLAGS: 00010002
Oct 29 05:30:29 FAGINTSC kernel: eax: ffffffff   ebx: dbcbd160   ecx:
00000001   edx: dffed600
Oct 29 05:30:29 FAGINTSC kernel: esi: c158a37c   edi: 00001db7   ebp:
dffed6a4   esp: c15b9f30
Oct 29 05:30:29 FAGINTSC kernel: ds: 0018   es: 0018   ss: 0018
Oct 29 05:30:29 FAGINTSC kernel: Process kswapd (pid: 5, stackpage=c15b9000)
Oct 29 05:30:29 FAGINTSC kernel: Stack: 00000000 c1243108 c158a38c
c158a384 c15b8000 dffed600 00000000 00000008
Oct 29 05:30:29 FAGINTSC kernel:        00000000 00000000 00000000
00000020 000001d0 c028f69c c028f69c c01368ac
Oct 29 05:30:29 FAGINTSC kernel:        c15b9f90 000001d0 0000003c
00000020 c0136952 c15b9f90 00000246 00000000
Oct 29 05:30:29 FAGINTSC kernel: Call Trace:    [<c01368ac>] [<c0136952>]
[<c0136b0c>] [<c0136b78>] [<c0136cbd>]
Oct 29 05:30:29 FAGINTSC kernel:   [<c0105000>] [<c010745e>] [<c0136c20>]
Oct 29 05:30:29 FAGINTSC kernel: Code: 8b 00 47 3b 44 24 08 75 f7 8b 5e 2c
89 fa 8b 46 4c 88 d9 d3

>>EIP; c0135400 <kmem_cache_reap+230/340>   <=====

>>eax; ffffffff <END_OF_CODE+1f701d74/????>
>>ebx; dbcbd160 <_end+1b96ee68/204b3d68>
>>edx; dffed600 <_end+1fc9f308/204b3d68>
>>esi; c158a37c <_end+123c084/204b3d68>
>>edi; 00001db7 Before first symbol
>>ebp; dffed6a4 <_end+1fc9f3ac/204b3d68>
>>esp; c15b9f30 <_end+126bc38/204b3d68>

Trace; c01368ac <shrink_caches+1c/60>
Trace; c0136952 <try_to_free_pages_zone+62/f0>
Trace; c0136b0c <kswapd_balance_pgdat+6c/b0>
Trace; c0136b78 <kswapd_balance+28/40>
Trace; c0136cbd <kswapd+9d/b7>
Trace; c0105000 <_stext+0/0>
Trace; c010745e <arch_kernel_thread+2e/40>
Trace; c0136c20 <kswapd+0/b7>

Code;  c0135400 <kmem_cache_reap+230/340>
00000000 <_EIP>:
Code;  c0135400 <kmem_cache_reap+230/340>   <=====
   0:   8b 00                     mov    (%eax),%eax   <=====
Code;  c0135402 <kmem_cache_reap+232/340>
   2:   47                        inc    %edi
Code;  c0135403 <kmem_cache_reap+233/340>
   3:   3b 44 24 08               cmp    0x8(%esp,1),%eax
Code;  c0135407 <kmem_cache_reap+237/340>
   7:   75 f7                     jne    0 <_EIP>
Code;  c0135409 <kmem_cache_reap+239/340>
   9:   8b 5e 2c                  mov    0x2c(%esi),%ebx
Code;  c013540c <kmem_cache_reap+23c/340>
   c:   89 fa                     mov    %edi,%edx
Code;  c013540e <kmem_cache_reap+23e/340>
   e:   8b 46 4c                  mov    0x4c(%esi),%eax
Code;  c0135411 <kmem_cache_reap+241/340>
  11:   88 d9                     mov    %bl,%cl
Code;  c0135413 <kmem_cache_reap+243/340>
  13:   d3 00                     roll   %cl,(%eax)

drbd.conf:

 global {
    # use this if you want to define more resources later
    # without reloading the module.
    # by default we load the module with exactly as many devices
    # as configured mentioned in this file.
    minor_count=40

    # this is for people who set up a drbd device via the
    # loopback network interface or between two VMs on the same
    # box, for testing/simulating/presentation
    # otherwise it could trigger a run_tasq_queue deadlock.
    # I'm not sure whether this deadlock can happen with two
    # nodes, but it seems at least extremly unlikely; and since
    # the io_hints boost performance, keep them enabled.
    # disable_io_hints
 }

[...]

resource drbd16 {

  protocol=C
  fsckcmd=/bin/true

  disk {
    # do-panic
    disk-size = 10485760
  }

  net {
    sync-nice  = 18
    sync-min    = 4M
    sync-max    = 8M    # maximal average syncer bandwidth
    tl-size     = 5000  # transfer log size, ensures strict write ordering
    timeout     = 60    # 0.1 seconds
    connect-int = 10    # seconds
    ping-int    = 10    # seconds
    sync-group  = 16
    ko-count    = 5
  }

  on FAGINTSB {
    device  = /dev/nb16
    disk    = /dev/system/test_LogVol
    address = 10.2.18.151
    port    = 7804
  }

  on FAGINTSC {
    device  = /dev/nb16
    disk    = /dev/system/test_LogVol
    address = 10.2.18.150
    port    = 7804
  }
}

Could you please help me, because drbd seems to crash my machine really
often (about once a day).

Kind regards,
Andreas Hartmann