Index: drbd/drbd_receiver.c =================================================================== --- drbd/drbd_receiver.c (.../trunk) (revision 4106) +++ drbd/drbd_receiver.c (.../branches/drbd-panic) (revision 4106) @@ -1668,7 +1668,7 @@ STATIC void drbd_uuid_dump(drbd_dev *mdev,char* text,u64* uuid) { - WARN("%s %016llX:%016llX:%016llX:%016llX\n", + INFO("%s %016llX:%016llX:%016llX:%016llX\n", text, uuid[Current], uuid[Bitmap], @@ -1748,13 +1748,13 @@ int hg; drbd_conns_t rv = conn_mask; - + //INFO("drbd_sync_handshake:\n"); //drbd_uuid_dump(mdev,"self",mdev->bc->md.uuid); //drbd_uuid_dump(mdev,"peer",mdev->p_uuid); hg = drbd_uuid_compare(mdev); - //WARN("uuid_compare()=%d\n",hg); + //INFO("uuid_compare()=%d\n",hg); if (hg == 100) { int pcount = (mdev->state.role==Primary) + (peer_role==Primary); Index: drbd/drbd_nl.c =================================================================== --- drbd/drbd_nl.c (.../trunk) (revision 4106) +++ drbd/drbd_nl.c (.../branches/drbd-panic) (revision 4106) @@ -883,6 +883,8 @@ return 0; release_bdev3_fail: + drbd_bm_unlock(mdev); + /* The following will be freed by state change below */ nbc = NULL; resync_lru = NULL; Index: drbd/drbd_bitmap.c =================================================================== --- drbd/drbd_bitmap.c (.../trunk) (revision 4106) +++ drbd/drbd_bitmap.c (.../branches/drbd-panic) (revision 4106) @@ -806,11 +806,15 @@ int drbd_bm_read(struct Drbd_Conf *mdev) { struct drbd_bitmap *b = mdev->bitmap; + int err=0; - int err = drbd_bm_rw(mdev, READ); + if (b->bm) { + // bitmap size > 0 + err = drbd_bm_rw(mdev, READ); - if (err == 0) - b->bm[b->bm_words] = DRBD_MAGIC; + if (err == 0) + b->bm[b->bm_words] = DRBD_MAGIC; + } return err; } Index: drbd/drbd_main.c =================================================================== --- drbd/drbd_main.c (.../trunk) (revision 4106) +++ drbd/drbd_main.c (.../branches/drbd-panic) (revision 4106) @@ -378,18 +378,12 @@ if (ok) WARN("Notified peer that my disk is broken.\n"); else ERR("Sending state in drbd_io_error() failed\n"); -#if 0 -// warning SPG -// This code seems wrong -- we only get here if we are set to -// detach in which case we have no local disk, so there's no -// point asserting that a full sync is needed. -// Flushing the meta data is probably also wrong -- we want -// this node to appear out of date so we should deliberately -// NOT update the meta data with the latest epoch info! - D_ASSERT(drbd_md_test_flag(mdev->bc,MDF_FullSync)); - D_ASSERT(!drbd_md_test_flag(mdev->bc,MDF_Consistent)); + // Make sure we try to flush meta-data to disk - we come + // in here because of a local disk error so it might fail + // but we still need to try -- both because the error might + // be in the data portion of the disk and because we need + // to ensure the md-sync-timer is stopped if running. drbd_md_sync(mdev); -#endif /* Releasing the backing device is done in after_state_ch() */ @@ -2903,10 +2905,10 @@ case ReportUUIDs: INFOP("%s Curr:%016llX, Bitmap:%016llX, HisSt:%016llX, HisEnd:%016llX\n", cmdname(cmd), - p->GenCnt.uuid[Current], - p->GenCnt.uuid[Bitmap], - p->GenCnt.uuid[History_start], - p->GenCnt.uuid[History_end]); + be64_to_cpu(p->GenCnt.uuid[Current]), + be64_to_cpu(p->GenCnt.uuid[Bitmap]), + be64_to_cpu(p->GenCnt.uuid[History_start]), + be64_to_cpu(p->GenCnt.uuid[History_end])); break; case ReportSizes: Index: testing/testsuite/testfaults.conf =================================================================== --- testing/testsuite/testfaults.conf (.../trunk) (revision 0) +++ testing/testsuite/testfaults.conf (.../branches/drbd-panic) (revision 4106) @@ -0,0 +1,299 @@ +# Configuration file of the testsuite.pl +# +# Use to configure both nodes and to change default commands of the testsuite. +# Important: after each argument put ; to finish the line otherwise it will not be set and can cause errors! + +# Configuration of one Node +node node1 { + # Syntax: addr + # IP Address of the node that is visible to the internet. If this is left undefined + # the testsuite will not start. + # + # Example: addr 192.168.1.10; + # + addr 192.168.1.10; + + # Syntax: port

+ # The port the testsuite will try to connect to. If this is left undefined + # the testsuite will not start. + # + # Example: port 4000; + # + port 4000; +} + +# Configuration of second Node +node node2 { + # Syntax: addr + # IP Address of the node that is visible to the internet. If this is left undefined + # the testsuite will not start. + # + # Example: addr 192.168.1.10; + # + addr 192.168.1.10; + + # Syntax: port

+ # The port the testsuite will try to connect to. If this is left undefined + # the testsuite will not start. + # + # Example: port 4000; + # + port 4000; +} + +# Set of default parameters of the testsuite +defaults { + # Syntax: timeout + # Generall timeout limit of seconds for the response of the executed command. + # + # If this is left undefined default is to wait 0 seconds = wait forever. + # + # Example: timeout 20; + # + timeout 10; + + # Syntax: latency + # + # The maximum time/clock difference between the two agents. If it's above + # the given value, FIXME => sync + # + # If this is left undefined default latency is 0.5 seconds + # + # Example: latency 0.05; + # + latency 0.1; + + # Syntax: connect_timeout + # Timeout after trying to establish a connection after seconds + # + # If this is left undefined default is to wait 3 seconds. + # + # Example: connect_timeout 1; + # + connect_timeout 1; + + # Syntax: timeserver + # If nodes are out of sync, ntpdate will be sent to each node. + # + # If no timeserver is given, the testsuite will abort instead of trying + # to syncronize the agents. + # + timeserver 10.25.91.12; +} + +# Set of sequence commands to be executet on the nodes. See README for detailed +# information about available commands. +seq-commands { + + # check if we are in a stable state + if ((get 'drbdsetup /dev/drbd0 state', on node1) =~ /Unknown/) { + VERBOSE ("trouble with drbd on agents"); + die(); + } + + cmd '/sbin/drbdadm detach {resource}', on node1; + + # simulate meta data read failures during attach + INFO "1. Simulate local meta data read failures during attach"; + cmd set_fr, on node1; + cmd set_md_rd, on node1; + + # attach/detach a few times (want at least one failure! + for (my $i = 0; $i < 5; ++$i) { + cmd '/sbin/drbdadm attach {resource}', on node1; + + sleep 2; + + if ((get state_ds, on node1) =~ /Diskless/) { + VERBOSE ("goodness: meta data read fault fired"); + } + else { + # might need to wait for resync here... disable + # faults whilst we wait + cmd clr_fr, on node1; + + expected 'cs', state 'Connected', timeout 500; + expected 'ds', state 'UpToDate', timeout 500; + + cmd '/sbin/drbdadm detach {resource}', on node1; + + cmd set_fr, on node1; + cmd set_md_rd, on node1; + } + + # everyone should be in connected state + expected 'cs', state 'Connected', timeout 15; + } + + cmd clr_fr, on node1; + + # simulate meta data write failures during attach + INFO "2. Simulate local meta data write failures during attach"; + cmd set_fr, on node1; + cmd set_md_wr, on node1; + + # attach/detach a few times (want at least one failure! + for (my $i = 0; $i < 5; ++$i) { + cmd '/sbin/drbdadm attach {resource}', on node1; + + sleep 2; + + if ((get state_ds, on node1) =~ /Diskless/) { + VERBOSE ("goodness: meta data write fault fired"); + } + else { + # might need to wait for resync here... disable + # faults whilst we wait + cmd clr_fr, on node1; + + expected 'cs', state 'Connected', timeout 500; + expected 'ds', state 'UpToDate', timeout 500; + + cmd '/sbin/drbdadm detach {resource}', on node1; + + cmd set_fr, on node1; + cmd set_md_wr, on node1; + } + } + + cmd clr_fr, on node1; + + cmd '/sbin/drbdadm attach {resource}', on node1; + + # everyone should be in connected cstate, uptodate dstate + expected 'cs', state 'Connected', timeout 15; + expected 'ds', state 'UpToDate', timeout 15; + + # switch to primary + cmd '/sbin/drbdadm primary {resource}', on node1; + + # Check node1 went primary... + expected 'st', state 'Primary', timeout 15, on node1; + + # simulate meta data write failures on partner node + INFO "3. Simulate remote meta data write failures during attach"; + cmd set_fr, on node2; + cmd set_md_rd, on node2; + + cmd '/sbin/drbdadm detach {resource}', on node1; + + # attach/detach a few times (want to see at least one failure! + for (my $i = 0; $i < 5; ++$i) { + cmd '/sbin/drbdadm attach {resource}', on node1; + + sleep 2; + + if ((get state_ds, on node1) =~ /Diskless/) { + VERBOSE ("goodness: meta data write fault fired"); + } + else { + # might need to wait for resync here... disable + # faults whilst we wait + cmd clr_fr, on node2; + + expected 'cs', state 'Connected', timeout 500; + expected 'ds', state 'UpToDate', timeout 500; + + cmd '/sbin/drbdadm detach {resource}', on node1; + + cmd set_fr, on node2; + cmd set_md_rd, on node2; + } + } + + cmd clr_fr, on node2; + + # make sure partner is attached... + cmd '/sbin/drbdadm attach {resource}', on node2; + + # everyone should be in connected cstate, uptodate dstate + expected 'cs', state 'Connected', timeout 500; + expected 'ds', state 'UpToDate', timeout 500; + + # simulate read errors locally + INFO "4. Simulate local user data read failures"; + cmd set_fr, on node1; + cmd set_dt_rd, on node1; + + # mount file system a few times. + for (my $i = 0; $i < 5; ++$i) { + cmd 'mount /dev/{device} {mountpoint}', on node1; + cmd 'umount /dev/{device}', on node1; + + sleep 2; + + # state should not change - i.e. no resync (YET! Should get resync of failed block + # eventually) + expected 'cs', state 'Connected', timeout 15; + } + + cmd clr_fr, on node1; + + # simulate write errors locally + INFO "5. Simulate local user data write failures"; + cmd set_fr, on node1; + cmd set_dt_wr, on node1; + + # mount file system and modify - check for errors! + for (my $i = 0; $i < 5; ++$i) { + cmd 'mount /dev/{device} {mountpoint}', on node1; + + # create some files - should get some errors... + cmd 'cp -f /boot/* {mountpoint}', on node1; + + cmd 'umount /dev/{device}', on node1; + + sleep 2; + + # state should not change - i.e. no resync (YET!) + expected 'cs', state 'Connected', timeout 15; + } + + cmd clr_fr, on node1; + + # simulate read errors remotely + INFO "6. Simulate remote user data read failures"; + cmd 'drbdadm detach {resource}', on node1; + + cmd set_fr, on node2; + cmd set_dt_rd, on node2; + + # mount file system - check for errors! + for (my $i = 0; $i < 5; ++$i) { + cmd 'mount /dev/{device} {mountpoint}', on node1; + cmd 'umount /dev/{device}', on node1; + + sleep 2; + + # state should not change - i.e. no resync + expected 'cs', state 'Connected', timeout 15; + } + + cmd clr_fr, on node2; + + cmd 'drbdadm attach {resource}', on node1; + + # wait for resync to complete + expected 'cs', state 'Connected', timeout 15; + expected 'ds', state 'UpToDate', timeout 15; + + # simulate write errors remotely + INFO "6. Simulate remote user data write failures"; + cmd set_fr, on node2; + cmd set_dt_wr, on node2; + + for (my $i = 0; $i < 5; ++$i) { + # mount file system - check for errors! + cmd 'mount /dev/{device} {mountpoint}', on node1; + + # create some files - should get some errors... + cmd 'cp -f /boot/* {mountpoint}', on node1; + + cmd 'umount /dev/{device}', on node1; + + sleep 2; + } + + cmd clr_fr, on node2; +} + Index: testing/testsuite/testsuite.pl =================================================================== --- testing/testsuite/testsuite.pl (.../trunk) (revision 4106) +++ testing/testsuite/testsuite.pl (.../branches/drbd-panic) (revision 4106) @@ -83,6 +83,16 @@ #FILESYSTEM: $commands{'fs_make'} = 'mkfs.{filesystem} /dev/{device}'; #FIXME FileSystem - agent.conf!!' +#FAULTS +$commands{'set_fr'} = 'echo 10 >/sys/module/drbd/parameters/fault_rate'; +$commands{'clr_fr'} = 'echo 0 >/sys/module/drbd/parameters/fault_rate; echo 0 >/sys/module/drbd/parameters/enable_faults'; +$commands{'set_md_wr'} = 'echo 1 >/sys/module/drbd/parameters/enable_faults'; +$commands{'set_md_rd'} = 'echo 2 >/sys/module/drbd/parameters/enable_faults'; +$commands{'set_rs_wr'} = 'echo 4 >/sys/module/drbd/parameters/enable_faults'; +$commands{'set_rs_rd'} = 'echo 8 >/sys/module/drbd/parameters/enable_faults'; +$commands{'set_dt_wr'} = 'echo 16 >/sys/module/drbd/parameters/enable_faults'; +$commands{'set_dt_rd'} = 'echo 32 >/sys/module/drbd/parameters/enable_faults'; + ############################################################################### require 'getopts.pl'; @@ -154,10 +164,10 @@ elsif ($section == 4) { push @seqcommands, $_; if (/{/) { - $seqsection = 1; + $seqsection += 1; } if (/}/) { - $seqsection = 0; + $seqsection -= 1; } } elsif ($section == 1 or $section == 2) { @@ -210,7 +220,7 @@ $section = 4; } else { - ERROR ("unknown configuration"); + ERROR ("unknown configuration: ".$_); } } } @@ -335,7 +345,6 @@ return $reply; } - ############################################################################### ###### functions ############################################################################### @@ -603,7 +612,7 @@ my $yday; print LOGFILE "--------- TestSuite --------\n"; - foreach(sort(@logList)) { + foreach(@logList) { ($sec,$min,$hour,$mday,$mon,$year,$wday,$yday) = (localtime($$_[0])); printf LOGFILE "%s %02d %02d:%02d:%02d ", $mnames[$mon], $mday, $hour, $min, $sec; print LOGFILE $$_[1]."\n"; @@ -674,6 +683,7 @@ while(($key, $value) = each(%commands)) { $seqcommands_eval =~ s/cmd $key/cmd '$value'/g; + $seqcommands_eval =~ s/get $key/get '$value'/g; } set_default_vars(); @@ -757,6 +767,19 @@ } +# print info messages +sub INFO { + my ($msg) = @_; + + if (defined($opt_l)) { + LOG($msg); + } + + print $msg. "\n"; + + return; +} + # print warn messages sub WARN { my ($msg) = @_;