[DRBD-cvs] svn commit by simon - r2427 - in trunk/drbd: . linux - Simon's anti-panic patch [phase 1] 1. In the case of me

Fri Sep 15 15:14:26 CEST 2006

Author: simon
Date: 2006-09-15 15:14:25 +0200 (Fri, 15 Sep 2006)
New Revision: 2427

Modified:
   trunk/drbd/drbd_actlog.c
   trunk/drbd/drbd_bitmap.c
   trunk/drbd/drbd_compat_wrappers.h
   trunk/drbd/drbd_int.h
   trunk/drbd/drbd_main.c
   trunk/drbd/drbd_nl.c
   trunk/drbd/drbd_receiver.c
   trunk/drbd/drbd_req.c
   trunk/drbd/drbd_worker.c
   trunk/drbd/linux/drbd_config.h
Log:
Simon's anti-panic patch [phase 1]

1. In the case of meta-data failures, I took the approach of forcibly
   detaching the disk even if the on-error setting is PassOn AND I
   made sure that this is done on ALL meta-data errors. 

2. To do this, I added a new Boolean parameter to drbd_chk_io_error
   and drbd_io_error that indicates if a detach should be forced - all
   meta-data cases pass TRUE and all user data cases pass FALSE. 

3. Apart from making sure that chk_io_error and io_error are called
   for all meta data cases, I also removed the panic()s from these
   failure cases. 

4. In order to test this, I introduced some fault insertion code -
   controlled by a new config macro, DRBD_ENABLE_FAULTS, off by
   default. This adds a couple of module parameters; a. fault_rate -
   integer is the % of times the specified fault should be inserted -
   the idea is that if you run enough tests with each fault enabled,
   eventually all failure code paths will be tested... b.
   enable_faults - bitmap of enabled faults - I broke it down into 6
   classes so far - meta-data, resync and data reads and writes. Every
   time an I/O is sent to the block layer, the code tests for the
   fault being active and if so it completes the bio with an error
   instead of sending it down.
   
There are also a few other minor fixes:

1. when reading the bitmap, I clear the BM_MD_IO_ERROR flag before
   starting - otherwise if this fails once, it will fail every
   subsequent time.  
   
2. some changes in tracing to help me debug - including fixing the
   packet dump trace code - this fix got lost somehow and received
   frames were printed incorrectly. 

3. At the end of drbd_nl_disk_conf, if a failure occurs AFTER the
   point of no return, I think it's necessary to set the local nbc
   value to NULL and NOT free it - since it has been put into the
   mdev->bc by this point, the error handling in drbd_force_state()
   will free the bc object and we'd end up freeing it twice (I THINK!)

4. drbd_al_to_on_disk_bm() - if inc_local_if_state() returns 0 pay
   attention!
   


Modified: trunk/drbd/drbd_actlog.c
===================================================================

--- trunk/drbd/drbd_actlog.c	2006-09-15 11:45:26 UTC (rev 2426)
+++ trunk/drbd/drbd_actlog.c	2006-09-15 13:14:25 UTC (rev 2427)
@@ -49,12 +49,18 @@
 	bio->bi_private = &event;
 	bio->bi_end_io = drbd_md_io_complete;
 
+	if (FAULT_ACTIVE((rw & WRITE)? DRBD_FAULT_MD_WR:DRBD_FAULT_MD_RD)) {
+		bio->bi_rw |= rw;
+		bio_endio(bio,bio->bi_size,-EIO);
+	}
+	else {
 #ifdef BIO_RW_SYNC
-	submit_bio(rw | (1 << BIO_RW_SYNC), bio);
+		submit_bio(rw | (1 << BIO_RW_SYNC), bio);
 #else
-	submit_bio(rw, bio);
-	drbd_blk_run_queue(bdev_get_queue(bdev->md_bdev));
+		submit_bio(rw, bio);
+		drbd_blk_run_queue(bdev_get_queue(bdev->md_bdev));
 #endif
+	}
 	wait_for_completion(&event);
 
 	ok = test_bit(BIO_UPTODATE, &bio->bi_flags);
@@ -108,7 +114,11 @@
 			ok = _drbd_md_sync_page_io(bdev,iop,
 						   sector,READ,hardsect);
 
-			if (unlikely(!ok)) return 0;
+			if (unlikely(!ok)) {
+				ERR("drbd_md_sync_page_io(,%llu,READ [hardsect!=512]) failed!\n",
+				    (unsigned long long)sector);
+				return 0;
+			}
 
 			memcpy(hp + offset*MD_HARDSECT , p, MD_HARDSECT);
 		}
@@ -130,6 +140,7 @@
 	if (unlikely(!ok)) {
 		ERR("drbd_md_sync_page_io(,%llu,%s) failed!\n",
 		    (unsigned long long)sector,rw ? "WRITE" : "READ");
+		return 0;
 	}
 
 	if( hardsect != MD_HARDSECT && rw == READ ) {
@@ -326,8 +337,8 @@
 	sector = mdev->bc->md.md_offset + mdev->bc->md.al_offset + mdev->al_tr_pos;
 
 	if(!drbd_md_sync_page_io(mdev,mdev->bc,sector,WRITE)) {
-		drbd_chk_io_error(mdev, 1);
-		drbd_io_error(mdev);
+		drbd_chk_io_error(mdev, 1, TRUE);
+		drbd_io_error(mdev, TRUE);
 	}
 
 	if( ++mdev->al_tr_pos > div_ceil(mdev->act_log->nr_elements,AL_EXTENTS_PT) ) {
@@ -361,6 +372,8 @@
 	sector = bdev->md.md_offset + bdev->md.al_offset + index;
 
 	if(!drbd_md_sync_page_io(mdev,bdev,sector,READ)) {
+		// Dont process error normally as this is done before
+		// disk is atached!
 		return -1;
 	}
 
@@ -500,22 +513,20 @@
 
 	wait_event(mdev->al_wait, lc_try_lock(mdev->act_log));
 
-	i=inc_local_if_state(mdev,Attaching);
-	D_ASSERT( i ); // Assertions should not have side effects.
-	// I do not want to have D_ASSERT( inc_md_only(mdev,Attaching) );
+	if (inc_local_if_state(mdev,Attaching)) {
+		for(i=0;i<mdev->act_log->nr_elements;i++) {
+			enr = lc_entry(mdev->act_log,i)->lc_number;
+			if(enr == LC_FREE) continue;
+			/* TODO encapsulate and optimize within drbd_bitmap
+			 * currently, if we have al-extents 16..19 active,
+			 * sector 4 will be written four times! */
+			drbd_bm_write_sect(mdev, enr/AL_EXT_PER_BM_SECT );
+		}
 
-	for(i=0;i<mdev->act_log->nr_elements;i++) {
-		enr = lc_entry(mdev->act_log,i)->lc_number;
-		if(enr == LC_FREE) continue;
-		/* TODO encapsulate and optimize within drbd_bitmap
-		 * currently, if we have al-extents 16..19 active,
-		 * sector 4 will be written four times! */
-		drbd_bm_write_sect(mdev, enr/AL_EXT_PER_BM_SECT );
-	}
-
-	lc_unlock(mdev->act_log);
-	wake_up(&mdev->al_wait);
-	dec_local(mdev);
+		lc_unlock(mdev->act_log);
+		wake_up(&mdev->al_wait);
+		dec_local(mdev);
+	} else D_ASSERT(0);
 }
 
 /**

Modified: trunk/drbd/drbd_bitmap.c
===================================================================
--- trunk/drbd/drbd_bitmap.c	2006-09-15 11:45:26 UTC (rev 2426)
+++ trunk/drbd/drbd_bitmap.c	2006-09-15 13:14:25 UTC (rev 2427)
@@ -653,7 +653,13 @@
 	bio_add_page(bio, page, len, 0);
 	bio->bi_private = b;
 	bio->bi_end_io = drbd_bm_async_io_complete;
-	submit_bio(rw, bio);
+
+	if (FAULT_ACTIVE((rw&WRITE)?DRBD_FAULT_MD_WR:DRBD_FAULT_MD_RD)) {
+		bio->bi_rw |= rw;
+		bio_endio(bio,bio->bi_size,-EIO);
+	}
+	else
+		submit_bio(rw, bio);
 }
 /* read one sector of the on disk bitmap into memory.
  * on disk bitmap is little endian.
@@ -684,8 +690,8 @@
 		ERR( "IO ERROR reading bitmap sector %lu "
 		     "(meta-disk sector %llu)\n",
 		     enr, (unsigned long long)on_disk_sector );
-		drbd_chk_io_error(mdev, 1);
-		drbd_io_error(mdev);
+		drbd_chk_io_error(mdev, 1, TRUE);
+		drbd_io_error(mdev, TRUE);
 		for (i = 0; i < AL_EXT_PER_BM_SECT; i++)
 			drbd_bm_ALe_set_all(mdev,enr*AL_EXT_PER_BM_SECT+i);
 	}
@@ -755,6 +761,8 @@
 
 	now = jiffies;
 	atomic_set(&b->bm_async_io, num_pages);
+	__clear_bit(BM_MD_IO_ERROR,&b->bm_flags);
+
 	for (i = 0; i < num_pages; i++) {
 		/* let the layers below us try to merge these bios... */
 		drbd_bm_page_io_async(mdev,b,i,rw);
@@ -770,8 +778,8 @@
 		 * detach?
 		 */
 		ALERT("we had at least one MD IO ERROR during bitmap IO\n");
-		drbd_chk_io_error(mdev, 1);
-		drbd_io_error(mdev);
+		drbd_chk_io_error(mdev, 1, TRUE);
+		drbd_io_error(mdev, TRUE);
 	}
 
 	now = jiffies;
@@ -836,8 +844,8 @@
 		ERR( "IO ERROR writing bitmap sector %lu "
 		     "(meta-disk sector %lu)\n",
 		     enr, (unsigned long)on_disk_sector );
-		drbd_chk_io_error(mdev, 1);
-		drbd_io_error(mdev);
+		drbd_chk_io_error(mdev, 1, TRUE);
+		drbd_io_error(mdev, TRUE);
 		for (i = 0; i < AL_EXT_PER_BM_SECT; i++)
 			drbd_bm_ALe_set_all(mdev,enr*AL_EXT_PER_BM_SECT+i);
 	}

Modified: trunk/drbd/drbd_compat_wrappers.h
===================================================================
--- trunk/drbd/drbd_compat_wrappers.h	2006-09-15 11:45:26 UTC (rev 2426)
+++ trunk/drbd/drbd_compat_wrappers.h	2006-09-15 13:14:25 UTC (rev 2427)
@@ -129,7 +129,7 @@
 /*
  * used to submit our private bio
  */
-static inline void drbd_generic_make_request(int rw, struct bio *bio)
+static inline void drbd_generic_make_request(int rw, int fault_type, struct bio *bio)
 {
 	bio->bi_rw = rw; // on the receiver side, e->..rw was not yet defined.
 
@@ -140,7 +140,10 @@
 		return;
 	}
 
-	generic_make_request(bio);
+	if (FAULT_ACTIVE(fault_type))
+		bio_endio(bio,bio->bi_size,-EIO);
+	else
+		generic_make_request(bio);
 }
 
 static inline void drbd_plug_device(drbd_dev *mdev)

Modified: trunk/drbd/drbd_int.h
===================================================================
--- trunk/drbd/drbd_int.h	2006-09-15 11:45:26 UTC (rev 2426)
+++ trunk/drbd/drbd_int.h	2006-09-15 13:14:25 UTC (rev 2427)
@@ -43,6 +43,11 @@
 extern int major_nr;
 extern int use_nbd_major;
 
+#ifdef DRBD_ENABLE_FAULTS
+extern int enable_faults;
+extern int fault_rate;
+#endif
+
 #include <linux/major.h>
 #ifdef DRBD_MAJOR
 # warning "FIXME. DRBD_MAJOR is now officially defined in major.h"
@@ -187,6 +192,27 @@
 	 _b; \
 	}))
 
+// Defines to control fault insertion
+enum {
+    DRBD_FAULT_MD_WR = 0,
+    DRBD_FAULT_MD_RD,
+    DRBD_FAULT_RS_WR,
+    DRBD_FAULT_RS_RD,
+    DRBD_FAULT_DT_WR,
+    DRBD_FAULT_DT_RD,
+
+    DRBD_FAULT_MAX,
+};
+
+#ifdef DRBD_ENABLE_FAULTS
+#define FAULT_ACTIVE(_t) \
+    (fault_rate && (enable_faults & (1<<(_t))) && _drbd_insert_fault(_t))
+
+extern unsigned int _drbd_insert_fault(unsigned int type);
+#else
+#define FAULT_ACTIVE(_t) (0)
+#endif
+
 #include <linux/stringify.h>
 // integer division, round _UP_ to the next integer
 #define div_ceil(A,B) ( (A)/(B) + ((A)%(B) ? 1 : 0) )
@@ -970,7 +996,7 @@
 extern int drbd_send_discard(drbd_dev *mdev, drbd_request_t *req);
 extern int drbd_send_sr_reply(drbd_dev *mdev, int retcode);
 extern void drbd_free_bc(struct drbd_backing_dev* bc);
-extern int drbd_io_error(drbd_dev* mdev);
+extern int drbd_io_error(drbd_dev* mdev, int forcedetach);
 extern void drbd_mdev_cleanup(drbd_dev *mdev);
 
 // drbd_meta-data.c (still in drbd_main.c)
@@ -1318,13 +1344,22 @@
  * drbd_chk_io_error: Handles the on_io_error setting, should be called from
  * all io completion handlers. See also drbd_io_error().
  */
-static inline void __drbd_chk_io_error(drbd_dev* mdev)
+static inline void __drbd_chk_io_error(drbd_dev* mdev, int forcedetach)
 {
 	/* FIXME cleanup the messages here */
 	switch(mdev->bc->dc.on_io_error) {
-	case PassOn: /* FIXME should the better be named "Ignore"? */
-		ERR("Ignoring local IO error!\n");
+	case PassOn: /* FIXME would this be better named "Ignore"? */
+	    if (!forcedetach) {
+		ERR("Local IO failed. Passing error on...\n");
 		break;
+	    }
+	    /* NOTE fall through to detach case if forcedetach set */
+	case Detach:
+		if (_drbd_set_state(mdev,_NS(disk,Failed),ChgStateHard) 
+		    == SS_Success) {
+			ERR("Local IO failed. Detaching...\n");
+		}
+		break;
 	case Panic:
 		_drbd_set_state(mdev,_NS(disk,Failed),ChgStateHard);
 		/* FIXME this is very ugly anyways.
@@ -1332,21 +1367,15 @@
 		 * while holding the req_lock hand with irq disabled. */
 		drbd_panic("IO error on backing device!\n");
 		break;
-	case Detach:
-		if (_drbd_set_state(mdev,_NS(disk,Failed),ChgStateHard) 
-		    == SS_Success) {
-			ERR("Local IO failed. Detaching...\n");
-		}
-		break;
 	}
 }
 
-static inline void drbd_chk_io_error(drbd_dev* mdev, int error)
+static inline void drbd_chk_io_error(drbd_dev* mdev, int error, int forcedetach)
 {
 	if (error) {
 		unsigned long flags;
 		spin_lock_irqsave(&mdev->req_lock,flags);
-		__drbd_chk_io_error(mdev);
+		__drbd_chk_io_error(mdev,forcedetach);
 		spin_unlock_irqrestore(&mdev->req_lock,flags);
 	}
 }

Modified: trunk/drbd/drbd_main.c
===================================================================
--- trunk/drbd/drbd_main.c	2006-09-15 11:45:26 UTC (rev 2426)
+++ trunk/drbd/drbd_main.c	2006-09-15 13:14:25 UTC (rev 2427)
@@ -50,6 +50,7 @@
 #include <linux/random.h>
 #include <linux/reboot.h>
 #include <linux/notifier.h>
+#include <linux/byteorder/swabb.h>
 
 #define __KERNEL_SYSCALLS__
 #include <linux/unistd.h>
@@ -106,6 +107,13 @@
 module_param(minor_count,      int,0);
 module_param(disable_bd_claim,bool,0);
 
+#ifdef DRBD_ENABLE_FAULTS
+int enable_faults = 0;
+int fault_rate;
+module_param(enable_faults,int,0664);	// bitmap of enabled faults
+module_param(fault_rate,int,0664);	// fault rate % value - applies to all enabled faults
+#endif
+
 // module parameter, defined
 int major_nr = LANANA_DRBD_MAJOR;
 int minor_count = 32;
@@ -340,17 +348,22 @@
  * unlikely(!drbd_bio_uptodate(e->bio)) case from kernel thread context.
  * See also drbd_chk_io_error
  *
- * NOTE: we set ourselves DISKLESS here.
- * But we try to write the "need full sync bit" here anyways.  This is to make sure
- * that you get a resynchronisation of the full device the next time you
- * connect.
+ * NOTE: we set ourselves FAILED here if on_io_error is Detach or Panic OR
+ *	 if the forcedetach flag is set. This flag is set when failures
+ *	 occur writing the meta data portion of the disk as they are
+ *	 not recoverable. We also try to write the "need full sync bit" here 
+ *	 anyways.  This is to make sure that you get a resynchronisation of 
+ *	 the full device the next time you connect.
  */
-int drbd_io_error(drbd_dev* mdev)
+int drbd_io_error(drbd_dev* mdev, int forcedetach)
 {
 	unsigned long flags;
 	int send,ok=1;
 
-	if(mdev->bc->dc.on_io_error != Panic && mdev->bc->dc.on_io_error != Detach) return 1;
+	if(!forcedetach &&
+	   mdev->bc->dc.on_io_error != Panic && 
+	   mdev->bc->dc.on_io_error != Detach) 
+		return 1;
 
 	spin_lock_irqsave(&mdev->req_lock,flags);
 	if( (send = (mdev->state.disk == Failed)) ) {
@@ -365,9 +378,18 @@
 	if (ok) WARN("Notified peer that my disk is broken.\n");
 	else ERR("Sending state in drbd_io_error() failed\n");
 
+#if 0
+// warning SPG
+// This code seems wrong -- we only get here if we are set to
+// detach in which case we have no local disk, so there's no
+// point asserting that a full sync is needed.
+// Flushing the meta data is probably also wrong -- we want
+// this node to appear out of date so we should deliberately
+// NOT update the meta data with the latest epoch info!
 	D_ASSERT(drbd_md_test_flag(mdev->bc,MDF_FullSync));
 	D_ASSERT(!drbd_md_test_flag(mdev->bc,MDF_Consistent));
 	drbd_md_sync(mdev);
+#endif
 
 	/* Releasing the backing device is done in after_state_ch() */
 
@@ -756,13 +778,6 @@
 		D_ASSERT(i);
 	}
 
-	if ( ns.role == Primary && ns.conn < Connected &&
-	     ns.disk < Consistent ) {
-// warning LGE "ugly and wrong"
-// warning LGE "FIXME code missing"
-		drbd_panic("No access to good data anymore.\n");
-	}
-
 	if( flags & ScheduleAfter ) {
 		struct after_state_chg_work* ascw;
 
@@ -861,23 +876,26 @@
 			kfree(mdev->p_uuid);
 			mdev->p_uuid = NULL;
 		}
-		if (ns.role == Primary && mdev->bc->md.uuid[Bitmap] == 0 ) {
-			/* Only do it if we have not yet done it... */
-			INFO("Creating new current UUID\n");
-			drbd_uuid_new_current(mdev);
+		if (inc_local(mdev)) {
+			if (ns.role == Primary && mdev->bc->md.uuid[Bitmap] == 0 ) {
+				/* Only do it if we have not yet done it... */
+				INFO("Creating new current UUID\n");
+				drbd_uuid_new_current(mdev);
+			}
+			if (ns.peer == Primary ) { 
+				/* Note: The condition ns.peer == Primary implies
+				   that we are connected. Otherwise it would
+				   be ns.peer == Unknown. */
+				/* Our peer lost its disk.
+				   Not rotation into BitMap-UUID! A FullSync is 
+				   required after a primary detached from it disk! */
+				u64 uuid;
+				INFO("Creating new current UUID [no BitMap]\n");
+				get_random_bytes(&uuid, sizeof(u64));
+				drbd_uuid_set(mdev, Current, uuid);
+			}
+			dec_local(mdev);
 		}
-		if (ns.peer == Primary ) { 
- 			/* Note: The condition ns.peer == Primary implies
-			         that we are connected. Otherwise it would
-				 be ns.peer == Unknown. */
- 			/* Our peer lost its disk.
-			   Not rotation into BitMap-UUID! A FullSync is 
-			   required after a primary detached from it disk! */
-			u64 uuid;
-			INFO("Creating new current UUID [no BitMap]\n");
-			get_random_bytes(&uuid, sizeof(u64));
-			drbd_uuid_set(mdev, Current, uuid);
-		}
 	}
 
 	if( ns.pdsk < Inconsistent ) {
@@ -1040,7 +1058,7 @@
 
 	spin_lock(&thi->t_lock);
 
-	/* INFO("%s [%d]: %s %d -> Running\n",
+	/* INFO("drbd_thread_start: %s [%d]: %s %d -> Running\n",
 	     current->comm, current->pid,
 	     thi == &mdev->receiver ? "receiver" :
              thi == &mdev->asender  ? "asender"  :
@@ -1073,7 +1091,7 @@
 
 	spin_lock(&thi->t_lock);
 
-	/* INFO("%s [%d]: %s %d -> %d; %d\n",
+	/* INFO("drbd_thread_stop: %s [%d]: %s %d -> %d; %d\n",
 	     current->comm, current->pid,
 	     thi->task ? thi->task->comm : "NULL", thi->t_state, ns, wait); */
 
@@ -1100,7 +1118,6 @@
 			force_sig(DRBD_SIGKILL,thi->task);
 		else
 			D_ASSERT(!wait);
-
 	}
 	spin_unlock(&thi->t_lock);
 
@@ -1340,16 +1357,15 @@
 		drbd_bm_set_all(mdev);
 		drbd_bm_write(mdev);
 		if (unlikely(mdev->state.disk <= Failed )) {
-			/* write_bm did fail! panic.
-			 * FIXME can we do something better than panic?
-			 */
-// warning LGE "ugly and wrong"
-			drbd_panic("Failed to write bitmap to disk\n!");
-			ok = FALSE;
-			goto out;
+			/* write_bm did fail! Leave full sync flag set in Meta Data
+			 * but otherwise process as per normal - need to tell other
+			 * side that a full resync is required! */
+			ERR("Failed to write bitmap to disk!\n");
 		}
-		drbd_md_clear_flag(mdev,MDF_FullSync);
-		drbd_md_sync(mdev);
+		else {
+			drbd_md_clear_flag(mdev,MDF_FullSync);
+			drbd_md_sync(mdev);
+		}
 	}
 
 	/*
@@ -1367,7 +1383,6 @@
 		bm_i += num_words;
 	} while (ok && want);
 
-  out:
 	vfree(p);
 	return ok;
 }
@@ -2530,17 +2545,11 @@
 	if (drbd_md_sync_page_io(mdev,mdev->bc,sector,WRITE)) {
 		clear_bit(MD_DIRTY,&mdev->flags);
 	} else {
-		if (mdev->state.disk <= Failed) {
-			/* this was a try anyways ... */
-			ERR("meta data update failed!\n");
-		} else {
-			/* If we cannot write our meta data,
-			 * but we are supposed to be able to,
-			 * tough!
-			 */
-// warning LGE "ugly and wrong"
-			drbd_panic("meta data update failed!\n");
-		}
+		/* this was a try anyways ... */
+		ERR("meta data update failed!\n");
+
+		drbd_chk_io_error(mdev, 1, TRUE);
+		drbd_io_error(mdev, TRUE);
 	}
 
 	// Update mdev->bc->md.la_size_sect, since we updated it on metadata.
@@ -2568,6 +2577,8 @@
 	buffer = (struct meta_data_on_disk *)page_address(mdev->md_io_page);
 
 	if ( ! drbd_md_sync_page_io(mdev,bdev,bdev->md.md_offset,READ) ) {
+		/* NOTE: cant do normal error processing here as this is
+		   called BEFORE disk is attached */
 		ERR("Error while reading metadata.\n");
 		rv = MDIOError;
 		goto err;
@@ -2732,6 +2743,65 @@
 	return 1;
 }
 
+#ifdef DRBD_ENABLE_FAULTS
+// Fault insertion support including random number generator shamelessly
+// stolen from kernel/rcutorture.c
+struct fault_random_state {
+	unsigned long state;
+	unsigned long count;
+};
+
+#define FAULT_RANDOM_MULT 39916801  /* prime */
+#define FAULT_RANDOM_ADD	479001701 /* prime */
+#define FAULT_RANDOM_REFRESH 10000
+
+/*
+ * Crude but fast random-number generator.  Uses a linear congruential
+ * generator, with occasional help from get_random_bytes().
+ */
+static unsigned long
+_drbd_fault_random(struct fault_random_state *rsp)
+{
+	long refresh;
+
+	if (--rsp->count < 0) {
+		get_random_bytes(&refresh, sizeof(refresh));
+		rsp->state += refresh;
+		rsp->count = FAULT_RANDOM_REFRESH;
+	}
+	rsp->state = rsp->state * FAULT_RANDOM_MULT + FAULT_RANDOM_ADD;
+	return swahw32(rsp->state);
+}
+
+static char *
+_drbd_fault_str(unsigned int type) {
+    static char *_faults[] = {
+	"Meta-data write",
+	"Meta-data read",
+	"Resync write",
+	"Resync read",
+	"Data write",
+	"Data read",
+    };
+
+    return (type < DRBD_FAULT_MAX)? _faults[type] : "**Unknown**";
+}
+
+unsigned int
+_drbd_insert_fault(unsigned int type)
+{
+    static struct fault_random_state rrs = {0,0};
+
+    unsigned int rnd = ((_drbd_fault_random(&rrs) % 100) + 1);
+    unsigned int ret = (rnd <= fault_rate);
+
+    if (ret)
+	printk(KERN_ALERT "Simulating %s failure\n", _drbd_fault_str(type));
+
+    return ret;
+}
+#endif
+
 #ifdef DUMP_EACH_PACKET
 #define PSM(A) \
 do { \

Modified: trunk/drbd/drbd_nl.c
===================================================================
--- trunk/drbd/drbd_nl.c	2006-09-15 11:45:26 UTC (rev 2426)
+++ trunk/drbd/drbd_nl.c	2006-09-15 13:14:25 UTC (rev 2427)
@@ -738,7 +738,7 @@
 		goto release_bdev3_fail;
 	}
 
-	// Since ware are diskless, fix the AL first...
+	// Since we are diskless, fix the AL first...
 	if (drbd_check_al_size(mdev)) {
 		retcode = KMallocFailed;
 		goto release_bdev3_fail;
@@ -878,8 +878,10 @@
 	return 0;
 
  release_bdev3_fail:
+	nbc = NULL; /* will be freed by state change below */
 	drbd_force_state(mdev,NS(disk,Diskless));
 	drbd_md_sync(mdev);
+	goto fail;
  release_bdev2_fail:
 	bd_release(nbc->md_bdev);
  release_bdev_fail:

Modified: trunk/drbd/drbd_receiver.c
===================================================================
--- trunk/drbd/drbd_receiver.c	2006-09-15 11:45:26 UTC (rev 2426)
+++ trunk/drbd/drbd_receiver.c	2006-09-15 13:14:25 UTC (rev 2427)
@@ -1001,7 +1001,7 @@
 		set_bit(SYNC_STARTED,&mdev->flags);
 	} else {
 		ok = drbd_send_ack(mdev,NegAck,e);
-		ok&= drbd_io_error(mdev);
+		ok&= drbd_io_error(mdev, FALSE);
 	}
 	dec_unacked(mdev);
 
@@ -1028,7 +1028,7 @@
 	list_add(&e->w.list,&mdev->sync_ee);
 	spin_unlock_irq(&mdev->req_lock);
 
-	drbd_generic_make_request(WRITE,e->private_bio);
+	drbd_generic_make_request(WRITE,DRBD_FAULT_RS_WR,e->private_bio);
 	/* accounting done in endio */
 
 	maybe_kick_lo(mdev);
@@ -1155,14 +1155,14 @@
 			 * may not list_del() the ee before this callback did run...
 			 * maybe even move the list_del(e) in here... */
 			ok = drbd_send_ack(mdev,NegAck,e);
-			ok&= drbd_io_error(mdev);
+			ok&= drbd_io_error(mdev, FALSE);
 			/* we expect it to be marked out of sync anyways...
 			 * maybe assert this?  */
 		}
 		dec_unacked(mdev);
 		return ok;
 	} else if(unlikely(!drbd_bio_uptodate(e->private_bio))) {
-		ok = drbd_io_error(mdev);
+		ok = drbd_io_error(mdev, FALSE);
 	}
 
 // warning LGE "FIXME code missing"
@@ -1436,7 +1436,7 @@
 	}
 
 	/* FIXME drbd_al_begin_io in case we have two primaries... */
-	drbd_generic_make_request(WRITE,e->private_bio);
+	drbd_generic_make_request(WRITE,DRBD_FAULT_DT_WR,e->private_bio);
 	/* accounting done in endio */
 
 	maybe_kick_lo(mdev);
@@ -1457,6 +1457,7 @@
 	const sector_t capacity = drbd_get_capacity(mdev->this_bdev);
 	struct Tl_epoch_entry *e;
 	int size;
+	unsigned int fault_type;
 	Drbd_BlockRequest_Packet *p = (Drbd_BlockRequest_Packet*)h;
 
 	ERR_IF(h->length != (sizeof(*p)-sizeof(*h))) return FALSE;
@@ -1501,9 +1502,11 @@
 	switch (h->command) {
 	case DataRequest:
 		e->w.cb = w_e_end_data_req;
+		fault_type = DRBD_FAULT_DT_RD;
 		break;
 	case RSDataRequest:
 		e->w.cb = w_e_end_rsdata_req;
+		fault_type = DRBD_FAULT_RS_RD;
 		/* Eventually this should become asynchrously. Currently it
 		 * blocks the whole receiver just to delay the reading of a
 		 * resync data block.
@@ -1519,11 +1522,12 @@
 		}
 		break;
 	default:; /* avoid compiler warning */
+		fault_type = DRBD_FAULT_MAX;
 	}
 
 	inc_unacked(mdev);
 	/* FIXME actually, it could be a READA originating from the peer ... */
-	drbd_generic_make_request(READ,e->private_bio);
+	drbd_generic_make_request(READ,fault_type,e->private_bio);
 	maybe_kick_lo(mdev);
 
 	return TRUE;
@@ -1753,15 +1757,22 @@
 	//WARN("uuid_compare()=%d\n",hg);
 
 	if (hg == 100) {
-		if ( mdev->state.role==Secondary && peer_role==Secondary ) {
+		int pcount = (mdev->state.role==Primary) + (peer_role==Primary);
+
+		switch (pcount) {
+		case 0:
 			hg = drbd_asb_recover_0p(mdev);
-		} else if (mdev->state.role==Primary && peer_role==Primary) {
+			break;
+		case 1:
+			hg = drbd_asb_recover_1p(mdev);
+			break;
+		case 2:
 			hg = drbd_asb_recover_2p(mdev);
-		} else {
-			hg = drbd_asb_recover_1p(mdev);
+			break;
 		}
 		if ( abs(hg) < 100 ) {
-			WARN("Split-Brain detected, automatically solved.\n");
+			WARN("Split-Brain detected, %d primaries, automatically solved. Sync from %s node\n",
+			     pcount, (hg < 0) ? "peer":"this");
 		}
 	}
 
@@ -1774,7 +1785,8 @@
 		}
 
 		if ( abs(hg) < 100 ) {
-			WARN("Split-Brain detected, manually solved.\n");
+			WARN("Split-Brain detected, manually solved. Sync from %s node\n",
+			     (hg < 0) ? "peer":"this");
 		}
 	}
 	
@@ -1794,6 +1806,8 @@
 
 	if (hg == -100) {
 		ALERT("Split-Brain detected, dropping connection!\n");
+		drbd_uuid_dump(mdev,"self",mdev->bc->md.uuid);
+		drbd_uuid_dump(mdev,"peer",mdev->p_uuid);
 		drbd_force_state(mdev,NS(conn,StandAlone));
 		drbd_thread_stop_nowait(&mdev->receiver);
 		return conn_mask;
@@ -2413,8 +2427,6 @@
 		else
 			handler = NULL;
 
-		dump_packet(mdev,mdev->data.socket,2,&mdev->data.rbuf, __FILE__, __LINE__);
-
 		if (unlikely(!handler)) {
 			ERR("unknown packet type %d, l: %d!\n",
 			    header->command, header->length);
@@ -2425,6 +2437,8 @@
 			    cmdname(header->command), header->length);
 			break;
 		}
+
+		dump_packet(mdev,mdev->data.socket,2,&mdev->data.rbuf, __FILE__, __LINE__);
 	}
 }
 
@@ -2893,17 +2907,15 @@
 		return FALSE;
 	}
 
-	/* FIXME what for ?? list_del(&req->w.list); */
+	ERR("Got NegDReply; Sector %llx, len %x; Fail original request.\n",
+	    (unsigned long long)sector,be32_to_cpu(p->blksize));
+
 	_req_mod(req, neg_acked);
 	spin_unlock_irq(&mdev->req_lock);
 
 // warning LGE "ugly and wrong"
 	drbd_khelper(mdev,"pri-on-incon-degr");
-	drbd_panic("Got NegDReply. WE ARE LOST. We lost our up-to-date disk.\n");
 
-	// THINK do we have other options, but panic?
-	//       what about bio_endio, in case we don't panic ??
-
 	return TRUE;
 }
 

Modified: trunk/drbd/drbd_req.c
===================================================================
--- trunk/drbd/drbd_req.c	2006-09-15 11:45:26 UTC (rev 2426)
+++ trunk/drbd/drbd_req.c	2006-09-15 13:14:25 UTC (rev 2427)
@@ -116,7 +116,7 @@
 		/*
 		 * figure out whether to report success or failure.
 		 *
-		 * report success when at least one of the oprations suceeded.
+		 * report success when at least one of the operations suceeded.
 		 * or, to put the other way,
 		 * only report failure, when both operations failed.
 		 *
@@ -302,7 +302,7 @@
 				      "	[DISCARD L] new: %llu +%d; pending: %llu +%d\n",
 				      current->comm, current->pid,
 				      (unsigned long long)sector, size,
-				      e->sector, e->size);
+				      (unsigned long long)e->sector, e->size);
 				return 1;
 			}
 		}
@@ -399,10 +399,10 @@
 		req->private_bio = NULL;
 		dec_local(mdev);
 		ALERT("Local WRITE failed sec=%llu size=%u\n",
-					req->sector, req->size);
+		      (unsigned long long)req->sector, req->size);
 		/* and now: check how to handle local io error.
 		 * FIXME see comment below in read_completed_with_error */
-		__drbd_chk_io_error(mdev);
+		__drbd_chk_io_error(mdev,FALSE);
 		_req_may_be_done(req);
 		break;
 
@@ -419,7 +419,7 @@
 			break;
 		/* else */
 		ALERT("Local READ failed sec=%llu size=%u\n",
-					req->sector, req->size);
+		      (unsigned long long)req->sector, req->size);
 		/* _req_mod(req,to_be_send); oops, recursion in static inline */
 		D_ASSERT(!(req->rq_state & RQ_NET_MASK));
 		req->rq_state |= RQ_NET_PENDING;
@@ -434,7 +434,7 @@
 		 * private bio then, and round the offset and size so
 		 * we get back enough data to be able to clear the bits again.
 		 */
-		__drbd_chk_io_error(mdev);
+		__drbd_chk_io_error(mdev,FALSE);
 		/* fall through: _req_mod(req,queue_for_net_read); */
 
 	case queue_for_net_read:
@@ -943,7 +943,11 @@
 
 	if (local) {
 		req->private_bio->bi_bdev = mdev->bc->backing_bdev;
-		generic_make_request(req->private_bio);
+
+		if (FAULT_ACTIVE(rw==WRITE? DRBD_FAULT_DT_WR : DRBD_FAULT_DT_RD))
+			bio_endio(req->private_bio, req->private_bio->bi_size, -EIO);
+		else
+			generic_make_request(req->private_bio);
 	}
 	return 0;
 
@@ -991,7 +995,6 @@
 		ERR("Sorry, I have no access to good data anymore.\n");
 		/*
 		 * FIXME suspend, loop waiting on cstate wait?
-		 * panic?
 		 */
 		return 1;
 	}

Modified: trunk/drbd/drbd_worker.c
===================================================================
--- trunk/drbd/drbd_worker.c	2006-09-15 11:45:26 UTC (rev 2426)
+++ trunk/drbd/drbd_worker.c	2006-09-15 13:14:25 UTC (rev 2427)
@@ -87,7 +87,7 @@
 	if(list_empty(&mdev->read_ee)) wake_up(&mdev->ee_wait);
 	spin_unlock_irqrestore(&mdev->req_lock,flags);
 
-	drbd_chk_io_error(mdev,error);
+	drbd_chk_io_error(mdev,error,FALSE);
 	drbd_queue_work(&mdev->data.work,&e->w);
 	dec_local(mdev);
 	return 0;
@@ -129,7 +129,7 @@
 		? list_empty(&mdev->sync_ee)
 		: list_empty(&mdev->active_ee);
 
-	if (error) __drbd_chk_io_error(mdev);
+	if (error) __drbd_chk_io_error(mdev,FALSE);
 	spin_unlock_irqrestore(&mdev->req_lock,flags);
 
 	if (do_wake) wake_up(&mdev->ee_wait);
@@ -184,7 +184,7 @@
 
 	if(unlikely(cancel)) return 1;
 
-	ok = drbd_io_error(mdev);
+	ok = drbd_io_error(mdev, FALSE);
 	if(unlikely(!ok)) ERR("Sending in w_io_error() failed\n");
 	return ok;
 }
@@ -208,7 +208,7 @@
 	/* FIXME this is ugly. we should not detach for read io-error,
 	 * but try to WRITE the DataReply to the failed location,
 	 * to give the disk the chance to relocate that block */
-	drbd_io_error(mdev); /* tries to schedule a detach and notifies peer */
+	drbd_io_error(mdev,FALSE); /* tries to schedule a detach and notifies peer */
 	return w_send_read_req(mdev,w,0);
 }
 
@@ -440,7 +440,7 @@
 		/* FIXME we should not detach for read io-errors, in particular
 		 * not now: when the peer asked us for our data, we are likely
 		 * the only remaining disk... */
-		drbd_io_error(mdev);
+		drbd_io_error(mdev,FALSE);
 	}
 
 	dec_unacked(mdev);
@@ -486,8 +486,8 @@
 	} else {
 		ok=drbd_send_ack(mdev,NegRSDReply,e);
 		if (DRBD_ratelimit(5*HZ,5))
-			ERR("Sending NegDReply. I guess it gets messy.\n");
-		drbd_io_error(mdev);
+			ERR("Sending NegRSDReply. I guess it gets messy.\n");
+		drbd_io_error(mdev, FALSE);
 	}
 
 	dec_unacked(mdev);

Modified: trunk/drbd/linux/drbd_config.h
===================================================================
--- trunk/drbd/linux/drbd_config.h	2006-09-15 11:45:26 UTC (rev 2426)
+++ trunk/drbd/linux/drbd_config.h	2006-09-15 13:14:25 UTC (rev 2427)
@@ -48,4 +48,7 @@
 // for troubles.
 // #define DRBD_DISABLE_SENDPAGE
 
+// Enable fault insertion code
+//#define DRBD_ENABLE_FAULTS
+
 #endif