Index: drbd/drbd_receiver.c
===================================================================
--- drbd/drbd_receiver.c	(.../trunk)	(revision 4048)
+++ drbd/drbd_receiver.c	(.../branches/drbd-panic)	(revision 4048)
@@ -1000,7 +1000,7 @@
 		set_bit(SYNC_STARTED,&mdev->flags);
 	} else {
 		ok = drbd_send_ack(mdev,NegAck,e);
-		ok&= drbd_io_error(mdev);
+		ok&= drbd_io_error(mdev, FALSE);
 	}
 	dec_unacked(mdev);
 
@@ -1027,7 +1027,7 @@
 	list_add(&e->w.list,&mdev->sync_ee);
 	spin_unlock_irq(&mdev->req_lock);
 
-	drbd_generic_make_request(WRITE,e->private_bio);
+	drbd_generic_make_request(WRITE,DRBD_FAULT_RS_WR,e->private_bio);
 	/* accounting done in endio */
 
 	maybe_kick_lo(mdev);
@@ -1154,14 +1154,14 @@
 			 * may not list_del() the ee before this callback did run...
 			 * maybe even move the list_del(e) in here... */
 			ok = drbd_send_ack(mdev,NegAck,e);
-			ok&= drbd_io_error(mdev);
+			ok&= drbd_io_error(mdev, FALSE);
 			/* we expect it to be marked out of sync anyways...
 			 * maybe assert this?  */
 		}
 		dec_unacked(mdev);
 		return ok;
 	} else if(unlikely(!drbd_bio_uptodate(e->private_bio))) {
-		ok = drbd_io_error(mdev);
+		ok = drbd_io_error(mdev, FALSE);
 	}
 
 // warning LGE "FIXME code missing"
@@ -1428,7 +1428,7 @@
 	}
 
 	/* FIXME drbd_al_begin_io in case we have two primaries... */
-	drbd_generic_make_request(WRITE,e->private_bio);
+	drbd_generic_make_request(WRITE,DRBD_FAULT_DT_WR,e->private_bio);
 	/* accounting done in endio */
 
 	maybe_kick_lo(mdev);
@@ -1449,6 +1449,7 @@
 	const sector_t capacity = drbd_get_capacity(mdev->this_bdev);
 	struct Tl_epoch_entry *e;
 	int size;
+	unsigned int fault_type;
 	Drbd_BlockRequest_Packet *p = (Drbd_BlockRequest_Packet*)h;
 
 	ERR_IF(h->length != (sizeof(*p)-sizeof(*h))) return FALSE;
@@ -1493,9 +1494,11 @@
 	switch (h->command) {
 	case DataRequest:
 		e->w.cb = w_e_end_data_req;
+		fault_type = DRBD_FAULT_DT_RD;
 		break;
 	case RSDataRequest:
 		e->w.cb = w_e_end_rsdata_req;
+		fault_type = DRBD_FAULT_RS_RD;
 		/* Eventually this should become asynchrously. Currently it
 		 * blocks the whole receiver just to delay the reading of a
 		 * resync data block.
@@ -1511,11 +1514,12 @@
 		}
 		break;
 	default:; /* avoid compiler warning */
+		fault_type = DRBD_FAULT_MAX;
 	}
 
 	inc_unacked(mdev);
 	/* FIXME actually, it could be a READA originating from the peer ... */
-	drbd_generic_make_request(READ,e->private_bio);
+	drbd_generic_make_request(READ,fault_type,e->private_bio);
 	maybe_kick_lo(mdev);
 
 	return TRUE;
@@ -1745,15 +1749,22 @@
 	//WARN("uuid_compare()=%d\n",hg);
 
 	if (hg == 100) {
-		if ( mdev->state.role==Secondary && peer_role==Secondary ) {
+		int pcount = (mdev->state.role==Primary) + (peer_role==Primary);
+
+		switch (pcount) {
+		case 0:
 			hg = drbd_asb_recover_0p(mdev);
-		} else if (mdev->state.role==Primary && peer_role==Primary) {
+			break;
+		case 1:
+			hg = drbd_asb_recover_1p(mdev);
+			break;
+		case 2:
 			hg = drbd_asb_recover_2p(mdev);
-		} else {
-			hg = drbd_asb_recover_1p(mdev);
+			break;
 		}
 		if ( abs(hg) < 100 ) {
-			WARN("Split-Brain detected, automatically solved.\n");
+			WARN("Split-Brain detected, %d primaries, automatically solved. Sync from %s node\n",
+			     pcount, (hg < 0) ? "peer":"this");
 		}
 	}
 
@@ -1766,7 +1777,8 @@
 		}
 
 		if ( abs(hg) < 100 ) {
-			WARN("Split-Brain detected, manually solved.\n");
+			WARN("Split-Brain detected, manually solved. Sync from %s node\n",
+			     (hg < 0) ? "peer":"this");
 		}
 	}
 	
@@ -1786,6 +1798,8 @@
 
 	if (hg == -100) {
 		ALERT("Split-Brain detected, dropping connection!\n");
+		drbd_uuid_dump(mdev,"self",mdev->bc->md.uuid);
+		drbd_uuid_dump(mdev,"peer",mdev->p_uuid);
 		drbd_force_state(mdev,NS(conn,StandAlone));
 		drbd_thread_stop_nowait(&mdev->receiver);
 		return conn_mask;
@@ -2407,8 +2421,6 @@
 		else
 			handler = NULL;
 
-		dump_packet(mdev,mdev->data.socket,2,&mdev->data.rbuf, __FILE__, __LINE__);
-
 		if (unlikely(!handler)) {
 			ERR("unknown packet type %d, l: %d!\n",
 			    header->command, header->length);
@@ -2419,6 +2431,8 @@
 			    cmdname(header->command), header->length);
 			break;
 		}
+
+		dump_packet(mdev,mdev->data.socket,2,&mdev->data.rbuf, __FILE__, __LINE__);
 	}
 }
 
@@ -2887,17 +2901,15 @@
 		return FALSE;
 	}
 
-	/* FIXME what for ?? list_del(&req->w.list); */
+	ERR("Got NegDReply; Sector %llx, len %x; Fail original request.\n",
+	    (unsigned long long)sector,be32_to_cpu(p->blksize));
+
 	_req_mod(req, neg_acked);
 	spin_unlock_irq(&mdev->req_lock);
 
 // warning LGE "ugly and wrong"
 	drbd_khelper(mdev,"pri-on-incon-degr");
-	drbd_panic("Got NegDReply. WE ARE LOST. We lost our up-to-date disk.\n");
 
-	// THINK do we have other options, but panic?
-	//       what about bio_endio, in case we don't panic ??
-
 	return TRUE;
 }
 
Index: drbd/drbd_nl.c
===================================================================
--- drbd/drbd_nl.c	(.../trunk)	(revision 4048)
+++ drbd/drbd_nl.c	(.../branches/drbd-panic)	(revision 4048)
@@ -738,7 +738,7 @@
 		goto release_bdev3_fail;
 	}
 
-	// Since ware are diskless, fix the AL first...
+	// Since we are are diskless, fix the AL first...
 	if (drbd_check_al_size(mdev)) {
 		retcode = KMallocFailed;
 		goto release_bdev3_fail;
@@ -877,8 +877,10 @@
 	return 0;
 
  release_bdev3_fail:
+	nbc = NULL; /* will be freed by state change below */
 	drbd_force_state(mdev,NS(disk,Diskless));
 	drbd_md_sync(mdev);
+	goto fail;
  release_bdev2_fail:
 	bd_release(nbc->md_bdev);
  release_bdev_fail:
Index: drbd/drbd_actlog.c
===================================================================
--- drbd/drbd_actlog.c	(.../trunk)	(revision 4048)
+++ drbd/drbd_actlog.c	(.../branches/drbd-panic)	(revision 4048)
@@ -49,12 +49,18 @@
 	bio->bi_private = &event;
 	bio->bi_end_io = drbd_md_io_complete;
 
+	if (FAULT_ACTIVE((rw & WRITE)? DRBD_FAULT_MD_WR:DRBD_FAULT_MD_RD)) {
+		bio->bi_rw |= rw;
+		bio_endio(bio,bio->bi_size,-EIO);
+	}
+	else {
 #ifdef BIO_RW_SYNC
-	submit_bio(rw | (1 << BIO_RW_SYNC), bio);
+		submit_bio(rw | (1 << BIO_RW_SYNC), bio);
 #else
-	submit_bio(rw, bio);
-	drbd_blk_run_queue(bdev_get_queue(bdev->md_bdev));
+		submit_bio(rw, bio);
+		drbd_blk_run_queue(bdev_get_queue(bdev->md_bdev));
 #endif
+	}
 	wait_for_completion(&event);
 
 	ok = test_bit(BIO_UPTODATE, &bio->bi_flags);
@@ -108,7 +114,11 @@
 			ok = _drbd_md_sync_page_io(bdev,iop,
 						   sector,READ,hardsect);
 
-			if (unlikely(!ok)) return 0;
+			if (unlikely(!ok)) {
+				ERR("drbd_md_sync_page_io(,%llu,READ [hardsect!=512]) failed!\n",
+				    (unsigned long long)sector);
+				return 0;
+			}
 
 			memcpy(hp + offset*MD_HARDSECT , p, MD_HARDSECT);
 		}
@@ -130,6 +140,7 @@
 	if (unlikely(!ok)) {
 		ERR("drbd_md_sync_page_io(,%llu,%s) failed!\n",
 		    (unsigned long long)sector,rw ? "WRITE" : "READ");
+		return 0;
 	}
 
 	if( hardsect != MD_HARDSECT && rw == READ ) {
@@ -326,8 +337,8 @@
 	sector = mdev->bc->md.md_offset + mdev->bc->md.al_offset + mdev->al_tr_pos;
 
 	if(!drbd_md_sync_page_io(mdev,mdev->bc,sector,WRITE)) {
-		drbd_chk_io_error(mdev, 1);
-		drbd_io_error(mdev);
+		drbd_chk_io_error(mdev, 1, TRUE);
+		drbd_io_error(mdev, TRUE);
 	}
 
 	if( ++mdev->al_tr_pos > div_ceil(mdev->act_log->nr_elements,AL_EXTENTS_PT) ) {
@@ -361,6 +372,8 @@
 	sector = bdev->md.md_offset + bdev->md.al_offset + index;
 
 	if(!drbd_md_sync_page_io(mdev,bdev,sector,READ)) {
+		// Dont process error normally as this is done before
+		// disk is atached!
 		return -1;
 	}
 
@@ -500,22 +513,20 @@
 
 	wait_event(mdev->al_wait, lc_try_lock(mdev->act_log));
 
-	i=inc_local_if_state(mdev,Attaching);
-	D_ASSERT( i ); // Assertions should not have side effects.
-	// I do not want to have D_ASSERT( inc_md_only(mdev,Attaching) );
+	if (inc_local_if_state(mdev,Attaching)) {
+		for(i=0;i<mdev->act_log->nr_elements;i++) {
+			enr = lc_entry(mdev->act_log,i)->lc_number;
+			if(enr == LC_FREE) continue;
+			/* TODO encapsulate and optimize within drbd_bitmap
+			 * currently, if we have al-extents 16..19 active,
+			 * sector 4 will be written four times! */
+			drbd_bm_write_sect(mdev, enr/AL_EXT_PER_BM_SECT );
+		}
 
-	for(i=0;i<mdev->act_log->nr_elements;i++) {
-		enr = lc_entry(mdev->act_log,i)->lc_number;
-		if(enr == LC_FREE) continue;
-		/* TODO encapsulate and optimize within drbd_bitmap
-		 * currently, if we have al-extents 16..19 active,
-		 * sector 4 will be written four times! */
-		drbd_bm_write_sect(mdev, enr/AL_EXT_PER_BM_SECT );
+		lc_unlock(mdev->act_log);
+		wake_up(&mdev->al_wait);
+		dec_local(mdev);
 	}
-
-	lc_unlock(mdev->act_log);
-	wake_up(&mdev->al_wait);
-	dec_local(mdev);
 }
 
 /**
Index: drbd/drbd_worker.c
===================================================================
--- drbd/drbd_worker.c	(.../trunk)	(revision 4048)
+++ drbd/drbd_worker.c	(.../branches/drbd-panic)	(revision 4048)
@@ -87,7 +87,7 @@
 	if(list_empty(&mdev->read_ee)) wake_up(&mdev->ee_wait);
 	spin_unlock_irqrestore(&mdev->req_lock,flags);
 
-	drbd_chk_io_error(mdev,error);
+	drbd_chk_io_error(mdev,error,FALSE);
 	drbd_queue_work(&mdev->data.work,&e->w);
 	dec_local(mdev);
 	return 0;
@@ -129,7 +129,7 @@
 		? list_empty(&mdev->sync_ee)
 		: list_empty(&mdev->active_ee);
 
-	if (error) __drbd_chk_io_error(mdev);
+	if (error) __drbd_chk_io_error(mdev,FALSE);
 	spin_unlock_irqrestore(&mdev->req_lock,flags);
 
 	if (do_wake) wake_up(&mdev->ee_wait);
@@ -182,7 +182,7 @@
 
 	if(unlikely(cancel)) return 1;
 
-	ok = drbd_io_error(mdev);
+	ok = drbd_io_error(mdev, FALSE);
 	if(unlikely(!ok)) ERR("Sending in w_io_error() failed\n");
 	return ok;
 }
@@ -206,7 +206,7 @@
 	/* FIXME this is ugly. we should not detach for read io-error,
 	 * but try to WRITE the DataReply to the failed location,
 	 * to give the disk the chance to relocate that block */
-	drbd_io_error(mdev); /* tries to schedule a detach and notifies peer */
+	drbd_io_error(mdev,FALSE); /* tries to schedule a detach and notifies peer */
 	return w_send_read_req(mdev,w,0);
 }
 
@@ -438,7 +438,7 @@
 		/* FIXME we should not detach for read io-errors, in particular
 		 * not now: when the peer asked us for our data, we are likely
 		 * the only remaining disk... */
-		drbd_io_error(mdev);
+		drbd_io_error(mdev,FALSE);
 	}
 
 	dec_unacked(mdev);
@@ -484,8 +484,8 @@
 	} else {
 		ok=drbd_send_ack(mdev,NegRSDReply,e);
 		if (DRBD_ratelimit(5*HZ,5))
-			ERR("Sending NegDReply. I guess it gets messy.\n");
-		drbd_io_error(mdev);
+			ERR("Sending NegRSDReply. I guess it gets messy.\n");
+		drbd_io_error(mdev, FALSE);
 	}
 
 	dec_unacked(mdev);
Index: drbd/drbd_compat_wrappers.h
===================================================================
--- drbd/drbd_compat_wrappers.h	(.../trunk)	(revision 4048)
+++ drbd/drbd_compat_wrappers.h	(.../branches/drbd-panic)	(revision 4048)
@@ -129,7 +129,7 @@
 /*
  * used to submit our private bio
  */
-static inline void drbd_generic_make_request(int rw, struct bio *bio)
+static inline void drbd_generic_make_request(int rw, int fault_type, struct bio *bio)
 {
 	bio->bi_rw = rw; // on the receiver side, e->..rw was not yet defined.
 
@@ -140,7 +140,10 @@
 		return;
 	}
 
-	generic_make_request(bio);
+	if (FAULT_ACTIVE(fault_type))
+		bio_endio(bio,bio->bi_size,-EIO);
+	else
+		generic_make_request(bio);
 }
 
 static inline void drbd_plug_device(drbd_dev *mdev)
Index: drbd/drbd_bitmap.c
===================================================================
--- drbd/drbd_bitmap.c	(.../trunk)	(revision 4048)
+++ drbd/drbd_bitmap.c	(.../branches/drbd-panic)	(revision 4048)
@@ -653,7 +653,13 @@
 	bio_add_page(bio, page, len, 0);
 	bio->bi_private = b;
 	bio->bi_end_io = drbd_bm_async_io_complete;
-	submit_bio(rw, bio);
+
+	if (FAULT_ACTIVE((rw&WRITE)?DRBD_FAULT_MD_WR:DRBD_FAULT_MD_RD)) {
+		bio->bi_rw |= rw;
+		bio_endio(bio,bio->bi_size,-EIO);
+	}
+	else
+		submit_bio(rw, bio);
 }
 /* read one sector of the on disk bitmap into memory.
  * on disk bitmap is little endian.
@@ -684,8 +690,8 @@
 		ERR( "IO ERROR reading bitmap sector %lu "
 		     "(meta-disk sector %llu)\n",
 		     enr, (unsigned long long)on_disk_sector );
-		drbd_chk_io_error(mdev, 1);
-		drbd_io_error(mdev);
+		drbd_chk_io_error(mdev, 1, TRUE);
+		drbd_io_error(mdev, TRUE);
 		for (i = 0; i < AL_EXT_PER_BM_SECT; i++)
 			drbd_bm_ALe_set_all(mdev,enr*AL_EXT_PER_BM_SECT+i);
 	}
@@ -755,6 +761,8 @@
 
 	now = jiffies;
 	atomic_set(&b->bm_async_io, num_pages);
+	__clear_bit(BM_MD_IO_ERROR,&b->bm_flags);
+
 	for (i = 0; i < num_pages; i++) {
 		/* let the layers below us try to merge these bios... */
 		drbd_bm_page_io_async(mdev,b,i,rw);
@@ -770,8 +778,8 @@
 		 * detach?
 		 */
 		ALERT("we had at least one MD IO ERROR during bitmap IO\n");
-		drbd_chk_io_error(mdev, 1);
-		drbd_io_error(mdev);
+		drbd_chk_io_error(mdev, 1, TRUE);
+		drbd_io_error(mdev, TRUE);
 	}
 
 	now = jiffies;
@@ -836,8 +844,8 @@
 		ERR( "IO ERROR writing bitmap sector %lu "
 		     "(meta-disk sector %lu)\n",
 		     enr, (unsigned long)on_disk_sector );
-		drbd_chk_io_error(mdev, 1);
-		drbd_io_error(mdev);
+		drbd_chk_io_error(mdev, 1, TRUE);
+		drbd_io_error(mdev, TRUE);
 		for (i = 0; i < AL_EXT_PER_BM_SECT; i++)
 			drbd_bm_ALe_set_all(mdev,enr*AL_EXT_PER_BM_SECT+i);
 	}
Index: drbd/linux/drbd_config.h
===================================================================
--- drbd/linux/drbd_config.h	(.../trunk)	(revision 4048)
+++ drbd/linux/drbd_config.h	(.../branches/drbd-panic)	(revision 4048)
@@ -48,4 +48,7 @@
 // for troubles.
 // #define DRBD_DISABLE_SENDPAGE
 
+// Enable fault insertion code
+//#define DRBD_ENABLE_FAULTS
+
 #endif
Index: drbd/drbd_main.c
===================================================================
--- drbd/drbd_main.c	(.../trunk)	(revision 4048)
+++ drbd/drbd_main.c	(.../branches/drbd-panic)	(revision 4048)
@@ -50,6 +50,7 @@
 #include <linux/random.h>
 #include <linux/reboot.h>
 #include <linux/notifier.h>
+#include <linux/byteorder/swabb.h>
 
 #define __KERNEL_SYSCALLS__
 #include <linux/unistd.h>
@@ -106,6 +107,13 @@
 module_param(minor_count,      int,0);
 module_param(disable_bd_claim,bool,0);
 
+#ifdef DRBD_ENABLE_FAULTS
+int enable_faults = 0;
+int fault_rate;
+module_param(enable_faults,int,0664);	// bitmap of enabled faults
+module_param(fault_rate,int,0664);	// fault rate % value - applies to all enabled faults
+#endif
+
 // module parameter, defined
 int major_nr = LANANA_DRBD_MAJOR;
 int minor_count = 32;
@@ -340,17 +348,22 @@
  * unlikely(!drbd_bio_uptodate(e->bio)) case from kernel thread context.
  * See also drbd_chk_io_error
  *
- * NOTE: we set ourselves DISKLESS here.
- * But we try to write the "need full sync bit" here anyways.  This is to make sure
- * that you get a resynchronisation of the full device the next time you
- * connect.
+ * NOTE: we set ourselves FAILED here if on_io_error is Detach or Panic OR
+ *	 if the forcedetach flag is set. This flag is set when failures
+ *	 occur writing the meta data portion of the disk as they are
+ *	 not recoverable. We also try to write the "need full sync bit" here 
+ *	 anyways.  This is to make sure that you get a resynchronisation of 
+ *	 the full device the next time you connect.
  */
-int drbd_io_error(drbd_dev* mdev)
+int drbd_io_error(drbd_dev* mdev, int forcedetach)
 {
 	unsigned long flags;
 	int send,ok=1;
 
-	if(mdev->bc->dc.on_io_error != Panic && mdev->bc->dc.on_io_error != Detach) return 1;
+	if(!forcedetach &&
+	   mdev->bc->dc.on_io_error != Panic && 
+	   mdev->bc->dc.on_io_error != Detach) 
+		return 1;
 
 	spin_lock_irqsave(&mdev->req_lock,flags);
 	if( (send = (mdev->state.disk == Failed)) ) {
@@ -365,9 +378,18 @@
 	if (ok) WARN("Notified peer that my disk is broken.\n");
 	else ERR("Sending state in drbd_io_error() failed\n");
 
+#if 0
+// warning SPG
+// This code seems wrong -- we only get here if we are set to
+// detach in which case we have no local disk, so there's no
+// point asserting that a full sync is needed.
+// Flushing the meta data is probably also wrong -- we want
+// this node to appear out of date so we should deliberately
+// NOT update the meta data with the latest epoch info!
 	D_ASSERT(drbd_md_test_flag(mdev->bc,MDF_FullSync));
 	D_ASSERT(!drbd_md_test_flag(mdev->bc,MDF_Consistent));
 	drbd_md_sync(mdev);
+#endif
 
 	/* Releasing the backing device is done in after_state_ch() */
 
@@ -750,13 +772,6 @@
 		D_ASSERT(i);
 	}
 
-	if ( ns.role == Primary && ns.conn < Connected &&
-	     ns.disk < Consistent ) {
-// warning LGE "ugly and wrong"
-// warning LGE "FIXME code missing"
-		drbd_panic("No access to good data anymore.\n");
-	}
-
 	if( flags & ScheduleAfter ) {
 		struct after_state_chg_work* ascw;
 
@@ -853,23 +868,26 @@
 			kfree(mdev->p_uuid);
 			mdev->p_uuid = NULL;
 		}
-		if (ns.role == Primary && mdev->bc->md.uuid[Bitmap] == 0 ) {
-			/* Only do it if we have not yet done it... */
-			INFO("Creating new current UUID\n");
-			drbd_uuid_new_current(mdev);
+		if (inc_local(mdev)) {
+			if (ns.role == Primary && mdev->bc->md.uuid[Bitmap] == 0 ) {
+				/* Only do it if we have not yet done it... */
+				INFO("Creating new current UUID\n");
+				drbd_uuid_new_current(mdev);
+			}
+			if (ns.peer == Primary ) { 
+				/* Note: The condition ns.peer == Primary implies
+				   that we are connected. Otherwise it would
+				   be ns.peer == Unknown. */
+				/* Our peer lost its disk.
+				   Not rotation into BitMap-UUID! A FullSync is 
+				   required after a primary detached from it disk! */
+				u64 uuid;
+				INFO("Creating new current UUID [no BitMap]\n");
+				get_random_bytes(&uuid, sizeof(u64));
+				drbd_uuid_set(mdev, Current, uuid);
+			}
+			dec_local(mdev);
 		}
-		if (ns.peer == Primary ) { 
- 			/* Note: The condition ns.peer == Primary implies
-			         that we are connected. Otherwise it would
-				 be ns.peer == Unknown. */
- 			/* Our peer lost its disk.
-			   Not rotation into BitMap-UUID! A FullSync is 
-			   required after a primary detached from it disk! */
-			u64 uuid;
-			INFO("Creating new current UUID [no BitMap]\n");
-			get_random_bytes(&uuid, sizeof(u64));
-			drbd_uuid_set(mdev, Current, uuid);
-		}
 	}
 
 	/* We want to pause resync, tell peer. */
@@ -1013,7 +1031,7 @@
 
 	spin_lock(&thi->t_lock);
 
-	/* INFO("%s [%d]: %s %d -> Running\n",
+	/* INFO("drbd_thread_start: %s [%d]: %s %d -> Running\n",
 	     current->comm, current->pid,
 	     thi == &mdev->receiver ? "receiver" :
              thi == &mdev->asender  ? "asender"  :
@@ -1046,7 +1064,7 @@
 
 	spin_lock(&thi->t_lock);
 
-	/* INFO("%s [%d]: %s %d -> %d; %d\n",
+	/* INFO("drbd_thread_stop: %s [%d]: %s %d -> %d; %d\n",
 	     current->comm, current->pid,
 	     thi->task ? thi->task->comm : "NULL", thi->t_state, ns, wait); */
 
@@ -1073,7 +1091,6 @@
 			force_sig(DRBD_SIGKILL,thi->task);
 		else
 			D_ASSERT(!wait);
-
 	}
 	spin_unlock(&thi->t_lock);
 
@@ -1313,16 +1330,15 @@
 		drbd_bm_set_all(mdev);
 		drbd_bm_write(mdev);
 		if (unlikely(mdev->state.disk <= Failed )) {
-			/* write_bm did fail! panic.
-			 * FIXME can we do something better than panic?
-			 */
-// warning LGE "ugly and wrong"
-			drbd_panic("Failed to write bitmap to disk\n!");
-			ok = FALSE;
-			goto out;
+			/* write_bm did fail! Leave full sync flag set in Meta Data
+			 * but otherwise process as per normal - need to tell other
+			 * side that a full resync is required! */
+			ERR("Failed to write bitmap to disk!\n");
 		}
-		drbd_md_clear_flag(mdev,MDF_FullSync);
-		drbd_md_sync(mdev);
+		else {
+			drbd_md_clear_flag(mdev,MDF_FullSync);
+			drbd_md_sync(mdev);
+		}
 	}
 
 	/*
@@ -1340,7 +1356,6 @@
 		bm_i += num_words;
 	} while (ok && want);
 
-  out:
 	vfree(p);
 	return ok;
 }
@@ -2503,17 +2518,11 @@
 	if (drbd_md_sync_page_io(mdev,mdev->bc,sector,WRITE)) {
 		clear_bit(MD_DIRTY,&mdev->flags);
 	} else {
-		if (mdev->state.disk <= Failed) {
-			/* this was a try anyways ... */
-			ERR("meta data update failed!\n");
-		} else {
-			/* If we cannot write our meta data,
-			 * but we are supposed to be able to,
-			 * tough!
-			 */
-// warning LGE "ugly and wrong"
-			drbd_panic("meta data update failed!\n");
-		}
+		/* this was a try anyways ... */
+		ERR("meta data update failed!\n");
+
+		drbd_chk_io_error(mdev, 1, TRUE);
+		drbd_io_error(mdev, TRUE);
 	}
 
 	// Update mdev->bc->md.la_size_sect, since we updated it on metadata.
@@ -2541,6 +2550,8 @@
 	buffer = (struct meta_data_on_disk *)page_address(mdev->md_io_page);
 
 	if ( ! drbd_md_sync_page_io(mdev,bdev,bdev->md.md_offset,READ) ) {
+		/* NOTE: cant do normal error processing here as this is
+		   called BEFORE disk is attached */
 		ERR("Error while reading metadata.\n");
 		rv = MDIOError;
 		goto err;
@@ -2705,6 +2716,65 @@
 	return 1;
 }
 
+#ifdef DRBD_ENABLE_FAULTS
+// Fault insertion support including random number generator shamelessly
+// stolen from kernel/rcutorture.c
+struct fault_random_state {
+	unsigned long state;
+	unsigned long count;
+};
+
+#define FAULT_RANDOM_MULT 39916801  /* prime */
+#define FAULT_RANDOM_ADD	479001701 /* prime */
+#define FAULT_RANDOM_REFRESH 10000
+
+/*
+ * Crude but fast random-number generator.  Uses a linear congruential
+ * generator, with occasional help from get_random_bytes().
+ */
+static unsigned long
+_drbd_fault_random(struct fault_random_state *rsp)
+{
+	long refresh;
+
+	if (--rsp->count < 0) {
+		get_random_bytes(&refresh, sizeof(refresh));
+		rsp->state += refresh;
+		rsp->count = FAULT_RANDOM_REFRESH;
+	}
+	rsp->state = rsp->state * FAULT_RANDOM_MULT + FAULT_RANDOM_ADD;
+	return swahw32(rsp->state);
+}
+
+static char *
+_drbd_fault_str(unsigned int type) {
+    static char *_faults[] = {
+	"Meta-data write",
+	"Meta-data read",
+	"Resync write",
+	"Resync read",
+	"Data write",
+	"Data read",
+    };
+
+    return (type < DRBD_FAULT_MAX)? _faults[type] : "**Unknown**";
+}
+
+unsigned int
+_drbd_insert_fault(unsigned int type)
+{
+    static struct fault_random_state rrs = {0,0};
+
+    unsigned int rnd = ((_drbd_fault_random(&rrs) % 100) + 1);
+    unsigned int ret = (rnd <= fault_rate);
+
+    if (ret)
+	printk(KERN_ALERT "Simulating %s failure\n", _drbd_fault_str(type));
+
+    return ret;
+}
+#endif
+
 #ifdef DUMP_EACH_PACKET
 #define PSM(A) \
 do { \
Index: drbd/drbd_req.c
===================================================================
--- drbd/drbd_req.c	(.../trunk)	(revision 4048)
+++ drbd/drbd_req.c	(.../branches/drbd-panic)	(revision 4048)
@@ -116,7 +116,7 @@
 		/*
 		 * figure out whether to report success or failure.
 		 *
-		 * report success when at least one of the oprations suceeded.
+		 * report success when at least one of the operations suceeded.
 		 * or, to put the other way,
 		 * only report failure, when both operations failed.
 		 *
@@ -302,7 +302,7 @@
 				      "	[DISCARD L] new: %llu +%d; pending: %llu +%d\n",
 				      current->comm, current->pid,
 				      (unsigned long long)sector, size,
-				      e->sector, e->size);
+				      (unsigned long long)e->sector, e->size);
 				return 1;
 			}
 		}
@@ -399,10 +399,10 @@
 		req->private_bio = NULL;
 		dec_local(mdev);
 		ALERT("Local WRITE failed sec=%llu size=%u\n",
-					req->sector, req->size);
+		      (unsigned long long)req->sector, req->size);
 		/* and now: check how to handle local io error.
 		 * FIXME see comment below in read_completed_with_error */
-		__drbd_chk_io_error(mdev);
+		__drbd_chk_io_error(mdev,FALSE);
 		_req_may_be_done(req);
 		break;
 
@@ -419,7 +419,7 @@
 			break;
 		/* else */
 		ALERT("Local READ failed sec=%llu size=%u\n",
-					req->sector, req->size);
+		      (unsigned long long)req->sector, req->size);
 		/* _req_mod(req,to_be_send); oops, recursion in static inline */
 		D_ASSERT(!(req->rq_state & RQ_NET_MASK));
 		req->rq_state |= RQ_NET_PENDING;
@@ -434,7 +434,7 @@
 		 * private bio then, and round the offset and size so
 		 * we get back enough data to be able to clear the bits again.
 		 */
-		__drbd_chk_io_error(mdev);
+		__drbd_chk_io_error(mdev,FALSE);
 		/* fall through: _req_mod(req,queue_for_net_read); */
 
 	case queue_for_net_read:
@@ -945,7 +945,11 @@
 
 	if (local) {
 		req->private_bio->bi_bdev = mdev->bc->backing_bdev;
-		generic_make_request(req->private_bio);
+
+		if (FAULT_ACTIVE(rw==WRITE? DRBD_FAULT_DT_WR : DRBD_FAULT_DT_RD))
+			bio_endio(req->private_bio, req->private_bio->bi_size, -EIO);
+		else
+			generic_make_request(req->private_bio);
 	}
 	return 0;
 
@@ -993,7 +997,6 @@
 		ERR("Sorry, I have no access to good data anymore.\n");
 		/*
 		 * FIXME suspend, loop waiting on cstate wait?
-		 * panic?
 		 */
 		return 1;
 	}
Index: drbd/drbd_int.h
===================================================================
--- drbd/drbd_int.h	(.../trunk)	(revision 4048)
+++ drbd/drbd_int.h	(.../branches/drbd-panic)	(revision 4048)
@@ -43,6 +43,11 @@
 extern int major_nr;
 extern int use_nbd_major;
 
+#ifdef DRBD_ENABLE_FAULTS
+extern int enable_faults;
+extern int fault_rate;
+#endif
+
 #include <linux/major.h>
 #ifdef DRBD_MAJOR
 # warning "FIXME. DRBD_MAJOR is now officially defined in major.h"
@@ -187,6 +192,27 @@
 	 _b; \
 	}))
 
+// Defines to control fault insertion
+enum {
+    DRBD_FAULT_MD_WR = 0,
+    DRBD_FAULT_MD_RD,
+    DRBD_FAULT_RS_WR,
+    DRBD_FAULT_RS_RD,
+    DRBD_FAULT_DT_WR,
+    DRBD_FAULT_DT_RD,
+
+    DRBD_FAULT_MAX,
+};
+
+#ifdef DRBD_ENABLE_FAULTS
+#define FAULT_ACTIVE(_t) \
+    (fault_rate && (enable_faults & (1<<(_t))) && _drbd_insert_fault(_t))
+
+extern unsigned int _drbd_insert_fault(unsigned int type);
+#else
+#define FAULT_ACTIVE(_t) (0)
+#endif
+
 #include <linux/stringify.h>
 // integer division, round _UP_ to the next integer
 #define div_ceil(A,B) ( (A)/(B) + ((A)%(B) ? 1 : 0) )
@@ -961,7 +987,7 @@
 extern int drbd_send_discard(drbd_dev *mdev, drbd_request_t *req);
 extern int drbd_send_sr_reply(drbd_dev *mdev, int retcode);
 extern void drbd_free_bc(struct drbd_backing_dev* bc);
-extern int drbd_io_error(drbd_dev* mdev);
+extern int drbd_io_error(drbd_dev* mdev, int forcedetach);
 extern void drbd_mdev_cleanup(drbd_dev *mdev);
 
 // drbd_meta-data.c (still in drbd_main.c)
@@ -1309,13 +1335,22 @@
  * drbd_chk_io_error: Handles the on_io_error setting, should be called from
  * all io completion handlers. See also drbd_io_error().
  */
-static inline void __drbd_chk_io_error(drbd_dev* mdev)
+static inline void __drbd_chk_io_error(drbd_dev* mdev, int forcedetach)
 {
 	/* FIXME cleanup the messages here */
 	switch(mdev->bc->dc.on_io_error) {
-	case PassOn: /* FIXME should the better be named "Ignore"? */
-		ERR("Ignoring local IO error!\n");
+	case PassOn: /* FIXME would this be better named "Ignore"? */
+	    if (!forcedetach) {
+		ERR("Local IO failed. Passing error on...\n");
 		break;
+	    }
+	    /* NOTE fall through to detach case if forcedetach set */
+	case Detach:
+		if (_drbd_set_state(mdev,_NS(disk,Failed),ChgStateHard) 
+		    == SS_Success) {
+			ERR("Local IO failed. Detaching...\n");
+		}
+		break;
 	case Panic:
 		_drbd_set_state(mdev,_NS(disk,Failed),ChgStateHard);
 		/* FIXME this is very ugly anyways.
@@ -1323,21 +1358,15 @@
 		 * while holding the req_lock hand with irq disabled. */
 		drbd_panic("IO error on backing device!\n");
 		break;
-	case Detach:
-		if (_drbd_set_state(mdev,_NS(disk,Failed),ChgStateHard) 
-		    == SS_Success) {
-			ERR("Local IO failed. Detaching...\n");
-		}
-		break;
 	}
 }
 
-static inline void drbd_chk_io_error(drbd_dev* mdev, int error)
+static inline void drbd_chk_io_error(drbd_dev* mdev, int error, int forcedetach)
 {
 	if (error) {
 		unsigned long flags;
 		spin_lock_irqsave(&mdev->req_lock,flags);
-		__drbd_chk_io_error(mdev);
+		__drbd_chk_io_error(mdev,forcedetach);
 		spin_unlock_irqrestore(&mdev->req_lock,flags);
 	}
 }