[DRBD-cvs] svn commit by phil - r2602 - in trunk: . drbd drbd/linux scripts user - Did item 41 of the ROADMAP. Right resync after a cluste

Sun Nov 19 13:54:52 CET 2006

Author: phil
Date: 2006-11-19 13:54:50 +0100 (Sun, 19 Nov 2006)
New Revision: 2602

Modified:
   trunk/ROADMAP
   trunk/drbd/drbd_int.h
   trunk/drbd/drbd_main.c
   trunk/drbd/drbd_receiver.c
   trunk/drbd/linux/drbd.h
   trunk/drbd/linux/drbd_limits.h
   trunk/drbd/linux/drbd_nl.h
   trunk/scripts/drbd.conf
   trunk/user/drbdadm_scanner.fl
   trunk/user/drbdsetup.c
Log:
Did item 41 of the ROADMAP.
 Right resync after a cluster crash or common power failure.


Modified: trunk/ROADMAP
===================================================================

--- trunk/ROADMAP	2006-11-18 10:46:55 UTC (rev 2601)
+++ trunk/ROADMAP	2006-11-19 12:54:50 UTC (rev 2602)
@@ -65,7 +65,7 @@
      "consensus"  ... discard the version of the secondary if the outcome
                       of the "after-sb-0pri" algorithm would also destroy 
                       the current secondary's data. Otherwise disconnect.
-     "violently-as0p" Alsways take the decission of the "after-sb-0pri"
+     "violently-as0p" Always take the decission of the "after-sb-0pri"
                       algorithm. Even if that causes case an erratic change
 		      of the primarie's view of the data.
 	              This is only usefull if you use an 1node FS (i.e.
@@ -87,7 +87,7 @@
   In case both nodes are primary you control DRBD's strategy by
   the "after-sb-2pri" option.
      "disconnect" ... Go to StandAlone mode on both sides.
-     "violently-as0p" Alsways take the decission of the "after-sb-0pri"
+     "violently-as0p" Always take the decission of the "after-sb-0pri"
                       algorithm. Even if that causes case an erratic change
 		      of the primarie's view of the data.
 	              This is only usefull if you use an 1node FS (i.e.
@@ -521,7 +521,11 @@
   1.  C=JC   C=JC   No Sync
   2.  C=JC   C!=JC  I am SyncTarget setting BM
   3. C!=JC   C=JC   I am SyncSource setting BM
-  4.   C   =   C    No Sync
+  4.   C   =   C    Common power [off|failure](Examine the roles at crash time)
+  4.1  sec   sec    Common power off, no sync.
+  4.2  pri   sec    Common power failure, I am SyncSource using BM
+  4.3  sec   pri    Common power failure, I am SyncTarget using BM
+  4.4  pri   pri    Common power failure, resync in arbitrary direction.
   5.   C   =   B    I am SyncTarget using BM
   6.   C   = H1|H2  I am SyncTarget setting BM
   7.   B   =   C    I am SyncSource using BM
@@ -867,9 +871,30 @@
 
 40 Do something with FLUSHBUFS ioctl.
 
-41 Fix DRBD's behaviour in case of a cluster crash when both nodes
-   are in primary state.
+41 Fix DRBD's behaviour in case of a common power failuer and when 
+   both nodes were in primary role.
 
+   See the the Algorithm of Item 16, section 4 to 4.4 .
+
+   Further we need to have the resync rolces conflict  "rr-conflict" 
+   strategy option with the following values:
+
+   The available options are:
+     "disconnect" ... No automatic resynchronisation, simply disconnect.
+     "violently" .... Sync to the primary node is allowed, violating the 
+	              assumption that data on a block device is stable
+		      for one of the nodes. DANGEROUS, DO NOT USE.
+     "call-pri-lost"
+                      Call this helper program on one of the machines.
+                      This program is expected to halt or reboot the
+                      machine.
+
+   An exception of course is a primary disk-less node that gets a disk
+   attached. Such a nodes becomes sync target, but since it does not
+   show a violently data change, this state transition is always allowed.
+
+   99% DONE
+
 42 Forward port the abilitiy to resume the TL after IO was frozen,
    in case the connection is reestablished again.
 

Modified: trunk/drbd/drbd_int.h
===================================================================
--- trunk/drbd/drbd_int.h	2006-11-18 10:46:55 UTC (rev 2601)
+++ trunk/drbd/drbd_int.h	2006-11-19 12:54:50 UTC (rev 2602)
@@ -684,7 +684,9 @@
 	CLUSTER_ST_CHANGE,      // Cluster wide state change going on...
 	CL_ST_CHG_SUCCESS,
 	CL_ST_CHG_FAIL,
-	CRASHED_PRIMARY,	// This node was a crashed primary
+	CRASHED_PRIMARY,	// This node was a crashed primary. Gets
+	                        // cleared when the state.conn  goes into 
+	                        // Connected state.
 	WRITE_BM_AFTER_RESYNC	// A kmalloc() during resync failed
 };
 

Modified: trunk/drbd/drbd_main.c
===================================================================
--- trunk/drbd/drbd_main.c	2006-11-18 10:46:55 UTC (rev 2601)
+++ trunk/drbd/drbd_main.c	2006-11-19 12:54:50 UTC (rev 2602)
@@ -834,8 +834,7 @@
 	enum fencing_policy fp;
 	u32 mdf;
 
-	if ( (os.role != Primary && ns.role == Primary)    ||
-	     (os.conn != Connected && ns.conn == Connected) ) {
+	if ( (os.conn != Connected && ns.conn == Connected) ) {
 		clear_bit(CRASHED_PRIMARY, &mdev->flags);
 	}
 
@@ -1278,6 +1277,7 @@
 {
 	Drbd_GenCnt_Packet p;
 	int i;
+	u64 uuid_flags = 0;
 
 	if(!inc_local_if_state(mdev,Negotiating)) return 1; // ok.
 
@@ -1289,7 +1289,9 @@
 	}
 
 	p.uuid[UUID_SIZE] = cpu_to_be64(drbd_bm_total_weight(mdev));
-	p.uuid[UUID_FLAGS] = cpu_to_be64(mdev->net_conf->want_lose);
+	uuid_flags |= mdev->net_conf->want_lose ? 1 : 0;
+	uuid_flags |= test_bit(CRASHED_PRIMARY, &mdev->flags) ? 2 : 0;
+	p.uuid[UUID_FLAGS] = cpu_to_be64(uuid_flags);
 
 	dec_local(mdev);
 

Modified: trunk/drbd/drbd_receiver.c
===================================================================
--- trunk/drbd/drbd_receiver.c	2006-11-18 10:46:55 UTC (rev 2601)
+++ trunk/drbd/drbd_receiver.c	2006-11-19 12:54:50 UTC (rev 2602)
@@ -1737,7 +1737,7 @@
 		if( hg == -1 && mdev->state.role==Secondary) rv=hg;
 		if( hg == 1  && mdev->state.role==Primary)   rv=hg;
 		break;
-	case ViolentlyAS0Pri:
+	case Violently:
 		rv = drbd_asb_recover_0p(mdev);
 		break;
 	case DiscardSecondary:
@@ -1775,7 +1775,7 @@
 	case DiscardSecondary:
 		ERR("Configuration error.\n");
 		break;
-	case ViolentlyAS0Pri:
+	case Violently:
 		rv = drbd_asb_recover_0p(mdev);
 		break;
 	case Disconnect:
@@ -1833,8 +1833,23 @@
 	if (self != UUID_JUST_CREATED &&
 	    peer == UUID_JUST_CREATED) return 2;
 
-	if (self == peer) return 0;
+	if (self == peer) { // Common power [off|failure]
+		int rct; // roles at crash time
 
+		rct = (test_bit(CRASHED_PRIMARY, &mdev->flags) ? 1 : 0) +
+			( mdev->p_uuid[UUID_FLAGS] & 2 );
+		// lowest bit is set when we were primary
+		// next bit (weight 2) is set when peer was primary
+
+		switch(rct) {
+		case 0: /* !self_pri && !peer_pri */ return 0;
+		case 1: /*  self_pri && !peer_pri */ return 1;
+		case 2: /* !self_pri &&  peer_pri */ return -1;
+		case 3: /*  self_pri &&  peer_pri */ 
+			return test_bit(DISCARD_CONCURRENT,&mdev->flags) ? -1 : 1;
+		}
+	}
+
 	peer = mdev->p_uuid[Bitmap] & ~((u64)1);
 	if (self == peer) return -1;
 
@@ -1925,9 +1940,9 @@
 		}
 	}
 	
-	if (hg == 0) {
+	if (abs(hg) < 100) {
 		// This is needed in case someone does an invalidate on an
-		// disconnected node.
+		// disconnected node. This has priority.
 		if(mydisk==Inconsistent && peer_disk>Inconsistent) hg=-1;
 		if(mydisk>Inconsistent && peer_disk==Inconsistent) hg= 1;
 	}
@@ -1952,8 +1967,21 @@
 		return conn_mask;
 	}
 
-	/* The check "I shall become SyncTarget, but I am primary"
-	   got removed since, we now do live attachs on primary nodes! */
+	if (hg < 0 && // by intention we do not use mydisk here.
+	    mdev->state.role == Primary && mdev->state.disk >= Consistent ) {
+		switch(mdev->net_conf->rr_conflict) {
+		case CallHelper:
+			drbd_khelper(mdev,"pri-lost");
+			// fall through
+		case Disconnect:
+			ERR("I shall become SyncTarget, but I am primary!\n");
+			drbd_force_state(mdev,NS(conn,Disconnecting));
+			return conn_mask;
+		case Violently:
+			WARN("Becoming SyncTarget, violating the stable-data"
+			     "assumption\n");
+		}
+	}
 
 	if (abs(hg) >= 2) {
 		drbd_md_set_flag(mdev,MDF_FullSync);
@@ -1976,17 +2004,9 @@
 		rv = WFBitMapT;
 	} else {
 		rv = Connected;
-		drbd_bm_lock(mdev);   // {
 		if(drbd_bm_total_weight(mdev)) {
-			/* FIXME for two-primaries this is wrong and may lead
-			 * to diverging data sets! */
-			INFO("No resync -> clearing bit map.\n");
-			drbd_bm_clear_all(mdev);
-			drbd_uuid_set_bm(mdev,0UL);
-			if (unlikely(drbd_bm_write(mdev) < 0))
-				return conn_mask;
+			INFO("No resync, but bits in bitmap!\n");
 		}
-		drbd_bm_unlock(mdev); // }
 	}
 
 	return rv;

Modified: trunk/drbd/linux/drbd.h
===================================================================
--- trunk/drbd/linux/drbd.h	2006-11-18 10:46:55 UTC (rev 2601)
+++ trunk/drbd/linux/drbd.h	2006-11-19 12:54:50 UTC (rev 2602)
@@ -62,7 +62,7 @@
 	Consensus,
 	DiscardSecondary,
 	CallHelper,
-	ViolentlyAS0Pri
+	Violently
 };
 
 /* KEEP the order, do not delete or insert!

Modified: trunk/drbd/linux/drbd_limits.h
===================================================================
--- trunk/drbd/linux/drbd_limits.h	2006-11-18 10:46:55 UTC (rev 2601)
+++ trunk/drbd/linux/drbd_limits.h	2006-11-19 12:54:50 UTC (rev 2602)
@@ -113,6 +113,7 @@
 #define DRBD_AFTER_SB_0P_DEF Disconnect
 #define DRBD_AFTER_SB_1P_DEF Disconnect
 #define DRBD_AFTER_SB_2P_DEF Disconnect
+#define DRBD_RR_CONFLICT_DEF Disconnect
 
 #undef RANGE
 #endif

Modified: trunk/drbd/linux/drbd_nl.h
===================================================================
--- trunk/drbd/linux/drbd_nl.h	2006-11-18 10:46:55 UTC (rev 2601)
+++ trunk/drbd/linux/drbd_nl.h	2006-11-19 12:54:50 UTC (rev 2602)
@@ -46,6 +46,7 @@
 	INTEGER(	24,	T_MAY_IGNORE,	after_sb_0p)
 	INTEGER(	25,	T_MAY_IGNORE,	after_sb_1p)
 	INTEGER(	26,	T_MAY_IGNORE,	after_sb_2p)
+	INTEGER(	39,	T_MAY_IGNORE,	rr_conflict)
 	BIT(		27,	T_MAY_IGNORE,	want_lose)
 	BIT(		28,	T_MAY_IGNORE,	two_primaries)
 )

Modified: trunk/scripts/drbd.conf
===================================================================
--- trunk/scripts/drbd.conf	2006-11-18 10:46:55 UTC (rev 2601)
+++ trunk/scripts/drbd.conf	2006-11-19 12:54:50 UTC (rev 2602)
@@ -299,6 +299,14 @@
     #    "consensus"  ... discard the version of the secondary if the outcome
     #                     of the "after-sb-0pri" algorithm would also destroy 
     #                     the current secondary's data. Otherwise disconnect.
+    #    "violently-as0p" Always take the decission of the "after-sb-0pri"
+    #                     algorithm. Even if that causes case an erratic change
+    #		          of the primarie's view of the data.
+    #                     This is only usefull if you use an 1node FS (i.e.
+    #		          not OCFS2 or GFS) with the allow-two-primaries
+    #		          flag, _AND_ you really know what you are doing.
+    #		          This is DANGEROUS and MAY CRASH YOUR MACHINE if you
+    #		          have a FS mounted on the primary node.
     #    "discard-secondary"     
     #                     discard the version of the secondary.
     #    "panic-primary"  Always honour the outcome of the "after-sb-0pri"
@@ -311,10 +319,22 @@
     # In case both nodes are primary you control DRBD's strategy by
     # the "after-sb-2pri" option.
     #    "disconnect" ... Go to StandAlone mode on both sides.
+    #    "violently-as0p" Always take the decission of the "after-sb-0pri".
     #    "panic"      ... Honor the outcome of the "after-sb-0pri" algorithm
     #                     and panic the other node.
     after-sb-2pri disconnect;
 
+    # To solve the cases when the outcome of the resync descissions is 
+    # incompatible to the current role asignment in the cluster.
+    #    "disconnect" ... No automatic resynchronisation, simply disconnect.
+    #    "violently" .... Sync to the primary node is allowed, violating the 
+    #	                  assumption that data on a block device is stable
+    #		          for one of the nodes. DANGEROUS, DO NOT USE.
+    #    "call-pri-lost"  Call the "pri-lost" helper program on one of the 
+    #	                  machines. This program is expected to reboot the
+    #                     machine. (I.e. make it secondary.)
+    rr-conflict disconnect;
+
     # DRBD-0.7's behaviour is equivalent to 
     #   after-sb-0pri discard-younger-primary;
     #   after-sb-1pri consensus;

Modified: trunk/user/drbdadm_scanner.fl
===================================================================
--- trunk/user/drbdadm_scanner.fl	2006-11-18 10:46:55 UTC (rev 2601)
+++ trunk/user/drbdadm_scanner.fl	2006-11-19 12:54:50 UTC (rev 2602)
@@ -78,7 +78,8 @@
 cram-hmac-alg		{ DP; CP; return TK_NET_OPTION;		}
 shared-secret		{ DP; CP; return TK_NET_OPTION;		}
 max-epoch-size		{ DP; CP; RC(MAX_EPOCH_SIZE); return TK_NET_OPTION;}
-after-sb-[012]pri	{ DP; CP; return TK_NET_OPTION;		}
+after-sb-[012]pri	{ DP; CP; return TK_NET_OPTION;		} 
+rr-conflict 		{ DP; CP; return TK_NET_OPTION;		} 
 unplug-watermark	{ DP; CP; return TK_NET_OPTION;         }
 allow-two-primaries	{ DP; CP; return TK_NET_SWITCH;		}
 rate			{ DP; CP; RC(RATE); return TK_SYNCER_OPTION;	}
@@ -88,6 +89,7 @@
 degr-wfc-timeout	{ DP; CP; RC(DEGR_WFC_TIMEOUT); return TK_STARTUP_OPTION;}
 pri-on-incon-degr	{ DP; CP; return TK_HANDLER_OPTION;	}
 pri-lost-after-sb	{ DP; CP; return TK_HANDLER_OPTION;	}
+pri-lost	        { DP; CP; return TK_HANDLER_OPTION;     }
 outdate-peer		{ DP; CP; return TK_HANDLER_OPTION;	}
 local-io-error		{ DP; CP; return TK_HANDLER_OPTION;     }
 {IPV4ADDR}		{ DP; CP; return TK_IPADDR;		}

Modified: trunk/user/drbdsetup.c
===================================================================
--- trunk/user/drbdsetup.c	2006-11-18 10:46:55 UTC (rev 2601)
+++ trunk/user/drbdsetup.c	2006-11-19 12:54:50 UTC (rev 2602)
@@ -206,17 +206,23 @@
 const char *asb1p_n[] = {
 	[Disconnect]        = "disconnect",
 	[Consensus]         = "consensus",
-	[ViolentlyAS0Pri]   = "violently-as0p",
+	[Violently]         = "violently-as0p",
 	[DiscardSecondary]  = "discard-secondary",
 	[CallHelper]        = "call-pri-lost-after-sb"
 };
 
 const char *asb2p_n[] = {
 	[Disconnect]        = "disconnect",
-	[ViolentlyAS0Pri]   = "violently-as0p",
+	[Violently]         = "violently-as0p",
 	[CallHelper]        = "call-pri-lost-after-sb"
 };
 
+const char *rrcf_n[] = {
+	[Disconnect]        = "disconnect",
+	[Violently]         = "violently",
+	[CallHelper]        = "call-pri-lost"
+};
+
 struct option wait_cmds_options[] = {
 	{ "wfc-timeout",required_argument, 0, 't' },
 	{ "degr-wfc-timeout",required_argument,0,'d'},
@@ -283,6 +289,7 @@
 		 { "after-sb-0pri",'A',	T_after_sb_0p,EH(asb0p_n,AFTER_SB_0P) },
 		 { "after-sb-1pri",'B',	T_after_sb_1p,EH(asb1p_n,AFTER_SB_1P) },
 		 { "after-sb-2pri",'C',	T_after_sb_2p,EH(asb2p_n,AFTER_SB_2P) },
+		 { "rr-conflict",'R',	T_rr_conflict,EH(rrcf_n,RR_CONFLICT) },
 		 { "discard-my-data",'D', T_want_lose,     EB },
 		 CLOSE_OPTIONS }} }, },