[DRBD-cvs] svn commit by phil - r2205 - in trunk: . drbd drbd/linux - Implemented "cluster wide state changes". For now that

drbd-cvs at lists.linbit.com drbd-cvs at lists.linbit.com
Mon May 22 17:10:55 CEST 2006


Author: phil
Date: 2006-05-22 17:10:53 +0200 (Mon, 22 May 2006)
New Revision: 2205

Modified:
   trunk/ROADMAP
   trunk/drbd/drbd_fs.c
   trunk/drbd/drbd_int.h
   trunk/drbd/drbd_main.c
   trunk/drbd/drbd_receiver.c
   trunk/drbd/drbd_strings.c
   trunk/drbd/linux/drbd.h
Log:
Implemented "cluster wide state changes". For now that method
is only used for becoming primary. (Should also be used for
the "invalidate" command).
Seems to work so far.


Modified: trunk/ROADMAP
===================================================================
--- trunk/ROADMAP	2006-05-22 11:27:56 UTC (rev 2204)
+++ trunk/ROADMAP	2006-05-22 15:10:53 UTC (rev 2205)
@@ -775,11 +775,11 @@
 	* A wait queue.
 
    TODOS: 
-    Make sure it is used for getting PRIMARY.
     Evaluate if it is possible to use it for starting resync. (invalidate)
     Evaluate it for the other cases...
 
-  50 % Is implemented, needs testing etc...
+  60 % Is implemented. Changing the role to primary already uses this 
+       mechanism. Seems to work.
 
 34 Improve the initial hand-shake, to identify the sockets (and TCP-
    links) by an initial message, and not only by the connection timming.

Modified: trunk/drbd/drbd_fs.c
===================================================================
--- trunk/drbd/drbd_fs.c	2006-05-22 11:27:56 UTC (rev 2204)
+++ trunk/drbd/drbd_fs.c	2006-05-22 15:10:53 UTC (rev 2205)
@@ -851,8 +851,8 @@
 int drbd_set_role(drbd_dev *mdev, int* arg)
 {
 	drbd_role_t newstate = *arg;
-	int rv,r,forced = 0;
-	drbd_state_t os,ns,rs;
+	int rv,r,forced = 0, try=0;
+	drbd_state_t mask, val;
 	drbd_disks_t nps;
 
 	D_ASSERT(semaphore_is_locked(&mdev->device_mutex));
@@ -875,42 +875,42 @@
 			bd_release(mdev->this_bdev);
 	}
 
-	nps = disk_mask;
- retry:
-	spin_lock_irq(&mdev->req_lock);
-	os = mdev->state;
-	rs.i = os.i;
-	rs.role = newstate & role_mask;
-	if(nps != disk_mask) rs.pdsk = nps;
-	r = _drbd_set_state(mdev, rs, 0);
+	mask.i = 0; mask.role = role_mask;
+	val.i  = 0; val.role  = newstate & role_mask;
 
-	if ( r == SS_NoConsistnetDisk ) {
-		if ( newstate & DontBlameDrbd && mdev->state.disk<UpToDate) {
-			rs.disk = UpToDate;
+	while (try++ < 3) {
+		r = _drbd_request_state(mdev,mask,val,0);
+		if( r == SS_NoConsistnetDisk && (newstate & DontBlameDrbd) && 
+		    mdev->state.disk < UpToDate) {
+			mask.disk = disk_mask;
+			val.disk  = UpToDate;
 			forced = 1;
-			r = _drbd_set_state(mdev, rs, 0);
+			continue;
 		}
-	}
+		if ( r == SS_NothingToDo ) { rv = 0; goto fail; }
+		if ( r == SS_PrimaryNOP ) {
+			nps = drbd_try_outdate_peer(mdev);
 
-	ns = mdev->state;
-	spin_unlock_irq(&mdev->req_lock);
+			if ( newstate & DontBlameDrbd && nps > Outdated ) {
+				WARN("Forced into split brain situation!\n");
+				nps = Outdated;
+			}
 
-	if ( r == SS_NothingToDo ) { rv = 0; goto fail; }
-	if ( r == SS_PrimaryNOP && nps == disk_mask ) {
-		nps = drbd_try_outdate_peer(mdev);
-		if ( newstate & DontBlameDrbd && nps > Outdated ) {
-			WARN("Forced into split brain situation!\n");
-			nps = Outdated;
+			mask.pdsk = disk_mask;
+			val.pdsk  = nps;
+
+			continue;
 		}
-		goto retry;
+
+		if ( r < SS_Success ) {
+			r = drbd_request_state(mdev,mask,val); // Be verbose.
+			if( r < SS_Success ) {
+				rv = -EIO;
+				goto fail;
+			}
+		}
+		break;
 	}
-	if ( r < SS_Success ) {
-		print_st_err(mdev,os,rs,r);
-		*arg = r;
-		rv = -EIO;
-		goto fail;
-	}
-	after_state_ch(mdev,os,ns);
 
 	if(forced) WARN("Forced to conisder local data as UpToDate!\n");
 

Modified: trunk/drbd/drbd_int.h
===================================================================
--- trunk/drbd/drbd_int.h	2006-05-22 11:27:56 UTC (rev 2204)
+++ trunk/drbd/drbd_int.h	2006-05-22 15:10:53 UTC (rev 2205)
@@ -315,6 +315,7 @@
 	AuthResponse,
 	OutdateRequest,
 	OutdatedReply,
+	StateChgRequest,
 
 	Ping,         // These are sent on the meta socket...
 	PingAck,
@@ -325,6 +326,7 @@
 	NegRSDReply,  // Local disk is broken...
 	BarrierAck,
 	DiscardNote,
+	StateChgReply,
 
 	MAX_CMD,
 	MayIgnore = 0x100, // Flag only to test if (cmd > MayIgnore) ...
@@ -370,6 +372,8 @@
 		[NegRSDReply]      = "NegRSDReply",
 		[BarrierAck]       = "BarrierAck",
 		[DiscardNote]      = "DiscardNote",
+		[StateChgRequest]  = "StateChgRequest",
+		[StateChgReply]    = "StateChgReply"
 	};
 
 	if (cmd == HandShake) return "HandShake";
@@ -515,6 +519,17 @@
 } __attribute((packed)) Drbd_State_Packet;
 
 typedef struct {
+	Drbd_Header head;
+	u32         mask;
+	u32         val;
+} __attribute((packed)) Drbd_Req_State_Packet;
+
+typedef struct {
+	Drbd_Header head;
+	u32         retcode;
+} __attribute((packed)) Drbd_RqS_Reply_Packet;
+
+typedef struct {
 	u64       size;
 	u32       state;
 	u32       blksize;
@@ -649,7 +664,10 @@
 	MD_DIRTY,		// current gen counts and flags not yet on disk
 	SYNC_STARTED,		// Needed to agree on the exact point in time..
 	UNIQUE,                 // Set on one node, cleared on the peer!
-	USE_DEGR_WFC_T		// Use degr-wfc-timeout instead of wfc-timeout.
+	USE_DEGR_WFC_T,		// Use degr-wfc-timeout instead of wfc-timeout.
+	CLUSTER_ST_CHANGE,      // Cluster wide state change going on...
+	CL_ST_CHG_SUCCESS,
+	CL_ST_CHG_FAIL
 };
 
 struct drbd_bitmap; // opaque for Drbd_Conf
@@ -831,6 +849,11 @@
 	ScheduleAfter   = 4,
 };
 
+extern int drbd_change_state(drbd_dev* mdev, enum chg_state_flags f,
+			     drbd_state_t mask, drbd_state_t val);
+extern void drbd_force_state(drbd_dev*, drbd_state_t, drbd_state_t);
+extern int _drbd_request_state(drbd_dev*, drbd_state_t, drbd_state_t, 
+			       enum chg_state_flags);
 extern int _drbd_set_state(drbd_dev*, drbd_state_t, enum chg_state_flags );
 extern void print_st_err(drbd_dev*, drbd_state_t, drbd_state_t, int );
 extern void after_state_ch(drbd_dev* mdev, drbd_state_t os, drbd_state_t ns);
@@ -874,6 +897,7 @@
 extern int drbd_send_bitmap(drbd_dev *mdev);
 extern int _drbd_send_bitmap(drbd_dev *mdev);
 extern int drbd_send_discard(drbd_dev *mdev, drbd_request_t *req);
+extern int drbd_send_sr_reply(drbd_dev *mdev, int retcode);
 extern void drbd_free_bc(struct drbd_backing_dev* bc);
 extern int drbd_io_error(drbd_dev* mdev);
 extern void drbd_mdev_cleanup(drbd_dev *mdev);
@@ -1187,34 +1211,22 @@
                 ({drbd_state_t ns; ns.i = mdev->state.i; ns.T1 = (S1); \
                 ns.T2 = (S2); ns.T3 = (S3); ns;})
 
-static inline int drbd_change_state(drbd_dev* mdev, enum chg_state_flags f,
-				    drbd_state_t mask, drbd_state_t val)
+static inline void drbd_state_lock(drbd_dev *mdev)
 {
-	unsigned long flags;
-	drbd_state_t os,ns;
-	int rv;
-
-	spin_lock_irqsave(&mdev->req_lock,flags);
-	os = mdev->state;
-	ns.i = (os.i & ~mask.i) | val.i;
-	rv = _drbd_set_state(mdev, ns, f);
-	ns = mdev->state;
-	spin_unlock_irqrestore(&mdev->req_lock,flags);
-	after_state_ch(mdev,os,ns);
-
-	return rv;
+	wait_event(mdev->cstate_wait,
+		   !test_and_set_bit(CLUSTER_ST_CHANGE,&mdev->flags));
 }
 
-static inline void drbd_force_state(drbd_dev* mdev,
-				    drbd_state_t mask, drbd_state_t val)
+static inline void drbd_state_unlock(drbd_dev *mdev)
 {
-	drbd_change_state(mdev,ChgStateHard,mask,val);
+	clear_bit(CLUSTER_ST_CHANGE,&mdev->flags);
+	wake_up(&mdev->cstate_wait);
 }
 
-static inline int drbd_request_state(drbd_dev* mdev,
-				    drbd_state_t mask, drbd_state_t val)
+static inline int drbd_request_state(drbd_dev* mdev, drbd_state_t mask,
+				     drbd_state_t val)
 {
-	return drbd_change_state(mdev,ChgStateVerbose,mask,val);
+	return _drbd_request_state(mdev, mask, val, ChgStateVerbose);
 }
 
 static inline void drbd_req_free(drbd_request_t *req)

Modified: trunk/drbd/drbd_main.c
===================================================================
--- trunk/drbd/drbd_main.c	2006-05-22 11:27:56 UTC (rev 2204)
+++ trunk/drbd/drbd_main.c	2006-05-22 15:10:53 UTC (rev 2205)
@@ -527,6 +527,104 @@
 	return ok;
 }
 
+/** 
+ * cl_wide_st_chg:
+ * Returns TRUE if this state change should be preformed as a cluster wide
+ * transaction. 
+ */ 
+STATIC int cl_wide_st_chg(drbd_dev* mdev, drbd_state_t os, drbd_state_t ns)
+{
+	return ( ns.conn >= Connected &&
+		 ( ( os.role != Primary && ns.role == Primary ) ||
+		   // ( os.conn != SyncSource && ns.role == SyncSource ) ||
+		   // ( os.conn != SyncTarget && ns.role == SyncTarget ) ||
+		   // ( os.disk != Diskless && ns.role == Diskless ) ||
+		   // ( os.conn != TearDown && ns.conn == TearDown ) ||
+		   0
+		   ) );
+}
+
+int drbd_change_state(drbd_dev* mdev, enum chg_state_flags f,
+		      drbd_state_t mask, drbd_state_t val)
+{
+	unsigned long flags;
+	drbd_state_t os,ns;
+	int rv;
+
+	spin_lock_irqsave(&mdev->req_lock,flags);
+	os = mdev->state;
+	ns.i = (os.i & ~mask.i) | val.i;
+	rv = _drbd_set_state(mdev, ns, f);
+	ns = mdev->state;
+	spin_unlock_irqrestore(&mdev->req_lock,flags);
+	after_state_ch(mdev,os,ns);
+
+	return rv;
+}
+
+void drbd_force_state(drbd_dev* mdev, drbd_state_t mask, drbd_state_t val)
+{
+	drbd_change_state(mdev,ChgStateHard,mask,val);
+}
+
+static inline enum { REQS_SUCCESS=1, REQS_FAIL=2, REQS_NO_NEED=3 } 
+_req_st_cond(drbd_dev* mdev,drbd_state_t mask, drbd_state_t val)
+{
+	drbd_state_t os,ns;
+
+	if(test_and_clear_bit(CL_ST_CHG_SUCCESS,&mdev->flags)) return REQS_SUCCESS;
+	if(test_and_clear_bit(CL_ST_CHG_FAIL,&mdev->flags)) return REQS_FAIL;
+
+	os = mdev->state;
+	ns.i = (os.i & ~mask.i) | val.i;
+
+	if(!cl_wide_st_chg(mdev,os,ns)) return REQS_NO_NEED;
+
+	return 0;
+}
+
+STATIC int drbd_send_state_req(drbd_dev *, drbd_state_t, drbd_state_t);
+
+int _drbd_request_state(drbd_dev* mdev, drbd_state_t mask, drbd_state_t val,
+		       enum chg_state_flags f)
+{
+	unsigned long flags;
+	drbd_state_t os,ns;
+	int rv;
+
+	spin_lock_irqsave(&mdev->req_lock,flags);
+	os = mdev->state;
+	ns.i = (os.i & ~mask.i) | val.i;
+
+	if(cl_wide_st_chg(mdev,os,ns)) {
+		// TODO do the pre checks here as well ;
+		spin_unlock_irqrestore(&mdev->req_lock,flags);
+
+		drbd_state_lock(mdev);
+		drbd_send_state_req(mdev,mask,val);
+
+		wait_event(mdev->cstate_wait,(rv=_req_st_cond(mdev,mask,val)));
+
+		if(rv == REQS_FAIL) {
+			drbd_state_unlock(mdev);
+			return SS_FailedByPeer; // Nearly dead code ;)
+		}
+		spin_lock_irqsave(&mdev->req_lock,flags);
+		os = mdev->state;
+		ns.i = (os.i & ~mask.i) | val.i;
+		drbd_state_unlock(mdev);
+	}
+
+	rv = _drbd_set_state(mdev, ns, f);
+	ns = mdev->state;
+	spin_unlock_irqrestore(&mdev->req_lock,flags);
+
+	if (rv == SS_Success) after_state_ch(mdev,os,ns);
+
+	return rv;
+}
+
+
 static void print_st(drbd_dev* mdev, char *name, drbd_state_t ns)
 {
 	ERR(" %s = { cs:%s st:%s/%s ds:%s/%s %c%c%c%c }\n",
@@ -615,7 +713,6 @@
 	return rv;
 }
 
-
 int _drbd_set_state(drbd_dev* mdev, drbd_state_t ns,enum chg_state_flags flags)
 {
 	drbd_state_t os;
@@ -1225,6 +1322,28 @@
 			     (Drbd_Header*)&p,sizeof(p));
 }
 
+STATIC int drbd_send_state_req(drbd_dev *mdev, drbd_state_t mask, drbd_state_t val)
+{
+	Drbd_Req_State_Packet p;
+
+	p.mask    = cpu_to_be32(mask.i);
+	p.val     = cpu_to_be32(val.i);
+
+	return drbd_send_cmd(mdev,mdev->data.socket,StateChgRequest,
+			     (Drbd_Header*)&p,sizeof(p));
+}
+
+int drbd_send_sr_reply(drbd_dev *mdev, int retcode)
+{
+	Drbd_RqS_Reply_Packet p;
+
+	p.retcode    = cpu_to_be32(retcode);
+
+	return drbd_send_cmd(mdev,mdev->meta.socket,StateChgReply,
+			     (Drbd_Header*)&p,sizeof(p));
+}
+
+
 /* See the comment at receive_bitmap() */
 int _drbd_send_bitmap(drbd_dev *mdev)
 {

Modified: trunk/drbd/drbd_receiver.c
===================================================================
--- trunk/drbd/drbd_receiver.c	2006-05-22 11:27:56 UTC (rev 2204)
+++ trunk/drbd/drbd_receiver.c	2006-05-22 15:10:53 UTC (rev 2205)
@@ -1831,8 +1831,9 @@
 /* warn if the arguments differ by more than 12.5% */
 static void warn_if_differ_considerably(drbd_dev *mdev, const char *s, sector_t a, sector_t b)
 {
+	sector_t d;
 	if (a == 0 || b == 0) return;
-	sector_t d = (a > b) ? (a - b) : (b - a);
+	d = (a > b) ? (a - b) : (b - a);
 	if ( d > (a>>3) || d > (b>>3)) {
 		WARN("Considerable difference in %s: %llu vs. %llu\n", s,
 		     (unsigned long long)a, (unsigned long long)b);
@@ -1961,7 +1962,67 @@
 	return TRUE;
 }
 
+/** 
+ * convert_state:
+ * Switches the view of the state.
+ */ 
+STATIC drbd_state_t convert_state(drbd_state_t ps)
+{
+	drbd_state_t ms;
 
+	static drbd_conns_t c_tab[] = {
+		[Connected] = Connected,
+		[SkippedSyncS] = SkippedSyncT,
+		[SkippedSyncT] = SkippedSyncS,
+		[WFBitMapS] = WFBitMapT,
+		[WFBitMapT] = WFBitMapS,
+		[WFSyncUUID] = SyncSource,
+		[SyncSource] = SyncTarget,
+		[SyncTarget] = WFSyncUUID,
+		[PausedSyncS] = PausedSyncT,
+		[PausedSyncT] = PausedSyncS,
+		[conn_mask]   = conn_mask,
+	};
+
+	ms.i = ps.i;
+
+	ms.conn = c_tab[ps.conn];
+	ms.peer = ps.role;
+	ms.role = ps.peer;
+	ms.pdsk = ps.disk;
+	ms.disk = ps.pdsk;
+	ms.peer_isp = ( ps.aftr_isp | ps.user_isp );
+
+	return ms;
+}
+
+STATIC int receive_req_state(drbd_dev *mdev, Drbd_Header *h)
+{
+	Drbd_Req_State_Packet *p = (Drbd_Req_State_Packet*)h;
+	drbd_state_t mask,val;
+	int rv;
+
+	ERR_IF(h->length != (sizeof(*p)-sizeof(*h))) return FALSE;
+	if (drbd_recv(mdev, h->payload, h->length) != h->length)
+		return FALSE;
+
+	mask.i = be32_to_cpu(p->mask);
+	val.i = be32_to_cpu(p->val);
+
+	if (test_bit(UNIQUE,&mdev->flags)) drbd_state_lock(mdev);
+
+	mask = convert_state(mask);
+	val = convert_state(val);
+
+	rv = drbd_change_state(mdev,ChgStateVerbose,mask,val);
+
+	if (test_bit(UNIQUE,&mdev->flags)) drbd_state_unlock(mdev);
+
+	drbd_send_sr_reply(mdev,rv);
+
+	return TRUE;
+}
+
 STATIC int receive_state(drbd_dev *mdev, Drbd_Header *h)
 {
 	Drbd_State_Packet *p = (Drbd_State_Packet*)h;
@@ -2269,6 +2330,7 @@
 	[ReportUUIDs]      = receive_uuids,
 	[ReportSizes]      = receive_sizes,
 	[ReportState]      = receive_state,
+	[StateChgRequest]  = receive_req_state,
 	[ReportSyncUUID]   = receive_sync_uuid,
 	[PauseResync]      = receive_pause_resync,
 	[ResumeResync]     = receive_resume_resync,
@@ -2751,6 +2813,24 @@
 
 /* ********* acknowledge sender ******** */
 
+STATIC int got_RqSReply(drbd_dev *mdev, Drbd_Header* h)
+{
+	Drbd_RqS_Reply_Packet *p = (Drbd_RqS_Reply_Packet*)h;
+
+	int retcode = be32_to_cpu(p->retcode);
+
+	if(retcode >= SS_Success) {
+		set_bit(CL_ST_CHG_SUCCESS,&mdev->flags);
+	} else {
+		set_bit(CL_ST_CHG_FAIL,&mdev->flags);
+		ERR("Requested state change failed by peer: %s\n",
+		    set_st_err_name(retcode));
+	}
+	wake_up(&mdev->cstate_wait);
+
+	return TRUE;
+}
+
 STATIC int got_Ping(drbd_dev *mdev, Drbd_Header* h)
 {
 	return drbd_send_ping_ack(mdev);
@@ -2940,6 +3020,7 @@
 		[NegRSDReply]={sizeof(Drbd_BlockAck_Packet),  got_NegRSDReply},
 		[BarrierAck]={ sizeof(Drbd_BarrierAck_Packet),got_BarrierAck },
 		[DiscardNote]={sizeof(Drbd_Discard_Packet),   got_Discard },
+		[StateChgReply]={sizeof(Drbd_RqS_Reply_Packet),got_RqSReply },
 	};
 
 	sprintf(current->comm, "drbd%d_asender", (int)(mdev-drbd_conf));

Modified: trunk/drbd/drbd_strings.c
===================================================================
--- trunk/drbd/drbd_strings.c	2006-05-22 11:27:56 UTC (rev 2204)
+++ trunk/drbd/drbd_strings.c	2006-05-22 15:10:53 UTC (rev 2205)
@@ -68,6 +68,7 @@
 	[-SS_SyncingDiskless] = "Refusing to be syncing and diskless",
 	[-SS_ConnectedOutdates] = "Refusing to be Outdated while Connected",
 	[-SS_PrimaryNOP] = "Refusing to be Primary while peer is not outdated",
+	[-SS_FailedByPeer] = "State changed was refused by peer node"
 };
 
 const char* conns_to_name(drbd_conns_t s) {

Modified: trunk/drbd/linux/drbd.h
===================================================================
--- trunk/drbd/linux/drbd.h	2006-05-22 11:27:56 UTC (rev 2204)
+++ trunk/drbd/linux/drbd.h	2006-05-22 15:10:53 UTC (rev 2205)
@@ -217,7 +217,7 @@
 	NetworkFailure,
 	WFConnection,
 	WFReportParams, // we have a socket
-	TearDown, 
+	TearDown,
 	Connected,      // we have introduced each other
 	SkippedSyncS,   // we should have synced, but user said no
 	SkippedSyncT,
@@ -268,7 +268,8 @@
 	SS_BothInconsistent=-4,
 	SS_SyncingDiskless=-5,
 	SS_ConnectedOutdates=-6,
-	SS_PrimaryNOP=-7
+	SS_PrimaryNOP=-7,
+	SS_FailedByPeer=-8
 } set_st_err_t;
 
 /* from drbd_strings.c */



More information about the drbd-cvs mailing list