[DRBD-cvs] svn commit by phil - r2205 - in trunk: . drbd drbd/linux
- Implemented "cluster wide state changes". For now that
drbd-cvs at lists.linbit.com
drbd-cvs at lists.linbit.com
Mon May 22 17:10:55 CEST 2006
Author: phil
Date: 2006-05-22 17:10:53 +0200 (Mon, 22 May 2006)
New Revision: 2205
Modified:
trunk/ROADMAP
trunk/drbd/drbd_fs.c
trunk/drbd/drbd_int.h
trunk/drbd/drbd_main.c
trunk/drbd/drbd_receiver.c
trunk/drbd/drbd_strings.c
trunk/drbd/linux/drbd.h
Log:
Implemented "cluster wide state changes". For now that method
is only used for becoming primary. (Should also be used for
the "invalidate" command).
Seems to work so far.
Modified: trunk/ROADMAP
===================================================================
--- trunk/ROADMAP 2006-05-22 11:27:56 UTC (rev 2204)
+++ trunk/ROADMAP 2006-05-22 15:10:53 UTC (rev 2205)
@@ -775,11 +775,11 @@
* A wait queue.
TODOS:
- Make sure it is used for getting PRIMARY.
Evaluate if it is possible to use it for starting resync. (invalidate)
Evaluate it for the other cases...
- 50 % Is implemented, needs testing etc...
+ 60 % Is implemented. Changing the role to primary already uses this
+ mechanism. Seems to work.
34 Improve the initial hand-shake, to identify the sockets (and TCP-
links) by an initial message, and not only by the connection timming.
Modified: trunk/drbd/drbd_fs.c
===================================================================
--- trunk/drbd/drbd_fs.c 2006-05-22 11:27:56 UTC (rev 2204)
+++ trunk/drbd/drbd_fs.c 2006-05-22 15:10:53 UTC (rev 2205)
@@ -851,8 +851,8 @@
int drbd_set_role(drbd_dev *mdev, int* arg)
{
drbd_role_t newstate = *arg;
- int rv,r,forced = 0;
- drbd_state_t os,ns,rs;
+ int rv,r,forced = 0, try=0;
+ drbd_state_t mask, val;
drbd_disks_t nps;
D_ASSERT(semaphore_is_locked(&mdev->device_mutex));
@@ -875,42 +875,42 @@
bd_release(mdev->this_bdev);
}
- nps = disk_mask;
- retry:
- spin_lock_irq(&mdev->req_lock);
- os = mdev->state;
- rs.i = os.i;
- rs.role = newstate & role_mask;
- if(nps != disk_mask) rs.pdsk = nps;
- r = _drbd_set_state(mdev, rs, 0);
+ mask.i = 0; mask.role = role_mask;
+ val.i = 0; val.role = newstate & role_mask;
- if ( r == SS_NoConsistnetDisk ) {
- if ( newstate & DontBlameDrbd && mdev->state.disk<UpToDate) {
- rs.disk = UpToDate;
+ while (try++ < 3) {
+ r = _drbd_request_state(mdev,mask,val,0);
+ if( r == SS_NoConsistnetDisk && (newstate & DontBlameDrbd) &&
+ mdev->state.disk < UpToDate) {
+ mask.disk = disk_mask;
+ val.disk = UpToDate;
forced = 1;
- r = _drbd_set_state(mdev, rs, 0);
+ continue;
}
- }
+ if ( r == SS_NothingToDo ) { rv = 0; goto fail; }
+ if ( r == SS_PrimaryNOP ) {
+ nps = drbd_try_outdate_peer(mdev);
- ns = mdev->state;
- spin_unlock_irq(&mdev->req_lock);
+ if ( newstate & DontBlameDrbd && nps > Outdated ) {
+ WARN("Forced into split brain situation!\n");
+ nps = Outdated;
+ }
- if ( r == SS_NothingToDo ) { rv = 0; goto fail; }
- if ( r == SS_PrimaryNOP && nps == disk_mask ) {
- nps = drbd_try_outdate_peer(mdev);
- if ( newstate & DontBlameDrbd && nps > Outdated ) {
- WARN("Forced into split brain situation!\n");
- nps = Outdated;
+ mask.pdsk = disk_mask;
+ val.pdsk = nps;
+
+ continue;
}
- goto retry;
+
+ if ( r < SS_Success ) {
+ r = drbd_request_state(mdev,mask,val); // Be verbose.
+ if( r < SS_Success ) {
+ rv = -EIO;
+ goto fail;
+ }
+ }
+ break;
}
- if ( r < SS_Success ) {
- print_st_err(mdev,os,rs,r);
- *arg = r;
- rv = -EIO;
- goto fail;
- }
- after_state_ch(mdev,os,ns);
if(forced) WARN("Forced to conisder local data as UpToDate!\n");
Modified: trunk/drbd/drbd_int.h
===================================================================
--- trunk/drbd/drbd_int.h 2006-05-22 11:27:56 UTC (rev 2204)
+++ trunk/drbd/drbd_int.h 2006-05-22 15:10:53 UTC (rev 2205)
@@ -315,6 +315,7 @@
AuthResponse,
OutdateRequest,
OutdatedReply,
+ StateChgRequest,
Ping, // These are sent on the meta socket...
PingAck,
@@ -325,6 +326,7 @@
NegRSDReply, // Local disk is broken...
BarrierAck,
DiscardNote,
+ StateChgReply,
MAX_CMD,
MayIgnore = 0x100, // Flag only to test if (cmd > MayIgnore) ...
@@ -370,6 +372,8 @@
[NegRSDReply] = "NegRSDReply",
[BarrierAck] = "BarrierAck",
[DiscardNote] = "DiscardNote",
+ [StateChgRequest] = "StateChgRequest",
+ [StateChgReply] = "StateChgReply"
};
if (cmd == HandShake) return "HandShake";
@@ -515,6 +519,17 @@
} __attribute((packed)) Drbd_State_Packet;
typedef struct {
+ Drbd_Header head;
+ u32 mask;
+ u32 val;
+} __attribute((packed)) Drbd_Req_State_Packet;
+
+typedef struct {
+ Drbd_Header head;
+ u32 retcode;
+} __attribute((packed)) Drbd_RqS_Reply_Packet;
+
+typedef struct {
u64 size;
u32 state;
u32 blksize;
@@ -649,7 +664,10 @@
MD_DIRTY, // current gen counts and flags not yet on disk
SYNC_STARTED, // Needed to agree on the exact point in time..
UNIQUE, // Set on one node, cleared on the peer!
- USE_DEGR_WFC_T // Use degr-wfc-timeout instead of wfc-timeout.
+ USE_DEGR_WFC_T, // Use degr-wfc-timeout instead of wfc-timeout.
+ CLUSTER_ST_CHANGE, // Cluster wide state change going on...
+ CL_ST_CHG_SUCCESS,
+ CL_ST_CHG_FAIL
};
struct drbd_bitmap; // opaque for Drbd_Conf
@@ -831,6 +849,11 @@
ScheduleAfter = 4,
};
+extern int drbd_change_state(drbd_dev* mdev, enum chg_state_flags f,
+ drbd_state_t mask, drbd_state_t val);
+extern void drbd_force_state(drbd_dev*, drbd_state_t, drbd_state_t);
+extern int _drbd_request_state(drbd_dev*, drbd_state_t, drbd_state_t,
+ enum chg_state_flags);
extern int _drbd_set_state(drbd_dev*, drbd_state_t, enum chg_state_flags );
extern void print_st_err(drbd_dev*, drbd_state_t, drbd_state_t, int );
extern void after_state_ch(drbd_dev* mdev, drbd_state_t os, drbd_state_t ns);
@@ -874,6 +897,7 @@
extern int drbd_send_bitmap(drbd_dev *mdev);
extern int _drbd_send_bitmap(drbd_dev *mdev);
extern int drbd_send_discard(drbd_dev *mdev, drbd_request_t *req);
+extern int drbd_send_sr_reply(drbd_dev *mdev, int retcode);
extern void drbd_free_bc(struct drbd_backing_dev* bc);
extern int drbd_io_error(drbd_dev* mdev);
extern void drbd_mdev_cleanup(drbd_dev *mdev);
@@ -1187,34 +1211,22 @@
({drbd_state_t ns; ns.i = mdev->state.i; ns.T1 = (S1); \
ns.T2 = (S2); ns.T3 = (S3); ns;})
-static inline int drbd_change_state(drbd_dev* mdev, enum chg_state_flags f,
- drbd_state_t mask, drbd_state_t val)
+static inline void drbd_state_lock(drbd_dev *mdev)
{
- unsigned long flags;
- drbd_state_t os,ns;
- int rv;
-
- spin_lock_irqsave(&mdev->req_lock,flags);
- os = mdev->state;
- ns.i = (os.i & ~mask.i) | val.i;
- rv = _drbd_set_state(mdev, ns, f);
- ns = mdev->state;
- spin_unlock_irqrestore(&mdev->req_lock,flags);
- after_state_ch(mdev,os,ns);
-
- return rv;
+ wait_event(mdev->cstate_wait,
+ !test_and_set_bit(CLUSTER_ST_CHANGE,&mdev->flags));
}
-static inline void drbd_force_state(drbd_dev* mdev,
- drbd_state_t mask, drbd_state_t val)
+static inline void drbd_state_unlock(drbd_dev *mdev)
{
- drbd_change_state(mdev,ChgStateHard,mask,val);
+ clear_bit(CLUSTER_ST_CHANGE,&mdev->flags);
+ wake_up(&mdev->cstate_wait);
}
-static inline int drbd_request_state(drbd_dev* mdev,
- drbd_state_t mask, drbd_state_t val)
+static inline int drbd_request_state(drbd_dev* mdev, drbd_state_t mask,
+ drbd_state_t val)
{
- return drbd_change_state(mdev,ChgStateVerbose,mask,val);
+ return _drbd_request_state(mdev, mask, val, ChgStateVerbose);
}
static inline void drbd_req_free(drbd_request_t *req)
Modified: trunk/drbd/drbd_main.c
===================================================================
--- trunk/drbd/drbd_main.c 2006-05-22 11:27:56 UTC (rev 2204)
+++ trunk/drbd/drbd_main.c 2006-05-22 15:10:53 UTC (rev 2205)
@@ -527,6 +527,104 @@
return ok;
}
+/**
+ * cl_wide_st_chg:
+ * Returns TRUE if this state change should be preformed as a cluster wide
+ * transaction.
+ */
+STATIC int cl_wide_st_chg(drbd_dev* mdev, drbd_state_t os, drbd_state_t ns)
+{
+ return ( ns.conn >= Connected &&
+ ( ( os.role != Primary && ns.role == Primary ) ||
+ // ( os.conn != SyncSource && ns.role == SyncSource ) ||
+ // ( os.conn != SyncTarget && ns.role == SyncTarget ) ||
+ // ( os.disk != Diskless && ns.role == Diskless ) ||
+ // ( os.conn != TearDown && ns.conn == TearDown ) ||
+ 0
+ ) );
+}
+
+int drbd_change_state(drbd_dev* mdev, enum chg_state_flags f,
+ drbd_state_t mask, drbd_state_t val)
+{
+ unsigned long flags;
+ drbd_state_t os,ns;
+ int rv;
+
+ spin_lock_irqsave(&mdev->req_lock,flags);
+ os = mdev->state;
+ ns.i = (os.i & ~mask.i) | val.i;
+ rv = _drbd_set_state(mdev, ns, f);
+ ns = mdev->state;
+ spin_unlock_irqrestore(&mdev->req_lock,flags);
+ after_state_ch(mdev,os,ns);
+
+ return rv;
+}
+
+void drbd_force_state(drbd_dev* mdev, drbd_state_t mask, drbd_state_t val)
+{
+ drbd_change_state(mdev,ChgStateHard,mask,val);
+}
+
+static inline enum { REQS_SUCCESS=1, REQS_FAIL=2, REQS_NO_NEED=3 }
+_req_st_cond(drbd_dev* mdev,drbd_state_t mask, drbd_state_t val)
+{
+ drbd_state_t os,ns;
+
+ if(test_and_clear_bit(CL_ST_CHG_SUCCESS,&mdev->flags)) return REQS_SUCCESS;
+ if(test_and_clear_bit(CL_ST_CHG_FAIL,&mdev->flags)) return REQS_FAIL;
+
+ os = mdev->state;
+ ns.i = (os.i & ~mask.i) | val.i;
+
+ if(!cl_wide_st_chg(mdev,os,ns)) return REQS_NO_NEED;
+
+ return 0;
+}
+
+STATIC int drbd_send_state_req(drbd_dev *, drbd_state_t, drbd_state_t);
+
+int _drbd_request_state(drbd_dev* mdev, drbd_state_t mask, drbd_state_t val,
+ enum chg_state_flags f)
+{
+ unsigned long flags;
+ drbd_state_t os,ns;
+ int rv;
+
+ spin_lock_irqsave(&mdev->req_lock,flags);
+ os = mdev->state;
+ ns.i = (os.i & ~mask.i) | val.i;
+
+ if(cl_wide_st_chg(mdev,os,ns)) {
+ // TODO do the pre checks here as well ;
+ spin_unlock_irqrestore(&mdev->req_lock,flags);
+
+ drbd_state_lock(mdev);
+ drbd_send_state_req(mdev,mask,val);
+
+ wait_event(mdev->cstate_wait,(rv=_req_st_cond(mdev,mask,val)));
+
+ if(rv == REQS_FAIL) {
+ drbd_state_unlock(mdev);
+ return SS_FailedByPeer; // Nearly dead code ;)
+ }
+ spin_lock_irqsave(&mdev->req_lock,flags);
+ os = mdev->state;
+ ns.i = (os.i & ~mask.i) | val.i;
+ drbd_state_unlock(mdev);
+ }
+
+ rv = _drbd_set_state(mdev, ns, f);
+ ns = mdev->state;
+ spin_unlock_irqrestore(&mdev->req_lock,flags);
+
+ if (rv == SS_Success) after_state_ch(mdev,os,ns);
+
+ return rv;
+}
+
+
static void print_st(drbd_dev* mdev, char *name, drbd_state_t ns)
{
ERR(" %s = { cs:%s st:%s/%s ds:%s/%s %c%c%c%c }\n",
@@ -615,7 +713,6 @@
return rv;
}
-
int _drbd_set_state(drbd_dev* mdev, drbd_state_t ns,enum chg_state_flags flags)
{
drbd_state_t os;
@@ -1225,6 +1322,28 @@
(Drbd_Header*)&p,sizeof(p));
}
+STATIC int drbd_send_state_req(drbd_dev *mdev, drbd_state_t mask, drbd_state_t val)
+{
+ Drbd_Req_State_Packet p;
+
+ p.mask = cpu_to_be32(mask.i);
+ p.val = cpu_to_be32(val.i);
+
+ return drbd_send_cmd(mdev,mdev->data.socket,StateChgRequest,
+ (Drbd_Header*)&p,sizeof(p));
+}
+
+int drbd_send_sr_reply(drbd_dev *mdev, int retcode)
+{
+ Drbd_RqS_Reply_Packet p;
+
+ p.retcode = cpu_to_be32(retcode);
+
+ return drbd_send_cmd(mdev,mdev->meta.socket,StateChgReply,
+ (Drbd_Header*)&p,sizeof(p));
+}
+
+
/* See the comment at receive_bitmap() */
int _drbd_send_bitmap(drbd_dev *mdev)
{
Modified: trunk/drbd/drbd_receiver.c
===================================================================
--- trunk/drbd/drbd_receiver.c 2006-05-22 11:27:56 UTC (rev 2204)
+++ trunk/drbd/drbd_receiver.c 2006-05-22 15:10:53 UTC (rev 2205)
@@ -1831,8 +1831,9 @@
/* warn if the arguments differ by more than 12.5% */
static void warn_if_differ_considerably(drbd_dev *mdev, const char *s, sector_t a, sector_t b)
{
+ sector_t d;
if (a == 0 || b == 0) return;
- sector_t d = (a > b) ? (a - b) : (b - a);
+ d = (a > b) ? (a - b) : (b - a);
if ( d > (a>>3) || d > (b>>3)) {
WARN("Considerable difference in %s: %llu vs. %llu\n", s,
(unsigned long long)a, (unsigned long long)b);
@@ -1961,7 +1962,67 @@
return TRUE;
}
+/**
+ * convert_state:
+ * Switches the view of the state.
+ */
+STATIC drbd_state_t convert_state(drbd_state_t ps)
+{
+ drbd_state_t ms;
+ static drbd_conns_t c_tab[] = {
+ [Connected] = Connected,
+ [SkippedSyncS] = SkippedSyncT,
+ [SkippedSyncT] = SkippedSyncS,
+ [WFBitMapS] = WFBitMapT,
+ [WFBitMapT] = WFBitMapS,
+ [WFSyncUUID] = SyncSource,
+ [SyncSource] = SyncTarget,
+ [SyncTarget] = WFSyncUUID,
+ [PausedSyncS] = PausedSyncT,
+ [PausedSyncT] = PausedSyncS,
+ [conn_mask] = conn_mask,
+ };
+
+ ms.i = ps.i;
+
+ ms.conn = c_tab[ps.conn];
+ ms.peer = ps.role;
+ ms.role = ps.peer;
+ ms.pdsk = ps.disk;
+ ms.disk = ps.pdsk;
+ ms.peer_isp = ( ps.aftr_isp | ps.user_isp );
+
+ return ms;
+}
+
+STATIC int receive_req_state(drbd_dev *mdev, Drbd_Header *h)
+{
+ Drbd_Req_State_Packet *p = (Drbd_Req_State_Packet*)h;
+ drbd_state_t mask,val;
+ int rv;
+
+ ERR_IF(h->length != (sizeof(*p)-sizeof(*h))) return FALSE;
+ if (drbd_recv(mdev, h->payload, h->length) != h->length)
+ return FALSE;
+
+ mask.i = be32_to_cpu(p->mask);
+ val.i = be32_to_cpu(p->val);
+
+ if (test_bit(UNIQUE,&mdev->flags)) drbd_state_lock(mdev);
+
+ mask = convert_state(mask);
+ val = convert_state(val);
+
+ rv = drbd_change_state(mdev,ChgStateVerbose,mask,val);
+
+ if (test_bit(UNIQUE,&mdev->flags)) drbd_state_unlock(mdev);
+
+ drbd_send_sr_reply(mdev,rv);
+
+ return TRUE;
+}
+
STATIC int receive_state(drbd_dev *mdev, Drbd_Header *h)
{
Drbd_State_Packet *p = (Drbd_State_Packet*)h;
@@ -2269,6 +2330,7 @@
[ReportUUIDs] = receive_uuids,
[ReportSizes] = receive_sizes,
[ReportState] = receive_state,
+ [StateChgRequest] = receive_req_state,
[ReportSyncUUID] = receive_sync_uuid,
[PauseResync] = receive_pause_resync,
[ResumeResync] = receive_resume_resync,
@@ -2751,6 +2813,24 @@
/* ********* acknowledge sender ******** */
+STATIC int got_RqSReply(drbd_dev *mdev, Drbd_Header* h)
+{
+ Drbd_RqS_Reply_Packet *p = (Drbd_RqS_Reply_Packet*)h;
+
+ int retcode = be32_to_cpu(p->retcode);
+
+ if(retcode >= SS_Success) {
+ set_bit(CL_ST_CHG_SUCCESS,&mdev->flags);
+ } else {
+ set_bit(CL_ST_CHG_FAIL,&mdev->flags);
+ ERR("Requested state change failed by peer: %s\n",
+ set_st_err_name(retcode));
+ }
+ wake_up(&mdev->cstate_wait);
+
+ return TRUE;
+}
+
STATIC int got_Ping(drbd_dev *mdev, Drbd_Header* h)
{
return drbd_send_ping_ack(mdev);
@@ -2940,6 +3020,7 @@
[NegRSDReply]={sizeof(Drbd_BlockAck_Packet), got_NegRSDReply},
[BarrierAck]={ sizeof(Drbd_BarrierAck_Packet),got_BarrierAck },
[DiscardNote]={sizeof(Drbd_Discard_Packet), got_Discard },
+ [StateChgReply]={sizeof(Drbd_RqS_Reply_Packet),got_RqSReply },
};
sprintf(current->comm, "drbd%d_asender", (int)(mdev-drbd_conf));
Modified: trunk/drbd/drbd_strings.c
===================================================================
--- trunk/drbd/drbd_strings.c 2006-05-22 11:27:56 UTC (rev 2204)
+++ trunk/drbd/drbd_strings.c 2006-05-22 15:10:53 UTC (rev 2205)
@@ -68,6 +68,7 @@
[-SS_SyncingDiskless] = "Refusing to be syncing and diskless",
[-SS_ConnectedOutdates] = "Refusing to be Outdated while Connected",
[-SS_PrimaryNOP] = "Refusing to be Primary while peer is not outdated",
+ [-SS_FailedByPeer] = "State changed was refused by peer node"
};
const char* conns_to_name(drbd_conns_t s) {
Modified: trunk/drbd/linux/drbd.h
===================================================================
--- trunk/drbd/linux/drbd.h 2006-05-22 11:27:56 UTC (rev 2204)
+++ trunk/drbd/linux/drbd.h 2006-05-22 15:10:53 UTC (rev 2205)
@@ -217,7 +217,7 @@
NetworkFailure,
WFConnection,
WFReportParams, // we have a socket
- TearDown,
+ TearDown,
Connected, // we have introduced each other
SkippedSyncS, // we should have synced, but user said no
SkippedSyncT,
@@ -268,7 +268,8 @@
SS_BothInconsistent=-4,
SS_SyncingDiskless=-5,
SS_ConnectedOutdates=-6,
- SS_PrimaryNOP=-7
+ SS_PrimaryNOP=-7,
+ SS_FailedByPeer=-8
} set_st_err_t;
/* from drbd_strings.c */
More information about the drbd-cvs
mailing list