[DRBD-cvs] r1687 - in trunk: . drbd scripts user

svn at svn.drbd.org svn at svn.drbd.org
Thu Dec 23 15:18:11 CET 2004


Author: phil
Date: 2004-12-23 15:18:08 +0100 (Thu, 23 Dec 2004)
New Revision: 1687

Modified:
   trunk/ROADMAP
   trunk/drbd/drbd_fs.c
   trunk/drbd/drbd_main.c
   trunk/scripts/Makefile
   trunk/scripts/drbd.conf
   trunk/scripts/outdate-peer.sh
   trunk/user/drbdadm_main.c
   trunk/user/drbdsetup.c
Log:
Some testing of the "outdate-peer" user-land-callback...
This is really great stuff!


Modified: trunk/ROADMAP
===================================================================
--- trunk/ROADMAP	2004-12-23 11:06:45 UTC (rev 1686)
+++ trunk/ROADMAP	2004-12-23 14:18:08 UTC (rev 1687)
@@ -92,23 +92,36 @@
   99% DONE
 
 7 Handle split brain situations; Support IO fencing; 
-  introduce the "Dead" peer state (o_state)
-
+  
   New commands:
-    drbdadm resume r0
     drbdadm outdate r0
-    drbdadm suspend r0
 
+    When the device is configured this works via an ioctl() call.
+    In the other case it modifies the meta data directly by 
+    calling drbdmeta.
+
   remove option value: on-disconnect=suspend_io
 
+  New meta-data flag: "Outdated"
+
   introduce:
+  disk {
     split-brain-fix;
-    suspend-io-split-brain;pdsk-unknown;
-    peer-state-unknown=suspend_io
-    peer-state-unknown=continue_io
+  }
 
-  New meta-data flag: "Outdated"
+  handlers {
+    outdate-peer "some script";
+  }
 
+  If the disk state of the peer is unknown, drbd calls this 
+  handler (yes a call to userspace from kernel space). The handler's
+  returncodes are:
+
+  3 -> peer is inconsistent
+  4 -> peer is outdated (presumabely this handler outdated it)
+  5 -> peer was down / unreachable
+  6 -> peer is primary
+
   Let us assume that we have two boxes (N1 and N2) and that these
   two boxes are connected by two networks (net and cnet [ clinets'-net ]).
 
@@ -127,18 +140,21 @@
                     N1 asks N2 to fence itself from the storage via cnet.
                     HB calls "drbdadm outdate r0" on N2.
                     N2 replies to N1 that fencing is done via cnet.
-                    N1 calls "drbdadm resume r0".
+                    The outdate-peer script on N1 returns sucess to DRBD.
    P/D - -  S/?     N1 thaws IO
 
   N2 got the the "Outdated" flag set in its meta-data, by the outdate 
-  command. The suspend command is here to make the interface 
-  complete, and to reach the freezed state without th need to
-  disconnect the peer. It might turn out to be usefull for other
-  tasks as well.
+  command. 
 
-  Eventually introduce on-disconnent-cmd "command";
-  33% DONE
+  The "split-brain-fix" enables this behaviour. If this option is
+  omitted, the handler is not called nor IO is frozen on disconnect.
 
+  Eventually introduce a "suspend" and a "resume" command to 
+  to reach the freezed state without the need to disconnect the peer. 
+  It might turn out to be usefull for other tasks as well.
+
+  66% DONE / TODO: IO freezing is not done yet.
+
 8 New command drbdmeta
 
   We move the read_gc.pl/write_gc.pl to the user directory. 

Modified: trunk/drbd/drbd_fs.c
===================================================================
--- trunk/drbd/drbd_fs.c	2004-12-23 11:06:45 UTC (rev 1686)
+++ trunk/drbd/drbd_fs.c	2004-12-23 14:18:08 UTC (rev 1687)
@@ -623,7 +623,7 @@
 
 	r=drbd_khelper(mdev,"outdate-peer");
 
-	switch( (r>>8) && 0xff ) {
+	switch( (r>>8) & 0xff ) {
 	case 3: /* peer is inconsistent */
 		nps = Inconsistent;
 		break;
@@ -634,14 +634,15 @@
 		drbd_md_inc(mdev,ConnectedCnt);
 		nps = Outdated;
 		break;
-	case 6: /* Peer is primary */
+	case 6: /* Peer is primary, voluntarily outdate myself */
 		nps = DUnknown;
+		drbd_request_state(mdev,NS(disk,Outdated));
 		break;
 	default:
 		/* The script is broken ... */
 		drbd_md_inc(mdev,ConnectedCnt);
 		nps = DUnknown;
-		ERR("outdate-peer helper returned %d (%d)\n",(r>>8)&&0xff,r);
+		ERR("outdate-peer helper returned %d (%d)\n",(r>>8)&0xff,r);
 	}
 
 	return nps;
@@ -700,12 +701,9 @@
 	if ( r == -7 ) {
 		drbd_disks_t nps = drbd_try_outdate_peer(mdev);
 		r = drbd_request_state(mdev,NS2(role,newstate & 0x3,pdsk,nps));
-	}
-	if ( r <= 0) { 
-		print_st_err(mdev,os,ns,r);
-		return -EACCES; 
-	}
+	} else if ( r <= 0 ) print_st_err(mdev,os,ns,r);
 
+	if ( r <= 0 ) return -EACCES; 
 
 	if (mdev->state.s.conn >= Connected) {
 		/* do NOT increase the Human count if we are connected,

Modified: trunk/drbd/drbd_main.c
===================================================================
--- trunk/drbd/drbd_main.c	2004-12-23 11:06:45 UTC (rev 1686)
+++ trunk/drbd/drbd_main.c	2004-12-23 14:18:08 UTC (rev 1687)
@@ -503,7 +503,7 @@
 			ns.s.pdsk = Outdated;
 			break;
 		case SyncSource:
-			ns.s.disk = Inconsistent;
+			ns.s.pdsk = Inconsistent;
 			WARN("Implicit set pdsk Inconsistent!\n");
 			break;
 		}
@@ -521,7 +521,7 @@
 			 ns.s.disk <= Outdated ) rv=-2;
 
 		else if( ns.s.role == Primary && ns.s.conn < Connected &&
-			 ns.s.pdsk >= Unknown ) rv=-7;
+			 ns.s.pdsk >= DUnknown ) rv=-7;
 
 		else if( ns.s.role == Primary && ns.s.disk <= Inconsistent && 
 			 ns.s.pdsk <= Inconsistent ) rv=-2;

Modified: trunk/scripts/Makefile
===================================================================
--- trunk/scripts/Makefile	2004-12-23 11:06:45 UTC (rev 1686)
+++ trunk/scripts/Makefile	2004-12-23 14:18:08 UTC (rev 1687)
@@ -57,6 +57,8 @@
 		install -m 644 drbd.conf $(PREFIX)/etc/; \
 	fi
 	install -m 755 drbddisk $(PREFIX)/etc/ha.d/resource.d
+	install -d $(PREFIX)/usr/lib/drbd
+	install -m 755 outdate-peer.sh $(PREFIX)/usr/lib/drbd
 ifeq ($(DIST),suselike)
 	ln -sf ../etc/init.d/drbd $(PREFIX)/sbin/rcdrbd
 endif

Modified: trunk/scripts/drbd.conf
===================================================================
--- trunk/scripts/drbd.conf	2004-12-23 11:06:45 UTC (rev 1686)
+++ trunk/scripts/drbd.conf	2004-12-23 14:18:08 UTC (rev 1687)
@@ -127,8 +127,11 @@
     # than we have, and we are primary.
     pri-sees-sec-with-higher-gc "halt -f";
 
-    # Commands to run in case we loose connection. Use this script in 
-    # in conjunction with the on-disconnect=suspend_io configuration.
+    # Commands to run in case we need to downgrade the peer's disk 
+    # state to "Outdated". Should be implemented by the superior
+    # communication possibilities of our cluster manager.
+    # The provided script uses ssh, and is for demonstration/development
+    # purposis.
     outdate-peer "/usr/lib/drbd/outdate-peer.sh on amd 192.168.22.11 192.168.23.11 on alf 192.168.22.12 192.168.23.12";
   }
 
@@ -159,6 +162,14 @@
     #                 continues in disk less mode.
     #
     on-io-error   detach;
+
+    # Enables the use of the outdate-peer handler, as well as freezing
+    # of IO while we are primary and the peer's disk state is unknown.
+    #  The outdate-peer handler is used then to resove the situation
+    #  as quick as possible.
+    # BTW, becoming primary on a disconnected node may also trigger the
+    # execution of the outdate-peer handler.
+    # split-brain-fix; 
   }
 
   net {

Modified: trunk/scripts/outdate-peer.sh
===================================================================
--- trunk/scripts/outdate-peer.sh	2004-12-23 11:06:45 UTC (rev 1686)
+++ trunk/scripts/outdate-peer.sh	2004-12-23 14:18:08 UTC (rev 1687)
@@ -41,24 +41,25 @@
 TIMEOUT=6
 
 for P in "$@"; do
-    if [ "$EXP_PEER_IP" = "1" ]; then 
-	PEER_IP="$PEER_IP $P"
-    fi;
-    if [ "$EXP_OWN_IP" = "1" ]; then 
-	OWN_IP="$OWN_IP $P"
-    fi;
-    if [ "$EXP_HOST_NAME" = "1" ]; then 
-	if [ "$P" != `uname -n` ]; then 
-	    EXP_PEER_IP=1
-	else
-	    EXP_OWN_IP=1
-	fi
-	EXP_HOST_NAME=0
-    fi
     if [ "$P" = "on" ]; then 
 	EXP_HOST_NAME=1
 	EXP_PEER_IP=0
 	EXP_OWN_IP=0
+    else
+	if [ "$EXP_PEER_IP" = "1" ]; then 
+	    PEER_IP="$PEER_IP $P"
+	fi;
+	if [ "$EXP_OWN_IP" = "1" ]; then 
+	    OWN_IP="$OWN_IP $P"
+	fi;
+	if [ "$EXP_HOST_NAME" = "1" ]; then 
+	    if [ "$P" != `uname -n` ]; then 
+		EXP_PEER_IP=1
+	    else
+		EXP_OWN_IP=1
+	    fi
+	    EXP_HOST_NAME=0
+	fi
     fi
 done
 
@@ -74,12 +75,13 @@
 
 
 SSH_CMDS_RUNNING=1
-while [ "$SSH_CMDS_RUNNING" = "1" ] ; do
+while [ "$SSH_CMDS_RUNNING" = "1" ] && [ $TIMEOUT -gt 0 ]; do
     sleep 1
     SSH_CMDS_RUNNING=0
     for P in $SSH_PID; do
 	if [ -d /proc/$P ]; then SSH_CMDS_RUNNING=1; fi
     done
+    TIMEOUT=$(( $TIMEOUT - 1 ))
 done
 
 RV=5
@@ -94,11 +96,11 @@
 	# exit codes of drbdmeata outdate:
 	# 5  -> is inconsistent
 	# 0  -> is outdated
-	# 20 -> outdate failed because peer is primary.
+	# 17 -> outdate failed because peer is primary.
 	# Unfortunately 20 can have other reasons too....
 
 	if [ $EXIT_CODE -eq 5 ]; then RV=3; else
-	    if [ $EXIT_CODE -eq 27 ]; then RV=6; else
+	    if [ $EXIT_CODE -eq 17 ]; then RV=6; else
 		if [ $EXIT_CODE -eq 0 ]; then RV=4; else
 		    echo "do not know about this exit code"
 		fi

Modified: trunk/user/drbdadm_main.c
===================================================================
--- trunk/user/drbdadm_main.c	2004-12-23 11:06:45 UTC (rev 1686)
+++ trunk/user/drbdadm_main.c	2004-12-23 14:18:08 UTC (rev 1687)
@@ -613,7 +613,11 @@
 static int adm_generic_b(struct d_resource* res,const char* cmd)
 {
   int rv;
-  if( (rv=adm_generic(res,cmd,SLEEPS_SHORT|SUPRESS_STDERR)) ) {
+
+  rv=adm_generic(res,cmd,SLEEPS_SHORT|SUPRESS_STDERR);
+  if(rv == 17) return rv;
+
+  if( rv ) {
     rv = admm_generic(res,cmd);
   }
   return rv;
@@ -1311,9 +1315,9 @@
 	  fprintf(stderr,"'%s' not defined in your config.\n",argv[i]);
 	  exit(E_usage);
 	found:
-	  if( (rv=cmd->function(res,cmd->name)) >= 10 ) {
+	  if( (rv=cmd->function(res,cmd->name)) >= 20 ) {
 	    fprintf(stderr,"drbdadm aborting\n");
-	    exit(E_exec_error);
+	    exit(rv);
 	  }
 	}
       }

Modified: trunk/user/drbdsetup.c
===================================================================
--- trunk/user/drbdsetup.c	2004-12-23 11:06:45 UTC (rev 1686)
+++ trunk/user/drbdsetup.c	2004-12-23 14:18:08 UTC (rev 1687)
@@ -980,6 +980,7 @@
       if(err==EIO) 
 	{
 	  fprintf(stderr,"%s\n",set_st_err_name(reason));
+	  if(reason == -2) return 17;
 	}
       return 20;
     }



More information about the drbd-cvs mailing list