[DRBD-cvs] r1687 - in trunk: . drbd scripts user
svn at svn.drbd.org
svn at svn.drbd.org
Thu Dec 23 15:18:11 CET 2004
Author: phil
Date: 2004-12-23 15:18:08 +0100 (Thu, 23 Dec 2004)
New Revision: 1687
Modified:
trunk/ROADMAP
trunk/drbd/drbd_fs.c
trunk/drbd/drbd_main.c
trunk/scripts/Makefile
trunk/scripts/drbd.conf
trunk/scripts/outdate-peer.sh
trunk/user/drbdadm_main.c
trunk/user/drbdsetup.c
Log:
Some testing of the "outdate-peer" user-land-callback...
This is really great stuff!
Modified: trunk/ROADMAP
===================================================================
--- trunk/ROADMAP 2004-12-23 11:06:45 UTC (rev 1686)
+++ trunk/ROADMAP 2004-12-23 14:18:08 UTC (rev 1687)
@@ -92,23 +92,36 @@
99% DONE
7 Handle split brain situations; Support IO fencing;
- introduce the "Dead" peer state (o_state)
-
+
New commands:
- drbdadm resume r0
drbdadm outdate r0
- drbdadm suspend r0
+ When the device is configured this works via an ioctl() call.
+ In the other case it modifies the meta data directly by
+ calling drbdmeta.
+
remove option value: on-disconnect=suspend_io
+ New meta-data flag: "Outdated"
+
introduce:
+ disk {
split-brain-fix;
- suspend-io-split-brain;pdsk-unknown;
- peer-state-unknown=suspend_io
- peer-state-unknown=continue_io
+ }
- New meta-data flag: "Outdated"
+ handlers {
+ outdate-peer "some script";
+ }
+ If the disk state of the peer is unknown, drbd calls this
+ handler (yes a call to userspace from kernel space). The handler's
+ returncodes are:
+
+ 3 -> peer is inconsistent
+ 4 -> peer is outdated (presumabely this handler outdated it)
+ 5 -> peer was down / unreachable
+ 6 -> peer is primary
+
Let us assume that we have two boxes (N1 and N2) and that these
two boxes are connected by two networks (net and cnet [ clinets'-net ]).
@@ -127,18 +140,21 @@
N1 asks N2 to fence itself from the storage via cnet.
HB calls "drbdadm outdate r0" on N2.
N2 replies to N1 that fencing is done via cnet.
- N1 calls "drbdadm resume r0".
+ The outdate-peer script on N1 returns sucess to DRBD.
P/D - - S/? N1 thaws IO
N2 got the the "Outdated" flag set in its meta-data, by the outdate
- command. The suspend command is here to make the interface
- complete, and to reach the freezed state without th need to
- disconnect the peer. It might turn out to be usefull for other
- tasks as well.
+ command.
- Eventually introduce on-disconnent-cmd "command";
- 33% DONE
+ The "split-brain-fix" enables this behaviour. If this option is
+ omitted, the handler is not called nor IO is frozen on disconnect.
+ Eventually introduce a "suspend" and a "resume" command to
+ to reach the freezed state without the need to disconnect the peer.
+ It might turn out to be usefull for other tasks as well.
+
+ 66% DONE / TODO: IO freezing is not done yet.
+
8 New command drbdmeta
We move the read_gc.pl/write_gc.pl to the user directory.
Modified: trunk/drbd/drbd_fs.c
===================================================================
--- trunk/drbd/drbd_fs.c 2004-12-23 11:06:45 UTC (rev 1686)
+++ trunk/drbd/drbd_fs.c 2004-12-23 14:18:08 UTC (rev 1687)
@@ -623,7 +623,7 @@
r=drbd_khelper(mdev,"outdate-peer");
- switch( (r>>8) && 0xff ) {
+ switch( (r>>8) & 0xff ) {
case 3: /* peer is inconsistent */
nps = Inconsistent;
break;
@@ -634,14 +634,15 @@
drbd_md_inc(mdev,ConnectedCnt);
nps = Outdated;
break;
- case 6: /* Peer is primary */
+ case 6: /* Peer is primary, voluntarily outdate myself */
nps = DUnknown;
+ drbd_request_state(mdev,NS(disk,Outdated));
break;
default:
/* The script is broken ... */
drbd_md_inc(mdev,ConnectedCnt);
nps = DUnknown;
- ERR("outdate-peer helper returned %d (%d)\n",(r>>8)&&0xff,r);
+ ERR("outdate-peer helper returned %d (%d)\n",(r>>8)&0xff,r);
}
return nps;
@@ -700,12 +701,9 @@
if ( r == -7 ) {
drbd_disks_t nps = drbd_try_outdate_peer(mdev);
r = drbd_request_state(mdev,NS2(role,newstate & 0x3,pdsk,nps));
- }
- if ( r <= 0) {
- print_st_err(mdev,os,ns,r);
- return -EACCES;
- }
+ } else if ( r <= 0 ) print_st_err(mdev,os,ns,r);
+ if ( r <= 0 ) return -EACCES;
if (mdev->state.s.conn >= Connected) {
/* do NOT increase the Human count if we are connected,
Modified: trunk/drbd/drbd_main.c
===================================================================
--- trunk/drbd/drbd_main.c 2004-12-23 11:06:45 UTC (rev 1686)
+++ trunk/drbd/drbd_main.c 2004-12-23 14:18:08 UTC (rev 1687)
@@ -503,7 +503,7 @@
ns.s.pdsk = Outdated;
break;
case SyncSource:
- ns.s.disk = Inconsistent;
+ ns.s.pdsk = Inconsistent;
WARN("Implicit set pdsk Inconsistent!\n");
break;
}
@@ -521,7 +521,7 @@
ns.s.disk <= Outdated ) rv=-2;
else if( ns.s.role == Primary && ns.s.conn < Connected &&
- ns.s.pdsk >= Unknown ) rv=-7;
+ ns.s.pdsk >= DUnknown ) rv=-7;
else if( ns.s.role == Primary && ns.s.disk <= Inconsistent &&
ns.s.pdsk <= Inconsistent ) rv=-2;
Modified: trunk/scripts/Makefile
===================================================================
--- trunk/scripts/Makefile 2004-12-23 11:06:45 UTC (rev 1686)
+++ trunk/scripts/Makefile 2004-12-23 14:18:08 UTC (rev 1687)
@@ -57,6 +57,8 @@
install -m 644 drbd.conf $(PREFIX)/etc/; \
fi
install -m 755 drbddisk $(PREFIX)/etc/ha.d/resource.d
+ install -d $(PREFIX)/usr/lib/drbd
+ install -m 755 outdate-peer.sh $(PREFIX)/usr/lib/drbd
ifeq ($(DIST),suselike)
ln -sf ../etc/init.d/drbd $(PREFIX)/sbin/rcdrbd
endif
Modified: trunk/scripts/drbd.conf
===================================================================
--- trunk/scripts/drbd.conf 2004-12-23 11:06:45 UTC (rev 1686)
+++ trunk/scripts/drbd.conf 2004-12-23 14:18:08 UTC (rev 1687)
@@ -127,8 +127,11 @@
# than we have, and we are primary.
pri-sees-sec-with-higher-gc "halt -f";
- # Commands to run in case we loose connection. Use this script in
- # in conjunction with the on-disconnect=suspend_io configuration.
+ # Commands to run in case we need to downgrade the peer's disk
+ # state to "Outdated". Should be implemented by the superior
+ # communication possibilities of our cluster manager.
+ # The provided script uses ssh, and is for demonstration/development
+ # purposis.
outdate-peer "/usr/lib/drbd/outdate-peer.sh on amd 192.168.22.11 192.168.23.11 on alf 192.168.22.12 192.168.23.12";
}
@@ -159,6 +162,14 @@
# continues in disk less mode.
#
on-io-error detach;
+
+ # Enables the use of the outdate-peer handler, as well as freezing
+ # of IO while we are primary and the peer's disk state is unknown.
+ # The outdate-peer handler is used then to resove the situation
+ # as quick as possible.
+ # BTW, becoming primary on a disconnected node may also trigger the
+ # execution of the outdate-peer handler.
+ # split-brain-fix;
}
net {
Modified: trunk/scripts/outdate-peer.sh
===================================================================
--- trunk/scripts/outdate-peer.sh 2004-12-23 11:06:45 UTC (rev 1686)
+++ trunk/scripts/outdate-peer.sh 2004-12-23 14:18:08 UTC (rev 1687)
@@ -41,24 +41,25 @@
TIMEOUT=6
for P in "$@"; do
- if [ "$EXP_PEER_IP" = "1" ]; then
- PEER_IP="$PEER_IP $P"
- fi;
- if [ "$EXP_OWN_IP" = "1" ]; then
- OWN_IP="$OWN_IP $P"
- fi;
- if [ "$EXP_HOST_NAME" = "1" ]; then
- if [ "$P" != `uname -n` ]; then
- EXP_PEER_IP=1
- else
- EXP_OWN_IP=1
- fi
- EXP_HOST_NAME=0
- fi
if [ "$P" = "on" ]; then
EXP_HOST_NAME=1
EXP_PEER_IP=0
EXP_OWN_IP=0
+ else
+ if [ "$EXP_PEER_IP" = "1" ]; then
+ PEER_IP="$PEER_IP $P"
+ fi;
+ if [ "$EXP_OWN_IP" = "1" ]; then
+ OWN_IP="$OWN_IP $P"
+ fi;
+ if [ "$EXP_HOST_NAME" = "1" ]; then
+ if [ "$P" != `uname -n` ]; then
+ EXP_PEER_IP=1
+ else
+ EXP_OWN_IP=1
+ fi
+ EXP_HOST_NAME=0
+ fi
fi
done
@@ -74,12 +75,13 @@
SSH_CMDS_RUNNING=1
-while [ "$SSH_CMDS_RUNNING" = "1" ] ; do
+while [ "$SSH_CMDS_RUNNING" = "1" ] && [ $TIMEOUT -gt 0 ]; do
sleep 1
SSH_CMDS_RUNNING=0
for P in $SSH_PID; do
if [ -d /proc/$P ]; then SSH_CMDS_RUNNING=1; fi
done
+ TIMEOUT=$(( $TIMEOUT - 1 ))
done
RV=5
@@ -94,11 +96,11 @@
# exit codes of drbdmeata outdate:
# 5 -> is inconsistent
# 0 -> is outdated
- # 20 -> outdate failed because peer is primary.
+ # 17 -> outdate failed because peer is primary.
# Unfortunately 20 can have other reasons too....
if [ $EXIT_CODE -eq 5 ]; then RV=3; else
- if [ $EXIT_CODE -eq 27 ]; then RV=6; else
+ if [ $EXIT_CODE -eq 17 ]; then RV=6; else
if [ $EXIT_CODE -eq 0 ]; then RV=4; else
echo "do not know about this exit code"
fi
Modified: trunk/user/drbdadm_main.c
===================================================================
--- trunk/user/drbdadm_main.c 2004-12-23 11:06:45 UTC (rev 1686)
+++ trunk/user/drbdadm_main.c 2004-12-23 14:18:08 UTC (rev 1687)
@@ -613,7 +613,11 @@
static int adm_generic_b(struct d_resource* res,const char* cmd)
{
int rv;
- if( (rv=adm_generic(res,cmd,SLEEPS_SHORT|SUPRESS_STDERR)) ) {
+
+ rv=adm_generic(res,cmd,SLEEPS_SHORT|SUPRESS_STDERR);
+ if(rv == 17) return rv;
+
+ if( rv ) {
rv = admm_generic(res,cmd);
}
return rv;
@@ -1311,9 +1315,9 @@
fprintf(stderr,"'%s' not defined in your config.\n",argv[i]);
exit(E_usage);
found:
- if( (rv=cmd->function(res,cmd->name)) >= 10 ) {
+ if( (rv=cmd->function(res,cmd->name)) >= 20 ) {
fprintf(stderr,"drbdadm aborting\n");
- exit(E_exec_error);
+ exit(rv);
}
}
}
Modified: trunk/user/drbdsetup.c
===================================================================
--- trunk/user/drbdsetup.c 2004-12-23 11:06:45 UTC (rev 1686)
+++ trunk/user/drbdsetup.c 2004-12-23 14:18:08 UTC (rev 1687)
@@ -980,6 +980,7 @@
if(err==EIO)
{
fprintf(stderr,"%s\n",set_st_err_name(reason));
+ if(reason == -2) return 17;
}
return 20;
}
More information about the drbd-cvs
mailing list