[DRBD-cvs] r1664 - in trunk: . drbd user

Mon Nov 29 16:32:44 CET 2004

Author: phil
Date: 2004-11-29 16:32:41 +0100 (Mon, 29 Nov 2004)
New Revision: 1664

Modified:
   trunk/
   trunk/Makefile
   trunk/drbd/Makefile-2.6
   trunk/drbd/drbd_bitmap.c
   trunk/drbd/drbd_int.h
   trunk/drbd/drbd_main.c
   trunk/drbd/drbd_receiver.c
   trunk/user/drbdadm_main.c
Log:
svnp run. Investigated 1654 to 1664

r1656 by lars on 2004-11-25 01:52:01 +0100 (Thu, 25 Nov 2004) 
  Changed paths:
     M /branches/drbd-0.7/Makefile
  
  removed debian/ subdir from .filelist,
  and thus from tarball and tgz make targets

r1657 by phil on 2004-11-25 09:09:10 +0100 (Thu, 25 Nov 2004) 
  Changed paths:
     M /branches/drbd-0.7/drbd/drbd_bitmap.c
  
  drbd_bm_total_weight() is called from several
       places with IRQs disabled! Therefore it has to
       use the spin_*_irqsave|irqrestore() variants.
       ...from  __drbd_set_in_sync(), _drbd_rs_resume(),
       _drbd_rs_pause()
  

r1658 by phil on 2004-11-26 14:54:36 +0100 (Fri, 26 Nov 2004) 
  Changed paths:
     M /branches/drbd-0.7/drbd/drbd_receiver.c
  
  There was this schedule_timeout() without set_current_state().
  
  

r1659 by phil on 2004-11-26 15:46:08 +0100 (Fri, 26 Nov 2004) 
  Changed paths:
     M /branches/drbd-0.7/drbd/drbd_int.h
     M /branches/drbd-0.7/drbd/drbd_main.c
     M /branches/drbd-0.7/drbd/drbd_receiver.c
  
  The test for (rs_total != 0) was not okay. If sync was
  started with the "invalidate" command on the Secondary,
  it (rs_total) is set to a positive value, just after we
  send the BecomeSyncSource packet to the peer. 
  
  The peer(Primary) writes its whole bitmap and sets 
  rs_total to a positive value afterwards. 
  
  Writes that happen on the Primary while it writes its
  bitmap are mirrored to the Secondary (where they are
  mared as cean in the BitMap, since rs_total is already
  set here), but when the ACK comes back to the Primary
  rs_total is still 0 here.
  
  With this patch that problem is fixed, they both consider
  the sync to be started when the SYNC_STARTED bit is 
  set on both node, and this happens with the first
  ACK of the fir RSDATA packet... So the agree on the
  same logic point in time.
  
  

r1660 by phil on 2004-11-29 09:46:01 +0100 (Mon, 29 Nov 2004) 
  Changed paths:
     M /branches/drbd-0.7/user/drbdadm_main.c
  
  Well, there are cases where the 60 seconds are not
  enough. Made it 120 Seconds...
  
  

r1661 by phil on 2004-11-29 12:15:50 +0100 (Mon, 29 Nov 2004) 
  Changed paths:
     M /branches/drbd-0.7/drbd/Makefile-2.6
     M /branches/drbd-0.7/drbd/drbd_receiver.c
  
  Removed the "struct list_head *head" parameter from 
  the drbd_process_ee() function, added a "be_sleepy"
  flag argument. 
  
  This this flag, the asender we avoid that the asender
  thread eventually sleeps in drbd_process_ee().
  
  This changes makes a strange lockup condition go
  away... where receive and asender where sleeping
  in drbd_process_ee(). 
  
  I do not know exacle how the lockup begins, but
  with this changes it can not lock-up.
  
  



Property changes on: trunk
___________________________________________________________________
Name: propagate:at
   - 1654
   + 1664

Modified: trunk/Makefile
===================================================================

--- trunk/Makefile	2004-11-29 12:45:29 UTC (rev 1663)
+++ trunk/Makefile	2004-11-29 15:32:41 UTC (rev 1664)
@@ -120,7 +120,7 @@
 .PHONY: .filelist
 .filelist:
 	@ svn info >/dev/null || { echo "you need a svn checkout to do this." ; false ; }
-	@find $$(svn st -vq | sed 's/^.\{8\} \+[0-9]\+ \+[0-9]\+ [a-z]\+ *//;') \
+	@find $$(svn st -v | sed '/^?/d;s/^.\{8\} \+[0-9]\+ \+[0-9]\+ [a-z]\+ *//;/^debian/d' ) \
 	\! -type d -maxdepth 0 |\
 	sed 's:^:drbd-$(DIST_VERSION)/:' > .filelist
 	@[ -s .filelist ] # assert there is something in .filelist now

Modified: trunk/drbd/Makefile-2.6
===================================================================
--- trunk/drbd/Makefile-2.6	2004-11-29 12:45:29 UTC (rev 1663)
+++ trunk/drbd/Makefile-2.6	2004-11-29 15:32:41 UTC (rev 1664)
@@ -1,4 +1,4 @@
-CFLAGS_drbd_sizeof_sanity_check.o = -Wpadded # -Werror
+#CFLAGS_drbd_sizeof_sanity_check.o = -Wpadded # -Werror
 
 drbd-objs  :=	drbd_sizeof_sanity_check.o \
 		drbd_buildtag.o drbd_bitmap.o drbd_fs.o drbd_proc.o \

Modified: trunk/drbd/drbd_bitmap.c
===================================================================
--- trunk/drbd/drbd_bitmap.c	2004-11-29 12:45:29 UTC (rev 1663)
+++ trunk/drbd/drbd_bitmap.c	2004-11-29 15:32:41 UTC (rev 1664)
@@ -385,13 +385,14 @@
 {
 	struct drbd_bitmap *b = mdev->bitmap;
 	unsigned long s;
+	unsigned long flags;
 
 	D_BUG_ON(!(b && b->bm));
 	// MUST_BE_LOCKED(); well. yes. but ...
 
-	spin_lock_irq(&b->bm_lock);
+	spin_lock_irqsave(&b->bm_lock,flags);
 	s = b->bm_set;
-	spin_unlock_irq(&b->bm_lock);
+	spin_unlock_irqrestore(&b->bm_lock,flags);
 
 	return s;
 }

Modified: trunk/drbd/drbd_int.h
===================================================================
--- trunk/drbd/drbd_int.h	2004-11-29 12:45:29 UTC (rev 1663)
+++ trunk/drbd/drbd_int.h	2004-11-29 15:32:41 UTC (rev 1664)
@@ -625,6 +625,7 @@
 	MD_IO_ALLOWED,		// EXPLAIN
 	SENT_DISK_FAILURE,	// sending it once is enough
 	MD_DIRTY,		// current gen counts and flags not yet on disk
+	SYNC_STARTED,		// Needed to agree on the exact point in time..
 };
 
 struct drbd_bitmap; // opaque for Drbd_Conf

Modified: trunk/drbd/drbd_main.c
===================================================================
--- trunk/drbd/drbd_main.c	2004-11-29 12:45:29 UTC (rev 1663)
+++ trunk/drbd/drbd_main.c	2004-11-29 15:32:41 UTC (rev 1664)
@@ -525,6 +525,7 @@
 
 	/**   post-state-change actions   **/
 	if ( os.s.conn >= SyncSource   && ns.s.conn <= Connected ) {
+		clear_bit(SYNC_STARTED,&mdev->flags);
 		set_bit(STOP_SYNC_TIMER,&mdev->flags);
 		mod_timer(&mdev->resync_timer,jiffies);
 	}

Modified: trunk/drbd/drbd_receiver.c
===================================================================
--- trunk/drbd/drbd_receiver.c	2004-11-29 12:45:29 UTC (rev 1663)
+++ trunk/drbd/drbd_receiver.c	2004-11-29 15:32:41 UTC (rev 1664)
@@ -237,7 +237,7 @@
 
 #define GFP_TRY	( __GFP_HIGHMEM | __GFP_NOWARN )
 
-STATIC int _drbd_process_ee(drbd_dev *mdev,struct list_head *head);
+STATIC int _drbd_process_ee(drbd_dev *mdev, int be_sleepy);
 
 /**
  * drbd_get_ee: Returns an Tl_epoch_entry; might sleep. Fails only if
@@ -257,7 +257,7 @@
 		spin_lock_irq(&mdev->ee_lock);
 	}
 
-	if(list_empty(&mdev->free_ee)) _drbd_process_ee(mdev,&mdev->done_ee);
+	if(list_empty(&mdev->free_ee)) _drbd_process_ee(mdev,1);
 
 	if(list_empty(&mdev->free_ee)) {
 		for (;;) {
@@ -282,7 +282,7 @@
 			}
 			// finish wait is inside, so that we are TASK_RUNNING 
 			// in _drbd_process_ee (which might sleep by itself.)
-			_drbd_process_ee(mdev,&mdev->done_ee);
+			_drbd_process_ee(mdev,1);
 		}
 		finish_wait(&mdev->ee_wait, &wait); 
 	}
@@ -353,9 +353,10 @@
    from this function. Note, this function is called from all three
    threads (receiver, worker and asender). To ensure this I only allow
    one thread at a time in the body of the function */
-STATIC int _drbd_process_ee(drbd_dev *mdev,struct list_head *head)
+STATIC int _drbd_process_ee(drbd_dev *mdev, int be_sleepy)
 {
 	struct Tl_epoch_entry *e;
+	struct list_head *head = &mdev->done_ee;
 	struct list_head *le;
 	int ok=1;
 	int got_sig;
@@ -365,6 +366,10 @@
 	reclaim_net_ee(mdev);
 
 	if( test_and_set_bit(PROCESS_EE_RUNNING,&mdev->flags) ) {
+		if(!be_sleepy) {
+			clear_bit(PROCESS_EE_RUNNING,&mdev->flags);
+			return 3;
+		}
 		spin_unlock_irq(&mdev->ee_lock);
 		got_sig = wait_event_interruptible(mdev->ee_wait,
 		       test_and_set_bit(PROCESS_EE_RUNNING,&mdev->flags) == 0);
@@ -388,11 +393,11 @@
 	return ok;
 }
 
-STATIC int drbd_process_ee(drbd_dev *mdev,struct list_head *head)
+STATIC int drbd_process_ee(drbd_dev *mdev, int be_sleepy)
 {
 	int rv;
 	spin_lock_irq(&mdev->ee_lock);
-	rv=_drbd_process_ee(mdev,head);
+	rv=_drbd_process_ee(mdev,be_sleepy);
 	spin_unlock_irq(&mdev->ee_lock);
 	return rv;
 }
@@ -638,7 +643,7 @@
 				for (retry=1; retry <= 10; retry++) {
 					// give the other side time to call
 					// bind() & listen()
-					current->state = TASK_INTERRUPTIBLE;
+					set_current_state(TASK_INTERRUPTIBLE);
 					schedule_timeout(HZ / 10);
 					msock=drbd_try_connect(mdev);
 					if(msock) goto connected;
@@ -754,7 +759,7 @@
 	drbd_wait_ee(mdev,&mdev->active_ee);
 
 	spin_lock_irq(&mdev->ee_lock);
-	rv = _drbd_process_ee(mdev,&mdev->done_ee);
+	rv = _drbd_process_ee(mdev,1);
 
 	epoch_size=mdev->epoch_size;
 	mdev->epoch_size=0;
@@ -851,6 +856,7 @@
 			 */
 		}
 		ok = drbd_send_ack(mdev,WriteAck,e);
+		__set_bit(SYNC_STARTED,&mdev->flags);
 	} else {
 		ok = drbd_send_ack(mdev,NegAck,e);
 		ok&= drbd_io_error(mdev);
@@ -971,7 +977,7 @@
 	if(mdev->conf.wire_protocol == DRBD_PROT_C) {
 		if(likely(drbd_bio_uptodate(&e->private_bio))) {
 			ok=drbd_send_ack(mdev,WriteAck,e);
-			if (ok && mdev->rs_total)
+			if (ok && test_bit(SYNC_STARTED,&mdev->flags) )
 				drbd_set_in_sync(mdev,sector,drbd_ee_get_size(e));
 		} else {
 			ok = drbd_send_ack(mdev,NegAck,e);
@@ -1682,6 +1688,7 @@
 			break;
 		} else {
 			spin_unlock(&mdev->send_task_lock);
+			set_current_state(TASK_INTERRUPTIBLE);
 			schedule_timeout(HZ / 10);
 		}
 	}
@@ -1928,6 +1935,7 @@
 
 		if( is_syncer_blk(mdev,p->block_id)) {
 			drbd_set_in_sync(mdev,sector,blksize);
+			__set_bit(SYNC_STARTED,&mdev->flags);
 		} else {
 			req=(drbd_request_t*)(long)p->block_id;
 
@@ -1935,7 +1943,7 @@
 
 			drbd_end_req(req, RQ_DRBD_SENT, 1, sector);
 
-			if (mdev->rs_total &&
+			if (test_bit(SYNC_STARTED,&mdev->flags) &&
 			    mdev->conf.wire_protocol == DRBD_PROT_C)
 				drbd_set_in_sync(mdev,sector,blksize);
 		}
@@ -2071,7 +2079,7 @@
 		 */
 		set_bit(SIGNAL_ASENDER, &mdev->flags);
 
-		if (!drbd_process_ee(mdev,&mdev->done_ee)) goto err;
+		if (!drbd_process_ee(mdev,0)) goto err;
 
 		rv = drbd_recv_short(mdev,buf,expect-received);
 		clear_bit(SIGNAL_ASENDER, &mdev->flags);

Modified: trunk/user/drbdadm_main.c
===================================================================
--- trunk/user/drbdadm_main.c	2004-11-29 12:45:29 UTC (rev 1663)
+++ trunk/user/drbdadm_main.c	2004-11-29 15:32:41 UTC (rev 1664)
@@ -398,7 +398,7 @@
     alarm_raised=0;
     switch(flags) {
     case SLEEPS_SHORT:     timeout = 5; break;
-    case SLEEPS_LONG:      timeout = 60; break;
+    case SLEEPS_LONG:      timeout = 120; break;
     case SLEEPS_VERY_LONG: timeout = 600; break;
     default:
 	fprintf(stderr,"logic bug in %s:%d\n",__FILE__,__LINE__);