[DRBD-cvs] svn commit by lars - r2284 - branches/drbd-0.7/drbd -
fixed (?) the "syncer stalled" after reconnect.
drbd-cvs at lists.linbit.com
drbd-cvs at lists.linbit.com
Tue Jul 18 15:34:08 CEST 2006
the res
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
Author: lars
Date: 2006-07-18 15:34:07 +0200 (Tue, 18 Jul 2006)
New Revision: 2284
Modified:
branches/drbd-0.7/drbd/drbd_main.c
branches/drbd-0.7/drbd/drbd_receiver.c
branches/drbd-0.7/drbd/drbd_worker.c
Log:
fixed (?) the "syncer stalled" after reconnect.
the resync_timer was not scheduled because of some
paranoia checks in drbd_start_resync.
Those checks could trigger because of a race between
the various places (drbd_disconnect, resync_timer_fn
etc.) during resync cleanup on connection loss.
This patch moves the racy cleanup code from
drbd_disconnect into drbd_worker.
Also drbd_start_resync schedules the resync_timer
anyways (even when those paranoia checks should trigger
for some reason).
The other parts of this patch is "just cosmetics".
Modified: branches/drbd-0.7/drbd/drbd_main.c
===================================================================
--- branches/drbd-0.7/drbd/drbd_main.c 2006-07-18 12:52:26 UTC (rev 2283)
+++ branches/drbd-0.7/drbd/drbd_main.c 2006-07-18 13:34:07 UTC (rev 2284)
@@ -469,10 +469,6 @@
smp_mb();
wake_up(&mdev->cstate_wait);
- /* THINK.
- * was:
- * if ( ( os==SyncSource || os==SyncTarget ) && ns <= Connected ) {
- */
if ( ( os >= SyncSource ) && ns <= Connected ) {
clear_bit(SYNC_STARTED,&mdev->flags);
set_bit(STOP_SYNC_TIMER,&mdev->flags);
Modified: branches/drbd-0.7/drbd/drbd_receiver.c
===================================================================
--- branches/drbd-0.7/drbd/drbd_receiver.c 2006-07-18 12:52:26 UTC (rev 2283)
+++ branches/drbd-0.7/drbd/drbd_receiver.c 2006-07-18 13:34:07 UTC (rev 2284)
@@ -1836,30 +1836,6 @@
D_ASSERT(mdev->cstate < Connected);
mdev->o_state = Unknown;
- /* in case we have been syncing, and then we drop the connection,
- * we need to "w_resume_next_sg", which we try to achieve by
- * setting the STOP_SYNC_TIMER bit, and schedulung the timer for
- * immediate execution.
- * unfortunately we cannot be sure that the timer already triggered.
- *
- * so we del_timer_sync here, and check that bit.
- * if it is still set, we queue w_resume_next_sg anyways,
- * just to be sure.
- */
-
- del_timer_sync(&mdev->resync_timer);
- spin_lock_irq(&mdev->req_lock);
- if (test_and_clear_bit(STOP_SYNC_TIMER,&mdev->flags)) {
- mdev->resync_work.cb = w_resume_next_sg;
- if (list_empty(&mdev->resync_work.list))
- _drbd_queue_work(&mdev->data.work,&mdev->resync_work);
- // else: already queued, we only need to release the lock.
- } else {
- D_ASSERT(mdev->resync_work.cb == w_resync_inactive);
- }
- spin_unlock_irq(&mdev->req_lock);
-
-
drbd_thread_stop_nowait(&mdev->worker);
drbd_thread_stop(&mdev->asender);
Modified: branches/drbd-0.7/drbd/drbd_worker.c
===================================================================
--- branches/drbd-0.7/drbd/drbd_worker.c 2006-07-18 12:52:26 UTC (rev 2283)
+++ branches/drbd-0.7/drbd/drbd_worker.c 2006-07-18 13:34:07 UTC (rev 2284)
@@ -690,9 +690,10 @@
_set_cstate(mdev,ns);
if(mdev->cstate == SyncTarget) {
- ERR_IF(test_bit(STOP_SYNC_TIMER,&mdev->flags)) {
+ int stop_syncer_timer_was_set =
+ test_and_clear_bit(STOP_SYNC_TIMER,&mdev->flags);
+ ERR_IF(stop_syncer_timer_was_set) {
unsigned long rs_left = drbd_bm_total_weight(mdev);
- clear_bit(STOP_SYNC_TIMER,&mdev->flags);
if (rs_left == 0) {
INFO("rs_left==0 in _drbd_rs_resume\n");
} else {
@@ -778,6 +779,8 @@
{
drbd_dev *odev;
int i,ng=10000;
+ /* FIXME need to document that 10000 somewhere!
+ * maybe better: first iterate to find the max sync group number */
PARANOIA_BUG_ON(w != &mdev->resync_work);
@@ -785,6 +788,9 @@
for (i=0; i < minor_count; i++) {
odev = drbd_conf + i;
+ /* currently we should never reach this while being
+ * mdev->cstate == Sync*, but just incase: skip ourselves */
+ if (odev == mdev) continue;
if ( odev->sync_conf.group <= mdev->sync_conf.group
&& ( odev->cstate == SyncSource ||
odev->cstate == SyncTarget ) ) {
@@ -795,6 +801,7 @@
for (i=0; i < minor_count; i++) { // find next sync group
odev = drbd_conf + i;
+ if (odev == mdev) continue;
if ( odev->sync_conf.group > mdev->sync_conf.group
&& odev->sync_conf.group < ng &&
(odev->cstate==PausedSyncS || odev->cstate==PausedSyncT)){
@@ -804,6 +811,7 @@
for (i=0; i < minor_count; i++) { // resume all devices in next group
odev = drbd_conf + i;
+ if (odev == mdev) continue;
if ( odev->sync_conf.group == ng &&
(odev->cstate==PausedSyncS || odev->cstate==PausedSyncT)){
_drbd_rs_resume(odev);
@@ -811,8 +819,8 @@
}
out:
+ w->cb = w_resync_inactive;
drbd_global_unlock();
- w->cb = w_resync_inactive;
return 1;
}
@@ -884,7 +892,6 @@
else
ERR("resync_work.cb == %p ???, should be w_resync_inactive\n",
mdev->resync_work.cb);
- return;
}
if ( mdev->rs_total == 0 ) {
@@ -905,12 +912,16 @@
*/
drbd_global_unlock();
- if (mdev->cstate == SyncTarget) {
- D_ASSERT(!test_bit(STOP_SYNC_TIMER,&mdev->flags));
- mod_timer(&mdev->resync_timer,jiffies);
- } else if (mdev->cstate == PausedSyncT) {
- D_ASSERT(test_bit(STOP_SYNC_TIMER,&mdev->flags));
- clear_bit(STOP_SYNC_TIMER,&mdev->flags);
+ /* THINK: for consistency, should this be before the drbd_global_unlock() ? */
+ {
+ int stop_syncer_timer_was_set =
+ test_and_clear_bit(STOP_SYNC_TIMER,&mdev->flags);
+ if (mdev->cstate == SyncTarget) {
+ D_ASSERT(!stop_syncer_timer_was_set);
+ mod_timer(&mdev->resync_timer,jiffies);
+ } else if (mdev->cstate == PausedSyncT) {
+ D_ASSERT(stop_syncer_timer_was_set);
+ }
}
}
@@ -960,8 +971,40 @@
drbd_wait_ee(mdev,&mdev->read_ee);
+ /* When we terminate a resync process, either because it finished
+ * sucessfully, or because (like in this case here) we lost
+ * communications, we need to "w_resume_next_sg".
+ * We cannot use del_timer_sync from within _set_cstate, and since the
+ * resync timer may still be scheduled and would then trigger anyways,
+ * we set the STOP_SYNC_TIMER bit, and schedule the timer for immediate
+ * execution from within _set_cstate().
+ * The timer should then clear that bit and queue w_resume_next_sg.
+ *
+ * This is fine for the normal "resync finished" case.
+ *
+ * In this case (worker thread beeing stopped), there is a race:
+ * we cannot be sure that the timer already triggered.
+ *
+ * So we del_timer_sync here, and check that "STOP_SYNC_TIMER" bit.
+ * if it is still set, we queue w_resume_next_sg anyways,
+ * just to be sure.
+ */
+
+ del_timer_sync(&mdev->resync_timer);
+ /* possible paranoia check: the STOP_SYNC_TIMER bit should be set
+ * if and only if del_timer_sync returns true ... */
+
+ spin_lock_irq(&mdev->req_lock);
+ if (test_and_clear_bit(STOP_SYNC_TIMER,&mdev->flags)) {
+ mdev->resync_work.cb = w_resume_next_sg;
+ if (list_empty(&mdev->resync_work.list))
+ _drbd_queue_work(&mdev->data.work,&mdev->resync_work);
+ // else: already queued
+ } else {
+ mdev->resync_work.cb = w_resync_inactive;
+ }
+
i = 0;
- spin_lock_irq(&mdev->req_lock);
again:
list_splice_init(&mdev->data.work.q,&work_list);
spin_unlock_irq(&mdev->req_lock);
@@ -970,7 +1013,7 @@
w = list_entry(work_list.next, struct drbd_work,list);
list_del_init(&w->list);
w->cb(mdev,w,1);
- i++;
+ i++; /* dead debugging code */
}
spin_lock_irq(&mdev->req_lock);
More information about the drbd-cvs
mailing list