[Drbd-dev] [CASE-41] After re-connected, despite of OOS remaining primary does not start re-synchronization or continues AHEAD mode.

Jaeheon Kim jhkim at mantech.co.kr
Mon Apr 18 16:05:38 CEST 2016


Dear Phil,

We made a code to avoid OOS remainder problem.
If the status is L_AHEAD at conn_try_disconnect(), we have to wait until
 the state changes to SyncSource and meta-data be flushed by drbd_md_sync().

What do think about our temporary workaround?

Thanks.



PS. source code:

1. drbd/drbd_int.h ========================================

    struct {/* sender todo per peer_device */
        bool was_ahead;
    } todo;
#ifdef _WIN32_OOS_TEST
    wait_queue_head_t resync_start_wait;
    int resync_start_flag;
#endif
};

struct submit_worker {



2. drbd/drbd_main.c ======================================

    peer_device->resync_finished_pdsk = D_UNKNOWN;

    INIT_WORK(&peer_device->send_acks_work, drbd_send_acks_wf);

#ifdef _WIN32_OOS_TEST
    init_waitqueue_head(&peer_device->resync_start_wait);
    peer_device->resync_start_flag = 0;
#endif
    return peer_device;
}


3. drbd/drbd_nl.c ======================================

static enum drbd_state_rv conn_try_disconnect(struct drbd_connection
*connection, bool force)
{
struct drbd_resource *resource = connection->resource;
enum drbd_state_rv rv;

#ifdef _WIN32_OOS_TEST
// First, Check whether reresync-action occurred or not after AHEAD
{
struct drbd_peer_device *peer_device = conn_peer_device(connection, 0);

DbgPrint("_WIN32_OOS_TEST:conn_try_disconnect: Please check whether
resync-action occurred or not.");
if (peer_device->repl_state[NOW] == L_AHEAD) // check AHEAD status only
{
if (peer_device->resync_start_flag > 0)
{
DbgPrint("_WIN32_OOS_TEST: resync action started already. flag=%d\n",
peer_device->resync_start_flag);
}
else
{
DbgPrint("_WIN32_OOS_TEST: Wait for resync start...\n");// It will update
status and meta data on each node, maybe.

long t = 0; // if 5 seconds exceeding, drbdadm disconnect CLI will be
timeout.
wait_event_timeout(t, peer_device->resync_start_wait,
(peer_device->resync_start_flag > 0), 3000);

if (t == 0)
{
DbgPrint("_WIN32_OOS_TEST: timeout! t=%d. No resync_start_wait event. So,
You can see AHEAD pending problem\n", t);
}
else
{
DbgPrint("_WIN32_OOS_TEST: OK! got it! time=%d\n", t);
}
}
peer_device->resync_start_flag = 0; // reset
}
else
{
DbgPrint("_WIN32_OOS_TEST: repl mode=(%s).",
drbd_repl_str(peer_device->repl_state[NOW]));
}
}
#endif

    repeat:



4. drbd/drbd_sender.c ======================================

Part
1)---------------------------------------------------------------------------
            drbd_khelper(NULL, connection, "unfence-peer");
    }

#ifdef _WIN32_OOS_TEST
    if (peer_device->resync_start_flag > 0)
    {
        DbgPrint("_WIN32_OOS_TEST:drbd_resync_finished:
resync_start_flag=%d. reset!", peer_device->resync_start_flag);
        peer_device->resync_start_flag = 0;
    }
#endif

    return 1;
}

Part
2)-----------------------------------------------------------------------------

    spin_lock_irq(&device->resource->req_lock);
    repl_state = peer_device->repl_state[NOW];
    spin_unlock_irq(&device->resource->req_lock);

#ifdef _WIN32_OOS_TEST
    int first_state = repl_state;
    DbgPrint("_WIN32_OOS_TEST:(%s) drbd_start_resync: repl(%s) side(%s)",
current->comm, drbd_repl_str(repl_state), drbd_repl_str(side));
#endif

    if (repl_state < L_ESTABLISHED) {
        /* Connection closed meanwhile. */
        return;


Part
3)----------------------------------------------------------------------------

            mod_timer(&peer_device->resync_timer, jiffies);

        drbd_md_sync(device);

#ifdef _WIN32_OOS_TEST
        if (first_state == L_AHEAD)
        {
            peer_device->resync_start_flag++;
            DbgPrint("_WIN32_OOS_TEST: Resync start at AHEAD status.
flag=%d. drbdadm disconnect CLI maybe wake up if exists",
peer_device->resync_start_flag);
            wake_up(&peer_device->resync_start_wait);
        }
        else
        {
            DbgPrint("_WIN32_OOS_TEST: flag=%d\n",
peer_device->resync_start_flag);
        }
#endif

    }
    put_ldev(device);
    out:


------------------------------------------------------------------------------
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.linbit.com/pipermail/drbd-dev/attachments/20160418/0b7ff24a/attachment.htm>


More information about the drbd-dev mailing list