[Drbd-dev] Fix drbd adding volume dynamically problem

fei luo morphyluo at gmail.com
Mon Jun 21 09:06:54 CEST 2021


When I added a new volume to the drbd resource(I tested in drbd 9.0.*,
9.1.* versions), I found there is a very high probability that the resource
will block in a consistent state and never change automatically. I analyzed
the difference of logs between success and unsuccess dynamically adding
volumes. I find The failure is due to not obtaining the correct UUID yet
when calling the may_return_to_up_to_date function, so I made a patch like
below and the problem is fixed. But because drbd state is too complicated, I'm
not sure if this will introduce other problems.Hoping to get some guidance
and Hopefully this will go some way toward actually fixing the problem



---

 drbd-9.1.2/drbd/drbd_state.c | 27 +++++++++++++++++++--------

 1 file changed, 19 insertions(+), 8 deletions(-)



diff --git a/drbd-9.1.2/drbd/drbd_state.c b/drbd-9.1.2/drbd/drbd_state.c

index c45284b..0cc75df 100644

--- a/drbd-9.1.2/drbd/drbd_state.c

+++ b/drbd-9.1.2/drbd/drbd_state.c

@@ -83,7 +83,8 @@ static void sanitize_state(struct drbd_resource
*resource);

    different UUID. This function should be used if the device was
D_UP_TO_DATE

    before.

  */

-static bool may_return_to_up_to_date(struct drbd_device *device, enum
which_state which)

+static bool may_return_to_up_to_date(struct drbd_device *device, enum
which_state which,

+ bool from_md)

 {

  struct drbd_peer_device *peer_device;

  bool rv = true;

@@ -93,6 +94,15 @@ static bool may_return_to_up_to_date(struct drbd_device
*device, enum which_stat

  if (peer_device->disk_state[which] == D_DISKLESS &&

      peer_device->connection->peer_role[which] == R_PRIMARY &&

      peer_device->current_uuid != drbd_current_uuid(device)) {

+ /* here ignore peer_device->current_uuid == 0, hope reverify when

+  * peer_device->current_uuid is set

+  * */

+ if (peer_device->current_uuid == 0 && from_md) {

+ continue;

+ }

  rv = false;

  break;

  }

@@ -108,12 +118,13 @@ static bool may_return_to_up_to_date(struct
drbd_device *device, enum which_stat

  * When fencing is enabled, it may only transition from D_CONSISTENT to
D_UP_TO_DATE

  * when ether all peers are connected, or outdated.

  */

-static bool may_be_up_to_date(struct drbd_device *device, enum which_state
which) __must_hold(local)

+static bool may_be_up_to_date(struct drbd_device *device, enum which_state
which,

+ bool from_md) __must_hold(local)

 {

  bool all_peers_outdated = true;

  int node_id;



- if (!may_return_to_up_to_date(device, which))

+ if (!may_return_to_up_to_date(device, which, from_md))

  return false;



  rcu_read_lock();

@@ -216,7 +227,7 @@ enum drbd_disk_state disk_state_from_md(struct
drbd_device *device) __must_hold(

  else if (!drbd_md_test_flag(device->ldev, MDF_WAS_UP_TO_DATE))

  disk_state = D_OUTDATED;

  else

- disk_state = may_be_up_to_date(device, NOW) ? D_UP_TO_DATE : D_CONSISTENT;

+ disk_state = may_be_up_to_date(device, NOW, true) ? D_UP_TO_DATE :
D_CONSISTENT;



  return disk_state;

 }

@@ -2103,12 +2114,12 @@ static void sanitize_state(struct drbd_resource
*resource)

  /* The attempted resync made us D_OUTDATED, roll that back in case */

  if (repl_state[OLD] == L_WF_BITMAP_T && repl_state[NEW] == L_OFF &&

      disk_state[NEW] == D_OUTDATED &&

-     stable_up_to_date_neighbor(device) && may_be_up_to_date(device, NEW))

+ stable_up_to_date_neighbor(device) && may_be_up_to_date(device, NEW,
false))

  disk_state[NEW] = D_UP_TO_DATE;



  /* clause intentional here, the D_CONSISTENT form above might trigger
this */

  if (repl_state[OLD] < L_ESTABLISHED && repl_state[NEW] >= L_ESTABLISHED &&

-     disk_state[NEW] == D_CONSISTENT && may_be_up_to_date(device, NEW))

+ disk_state[NEW] == D_CONSISTENT && may_be_up_to_date(device, NEW, false))

  disk_state[NEW] = D_UP_TO_DATE;



  /* Follow a neighbor that goes from D_CONSISTENT TO D_UP_TO_DATE */

@@ -3699,7 +3710,7 @@ static int w_after_state_change(struct drbd_work *w,
int unused)

  send_new_state_to_all_peer_devices(state_change, n_device);



  if (disk_state[OLD] == D_UP_TO_DATE && disk_state[NEW] == D_CONSISTENT &&

-     may_return_to_up_to_date(device, NOW))

+ may_return_to_up_to_date(device, NOW, false))

  try_become_up_to_date = true;



  if (test_bit(TRY_TO_GET_RESYNC, &device->flags)) {

@@ -4875,7 +4886,7 @@ static bool do_change_from_consistent(struct
change_context *context, enum chang



  idr_for_each_entry(&resource->devices, device, vnr) {

  if (device->disk_state[NOW] == D_CONSISTENT &&

-     may_return_to_up_to_date(device, NOW))

+ may_return_to_up_to_date(device, NOW, false))

  __change_disk_state(device, D_UP_TO_DATE);

  }

  }

-- 

1.8.3.1
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.linbit.com/pipermail/drbd-dev/attachments/20210621/281e6626/attachment-0001.htm>


More information about the drbd-dev mailing list