[DRBD-user] drbd strange error

Lars Ellenberg Lars.Ellenberg at linbit.com
Mon Aug 2 18:18:55 CEST 2004


/ 2004-08-02 17:47:10 +0200
\ Horvath Szabolcs:
> Hello!
> 
> Thanks for the quick reply.
> 
> Just now I can't log into the cluster, but I remember when
> /etc/init.d/drbd start failed,
> in /proc/drbd I saw lines with "unconfigured"
> (no primary/secondary/unknown words), and
> ps aux | grep drbd didn't show drbd threads.
> 
> > from reading the code there again,
> > we return BUSY for SET_DISK when
> >   - we already have a disk
> >   - we are currently diskless Primary
> >   - we are currently diskless, but already connected
> >   - we have an open_cnt > 1 on the device
> 
> How can I check this?
> 
> Insert a line with "fuser -mv /dev/nb0" before this? ->
> [...]
>             if $DRBDADM adjust $I; then

wait: maybe drbdadm itself is the problem here...
I just see that we do not properly drain, fclose, and waitpid for the
drbdsetup show in certain cases... yes, that probably is it.

patch to drbdadm attached.

	Lars Ellenberg

-- 
please use the "List-Reply" function of your email client.
-------------- next part --------------
Index: user/drbdadm_adjust.c
===================================================================
--- user/drbdadm_adjust.c	(revision 1480)
+++ user/drbdadm_adjust.c	(working copy)
@@ -202,6 +202,13 @@
 }
 
 
+/* NOTE
+ * return before waitpit is a BUG. "goto out;" instead!
+ *
+ * calling drbdsetup again before waitpid("drbdsetup show") has a race with
+ * the next ioctl failing because of the zombie still holding an open_cnt on
+ * the drbd device. so don't do that.
+ */
 int adm_adjust(struct d_resource* res,char* unused)
 {
   char* argv[20];
@@ -218,6 +225,7 @@
 
   struct stat sb;
   int major, minor;
+  int err = 10;
 
   argv[argc++]=drbdsetup;
   argv[argc++]=res->me->device;
@@ -236,11 +244,11 @@
 
   if (stat(res->me->disk, &sb)) {
     PERROR("stat '%s' failed:", res->me->device);
-    return 10;
+    goto out;
   }
   if (!S_ISBLK(sb.st_mode)) {
     fprintf(stderr, "'%s' not a block device!\n", res->me->disk);
-    return 10;
+    goto out;
   }
   rv=m_fscanf(in,"Lower device: %d:%d (%*[^)])\n",&major,&minor);
   if( (rv!=2) || (((major<<8)|minor) != (int)sb.st_rdev)) do_attach=1;
@@ -248,11 +256,11 @@
   if (strcmp("internal", res->me->meta_disk)) {
     if (stat(res->me->meta_disk, &sb)) {
       PERROR("stat '%s' failed:", res->me->meta_disk);
-      return 10;
+      goto out;
     }
     if (!S_ISBLK(sb.st_mode)) {
       fprintf(stderr, "'%s' not a block device!\n", res->me->disk);
-      return 10;
+      goto out;
     }
   } else {
     sb.st_rdev = 0;
@@ -265,7 +273,7 @@
 	do_attach = 1;
     } else {
       fprintf(stderr, "parse error, '%s' read, 'internal' expected\n", str1);
-      return 10;
+      goto out;
     }
   }
   if (rv == 2) {
@@ -278,7 +286,7 @@
 	do_attach = 1;
     } else {
       fprintf(stderr, "parse error\n");
-      return 10;
+      goto out;
     }
   }
 
@@ -345,10 +353,15 @@
     do_syncer |= complete(res->sync_options);
   } else do_syncer=1;
 
+ do_up:
+  err = 0;
+ out:
+  // drain, close, wait for drbdsetup to "officially die".
+  { static char drain[1024]; while (fgets(drain,1024,in)); }
   fclose(in);
   waitpid(pid,0,0);
+  if (err) return err;
 
- do_up:
   if(do_attach) {
     if( (rv=adm_attach(res,0)) ) return rv;
     do_resize=0;


More information about the drbd-user mailing list