[DRBD-cvs] drbd by phil; [Patch by Lars Ellenberg] Integrated the...

drbd-user@lists.linbit.com drbd-user@lists.linbit.com
Tue, 15 Jun 2004 12:07:38 +0200 (CEST)


DRBD CVS committal

Author  : phil
Module  : drbd

Dir     : drbd/drbd


Modified Files:
      Tag: rel-0_7-branch
	Makefile Makefile-2.6 drbd_actlog.c drbd_compat_wrappers.h 
	drbd_dsender.c drbd_fs.c drbd_int.h drbd_main.c drbd_proc.c 
	drbd_receiver.c drbd_req-2.4.c 


Log Message:
[Patch by Lars Ellenberg]
Integrated the new drbd_bitmap.c into all the other files...

===================================================================
RCS file: /var/lib/cvs/drbd/drbd/drbd/Makefile,v
retrieving revision 1.14.2.26
retrieving revision 1.14.2.27
diff -u -3 -r1.14.2.26 -r1.14.2.27
--- Makefile	20 Apr 2004 11:36:53 -0000	1.14.2.26
+++ Makefile	15 Jun 2004 10:07:32 -0000	1.14.2.27
@@ -101,7 +101,7 @@
   ifneq ($(KDIR_Makefile_PATCHLEVEL),6)
     # obsolete in 2.6 ...
     dep:
-	@echo "tying to make dep ..."
+	@echo "trying to make dep ..."
 	@$(MAKE) -s -C $(KDIR) SUBDIRS=$(DRBDSRC) $(ARCH_UM) dep || \
          echo "I'll ignore this error, but this can cause inconsistencies!"
 
===================================================================
RCS file: /var/lib/cvs/drbd/drbd/drbd/Attic/Makefile-2.6,v
retrieving revision 1.1.2.2
retrieving revision 1.1.2.3
diff -u -3 -r1.1.2.2 -r1.1.2.3
--- Makefile-2.6	8 Jun 2004 12:04:35 -0000	1.1.2.2
+++ Makefile-2.6	15 Jun 2004 10:07:32 -0000	1.1.2.3
@@ -1,4 +1,4 @@
-drbd-objs  := drbd_fs.o drbd_proc.o drbd_dsender.o \
+drbd-objs  := drbd_bitmap.o drbd_fs.o drbd_proc.o drbd_dsender.o \
 	      drbd_receiver.o drbd_req-2.4.o drbd_actlog.o \
 	      lru_cache.o drbd_main.o
 obj-$(CONFIG_BLK_DEV_DRBD)     += drbd.o
===================================================================
RCS file: /var/lib/cvs/drbd/drbd/drbd/Attic/drbd_actlog.c,v
retrieving revision 1.1.2.104
retrieving revision 1.1.2.105
diff -u -3 -r1.1.2.104 -r1.1.2.105
--- drbd_actlog.c	7 Jun 2004 12:58:38 -0000	1.1.2.104
+++ drbd_actlog.c	15 Jun 2004 10:07:32 -0000	1.1.2.105
@@ -38,6 +38,7 @@
 {
 	struct buffer_head bh;
 	struct completion event;
+	int ok = 0;
 
 	if (!mdev->md_bdev) {
 		if (DRBD_ratelimit(5*HZ,5)) {
@@ -66,7 +67,13 @@
 	run_task_queue(&tq_disk);
 	wait_for_completion(&event);
 
-	return test_bit(BH_Uptodate, &bh.b_state);
+	ok = test_bit(BH_Uptodate, &bh.b_state);
+	if (unlikely(!ok)) {
+		ERR("drbd_md_sync_page_io(,%lu,%d) failed!\n",
+				(unsigned long)sector,rw);
+	}
+
+	return ok;
 }
 #else
 int drbd_md_sync_page_io(drbd_dev *mdev, sector_t sector, int rw)
@@ -74,6 +81,8 @@
 	struct bio bio;
 	struct bio_vec vec;
 	struct completion event;
+	const sector_t capacity = drbd_get_capacity(mdev->this_bdev);
+	int ok = 0;
 
 	if (!mdev->md_bdev) {
 		if (DRBD_ratelimit(5*HZ,5)) {
@@ -102,11 +111,18 @@
 	bio.bi_private = &event;
 	bio.bi_end_io = drbd_md_io_complete;
 
-	/*
+#if DUMP_MD >= 3
 	INFO("%s [%d]:%s(,%ld,%s)\n",
 	     current->comm, current->pid, __func__,
 	     sector, rw ? "WRITE" : "READ");
-	*/
+#endif
+	if (sector < drbd_md_ss(mdev)  ||
+	    sector > drbd_md_ss(mdev)+MD_BM_OFFSET+BM_SECT_TO_EXT(capacity)) {
+		ALERT("%s [%d]:%s(,%ld,%s) out of range md access!\n",
+		     current->comm, current->pid, __func__,
+		     sector, rw ? "WRITE" : "READ");
+	}
+
 #ifdef BIO_RW_SYNC
 	submit_bio(rw | (1 << BIO_RW_SYNC), &bio);
 #else
@@ -115,7 +131,13 @@
 #endif
 	wait_for_completion(&event);
 
-	return test_bit(BIO_UPTODATE, &bio.bi_flags);
+	ok = test_bit(BIO_UPTODATE, &bio.bi_flags);
+	if (unlikely(!ok)) {
+		ERR("drbd_md_sync_page_io(,%lu,%s) failed!\n",
+				(unsigned long)sector,rw ? "WRITE" : "READ");
+	}
+
+	return ok;
 }
 #endif
 
@@ -140,10 +162,6 @@
 
 STATIC void drbd_al_write_transaction(struct Drbd_Conf *,struct lc_element *,
 				      unsigned int );
-STATIC void drbd_update_on_disk_bm(struct Drbd_Conf *,unsigned int);
-
-#define SM (BM_EXTENT_SIZE / AL_EXTENT_SIZE)
-
 static inline
 struct lc_element* _al_get(struct Drbd_Conf *mdev, unsigned int enr)
 {
@@ -152,7 +170,7 @@
 	unsigned long     al_flags=0;
 
 	spin_lock_irq(&mdev->al_lock);
-	bm_ext = (struct bm_extent*) lc_find(mdev->resync,enr/SM);
+	bm_ext = (struct bm_extent*) lc_find(mdev->resync,enr/AL_EXT_PER_BM_SECT);
 	if (unlikely(bm_ext!=NULL)) {
 		if(test_bit(BME_NO_WRITES,&bm_ext->flags)) {
 			spin_unlock_irq(&mdev->al_lock);
@@ -191,7 +209,7 @@
 		evicted = al_ext->lc_number;
 
 		if(mdev->cstate < Connected && evicted != LC_FREE ) {
-			drbd_update_on_disk_bm(mdev,evicted);
+			drbd_bm_write_sect(mdev, evicted/AL_EXT_PER_BM_SECT );
 		}
 		drbd_al_write_transaction(mdev,al_ext,enr);
 		mdev->al_writ_cnt++;
@@ -412,8 +430,10 @@
 }
 
 /**
- * drbd_al_to_on_disk_bm: Writes the areas of the bitmap which are covered by
- * the AL.
+ * drbd_al_to_on_disk_bm:
+ * Writes the areas of the bitmap which are covered by the AL.
+ * called when we detach (unconfigure) local storage,
+ * or when we go from Primary to Secondary state.
  */
 void drbd_al_to_on_disk_bm(struct Drbd_Conf *mdev)
 {
@@ -429,7 +449,10 @@
 	for(i=0;i<mdev->act_log->nr_elements;i++) {
 		enr = lc_entry(mdev->act_log,i)->lc_number;
 		if(enr == LC_FREE) continue;
-		drbd_update_on_disk_bm(mdev,enr);
+		/* TODO encapsulate and optimize within drbd_bitmap
+		 * currently, if we have al-extents 16..19 active,
+		 * sector 4 will be written four times! */
+		drbd_bm_write_sect(mdev, enr/AL_EXT_PER_BM_SECT );
 	}
 
 	lc_unlock(mdev->act_log);
@@ -452,34 +475,13 @@
 	for(i=0;i<mdev->act_log->nr_elements;i++) {
 		enr = lc_entry(mdev->act_log,i)->lc_number;
 		if(enr == LC_FREE) continue;
-		add += bm_set_bit( mdev, enr << (AL_EXTENT_SIZE_B-9),
-				   AL_EXTENT_SIZE, SS_OUT_OF_SYNC );
+		add += drbd_bm_e_set_all(mdev, enr);
 	}
 
 	lc_unlock(mdev->act_log);
 	wake_up(&mdev->al_wait);
 
-	INFO("Marked additional %lu KB as out-of-sync based on AL.\n",(add+1)/2);
-
-	mdev->rs_total += add;
-}
-
-/**
- * drbd_write_bm: Writes the whole bitmap to its on disk location.
- */
-void drbd_write_bm(struct Drbd_Conf *mdev)
-{
-	unsigned int exts,i;
-
-	if( !inc_local_md_only(mdev) ) return;
-
-	exts = div_ceil(drbd_get_capacity(mdev->this_bdev),
-			BM_EXTENT_SIZE >> 9 );
-
-	for(i=0;i<exts;i++) {
-		drbd_update_on_disk_bm(mdev,i*EXTENTS_PER_SECTOR);
-	}
-	dec_local(mdev);
+	INFO("Marked additional %lu KB as out-of-sync based on AL.\n",add >> 1);
 }
 
 static inline int _try_lc_del(struct Drbd_Conf *mdev,struct lc_element *al_ext)
@@ -626,15 +628,18 @@
 		return 1;
 	}
 
-	drbd_update_on_disk_bm(mdev,udw->enr);
+	drbd_bm_write_sect(mdev, udw->enr );
 	dec_local(mdev);
 
 	kfree(udw);
 
-	if(mdev->rs_left == 0 && 
+	/* FIXME what about PausedSync{S,T} ? */
+	if(drbd_bm_total_weight(mdev) == 0 &&
 	   ( mdev->cstate == SyncSource || mdev->cstate == SyncTarget ) ) {
 		D_ASSERT( mdev->resync_work.cb == w_resync_inactive );
+		drbd_bm_lock(mdev);
 		drbd_resync_finished(mdev);
+		drbd_bm_unlock(mdev);
 	}
 
 	return 1;
@@ -642,7 +647,10 @@
 
 
 /* ATTENTION. The AL's extents are 4MB each, while the extents in the  *
- * resync LRU-cache are 16MB each.                                     */
+ * resync LRU-cache are 16MB each.                                     *
+ *
+ * TODO will be obsoleted once we have a caching lru of the on disk bitmap
+ */
 STATIC void drbd_try_clear_on_disk_bm(struct Drbd_Conf *mdev,sector_t sector,
 				      int cleared)
 {
@@ -651,26 +659,35 @@
 	struct update_odbm_work * udw;
 
 	unsigned int enr;
-	unsigned long flags;
+
+	MUST_HOLD(&mdev->al_lock);
 
 	// I simply assume that a sector/size pair never crosses
 	// a 16 MB extent border. (Currently this is true...)
-	enr = (sector >> (BM_EXTENT_SIZE_B-9));
+	enr = BM_SECT_TO_EXT(sector);
 
-	spin_lock_irqsave(&mdev->al_lock,flags);
 	ext = (struct bm_extent *) lc_get(mdev->resync,enr);
 	if (ext) {
 		if( ext->lce.lc_number == enr) {
 			ext->rs_left -= cleared;
-			D_ASSERT(ext->rs_left >= 0);
+			if (ext->rs_left < 0) {
+				ERR("BAD! sector=%lu enr=%u rs_left=%d cleared=%d\n",
+				     (unsigned long)sector,
+				     ext->lce.lc_number, ext->rs_left, cleared);
+				// FIXME brrrgs. should never happen!
+				_set_cstate(mdev,StandAlone);
+				drbd_thread_stop_nowait(&mdev->receiver);
+				return;
+			}
 		} else {
 			//WARN("Recounting sectors in %d (resync LRU too small?)\n", enr);
 			// This element should be in the cache
 			// since drbd_rs_begin_io() pulled it already in.
-			ext->rs_left = bm_count_sectors(mdev->mbds_id,enr);
+			ext->rs_left = drbd_bm_e_weight(mdev,enr);
 			lc_changed(mdev->resync,&ext->lce);
 		}
 		lc_put(mdev->resync,&ext->lce);
+		// no race, we are within the al_lock!
 	} else {
 		ERR("lc_get() failed! locked=%d/%d flags=%lu\n",
 		    atomic_read(&mdev->resync_locked), 
@@ -686,47 +703,135 @@
 				WARN("Could not kmalloc an udw\n");
 				break;
 			}
-			udw->enr = enr*SM;
+			udw->enr = ext->lce.lc_number;
 			udw->w.cb = w_update_odbm;
 			drbd_queue_work_front(mdev,&mdev->data.work,&udw->w);
 			lc_del(mdev->resync,&ext->lce);
 		}
 	}
-
-	spin_unlock_irqrestore(&mdev->al_lock,flags);
-	// just wake_up unconditional now. [various lc_chaged(), lc_put() here]
-	wake_up(&mdev->al_wait);
 }
 
-void drbd_set_in_sync(drbd_dev* mdev, sector_t sector, int blk_size)
+/* clear the bit corresponding to the piece of storage in question:
+ * size byte of data starting from sector.  Only clear a bits of the affected
+ * one ore more _aligned_ BM_BLOCK_SIZE blocks.
+ *
+ * called by worker on SyncTarget and receiver on SyncSource.
+ *
+ */
+void drbd_set_in_sync(drbd_dev* mdev, sector_t sector, int size)
 {
-	/* Is called by drbd_dio_end possibly from IRQ context, but
-	   from other places in non IRQ */
-	unsigned long flags=0;
-	int cleared;
+	/* Is called from worker and receiver context _only_ */
+	unsigned long sbnr,ebnr,lbnr,bnr;
+	unsigned long count = 0;
+	sector_t esector, nr_sectors;
+
+	if (mdev->cstate < Connected || test_bit(DISKLESS,&mdev->flags)) {
+		ERR("%s:%d: %s flags=0x%02lx\n", __FILE__ , __LINE__ ,
+				cstate_to_name(mdev->cstate), mdev->flags);
+	}
 
-	cleared = bm_set_bit(mdev, sector, blk_size, SS_IN_SYNC);
+	if (size <= 0 || (size & 0x1ff) != 0 || size > PAGE_SIZE) {
+		ERR("drbd_set_in_sync: sector=%lu size=%d nonsense!\n",
+				(unsigned long)sector,size);
+		return;
+	}
+	nr_sectors = drbd_get_capacity(mdev->this_bdev);
+	esector = sector + (size>>9) -1;
 
-	if( cleared == 0 ) return;
+	ERR_IF(sector >= nr_sectors) return;
+	ERR_IF(esector >= nr_sectors) esector = (nr_sectors-1);
 
-	spin_lock_irqsave(&mdev->al_lock,flags);
-	mdev->rs_left -= cleared;
-	D_ASSERT((long)mdev->rs_left >= 0);
+	lbnr = BM_SECT_TO_BIT(nr_sectors-1);
 
-	if(jiffies - mdev->rs_mark_time > HZ*10) {
-		mdev->rs_mark_time=jiffies;
-		mdev->rs_mark_left=mdev->rs_left;
+	/* we clear it (in sync).
+	 * round up start sector, round down end sector.  we make sure we only
+	 * clear full, alligned, BM_BLOCK_SIZE (4K) blocks */
+	if (unlikely(esector < BM_SECT_PER_BIT-1)) {
+		return;
+	} else if (unlikely(esector == (nr_sectors-1))) {
+		ebnr = lbnr;
+	} else {
+		ebnr = BM_SECT_TO_BIT(esector - (BM_SECT_PER_BIT-1));
 	}
-	spin_unlock_irqrestore(&mdev->al_lock,flags);
+	sbnr = BM_SECT_TO_BIT(sector + BM_SECT_PER_BIT-1);
+
+#ifdef DUMP_EACH_PACKET
+	INFO("drbd_set_in_sync: sector=%lu size=%d sbnr=%lu ebnr=%lu\n",
+			(unsigned long)sector, size, sbnr, ebnr);
+#endif
+
+	if (sbnr > ebnr) return;
 
-	drbd_try_clear_on_disk_bm(mdev,sector,cleared);
+	/*
+	 * ok, (capacity & 7) != 0 sometimes, but who cares...
+	 * we count rs_{total,left} in bits, not sectors.
+	 */
+	for(bnr=sbnr; bnr <= ebnr; bnr++) {
+		if (drbd_bm_clear_bit(mdev,bnr)) count++;
+	}
+	if (count) {
+		// we need the lock for drbd_try_clear_on_disk_bm
+		spin_lock_irq(&mdev->al_lock);
+		if(jiffies - mdev->rs_mark_time > HZ*10) {
+			/* should be roling marks, but we estimate only anyways. */
+			mdev->rs_mark_time = jiffies;
+			mdev->rs_mark_left = drbd_bm_total_weight(mdev);
+		}
+		drbd_try_clear_on_disk_bm(mdev,sector,count);
+		spin_unlock_irq(&mdev->al_lock);
+		/* just wake_up unconditional now,
+		 * various lc_chaged(), lc_put() in drbd_try_clear_on_disk_bm(). */
+		wake_up(&mdev->al_wait);
+	}
 }
 
+/*
+ * this is intended to set one request worth of data out of sync.
+ * affects at least 1 bit, and at most 1+PAGE_SIZE/BM_BLOCK_SIZE bits.
+ *
+ * called by tl_clear and drbd_send_dblock (==drbd_make_request).
+ * so this can be _any_ process.
+ */
+void drbd_set_out_of_sync(drbd_dev* mdev, sector_t sector, int size)
+{
+	unsigned long sbnr,ebnr,lbnr,bnr;
+	sector_t esector, nr_sectors;
+
+	if (mdev->cstate >= Connected) {
+		ERR("%s:%d: %s flags=0x%02lx\n", __FILE__ , __LINE__ ,
+				cstate_to_name(mdev->cstate), mdev->flags);
+	}
+
+	if (size <= 0 || (size & 0x1ff) != 0 || size > PAGE_SIZE) {
+		ERR("sector: %lu, size: %d\n",(unsigned long)sector,size);
+		return;
+	}
+
+	nr_sectors = drbd_get_capacity(mdev->this_bdev);
+	esector = sector + (size>>9) -1;
+
+	ERR_IF(sector >= nr_sectors) return;
+	ERR_IF(esector >= nr_sectors) esector = (nr_sectors-1);
+
+	lbnr = BM_SECT_TO_BIT(nr_sectors-1);
+
+	/* we set it out of sync,
+	 * we do not need to round anything here */
+	sbnr = BM_SECT_TO_BIT(sector);
+	ebnr = BM_SECT_TO_BIT(esector);
+
+	/*
+	 * ok, (capacity & 7) != 0 sometimes, but who cares...
+	 * we count rs_{total,left} in bits, not sectors.
+	 */
+	for(bnr=sbnr; bnr <= ebnr; bnr++) drbd_bm_set_bit(mdev,bnr);
+}
 
 static inline
 struct bm_extent* _bme_get(struct Drbd_Conf *mdev, unsigned int enr)
 {
 	struct bm_extent  *bm_ext;
+	int wakeup = 0;
 	unsigned long     rs_flags;
 
 	if(atomic_read(&mdev->resync_locked) > mdev->resync->nr_elements-3 ) {
@@ -738,15 +843,16 @@
 	bm_ext = (struct bm_extent*) lc_get(mdev->resync,enr);
 	if (bm_ext) {
 		if(bm_ext->lce.lc_number != enr) {
-			bm_ext->rs_left = bm_count_sectors(mdev->mbds_id,enr);
+			bm_ext->rs_left = drbd_bm_e_weight(mdev,enr);
 			lc_changed(mdev->resync,(struct lc_element*)bm_ext);
-			wake_up(&mdev->al_wait);
+			wakeup = 1;
 		}
 		if(bm_ext->lce.refcnt == 1) atomic_inc(&mdev->resync_locked);
 		set_bit(BME_NO_WRITES,&bm_ext->flags); // within the lock
 	}
 	rs_flags=mdev->resync->flags;
 	spin_unlock_irq(&mdev->al_lock);
+	if (wakeup) wake_up(&mdev->al_wait);
 
 	if(!bm_ext) {
 		if (rs_flags & LC_STARVING) {
@@ -792,20 +898,20 @@
  */
 int drbd_rs_begin_io(drbd_dev* mdev, sector_t sector)
 {
-	unsigned int enr = (sector >> (BM_EXTENT_SIZE_B-9));
+	unsigned int enr = BM_SECT_TO_EXT(sector);
 	struct bm_extent* bm_ext;
-	int i;
+	int i, sig;
 
-	if( wait_event_interruptible(mdev->al_wait, 
-				     (bm_ext = _bme_get(mdev,enr)) ) ) {
-		return 0;
-	}
+	sig = wait_event_interruptible( mdev->al_wait,
+			(bm_ext = _bme_get(mdev,enr)) );
+	if (sig) return 0;
 
 	if(test_bit(BME_LOCKED,&bm_ext->flags)) return 1;
 
-	for(i=0;i<SM;i++) {
-		if( wait_event_interruptible(mdev->al_wait, 
-					     !_is_in_al(mdev,enr*SM+i) ) ) {
+	for(i=0;i<AL_EXT_PER_BM_SECT;i++) {
+		sig = wait_event_interruptible( mdev->al_wait,
+				!_is_in_al(mdev,enr*AL_EXT_PER_BM_SECT+i) );
+		if (sig) {
 			if( lc_put(mdev->resync,&bm_ext->lce) == 0 ) {
 				clear_bit(BME_NO_WRITES,&bm_ext->flags);
 				atomic_dec(&mdev->resync_locked);
@@ -822,7 +928,7 @@
 
 void drbd_rs_complete_io(drbd_dev* mdev, sector_t sector)
 {
-	unsigned int enr = (sector >> (BM_EXTENT_SIZE_B-9));
+	unsigned int enr = BM_SECT_TO_EXT(sector);
 	struct bm_extent* bm_ext;
 	unsigned long flags;
 
===================================================================
RCS file: /var/lib/cvs/drbd/drbd/drbd/Attic/drbd_compat_wrappers.h,v
retrieving revision 1.1.2.44
retrieving revision 1.1.2.45
diff -u -3 -r1.1.2.44 -r1.1.2.45
--- drbd_compat_wrappers.h	1 Jun 2004 07:00:57 -0000	1.1.2.44
+++ drbd_compat_wrappers.h	15 Jun 2004 10:07:32 -0000	1.1.2.45
@@ -533,6 +533,7 @@
  */
 static inline void drbd_generic_make_request(int rw, struct bio *bio)
 {
+	drbd_dev *mdev = drbd_conf -1; // for DRBD_ratelimit
 	bio->bi_rw = rw; //??
 
 	if (!bio->bi_bdev) {
===================================================================
RCS file: /var/lib/cvs/drbd/drbd/drbd/Attic/drbd_dsender.c,v
retrieving revision 1.1.2.119
retrieving revision 1.1.2.120
diff -u -3 -r1.1.2.119 -r1.1.2.120
--- drbd_dsender.c	8 Jun 2004 12:46:33 -0000	1.1.2.119
+++ drbd_dsender.c	15 Jun 2004 10:07:32 -0000	1.1.2.120
@@ -438,9 +438,13 @@
 	spin_unlock_irqrestore(&mdev->req_lock,flags);
 }
 
+#define SLEEP_TIME (HZ/10)
+
 int w_make_resync_request(drbd_dev* mdev, struct drbd_work* w,int cancel)
 {
+	unsigned long bit;
 	sector_t sector;
+	const sector_t capacity = drbd_get_capacity(mdev->this_bdev);
 	int number,i,size;
 
 	PARANOIA_BUG_ON(w != &mdev->resync_work);
@@ -453,8 +457,6 @@
 
 	D_ASSERT(mdev->cstate == SyncTarget);
 
-#define SLEEP_TIME (HZ/10)
-
         number = SLEEP_TIME*mdev->sync_conf.rate / ((BM_BLOCK_SIZE/1024)*HZ);
 
         if(number > 1000) number=1000;  // Remove later
@@ -467,21 +469,31 @@
 
 	next_sector:
 		size = BM_BLOCK_SIZE;
-		sector = bm_get_sector(mdev->mbds_id,&size);
+		bit  = drbd_bm_find_next(mdev);
 
-		if (sector == MBDS_DONE) {
+		if (bit == -1UL) {
+			/* FIXME either test_and_set some bit,
+			 * or make this the _only_ place that is allowed
+			 * to assign w_resync_inactive! */
 			mdev->resync_work.cb = w_resync_inactive;
 			return 1;
 		}
 
-		if(!drbd_rs_begin_io(mdev,sector)) return 0;
+		sector = BM_BIT_TO_SECT(bit);
 
-		if(unlikely(!bm_get_bit(mdev->mbds_id,sector,BM_BLOCK_SIZE))) {
+		if(!drbd_rs_begin_io(mdev,sector)) {
+			// we have been interrupted, probably connection lost!
+			D_ASSERT(signal_pending(current));
+			return 0;
+		}
+
+		if(unlikely( drbd_bm_test_bit(mdev,bit) == 0 )) {
 		      //INFO("Block got synced while in drbd_rs_begin_io()\n");
 			drbd_rs_complete_io(mdev,sector);
 			goto next_sector;
 		}
 
+		if (sector + (size>>9) > capacity) size = (capacity-sector)<<9;
 		inc_rs_pending(mdev);
 		if(!drbd_send_drequest(mdev,RSDataRequest,
 				       sector,size,ID_SYNCER)) {
@@ -492,13 +504,7 @@
 	}
 
  requeue:
-	if(bm_is_rs_done(mdev->mbds_id)) {
-		mdev->resync_work.cb = w_resync_inactive;
-		return 1;
-	}
-
-	mdev->resync_timer.expires = jiffies + SLEEP_TIME;
-	add_timer(&mdev->resync_timer);
+	mod_timer(&mdev->resync_timer, jiffies + SLEEP_TIME);
 	return 1;
 }
 
@@ -507,10 +513,8 @@
 	unsigned long dt;
 	sector_t n;
 
-	D_ASSERT(mdev->rs_left == 0);
-
 	dt = (jiffies - mdev->rs_start) / HZ + 1;
-	n = mdev->rs_total>>1;
+	n = mdev->rs_total << (BM_BLOCK_SIZE_B-10);
 	sector_div(n,dt);
 	INFO("Resync done (total %lu sec; %lu K/sec)\n",
 	     dt,(unsigned long)n);
@@ -519,10 +523,11 @@
 		mdev->gen_cnt[Flags] |= MDF_Consistent;
 		drbd_md_write(mdev);
 	}
-	mdev->rs_total = 0;
 
 	// assert that all bit-map parts are cleared.
 	D_ASSERT(list_empty(&mdev->resync->lru));
+	D_ASSERT(drbd_bm_total_weight(mdev) == 0);
+	mdev->rs_total = 0;
 
 	set_cstate(mdev,Connected); // w_resume_next_sg() gets called here.
 	return 1;
@@ -657,7 +662,7 @@
 		ERR_IF(test_bit(STOP_SYNC_TIMER,&mdev->flags)) {
 			clear_bit(STOP_SYNC_TIMER,&mdev->flags);
 		}
-		D_ASSERT(mdev->rs_left > 0);
+		D_ASSERT(drbd_bm_total_weight(mdev) > 0);
 		mod_timer(&mdev->resync_timer,jiffies);
 	}
 }
@@ -798,37 +803,46 @@
 
 void drbd_start_resync(drbd_dev *mdev, Drbd_CState side)
 {
+	if(side == SyncTarget) {
+		mdev->gen_cnt[Flags] &= ~MDF_Consistent;
+		drbd_bm_reset_find(mdev);
+	} else {
+		/* If we are SyncSource we must be consistent.
+		 * FIXME this should be an assertion only,
+		 * otherwise it masks a logic bug somewhere else...
+		 */
+		mdev->gen_cnt[Flags] |= MDF_Consistent;
+	}
+	drbd_md_write(mdev);
+
 	set_cstate(mdev,side);
-	mdev->rs_left=mdev->rs_total;
-	mdev->rs_start=jiffies;
-	mdev->rs_mark_left=mdev->rs_left;
-	mdev->rs_mark_time=mdev->rs_start;
-
-	INFO("Resync started as %s (need to sync %lu KB).\n",
-	     side == SyncTarget ? "target" : "source", 
-	     (unsigned long) (mdev->rs_left+1)>>1);
+	mdev->rs_total     =
+	mdev->rs_mark_left = drbd_bm_total_weight(mdev);
+	mdev->rs_start     =
+	mdev->rs_mark_time = jiffies;
+
+	INFO("Resync started as %s (need to sync %lu KB [%lu bits set]).\n",
+	     cstate_to_name(side),
+	     (unsigned long) mdev->rs_total << (BM_BLOCK_SIZE_B-10),
+	     (unsigned long) mdev->rs_total);
 
 	// FIXME: this was a PARANOIA_BUG_ON, but it triggered! ??
 	ERR_IF(mdev->resync_work.cb != w_resync_inactive)
 		return;
 
-	if ( mdev->rs_left == 0 ) {
+	if ( mdev->rs_total == 0 ) {
 		drbd_resync_finished(mdev);
 		return;
 	}
 
-	if(mdev->cstate == SyncTarget) {
-		mdev->gen_cnt[Flags] &= ~MDF_Consistent;
-		bm_reset(mdev->mbds_id);
+	/* FIXME THINK
+	 * use mdev->cstate (we may already be paused...) or side here ?? */
+	if (mdev->cstate == SyncTarget) {
+		drbd_bm_reset_find(mdev);
 		D_ASSERT(!test_bit(STOP_SYNC_TIMER,&mdev->flags));
 		mod_timer(&mdev->resync_timer,jiffies);
-	} else {
-		// If we are SyncSource we must be consistent :)
-		mdev->gen_cnt[Flags] |= MDF_Consistent;
 	}
 
-	drbd_md_write(mdev);
-
 	drbd_global_lock();
 	if (mdev->cstate == SyncTarget || mdev->cstate == SyncSource) {
 		_drbd_pause_higher_sg(mdev);
@@ -839,7 +853,6 @@
 	   * thread of other mdev already paused us,
 	   * or something very strange happend to our cstate!
 	   * I really hate it that we can't have a consistent view of cstate.
-	   * maybe we even need yet an other smp_mb() ?
 	   */
 	drbd_global_unlock();
 }
===================================================================
RCS file: /var/lib/cvs/drbd/drbd/drbd/drbd_fs.c,v
retrieving revision 1.28.2.104
retrieving revision 1.28.2.105
diff -u -3 -r1.28.2.104 -r1.28.2.105
--- drbd_fs.c	9 Jun 2004 14:17:59 -0000	1.28.2.104
+++ drbd_fs.c	15 Jun 2004 10:07:32 -0000	1.28.2.105
@@ -80,7 +80,7 @@
 	if ( pmdss != drbd_md_ss(mdev) && mdev->md_index == -1 ) {
 		WARN("Moving meta-data.\n");
 		drbd_al_shrink(mdev); // All extents inactive.
-		drbd_write_bm(mdev);  // 
+		drbd_bm_write(mdev);  // write bitmap
 		drbd_md_write(mdev);  // Write mdev->la_size to disk.
 	}
 	lc_unlock(mdev->act_log);
@@ -94,6 +94,13 @@
  * note, *_capacity operates in 512 byte sectors!!
  *
  * currently *_size is in KB.
+ *
+ * FIXME
+ * since this is done by drbd receiver as well as from drbdsetup,
+ * this actually needs proper locking!
+ * drbd_bm_resize already protects itself with a mutex.
+ * but again, this is a state change, and thus should be serialized with other
+ * state changes on a more general level already.
  */
 STATIC int do_determin_dev_size(struct Drbd_Conf* mdev)
 {
@@ -140,11 +147,17 @@
 	}
 
 	if( (drbd_get_capacity(mdev->this_bdev)>>1) != size ) {
-		if(bm_resize(mdev->mbds_id,size)) {
+		int err;
+		err = drbd_bm_resize(mdev,size<<1); // wants sectors
+		if (unlikely(err)) {
+			ERR("BM resizing failed. "
+			    "Leaving size unchanged at size = %lu KB\n", size);
+		} else {
+			// racy, see comments above.
 			drbd_set_my_capacity(mdev,size<<1);
 			mdev->la_size = size;
 			INFO("size = %lu KB\n",size);
-		} else ERR("BM resizing failed. Leaving size unchanged\n");
+		}
 	}
 
 	return rv;
@@ -333,14 +346,14 @@
 
 /* FIXME if (md_gc_valid < 0) META DATA IO NOT POSSIBLE! */
 
+	drbd_bm_lock(mdev); // racy...
 	drbd_determin_dev_size(mdev);
 
-	if(md_gc_valid > 0) drbd_read_bm(mdev);
+	if(md_gc_valid > 0) drbd_bm_read(mdev);
 	else {
 		INFO("Assuming that all blocks are out of sync (aka FullSync)\n");
-		bm_fill_bm(mdev->mbds_id,-1);
-		mdev->rs_total = drbd_get_capacity(mdev->this_bdev);
-		drbd_write_bm(mdev);
+		drbd_bm_set_all(mdev);
+		drbd_bm_write(mdev);
 	}
 
 	D_ASSERT(mdev->sync_conf.al_extents >= 7);
@@ -379,7 +392,6 @@
 	}
 
 
-// FIXME why "else" ?? I think allways, and *before* send_param!
 	clear_bit(DISKLESS,&mdev->flags);
 	smp_wmb();
 // FIXME EXPLAIN:
@@ -388,6 +400,7 @@
 	if(mdev->cstate >= Connected ) {
 		drbd_send_param(mdev,1);
 	}
+	drbd_bm_unlock(mdev);
 
 	return 0;
 
@@ -612,7 +625,8 @@
 	 * */
 
 	mdev->state = (Drbd_State) newstate & 0x03;
-	INFO("switched to %s state\n", nodestate_to_name(mdev->state));
+	INFO( "switched to %s/%s state\n", nodestate_to_name(mdev->state),
+			nodestate_to_name(mdev->o_state) );
 	if(newstate & Primary) {
 		NOT_IN_26( set_device_ro(MKDEV(MAJOR_NR, minor), FALSE ); )
 
@@ -813,8 +827,10 @@
 		}
 		err=0;
 		mdev->lo_usize = (unsigned long)arg;
+		drbd_bm_lock(mdev);
 		drbd_determin_dev_size(mdev);
 		drbd_md_write(mdev); // Write mdev->la_size to disk.
+		drbd_bm_unlock(mdev);
 		if (mdev->cstate == Connected) drbd_send_param(mdev,0);
 		break;
 
@@ -964,11 +980,16 @@
 			break;
 		}
 
-		bm_fill_bm(mdev->mbds_id,-1);
-		mdev->rs_total = drbd_get_capacity(mdev->this_bdev);
-		drbd_write_bm(mdev);
+		drbd_bm_lock(mdev); // racy...
+
+		drbd_bm_set_all(mdev);
+		drbd_bm_write(mdev);
+
 		drbd_send_short_cmd(mdev,BecomeSyncSource);
 		drbd_start_resync(mdev,SyncTarget);
+
+		drbd_bm_unlock(mdev);
+
 		break;
 
 	case DRBD_IOCTL_INVALIDATE_REM:
@@ -979,11 +1000,16 @@
 			break;
 		}
 
-		bm_fill_bm(mdev->mbds_id,-1);
-		mdev->rs_total = drbd_get_capacity(mdev->this_bdev);
-		drbd_write_bm(mdev);
+		drbd_bm_lock(mdev); // racy...
+
+		drbd_bm_set_all(mdev);
+		drbd_bm_write(mdev);
+
 		drbd_send_short_cmd(mdev,BecomeSyncTarget);
 		drbd_start_resync(mdev,SyncSource);
+
+		drbd_bm_unlock(mdev);
+
 		break;
 
 	default:
===================================================================
RCS file: /var/lib/cvs/drbd/drbd/drbd/drbd_int.h,v
retrieving revision 1.58.2.170
retrieving revision 1.58.2.171
diff -u -3 -r1.58.2.170 -r1.58.2.171
--- drbd_int.h	8 Jun 2004 12:46:33 -0000	1.58.2.170
+++ drbd_int.h	15 Jun 2004 10:07:32 -0000	1.58.2.171
@@ -144,6 +144,7 @@
 	printk(level DEVICE_NAME "%d: " fmt, \
 		(int)(mdev-drbd_conf) , ##args)
 
+#define ALERT(fmt,args...) PRINTK(KERN_ALERT, fmt , ##args)
 #define ERR(fmt,args...)  PRINTK(KERN_ERR, fmt , ##args)
 #define WARN(fmt,args...) PRINTK(KERN_WARNING, fmt , ##args)
 #define INFO(fmt,args...) PRINTK(KERN_INFO, fmt , ##args)
@@ -170,7 +171,8 @@
 		missed = 0;					\
 		toks -= ratelimit_jiffies;			\
 		if (lost)					\
-			printk(KERN_WARNING "drbd: %d messages suppressed.\n", lost);\
+			WARN("%d messages suppressed in %s:%d.\n",\
+				lost , __FILE__ , __LINE__ );	\
 		__ret=1;					\
 	} else {						\
 		missed++;					\
@@ -189,7 +191,7 @@
 	 ERR("ASSERT( " #exp " ) in %s:%d\n", __FILE__,__LINE__)
 #endif
 #define ERR_IF(exp) if (({ \
-	int _b = (exp); \
+	int _b = (exp)!=0; \
 	if (_b) ERR("%s: (" #exp ") in %s:%d\n", __func__, __FILE__,__LINE__); \
 	 _b; \
 	}))
@@ -387,6 +389,9 @@
  *      these are pointers to local structs
  *      and have no relevance for the partner,
  *      which just echoes them as received.)
+ *
+ * NOTE that the payload starts at a long aligned offset,
+ * regardless off 32 or 64 bit arch!
  */
 typedef struct {
 	u32       magic;
@@ -606,34 +611,7 @@
 	SENT_DISK_FAILURE,	// sending it once is enough
 };
 
-struct BitMap {
-	sector_t dev_size;
-	unsigned long size;
-	unsigned long* bm;
-	unsigned long gs_bitnr;
-	spinlock_t bm_lock;
-};
-
-// activity log
-#define AL_EXTENTS_PT 61         // Extents per 512B sector (AKA transaction)
-#define AL_EXTENT_SIZE_B 22             // One extent represents 4M Storage
-#define AL_EXTENT_SIZE (1<<AL_EXTENT_SIZE_B)
-// resync bitmap
-#define BM_EXTENT_SIZE_B 24       // One extent represents 16M Storage
-#define BM_EXTENT_SIZE (1<<BM_EXTENT_SIZE_B)
-
-#define BM_WORDS_PER_EXTENT ( (AL_EXTENT_SIZE/BM_BLOCK_SIZE) / BITS_PER_LONG )
-#define BM_BYTES_PER_EXTENT ( (AL_EXTENT_SIZE/BM_BLOCK_SIZE) / 8 )
-#define EXTENTS_PER_SECTOR  ( 512 / BM_BYTES_PER_EXTENT )
-
-struct bm_extent { // 16MB sized extents.
-	struct lc_element lce;
-	int rs_left; //number of sectors out of sync in this extent.
-	unsigned long flags;
-};
-
-#define BME_NO_WRITES    0
-#define BME_LOCKED       1
+struct drbd_bitmap; // opaque for Drbd_Conf
 
 // TODO sort members for performance
 // MAYBE group them further
@@ -719,15 +697,17 @@
 	unsigned long flags;
 	struct task_struct *send_task; /* about pid calling drbd_send */
 	spinlock_t send_task_lock;
-	sector_t rs_left;     // blocks not up-to-date [unit sectors]
-	sector_t rs_total;    // blocks to sync in this run [unit sectors]
+	// sector_t rs_left;	   // blocks not up-to-date [unit BM_BLOCK_SIZE]
+	// moved into bitmap->bm_set
+	unsigned long rs_total;    // blocks to sync in this run [unit BM_BLOCK_SIZE]
 	unsigned long rs_start;    // Syncer's start time [unit jiffies]
-	sector_t rs_mark_left;// block not up-to-date at mark [unit sect.]
+	unsigned long rs_paused;   // cumulated time in PausedSyncX state [unit jiffies]
+	unsigned long rs_mark_left;// block not up-to-date at mark [unit BM_BLOCK_SIZE]
 	unsigned long rs_mark_time;// marks's time [unit jiffies]
 	struct Drbd_thread receiver;
 	struct Drbd_thread worker;
 	struct Drbd_thread asender;
-	struct BitMap* mbds_id;
+	struct drbd_bitmap* bitmap;
 	struct lru_cache* resync; // Used to track operations of resync...
 	atomic_t resync_locked;   // Number of locked elements in resync LRU
 	int open_cnt;
@@ -778,7 +758,6 @@
 extern int drbd_send_cmd(drbd_dev *mdev, struct socket *sock,
 			  Drbd_Packet_Cmd cmd, Drbd_Header *h, size_t size);
 extern int drbd_send_sync_param(drbd_dev *mdev, struct syncer_config *sc);
-extern int drbd_send_cstate(drbd_dev *mdev);
 extern int drbd_send_b_ack(drbd_dev *mdev, u32 barrier_nr,
 			   u32 set_size);
 extern int drbd_send_ack(drbd_dev *mdev, Drbd_Packet_Cmd cmd,
@@ -791,37 +770,24 @@
 extern int _drbd_send_barrier(drbd_dev *mdev);
 extern int drbd_send_drequest(drbd_dev *mdev, int cmd,
 			      sector_t sector,int size, u64 block_id);
-extern int drbd_send_insync(drbd_dev *mdev,sector_t sector,
-			    u64 block_id);
 extern int drbd_send_bitmap(drbd_dev *mdev);
 extern int _drbd_send_bitmap(drbd_dev *mdev);
 extern void drbd_free_ll_dev(drbd_dev *mdev);
 extern int drbd_io_error(drbd_dev* mdev);
 extern void drbd_mdev_cleanup(drbd_dev *mdev);
 
-
-
 // drbd_meta-data.c (still in drbd_main.c)
 extern void drbd_md_write(drbd_dev *mdev);
 extern int drbd_md_read(drbd_dev *mdev);
-extern void drbd_md_inc(drbd_dev *mdev, enum MetaDataIndex order);
 extern int drbd_md_compare(drbd_dev *mdev,Drbd_Parameter_Packet *partner);
 extern void drbd_dump_md(drbd_dev *, Drbd_Parameter_Packet *, int );
-
-// drbd_bitmap.c (still in drbd_main.c)
-#define SS_OUT_OF_SYNC (1)
-#define SS_IN_SYNC     (0)
-#define MBDS_SYNC_ALL (-2)
-#define MBDS_DONE     (-3)
-// I want the packet to fit within one page
-#define MBDS_PACKET_SIZE (PAGE_SIZE-sizeof(Drbd_Header))
-
-#define BM_BLOCK_SIZE_B  12
-#define BM_BLOCK_SIZE    (1<<BM_BLOCK_SIZE_B)
-
-#define BM_IN_SYNC       0
-#define BM_OUT_OF_SYNC   1
-
+// maybe define them below as inline?
+extern void drbd_md_inc(drbd_dev *mdev, enum MetaDataIndex order);
+/* comming soon {
+extern void drbd_md_set_flag(drbd_dev *mdev, int flags);
+extern void drbd_md_clear_flag(drbd_dev *mdev, int flags);
+extern int drbd_md_test_flag(drbd_dev *mdev, int flag);
+} */
 
 /* Meta data layout
    We reserve a 128MB Block (4k aligned)
@@ -835,6 +801,11 @@
 #define MD_AL_MAX_SIZE 64   // = 32 kb LOG  ~ 3776 extents ~ 14 GB Storage
 #define MD_BM_OFFSET (MD_AL_OFFSET + MD_AL_MAX_SIZE) //Allows up to about 3.8TB
 
+// activity log
+#define AL_EXTENTS_PT    61      // Extents per 512B sector (AKA transaction)
+#define AL_EXTENT_SIZE_B 22      // One extent represents 4M Storage
+#define AL_EXTENT_SIZE (1<<AL_EXTENT_SIZE_B)
+
 #if BITS_PER_LONG == 32
 #define LN2_BPL 5
 #define cpu_to_lel(A) cpu_to_le32(A)
@@ -847,23 +818,103 @@
 #error "LN2 of BITS_PER_LONG unknown!"
 #endif
 
-struct BitMap;
+// resync bitmap
+// 16MB sized 'bitmap extent' to track syncer usage
+struct bm_extent {
+	struct lc_element lce;
+	int rs_left; //number of bits set (out of sync) in this extent.
+	unsigned long flags;
+};
+
+#define BME_NO_WRITES  0  // bm_extent.flags: no more requests on this one!
+#define BME_LOCKED     1  // bm_extent.flags: syncer active on this one.
+
+// drbd_bitmap.c
+/*
+ * We need to store one bit for a block.
+ * Example: 1GB disk @ 4096 byte blocks ==> we need 32 KB bitmap.
+ * Bit 0 ==> local node thinks this block is binary identical on both nodes
+ * Bit 1 ==> local node thinks this block needs to be synced.
+ */
+
+#define BM_BLOCK_SIZE_B  12			 //  4k per bit
+#define BM_BLOCK_SIZE    (1<<BM_BLOCK_SIZE_B)
+/* (9+3) : 512 bytes @ 8 bits; representing 16M storage
+ * per sector of on disk bitmap */
+#define BM_EXT_SIZE_B    (BM_BLOCK_SIZE_B + 9+3)
+#define BM_EXT_SIZE      (1<<BM_EXT_SIZE_B)
+
+/* thus many _storage_ sectors are described by one bit */
+#define BM_SECT_TO_BIT(x)   ((x)>>(BM_BLOCK_SIZE_B-9))
+#define BM_BIT_TO_SECT(x)   ((x)<<(BM_BLOCK_SIZE_B-9))
+#define BM_SECT_PER_BIT     BM_BIT_TO_SECT(1)
+
+/* in which _bitmap_ extent (resp. sector) the bit for a certain
+ * _storage_ sector is located in */
+#define BM_SECT_TO_EXT(x)   ((x)>>(BM_EXT_SIZE_B-9))
+
+/* in one sector of the bitmap, we have 1<<12 bits,
+ * accounting for this many activity_log extents.
+ */
+#define AL_EXT_PER_BM_SECT  (1 << (12-(AL_EXTENT_SIZE_B - BM_BLOCK_SIZE_B)))
+
+
+/* I want the packet to fit within one page
+ * THINK maybe use a special bitmap header,
+ * including offset and compression scheme and whatnot
+ */
+#define BM_PACKET_WORDS     ((PAGE_SIZE-sizeof(Drbd_Header))/sizeof(long))
+
+/* the extent in "PER_EXTENT" below is an activity log extent
+ * we need that many (long words/bytes) to store the bitmap
+ *                   of one AL_EXTENT_SIZE chunk of storage.
+ * we can store the bitmap for that many AL_EXTENTS within
+ * one sector of the _on_disk_ bitmap:
+ * bit   0        bit 37   bit 38            bit (512*8)-1
+ *           ...|........|........|.. // ..|........|
+ * sect. 0       `296     `304                     ^(512*8*8)-1
+ *
+#define BM_WORDS_PER_EXT    ( (AL_EXT_SIZE/BM_BLOCK_SIZE) / BITS_PER_LONG )
+#define BM_BYTES_PER_EXT    ( (AL_EXT_SIZE/BM_BLOCK_SIZE) / 8 )  // 128
+#define BM_EXT_PER_SECT	    ( 512 / BM_BYTES_PER_EXTENT )        //   4
+ */
 
-// TODO I'd like to change these all to take the mdev as first argument
-extern struct BitMap* bm_init(unsigned long size_kb);
-extern int bm_resize(struct BitMap* sbm, unsigned long size_kb);
-extern void bm_cleanup(struct BitMap* sbm);
-extern int bm_set_bit(drbd_dev *mdev, sector_t sector, int size, int bit);
-extern sector_t bm_get_sector(struct BitMap* sbm,int* size);
-extern void bm_reset(struct BitMap* sbm);
-extern void bm_fill_bm(struct BitMap* sbm,int value);
-extern int bm_get_bit(struct BitMap* sbm, sector_t sector, int size);
-extern int bm_count_sectors(struct BitMap* sbm, unsigned long enr);
-extern int bm_end_of_dev_case(struct BitMap* sbm);
-extern int bm_is_rs_done(struct BitMap* sbm);
+extern int  drbd_bm_init      (drbd_dev *mdev);
+extern int  drbd_bm_resize    (drbd_dev *mdev, sector_t sectors);
+extern void drbd_bm_cleanup   (drbd_dev *mdev);
+extern void drbd_bm_set_all   (drbd_dev *mdev);
+extern void drbd_bm_clear_all (drbd_dev *mdev);
+extern void drbd_bm_reset_find(drbd_dev *mdev);
+extern int  drbd_bm_set_bit   (drbd_dev *mdev, unsigned long bitnr);
+extern int  drbd_bm_test_bit  (drbd_dev *mdev, unsigned long bitnr);
+extern int  drbd_bm_clear_bit (drbd_dev *mdev, unsigned long bitnr);
+extern int  drbd_bm_e_weight  (drbd_dev *mdev, unsigned long enr);
+extern int  drbd_bm_read_sect (drbd_dev *mdev, sector_t offset);
+extern int  drbd_bm_write_sect(drbd_dev *mdev, sector_t offset);
+extern void drbd_bm_read      (drbd_dev *mdev);
+extern void drbd_bm_write     (drbd_dev *mdev);
+extern unsigned long drbd_bm_e_set_all   (drbd_dev *mdev, unsigned long enr);
+extern size_t        drbd_bm_words       (drbd_dev *mdev);
+extern unsigned long drbd_bm_find_next   (drbd_dev *mdev);
+extern unsigned long drbd_bm_total_weight(drbd_dev *mdev);
+// for receive_bitmap
+extern void drbd_bm_merge_lel (drbd_dev *mdev, size_t offset, size_t number,
+				unsigned long* buffer);
+// for _drbd_send_bitmap and drbd_bm_write_sect
+extern void drbd_bm_get_lel   (drbd_dev *mdev, size_t offset, size_t number,
+				unsigned long* buffer);
+/*
+ * only used by drbd_bm_read_sect
+extern void drbd_bm_set_lel   (drbd_dev *mdev, size_t offset, size_t number,
+				unsigned long* buffer);
+*/
 
+extern void __drbd_bm_lock    (drbd_dev *mdev, char* file, int line);
+extern void drbd_bm_unlock    (drbd_dev *mdev);
+#define drbd_bm_lock(mdev)    __drbd_bm_lock(mdev, __FILE__, __LINE__ )
 
 
+// drbd_main.c
 extern drbd_dev *drbd_conf;
 extern int minor_count;
 extern kmem_cache_t *drbd_request_cache;
@@ -927,10 +978,9 @@
 extern void drbd_rs_cancel_all(drbd_dev* mdev);
 extern void drbd_al_read_log(struct Drbd_Conf *mdev);
 extern void drbd_set_in_sync(drbd_dev* mdev, sector_t sector,int blk_size);
-extern void drbd_read_bm(struct Drbd_Conf *mdev);
+extern void drbd_set_out_of_sync(drbd_dev* mdev, sector_t sector,int blk_size);
 extern void drbd_al_apply_to_bm(struct Drbd_Conf *mdev);
 extern void drbd_al_to_on_disk_bm(struct Drbd_Conf *mdev);
-extern void drbd_write_bm(struct Drbd_Conf *mdev);
 extern void drbd_al_shrink(struct Drbd_Conf *mdev);
 
 /*
@@ -1245,14 +1295,6 @@
 	D_ASSERT(atomic_read(&mdev->ap_bio_cnt)>=0);
 }
 
-static inline void drbd_set_out_of_sync(drbd_dev* mdev,
-					sector_t sector, int blk_size)
-{
-	D_ASSERT(blk_size);
-	mdev->rs_total +=
-		bm_set_bit(mdev, sector, blk_size, SS_OUT_OF_SYNC);
-}
-
 #ifdef DUMP_EACH_PACKET
 /*
  * enable to dump information about every packet exchange.
@@ -1324,20 +1366,27 @@
 )
 #endif
 
-#ifndef hweight_long
-# if (BITS_PER_LONG > 32)
-static inline unsigned long hweight64(__u64 w)
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0)
+// this is a direct copy from 2.6.6 include/linux/bitops.h
+
+static inline unsigned long generic_hweight64(__u64 w)
 {
+#if BITS_PER_LONG < 64
+	return generic_hweight32((unsigned int)(w >> 32)) +
+				generic_hweight32((unsigned int)w);
+#else
 	u64 res;
-        res = (w & 0x5555555555555555) + ((w >> 1) & 0x5555555555555555);
-        res = (res & 0x3333333333333333) + ((res >> 2) & 0x3333333333333333);
-        res = (res & 0x0F0F0F0F0F0F0F0F) + ((res >> 4) & 0x0F0F0F0F0F0F0F0F);
-        res = (res & 0x00FF00FF00FF00FF) + ((res >> 8) & 0x00FF00FF00FF00FF);
-        res = (res & 0x0000FFFF0000FFFF) + ((res >> 16) & 0x0000FFFF0000FFFF);
-        return (res & 0x00000000FFFFFFFF) + ((res >> 32) & 0x00000000FFFFFFFF);
-}
-#  define hweight_long(x) hweight64(x)
-# else
-#  define hweight_long(x) hweight32(x)
-# endif
+	res = (w & 0x5555555555555555ul) + ((w >> 1) & 0x5555555555555555ul);
+	res = (res & 0x3333333333333333ul) + ((res >> 2) & 0x3333333333333333ul);
+	res = (res & 0x0F0F0F0F0F0F0F0Ful) + ((res >> 4) & 0x0F0F0F0F0F0F0F0Ful);
+	res = (res & 0x00FF00FF00FF00FFul) + ((res >> 8) & 0x00FF00FF00FF00FFul);
+	res = (res & 0x0000FFFF0000FFFFul) + ((res >> 16) & 0x0000FFFF0000FFFFul);
+	return (res & 0x00000000FFFFFFFFul) + ((res >> 32) & 0x00000000FFFFFFFFul);
+#endif
+}
+
+static inline unsigned long hweight_long(unsigned long w)
+{
+	return sizeof(w) == 4 ? generic_hweight32(w) : generic_hweight64(w);
+}
 #endif
===================================================================
RCS file: /var/lib/cvs/drbd/drbd/drbd/drbd_main.c,v
retrieving revision 1.73.2.186
retrieving revision 1.73.2.187
diff -u -3 -r1.73.2.186 -r1.73.2.187
--- drbd_main.c	9 Jun 2004 20:06:36 -0000	1.73.2.186
+++ drbd_main.c	15 Jun 2004 10:07:32 -0000	1.73.2.187
@@ -695,16 +695,15 @@
 /* See the comment at receive_bitmap() */
 int _drbd_send_bitmap(drbd_dev *mdev)
 {
-	int buf_i,want;
+	int want;
 	int ok=TRUE, bm_i=0;
-	size_t bm_words;
-	unsigned long *buffer,*bm;
+	size_t bm_words, num_words;
+	unsigned long *buffer;
 	Drbd_Header *p;
 
-	ERR_IF(!mdev->mbds_id) return FALSE;
+	ERR_IF(!mdev->bitmap) return FALSE;
 
-	bm_words = mdev->mbds_id->size/sizeof(long);
-	bm = mdev->mbds_id->bm;
+	bm_words = drbd_bm_words(mdev);
 	p  = vmalloc(PAGE_SIZE); // sleeps. cannot fail.
 	buffer = (unsigned long*)p->payload;
 
@@ -713,11 +712,14 @@
 	 * some such algorithms in the kernel anyways.
 	 */
 	do {
-		want=min_t(int,MBDS_PACKET_SIZE,(bm_words-bm_i)*sizeof(long));
-		for(buf_i=0;buf_i<want/sizeof(long);buf_i++)
-			buffer[buf_i] = cpu_to_lel(bm[bm_i++]);
+		num_words = min_t(size_t, BM_PACKET_WORDS, bm_words-bm_i );
+		want = num_words * sizeof(long);
+		if (want) {
+			drbd_bm_get_lel(mdev, bm_i, num_words, buffer);
+		}
 		ok = _drbd_send_cmd(mdev,mdev->data.socket,ReportBitMap,
 				   p, sizeof(*p) + want, 0);
+		bm_i += num_words;
 	} while (ok && want);
 
 	vfree(p);
@@ -1036,11 +1038,17 @@
 		}
 		D_ASSERT(rv != 0);
 		if (rv == -EINTR ) {
+#if 0
+			/* FIXME this happens all the time.
+			 * we don't care for now!
+			 * eventually this should be sorted out be the proper
+			 * use of the SIGNAL_ASENDER bit... */
 			if (DRBD_ratelimit(5*HZ,5)) {
 				DBG("Got a signal in drbd_send(,%c,)!\n",
 				    sock == mdev->meta.socket ? 'm' : 's');
 				// dump_stack();
 			}
+#endif
 			drbd_flush_signals(current);
 			rv = 0;
 		}
@@ -1307,7 +1315,6 @@
 	mdev->p_size       =
 	mdev->rs_start     =
 	mdev->rs_total     =
-	mdev->rs_left      =
 	mdev->rs_mark_left =
 	mdev->rs_mark_time = 0;
 	mdev->send_task    = NULL;
@@ -1440,7 +1447,7 @@
 )
 
 			tl_cleanup(mdev);
-			if (mdev->mbds_id) bm_cleanup(mdev->mbds_id);
+			if (mdev->bitmap) drbd_bm_cleanup(mdev);
 			if (mdev->resync) lc_free(mdev->resync);
 
 			D_ASSERT(mdev->ee_in_use==0);
@@ -1639,8 +1646,7 @@
 		if(!page) goto Enomem;
 		mdev->md_io_page = page;
 
-		mdev->mbds_id = bm_init(0);
-		if (!mdev->mbds_id) goto Enomem;
+		if (drbd_bm_init(mdev)) goto Enomem;
 		// no need to lock access, we are still initializing the module.
 		mdev->resync = lc_alloc(17, sizeof(struct bm_extent),mdev);
 		if (!mdev->resync) goto Enomem;
@@ -1764,388 +1770,6 @@
 }
 
 /*********************************/
-
-/*** The bitmap stuff. ***/
-/*
-  We need to store one bit for a block.
-  Example: 1GB disk @ 4096 byte blocks ==> we need 32 KB bitmap.
-  Bit 0 ==> Primary and secondary nodes are in sync.
-  Bit 1 ==> secondary node's block must be updated. (')
-*/
-
-
-// Shift right with round up. :)
-#define SR_RU(A,B) ( ((A)>>(B)) + ( ((A) & ((1<<(B))-1)) > 0 ? 1 : 0 ) )
-
-int bm_resize(struct BitMap* sbm, unsigned long size_kb)
-{
-	unsigned long *obm,*nbm;
-	unsigned long size;
-
-	if(!sbm) return 1; // Nothing to do
-
-	size = SR_RU(size_kb,(BM_BLOCK_SIZE_B - (10-LN2_BPL))) << (LN2_BPL-3);
-	/* 10 => blk_size is KB ; 3 -> 2^3=8 Bits per Byte */
-	// Calculate the number of long words needed, round it up, and
-	// finally convert it to bytes.
-
-	if(size == 0) {
-		sbm->size = size;
-		vfree(sbm->bm);
-		sbm->bm = 0;
-		return 1;
-	}
-
-	obm = sbm->bm;
-	nbm = vmalloc(size);
-	if(!nbm) {
-		printk(KERN_ERR DEVICE_NAME"X: Failed to allocate BitMap\n");
-		return 0;
-	}
-	memset(nbm,0,size);
-
-	spin_lock_irq(&sbm->bm_lock);
-	if(obm) {
-		memcpy(nbm,obm,min_t(unsigned long,sbm->size,size));
-	}
-	sbm->dev_size = size_kb;
-	sbm->size = size;
-	sbm->bm = nbm;
-	spin_unlock_irq(&sbm->bm_lock);
-
-	if(obm) vfree(obm);
-
-	return 1;
-}
-
-struct BitMap* bm_init(unsigned long size_kb)
-{
-	struct BitMap* sbm;
-
-	sbm = kmalloc(sizeof(struct BitMap),GFP_KERNEL);
-	if(!sbm) {
-		printk(KERN_ERR DEVICE_NAME"X: Failed to allocate BM desc\n");
-		return 0;
-	}
-
-	sbm->dev_size = size_kb;
-	sbm->gs_bitnr=0;
-	sbm->bm_lock = SPIN_LOCK_UNLOCKED;
-
-	sbm->size = 0;
-	sbm->bm = NULL;
-
-	if(!bm_resize(sbm,size_kb)) {
-		kfree(sbm);
-		return 0;
-	}
-
-	return sbm;
-}
-
-void bm_cleanup(struct BitMap* sbm)
-{
-	vfree(sbm->bm);
-	kfree(sbm);
-}
-
-#define BM_SS (BM_BLOCK_SIZE_B-9)     // 3
-#define BM_NS (1<<BM_SS)              // 8
-#define BM_MM ((1L<<BM_SS)-1)         // 7 = 111bin
-#define BPLM (BITS_PER_LONG-1)
-#define BM_BPS (BM_BLOCK_SIZE/1024)   // 4
-
-/* sector_t and size have a higher resolution (512 Byte) than
-   the bitmap (4K). In case we have to set a bit, we 'round up',
-   in case we have to clear a bit we do the opposit.
-   It returns the number of sectors that where marked dirty, or
-   marked clean.
-*/
-int bm_set_bit(drbd_dev *mdev, sector_t sector, int size, int bit)
-{
-	struct BitMap* sbm = mdev->mbds_id;
-	unsigned long* bm;
-	unsigned long sbnr,ebnr,bnr;
-	sector_t esector = ( sector + (size>>9) - 1 );
-	int ret=0;
-	unsigned long flags;
-
-	if (size <= 0 || (size & 0x1ff) != 0 || ( size > PAGE_SIZE && size != AL_EXTENT_SIZE)) {
-		DUMPI(size);
-		return 0;
-	}
-
-	if(sbm == NULL) {
-		printk(KERN_ERR DEVICE_NAME"X: No BitMap !?\n");
-		return 0;
-	}
-
-	if(sector >= sbm->dev_size<<1) return 0;
-	ERR_IF(esector >= sbm->dev_size<<1) esector = (sbm->dev_size<<1) - 1;
-
-	sbnr = sector >> BM_SS;
-	ebnr = esector >> BM_SS;
-
-	/*
-	INFO("bm_set_bit(,%lu,%d,%d) %lu %lu %lu ; %lu %lu\n",
-	     sector,size,bit, esector, sbnr,ebnr, sbm->size, sbm->dev_size);
-	*/
-
-	spin_lock_irqsave(&sbm->bm_lock,flags);
-	bm = sbm->bm;
-
-	if(bit) {
-		for(bnr=sbnr; bnr <= ebnr; bnr++) {
-			ERR_IF((bnr>>3) >= sbm->size) {
-				DUMPST(sector);
-				DUMPI(size);
-				DUMPLU(bnr);
-				DUMPLU(sbm->size);
-				DUMPST(sbm->dev_size);
-				break;
-			}
-			if(!test_bit(bnr&BPLM,bm+(bnr>>LN2_BPL))) ret+=BM_NS;
-			__set_bit(bnr & BPLM, bm + (bnr>>LN2_BPL));
-			ret += bm_end_of_dev_case(sbm);
-		}
-	} else { // bit == 0
-		sector_t dev_size;
-
-		dev_size=sbm->dev_size;
-
-		if(  (sector & BM_MM) != 0 )     sbnr++;
-		if( ebnr && (esector & BM_MM) != BM_MM ) {
-			ebnr--;
-
-			// There is this one special case at the
-			// end of the device...
-			if(unlikely(dev_size<<1 == esector+1)) {
-				ebnr++;
-				ERR_IF((ebnr>>3) >= sbm->size) {
-					DUMPST(sector);
-					DUMPI(size);
-					DUMPLU(ebnr);
-					DUMPLU(sbm->size);
-					DUMPST(sbm->dev_size);
-				} else if(test_bit(ebnr&BPLM,bm+(ebnr>>LN2_BPL))) {
-					ret = (esector-sector+1)-BM_NS;
-				}
-			}
-		}
-
-		for(bnr=sbnr; bnr <= ebnr; bnr++) {
-			ERR_IF((bnr>>3) >= sbm->size) {
-				DUMPST(sector);
-				DUMPI(size);
-				DUMPLU(bnr);
-				DUMPLU(sbnr);
-				DUMPLU(ebnr);
-				DUMPLU(sbm->size);
-				DUMPST(sbm->dev_size);
-				break;
-			}
-			if(test_bit(bnr&BPLM,bm+(bnr>>LN2_BPL))) ret+=BM_NS;
-			clear_bit(bnr & BPLM, bm + (bnr>>LN2_BPL));
-		}
-	}
-	spin_unlock_irqrestore(&sbm->bm_lock,flags);
-
-	return ret;
-}
-
-static inline unsigned long bitmask(int o)
-{
-	return o >= BITS_PER_LONG ? -1 : ((1<<o)-1);
-}
-
-/* In case the device's size is not divisible by 4, the last bit
-   does not count for 8 sectors but something less. This function
-   returns this 'something less' iff the last bit is set.
-   0               in case the device's size is divisible by 4
-   -2,-4 or -6     in the other cases
-   If the bits beyond the device's size are set, they are cleared
-   and their weight (-8 per bit) is added to the return value.
- */
-int bm_end_of_dev_case(struct BitMap* sbm)
-{
-	unsigned long bnr;
-	unsigned long* bm;
-	int rv=0;
-	int used_bits;      // number ob bits used in last word
-	unsigned long mask;
-
-	bm = sbm->bm;
-
-	if( sbm->dev_size % BM_BPS ) {
-		bnr = sbm->dev_size / BM_BPS;
-		if(test_bit(bnr&BPLM,bm+(bnr>>LN2_BPL))) {
-			rv = (sbm->dev_size*2) % BM_NS - BM_NS;
-		}
-	}
-	used_bits = BITS_PER_LONG -
-		( sbm->size*8 - div_ceil(sbm->dev_size,BM_BPS) );
-	mask = ~ bitmask(used_bits); // mask of bits to clear;
-	mask &= bm[sbm->size/sizeof(long)-1];
-	if( mask ) {
-		rv = -8 * hweight_long(mask);
-		bm[sbm->size/sizeof(long)-1] &= ~mask;
-	}
-
-	return rv;
-}
-
-#define WORDS ( ( BM_EXTENT_SIZE / BM_BLOCK_SIZE ) / BITS_PER_LONG )
-int bm_count_sectors(struct BitMap* sbm, unsigned long enr)
-{
-	unsigned long* bm;
-	unsigned long flags;
-	int i,max,bits=0;
-
-	spin_lock_irqsave(&sbm->bm_lock,flags);
-	bm = sbm->bm;
-
-	max = min_t(int, (enr+1)*WORDS, sbm->size/sizeof(long));
-
-	for(i = enr * WORDS ; i < max ; i++) {
-		bits += hweight_long(bm[i]);
-	}
-
-	bits = bits << (BM_BLOCK_SIZE_B - 9); // in sectors
-
-	// Special case at the end of the device
-	if( max == sbm->size/sizeof(long) ) {
-		bits += bm_end_of_dev_case(sbm);
-	}
-
-	spin_unlock_irqrestore(&sbm->bm_lock,flags);
-
-	return bits;
-}
-#undef WORDS
-
-int bm_get_bit(struct BitMap* sbm, sector_t sector, int size)
-{
-	unsigned long* bm;
-	unsigned long sbnr,ebnr,bnr;
-	unsigned long flags;
-	sector_t esector = ( sector + (size>>9) - 1 );
-	int ret=0;
-
-	if(sbm == NULL) {
-		printk(KERN_ERR DEVICE_NAME"X: No BitMap !?\n");
-		return 0;
-	}
-
-	sbnr = sector >> BM_SS;
-	ebnr = esector >> BM_SS;
-
-	spin_lock_irqsave(&sbm->bm_lock,flags);
-	bm = sbm->bm;
-
-	for (bnr=sbnr; bnr <= ebnr; bnr++) {
-		if (test_bit(bnr, bm)) {
-			ret=1;
-			break;
-		}
-	}
-
-	spin_unlock_irqrestore(&sbm->bm_lock,flags);
-
-	return ret;
-}
-
-sector_t bm_get_sector(struct BitMap* sbm,int* size)
-{
-	sector_t bnr;
-	unsigned long* bm;
-	unsigned long flags;
-	sector_t dev_size;
-	sector_t ret;
-
-	if(*size != BM_BLOCK_SIZE) BUG(); // Other cases are not needed
-
-	if(sbm->gs_bitnr == -1) {
-		return MBDS_DONE;
-	}
-
-	spin_lock_irqsave(&sbm->bm_lock,flags);
-	bm = sbm->bm;
-	bnr = sbm->gs_bitnr;
-
-	// optimization possible, search word != 0 first...
-	while( (bnr>>3) < sbm->size ) {
-		if(test_bit(bnr & BPLM, bm + (bnr>>LN2_BPL))) break;
-		bnr++;
-	}
-
-	ret=bnr<<BM_SS;
-
-	dev_size=sbm->dev_size;
-	if( ret+((1<<BM_SS)-1) > dev_size<<1 ) {
-		int ns = dev_size % (1<<(BM_BLOCK_SIZE_B-10));
-		sbm->gs_bitnr = -1;
-		if(ns) *size = ns<<10;
-		else ret=MBDS_DONE;
-	} else {
-		sbm->gs_bitnr = bnr+1;
-	}
-
-	spin_unlock_irqrestore(&sbm->bm_lock,flags);
-
-	return ret;
-}
-
-int bm_is_rs_done(struct BitMap* sbm)
-{
-	int rv=0;
-	unsigned long flags;
-
-	spin_lock_irqsave(&sbm->bm_lock,flags);
-
-	if( (sbm->gs_bitnr<<BM_SS) + ((1<<BM_SS)-1) > sbm->dev_size<<1) {
-		int ns = sbm->dev_size % (1<<(BM_BLOCK_SIZE_B-10));
-		if(!ns) {
-			sbm->gs_bitnr = -1;
-			rv=1;
-		}
-	}
-
-	spin_unlock_irqrestore(&sbm->bm_lock,flags);
-
-	return rv;
-}
-
-void bm_reset(struct BitMap* sbm)
-{
-	unsigned long flags;
-	spin_lock_irqsave(&sbm->bm_lock,flags);
-	sbm->gs_bitnr=0;
-	spin_unlock_irqrestore(&sbm->bm_lock,flags);
-}
-
-
-void bm_fill_bm(struct BitMap* sbm,int value)
-{
-	unsigned long* bm;
-	unsigned long bnr,o;
-	unsigned long flags;
-
-	spin_lock_irqsave(&sbm->bm_lock,flags);
-	bm = sbm->bm;
-
-	memset(bm,value,sbm->size);
-
-	// Special case at end of device...
-	bnr = sbm->dev_size / BM_BPS + ( sbm->dev_size % BM_BPS ? 1 : 0 );
-	o = bnr / BITS_PER_LONG;
-	if ( o < sbm->size/sizeof(long) ) { // e.g. is wrong if dev_size == 1G 
-		bm[ o ] &= ( ( 1 << (bnr % BITS_PER_LONG) ) - 1 );
-	}
-
-	spin_unlock_irqrestore(&sbm->bm_lock,flags);
-}
-
-/*********************************/
 /* meta data management */
 
 struct meta_data_on_disk {
@@ -2160,7 +1784,7 @@
 
 /*
 
-FIXME md_io might fail unnoticed
+FIXME md_io might fail unnoticed sometimes ...
 
 */
 void drbd_md_write(drbd_dev *mdev)
===================================================================
RCS file: /var/lib/cvs/drbd/drbd/drbd/drbd_proc.c,v
retrieving revision 1.8.2.30
retrieving revision 1.8.2.31
diff -u -3 -r1.8.2.30 -r1.8.2.31
--- drbd_proc.c	1 Jun 2004 07:00:57 -0000	1.8.2.30
+++ drbd_proc.c	15 Jun 2004 10:07:32 -0000	1.8.2.31
@@ -48,14 +48,16 @@
  *	[=====>..............] 33.5% (23456/123456)
  *	finish: 2:20:20 speed: 6,345 (6,456) K/sec
  */
+#define Bit2KB(bits) ((bits)<<(BM_BLOCK_SIZE_B-10))
 STATIC int drbd_syncer_progress(struct Drbd_Conf* mdev,char *buf)
 {
 	int sz = 0;
-	unsigned long res , db, dt, dbdt, rt;
+	unsigned long res , db, dt, dbdt, rt, rs_left;
 	sector_t n;
 
-	n = (mdev->rs_left>>11)*1000;
-	sector_div(n,((mdev->rs_total>>11) + 1));
+	rs_left = drbd_bm_total_weight(mdev);
+	n = rs_left*1000;
+	sector_div(n,mdev->rs_total + 1);
 	res = n;
 	{
 		int i, y = res/50, x = 20-y;
@@ -69,15 +71,16 @@
 	}
 	res = 1000L - res;
 	sz+=sprintf(buf+sz,"sync'ed:%3lu.%lu%% ", res / 10, res % 10);
-	if (mdev->rs_total > 0x100000L) /* if more than 1 GB display in MB */
+	/* if more than 1 GB display in MB */
+	if (mdev->rs_total > 0x100000L) {
 		sz+=sprintf(buf+sz,"(%lu/%lu)M\n\t",
-			    (unsigned long) mdev->rs_left>>11,
-			    (unsigned long) mdev->rs_total>>11);
-	else
-		sz+=sprintf(buf+sz,"(%lu/%lu)K\n\t", 
-			    (unsigned long) mdev->rs_left>>1 | 
-			    (mdev->rs_left == 1),
-			    (unsigned long) mdev->rs_total>>1);
+			    (unsigned long) Bit2KB(rs_left) >> 10,
+			    (unsigned long) Bit2KB(mdev->rs_total) >> 10 );
+	} else {
+		sz+=sprintf(buf+sz,"(%lu/%lu)K\n\t",
+			    (unsigned long) Bit2KB(rs_left),
+			    (unsigned long) Bit2KB(mdev->rs_total) );
+	}
 
 	/* see drivers/md/md.c
 	 * We do not want to overflow, so the order of operands and
@@ -90,8 +93,8 @@
 	 */
 	dt = (jiffies - mdev->rs_mark_time) / HZ;
 	if (!dt) dt++;
-	db = (mdev->rs_mark_left - mdev->rs_left)>>1;
-	n = mdev->rs_left>>1;
+	db = Bit2KB(mdev->rs_mark_left - rs_left);
+	n = Bit2KB(rs_left);
 	sector_div(n,(db/100+1));
 	rt = ( dt * (unsigned long) n ) / 100; /* seconds */
 
@@ -109,7 +112,7 @@
 	/* mean speed since syncer started */
 	dt = (jiffies - mdev->rs_start) / HZ;
 	if (!dt) dt++;
-	db = (mdev->rs_total - mdev->rs_left)>>1;
+	db = Bit2KB(mdev->rs_total - rs_left);
 	if ((dbdt=db/dt) > 1000)
 		sz += sprintf(buf + sz, " (%ld,%03ld)",
 			dbdt/1000,dbdt % 1000);
===================================================================
RCS file: /var/lib/cvs/drbd/drbd/drbd/drbd_receiver.c,v
retrieving revision 1.97.2.170
retrieving revision 1.97.2.171
diff -u -3 -r1.97.2.170 -r1.97.2.171
--- drbd_receiver.c	14 Jun 2004 12:54:18 -0000	1.97.2.170
+++ drbd_receiver.c	15 Jun 2004 10:07:32 -0000	1.97.2.171
@@ -969,8 +969,7 @@
 	if(mdev->conf.wire_protocol == DRBD_PROT_C) {
 		if(likely(drbd_bio_uptodate(&e->private_bio))) {
 			ok=drbd_send_ack(mdev,WriteAck,e);
-			if(ok && mdev->rs_left)
-				drbd_set_in_sync(mdev,sector,drbd_ee_get_size(e));
+			if(ok && mdev->rs_total) drbd_set_in_sync(mdev,sector,drbd_ee_get_size(e));
 		} else {
 			ok = drbd_send_ack(mdev,NegAck,e);
 			ok&= drbd_io_error(mdev);
@@ -1053,8 +1052,9 @@
 STATIC int receive_DataRequest(drbd_dev *mdev,Drbd_Header *h)
 {
 	sector_t sector;
+	const sector_t capacity = drbd_get_capacity(mdev->this_bdev);
 	struct Tl_epoch_entry *e;
-	int data_size;
+	int size;
 	Drbd_BlockRequest_Packet *p = (Drbd_BlockRequest_Packet*)h;
 
 	ERR_IF(h->length != (sizeof(*p)-sizeof(*h))) return FALSE;
@@ -1062,8 +1062,27 @@
 	if (drbd_recv(mdev, h->payload, h->length) != h->length)
 		return FALSE;
 
-	sector    = be64_to_cpu(p->sector);
-	data_size = be32_to_cpu(p->blksize);
+	sector = be64_to_cpu(p->sector);
+	size   = be32_to_cpu(p->blksize);
+
+	/*
+	 * handled by NegDReply below ...
+	ERR_IF (test_bit(DISKLESS,&mdev->flags)) {
+		return FALSE;
+	ERR_IF ( (mdev->gen_cnt[Flags] & MDF_Consistent) == 0 )
+		return FALSE;
+	*/
+
+	if (size <= 0 || (size & 0x1ff) != 0 || size > PAGE_SIZE) {
+		ERR("%s:%d: sector: %lu, size: %d\n", __FILE__, __LINE__,
+				(unsigned long)sector,size);
+		return FALSE;
+	}
+	if ( sector + (size>>9) > capacity) {
+		ERR("%s:%d: sector: %lu, size: %d\n", __FILE__, __LINE__,
+				(unsigned long)sector,size);
+		return FALSE;
+	}
 
 	spin_lock_irq(&mdev->ee_lock);
 	e=drbd_get_ee(mdev);
@@ -1075,9 +1094,9 @@
 	list_add(&e->w.list,&mdev->read_ee);
 	spin_unlock_irq(&mdev->ee_lock);
 
-	if(!inc_local(mdev)) {
+	if(!inc_local(mdev) || (mdev->gen_cnt[Flags] & MDF_Consistent) == 0) {
 		if (DRBD_ratelimit(5*HZ,5))
-			ERR("Can not satisfy peer's read request, no local disk.\n");
+			ERR("Can not satisfy peer's read request, no local data.\n");
 		drbd_send_ack(mdev,NegDReply,e);
 		spin_lock_irq(&mdev->ee_lock);
 		drbd_put_ee(mdev,e);
@@ -1085,7 +1104,7 @@
 		return TRUE;
 	}
 
-	drbd_ee_prepare_read(mdev,e,sector,data_size);
+	drbd_ee_prepare_read(mdev,e,sector,size);
 
 	switch (h->command) {
 	case DataRequest:
@@ -1104,7 +1123,7 @@
 		D_ASSERT(0);
 	}
 
-	mdev->read_cnt += data_size >> 9;
+	mdev->read_cnt += size >> 9;
 	inc_unacked(mdev);
 	drbd_generic_make_request(READ,&e->private_bio);
 
@@ -1178,6 +1197,8 @@
 		drbd_thread_stop_nowait(&mdev->receiver);
 		return FALSE;
 	}
+
+	drbd_bm_lock(mdev);
 	mdev->p_size=p_size;
 
 	consider_sync = (mdev->cstate == WFReportParams);
@@ -1264,6 +1285,7 @@
 					ERR("Current Primary shall become sync TARGET! Aborting to prevent data corruption.\n");
 					set_cstate(mdev,StandAlone);
 					drbd_thread_stop_nowait(&mdev->receiver);
+					drbd_bm_unlock(mdev);
 					return FALSE;
 				}
 				mdev->gen_cnt[Flags] &= ~MDF_Consistent;
@@ -1271,7 +1293,7 @@
 			}
 		} else {
 			set_cstate(mdev,Connected);
-			if(mdev->rs_total) {
+			if(drbd_bm_total_weight(mdev)) {
 				/* We are not going to do a resync but there
 				   are marks in the bitmap.
 				   (Could be from the AL, or someone used
@@ -1279,9 +1301,8 @@
 				   Clean the bitmap...
 				 */
 				INFO("No resync -> clearing bit map.\n");
-				bm_fill_bm(mdev->mbds_id,0);
-				mdev->rs_total = 0;
-				drbd_write_bm(mdev);
+				drbd_bm_clear_all(mdev);
+				drbd_bm_write(mdev);
 			}
 		}
 
@@ -1305,10 +1326,12 @@
 		drbd_md_inc(mdev,ConnectedCnt);
 	}
 	if (oo_state != mdev->o_state) {
-		INFO("Peer switched to %s state\n", nodestate_to_name(mdev->o_state));
+		INFO( "now %s/%s\n", nodestate_to_name(mdev->state),
+				nodestate_to_name(mdev->o_state) );
 	}
 
 	drbd_md_write(mdev); // update connected indicator, la_size, ...
+	drbd_bm_unlock(mdev);
 
 	return TRUE;
 }
@@ -1318,44 +1341,42 @@
    chunks as long as it is little endian. (Understand it as byte stream,
    beginning with the lowest byte...) If we would use big endian
    we would need to process it from the highest address to the lowest,
-   in order to be agnostic to the 32 vs 64 bits issue. */
+   in order to be agnostic to the 32 vs 64 bits issue.
+
+   returns 0 on failure, 1 if we suceessfully received it. */
 STATIC int receive_bitmap(drbd_dev *mdev, Drbd_Header *h)
 {
-	size_t bm_words;
-	unsigned long *buffer, *bm, word;
-	int buf_i,want;
-	int ok=FALSE, bm_i=0;
-	unsigned long bits=0;
-
-	bm_words=mdev->mbds_id->size/sizeof(long);
-	bm=mdev->mbds_id->bm;
-	buffer=vmalloc(MBDS_PACKET_SIZE);
+	size_t bm_words, bm_i, want, num_words;
+	unsigned long *buffer;
+	int ok=FALSE;
+
+	drbd_bm_lock(mdev);
+
+	bm_words = drbd_bm_words(mdev);
+	bm_i     = 0;
+	buffer   = vmalloc(BM_PACKET_WORDS*sizeof(long));
 
 	while (1) {
-		want=min_t(int,MBDS_PACKET_SIZE,(bm_words-bm_i)*sizeof(word));
+		num_words = min_t(size_t, BM_PACKET_WORDS, bm_words-bm_i );
+		want = num_words * sizeof(long);
 		ERR_IF(want != h->length) goto out;
 		if (want==0) break;
 		if (drbd_recv(mdev, buffer, want) != want)
 			goto out;
-		for(buf_i=0;buf_i<want/sizeof(long);buf_i++) {
-			word = lel_to_cpu(buffer[buf_i]) | bm[bm_i];
-			bits += hweight_long(word);
-			bm[bm_i++] = word;
-		}
+
+		drbd_bm_merge_lel(mdev, bm_i, num_words, buffer);
+		bm_i += num_words;
+
 		if (!drbd_recv_header(mdev,h))
 			goto out;
 		D_ASSERT(h->command == ReportBitMap);
 	}
 
-	bits = bits << (BM_BLOCK_SIZE_B - 9); // in sectors
-
-	mdev->rs_total = bits + bm_end_of_dev_case(mdev->mbds_id);
-
 	if (mdev->cstate == WFBitMapS) {
 		drbd_start_resync(mdev,SyncSource);
 	} else if (mdev->cstate == WFBitMapT) {
-		if (!drbd_send_bitmap(mdev))
-			goto out;
+		ok = drbd_send_bitmap(mdev);
+		if (!ok) goto out;
 		drbd_start_resync(mdev,SyncTarget); // XXX cannot fail ???
 	} else {
 		D_ASSERT(0);
@@ -1377,6 +1398,7 @@
 
 	ok=TRUE;
  out:
+	drbd_bm_unlock(mdev);
 	vfree(buffer);
 	return ok;
 }
@@ -1432,21 +1454,30 @@
 
 STATIC int receive_BecomeSyncTarget(drbd_dev *mdev, Drbd_Header *h)
 {
-	ERR_IF(!mdev->mbds_id)
-		return FALSE;
-	bm_fill_bm(mdev->mbds_id,-1);
-	mdev->rs_total = drbd_get_capacity(mdev->this_bdev);
-	drbd_write_bm(mdev);
+	ERR_IF(!mdev->bitmap) return FALSE;
+
+	/* THINK
+	 * otherwise this does not make much sense, no?
+	 * and some other assertion maybe about cstate...
+	 */
+	ERR_IF(mdev->cstate == Secondary) return FALSE;
+
+	drbd_bm_lock(mdev);
+	drbd_bm_set_all(mdev);
+	drbd_bm_write(mdev);
 	drbd_start_resync(mdev,SyncTarget);
+	drbd_bm_unlock(mdev);
 	return TRUE; // cannot fail ?
 }
 
 STATIC int receive_BecomeSyncSource(drbd_dev *mdev, Drbd_Header *h)
 {
-	bm_fill_bm(mdev->mbds_id,-1);
-	mdev->rs_total = drbd_get_capacity(mdev->this_bdev);
-	drbd_write_bm(mdev);
+	// FIXME asserts ?
+	drbd_bm_lock(mdev);
+	drbd_bm_set_all(mdev);
+	drbd_bm_write(mdev);
 	drbd_start_resync(mdev,SyncSource);
+	drbd_bm_unlock(mdev);
 	return TRUE; // cannot fail ?
 }
 
@@ -1699,8 +1730,9 @@
 
 			drbd_end_req(req, RQ_DRBD_SENT, 1, sector);
 
-			if(mdev->conf.wire_protocol == DRBD_PROT_C && 
-			   mdev->rs_left)
+			/* TODO maybe optimize: don't do the set_in_sync
+			 * if not neccessary */
+			if(mdev->conf.wire_protocol == DRBD_PROT_C)
 				drbd_set_in_sync(mdev,sector,blksize);
 		}
 	}
===================================================================
RCS file: /var/lib/cvs/drbd/drbd/drbd/drbd_req-2.4.c,v
retrieving revision 1.33.2.84
retrieving revision 1.33.2.85
diff -u -3 -r1.33.2.84 -r1.33.2.85
--- drbd_req-2.4.c	1 Jun 2004 14:29:07 -0000	1.33.2.84
+++ drbd_req-2.4.c	15 Jun 2004 10:07:32 -0000	1.33.2.85
@@ -81,7 +81,7 @@
 	uptodate = req->rq_status & 0x0001;
 	if( !uptodate && mdev->on_io_error == Detach) {
 		drbd_set_out_of_sync(mdev,rsector, drbd_req_get_size(req));
-		// It should also be as out of sync on 
+		// It should also be as out of sync on
 		// the other side!  See w_io_error()
 
 		drbd_bio_endio(req->master_bio,1);
@@ -133,6 +133,36 @@
 	return rv;
 }
 
+
+/* we may do a local read if:
+ * - we are consistent (of course),
+ * - or we are generally inconsistent,
+ *   BUT we are still/already IN SYNC for this area.
+ *   since size may be up to PAGE_SIZE, but BM_BLOCK_SIZE may be smaller
+ *   than PAGE_SIZE, we may need to check several bits.
+ */
+STATIC int drbd_may_do_local_read(drbd_dev *mdev, sector_t sector, int size)
+{
+	unsigned long sbnr,ebnr,bnr;
+	sector_t esector, nr_sectors;
+
+	if (mdev->gen_cnt[Flags] & MDF_Consistent) return 1;
+
+	nr_sectors = drbd_get_capacity(mdev->this_bdev);
+	esector = sector + (size>>9) -1;
+
+	D_ASSERT(sector  < nr_sectors);
+	D_ASSERT(esector < nr_sectors);
+
+	sbnr = BM_SECT_TO_BIT(sector);
+	ebnr = BM_SECT_TO_BIT(esector);
+
+	for (bnr = sbnr; bnr <= ebnr; bnr++) {
+		if (drbd_bm_test_bit(mdev,bnr)) return 0;
+	}
+	return 1;
+}
+
 STATIC int
 drbd_make_request_common(drbd_dev *mdev, int rw, int size,
 			 sector_t sector, drbd_bio_t *bio)
@@ -205,22 +235,27 @@
 	// FIXME special case handling of READA ??
 	if (rw == READ || rw == READA) {
 		if (local) {
-			target_area_out_of_sync =
-				(mdev->cstate == SyncTarget) &&
-				bm_get_bit(mdev->mbds_id,sector,size);
-			if (target_area_out_of_sync) {
+			if (!drbd_may_do_local_read(mdev,sector,size)) {
 				/* whe could kick the syncer to
 				 * sync this extent asap, wait for
 				 * it, then continue locally.
 				 * Or just issue the request remotely.
 				 */
-/* FIXME I think we have a RACE here
- * we request it remotely, then later some write starts ...
- * and finished *before* the answer to the read comes in,
- * because the ACK for the WRITE goes over meta-socket ...
- * I think we need to properly lock reads against the syncer, too.
- */
-
+				/* FIXME
+				 * I think we have a RACE here. We request
+				 * something from the peer, then later some
+				 * write starts ...  and finished *before*
+				 * the answer to the read comes in, because
+				 * the ACK for the WRITE goes over
+				 * meta-socket ...
+				 * Maybe we need to properly lock reads
+				 * against the syncer, too. But if we have
+				 * some user issuing writes on an area that
+				 * he has pending reads on, _he_ is really
+				 * broke anyways, and would get "undefined
+				 * results" on _any_ io stack, even just the
+				 * local io stack.
+				 */
 				local = 0;
 				dec_local(mdev);
 			}