[DRBD-cvs] svn commit by phil - r2526 - in branches/drbd-0.7: drbd testing user - First part of the "freeze-io" feature for drbd-0.7. See

drbd-cvs at lists.linbit.com drbd-cvs at lists.linbit.com
Wed Oct 11 22:01:36 CEST 2006


Author: phil
Date: 2006-10-11 22:01:33 +0200 (Wed, 11 Oct 2006)
New Revision: 2526

Added:
   branches/drbd-0.7/testing/io-latency-test.c
Modified:
   branches/drbd-0.7/drbd/drbd_compat_wrappers.h
   branches/drbd-0.7/drbd/drbd_fs.c
   branches/drbd-0.7/drbd/drbd_int.h
   branches/drbd-0.7/drbd/drbd_main.c
   branches/drbd-0.7/drbd/drbd_proc.c
   branches/drbd-0.7/drbd/drbd_receiver.c
   branches/drbd-0.7/drbd/drbd_req.c
   branches/drbd-0.7/testing/Makefile
   branches/drbd-0.7/user/drbdsetup.c
Log:
First part of the "freeze-io" feature for drbd-0.7.
Seems to work already, but more testing is needed.


Modified: branches/drbd-0.7/drbd/drbd_compat_wrappers.h
===================================================================
--- branches/drbd-0.7/drbd/drbd_compat_wrappers.h	2006-10-11 14:23:39 UTC (rev 2525)
+++ branches/drbd-0.7/drbd/drbd_compat_wrappers.h	2006-10-11 20:01:33 UTC (rev 2526)
@@ -625,7 +625,7 @@
 
 static inline int _drbd_send_bio(drbd_dev *mdev, struct bio *bio)
 {
-	struct bio_vec *bvec = bio_iovec(bio);
+	struct bio_vec *bvec = bio_iovec_idx(bio,0);
 	struct page *page = bvec->bv_page;
 	size_t size = bvec->bv_len;
 	int offset = bvec->bv_offset;

Modified: branches/drbd-0.7/drbd/drbd_fs.c
===================================================================
--- branches/drbd-0.7/drbd/drbd_fs.c	2006-10-11 14:23:39 UTC (rev 2525)
+++ branches/drbd-0.7/drbd/drbd_fs.c	2006-10-11 20:01:33 UTC (rev 2526)
@@ -1293,8 +1293,7 @@
 			err=-ENODATA;
 			break;
 		}
-		/* FIXME what if fsync returns error */
-		drbd_sync_me(mdev);
+
 		set_bit(DO_NOT_INC_CONCNT,&mdev->flags);
 		set_cstate(mdev,Unconnected);
 		drbd_thread_stop(&mdev->receiver);

Modified: branches/drbd-0.7/drbd/drbd_int.h
===================================================================
--- branches/drbd-0.7/drbd/drbd_int.h	2006-10-11 14:23:39 UTC (rev 2525)
+++ branches/drbd-0.7/drbd/drbd_int.h	2006-10-11 20:01:33 UTC (rev 2526)
@@ -690,7 +690,8 @@
 	MD_DIRTY,		// current gen counts and flags not yet on disk
 	SYNC_STARTED,		// Needed to agree on the exact point in time..
 	USE_DEGR_WFC_T,		// Use degr-wfc-timeout instad of wfc-timeout.
-	CRASHED_PRIMARY         // This node was a crashed primary
+	CRASHED_PRIMARY,        // This node was a crashed primary
+	IO_FROZEN               // IO Frozen.
 };
 
 struct drbd_bitmap; // opaque for Drbd_Conf
@@ -832,6 +833,7 @@
 extern void tl_release(drbd_dev *mdev,unsigned int barrier_nr,
 		       unsigned int set_size);
 extern void tl_clear(drbd_dev *mdev);
+extern void tl_resend(drbd_dev *mdev);
 extern int tl_dependence(drbd_dev *mdev, drbd_request_t * item, int free_it);
 extern void drbd_free_sock(drbd_dev *mdev);
 extern int drbd_send(drbd_dev *mdev, struct socket *sock,
@@ -863,6 +865,9 @@
 extern int drbd_io_error(drbd_dev* mdev);
 extern void drbd_mdev_cleanup(drbd_dev *mdev);
 
+extern int drbd_resend_barrier(drbd_dev *mdev,struct drbd_barrier *b);
+extern int drbd_resend_dblock(drbd_dev *mdev, drbd_request_t *req);
+
 // drbd_meta-data.c (still in drbd_main.c)
 extern void drbd_md_write(drbd_dev *mdev);
 extern int drbd_md_read(drbd_dev *mdev);

Modified: branches/drbd-0.7/drbd/drbd_main.c
===================================================================
--- branches/drbd-0.7/drbd/drbd_main.c	2006-10-11 14:23:39 UTC (rev 2525)
+++ branches/drbd-0.7/drbd/drbd_main.c	2006-10-11 20:01:33 UTC (rev 2526)
@@ -214,7 +214,7 @@
 
 	new_item->barrier = b;
 	new_item->rq_status |= RQ_DRBD_IN_TL;
-	list_add(&new_item->w.list,&b->requests);
+	list_add_tail(&new_item->w.list,&b->requests);
 
 	if( b->n_req++ > mdev->conf.max_epoch_size ) {
 		set_bit(ISSUE_BARRIER,&mdev->flags);
@@ -282,10 +282,45 @@
 	D_ASSERT(b->br_number == barrier_nr);
 	D_ASSERT(b->n_req == set_size);
 
+#if 1
+        if(b->br_number != barrier_nr) {
+                DUMPI(b->br_number);
+                DUMPI(barrier_nr);
+        }
+        if(b->n_req != set_size) {
+                DUMPI(b->n_req);
+                DUMPI(set_size);
+        }
+#endif
+
 	list_del(&b->requests);
 	kfree(b);
 }
 
+/* Since IO is frozen, nobody may modifiy the Transfer right log now...
+ */
+void tl_resend(drbd_dev *mdev)
+{
+	struct drbd_request *req;
+	struct list_head *le;
+	struct drbd_barrier *b;
+
+	D_ASSERT(test_bit(IO_FROZEN,&mdev->flags));
+
+	b = mdev->oldest_barrier;
+	while(1) {
+		list_for_each(le, &b->requests) {
+			req = list_entry(le, struct drbd_request,w.list);
+			drbd_resend_dblock(mdev,req);
+		}
+		if( b == mdev->newest_barrier ) break;
+		drbd_resend_barrier(mdev,b);
+		b = b->next;
+	}
+	D_ASSERT(test_bit(IO_FROZEN,&mdev->flags));
+}
+
+
 void tl_clear(drbd_dev *mdev)
 {
 	struct list_head *le,*tle;
@@ -455,6 +490,19 @@
 // FIXME EXPLAIN
 		clear_bit(MD_IO_ALLOWED,&mdev->flags);
 	}
+
+	if(mdev->conf.on_disconnect == FreezeIO && mdev->state == Primary) {
+		if(os >= Connected && ns < Connected) {
+			set_bit(IO_FROZEN, &mdev->flags);
+		}
+	}
+
+	if( ns <= StandAlone && test_bit(IO_FROZEN, &mdev->flags)) {
+		WARN("Going to thaw IO, setting out of sync %d requests.\n",
+		     atomic_read(&mdev->ap_pending_cnt));
+		tl_clear(mdev);
+		clear_bit(IO_FROZEN, &mdev->flags);
+	}
 }
 
 STATIC int drbd_thread_setup(void* arg)
@@ -803,6 +851,17 @@
 	return ok;
 }
 
+int drbd_resend_barrier(drbd_dev *mdev,struct drbd_barrier *b)
+{
+	int ok;
+	Drbd_Barrier_Packet p;
+
+	p.barrier=b->br_number;
+	ok=drbd_send_cmd(mdev,USE_DATA_SOCKET, Barrier,(Drbd_Header*)&p,sizeof(p));
+
+	return ok;
+}
+
 int drbd_send_b_ack(drbd_dev *mdev, u32 barrier_nr,u32 set_size)
 {
 	int ok;
@@ -1098,6 +1157,39 @@
 	return ok;
 }
 
+int drbd_resend_dblock(drbd_dev *mdev, drbd_request_t *req)
+{
+	int ok;
+	Drbd_Data_Packet p;
+
+	p.head.magic   = BE_DRBD_MAGIC;
+	p.head.command = cpu_to_be16(Data);
+	p.head.length  = cpu_to_be16(sizeof(p)-sizeof(Drbd_Header)+req->size);
+
+	p.sector   = cpu_to_be64(req->sector);
+	p.block_id = (unsigned long)req;
+
+	down(&mdev->data.mutex);
+
+	spin_lock(&mdev->send_task_lock);
+	mdev->send_task=current;
+	spin_unlock(&mdev->send_task_lock);
+
+	dump_packet(mdev,mdev->data.socket,0,(void*)&p, __FILE__, __LINE__);
+	ok = sizeof(p) == drbd_send(mdev,mdev->data.socket,&p,sizeof(p),MSG_MORE);
+	if(ok) {
+		ok = _drbd_send_bio(mdev,req->master_bio);
+	}
+
+	spin_lock(&mdev->send_task_lock);
+	mdev->send_task=NULL;
+	spin_unlock(&mdev->send_task_lock);
+
+	up(&mdev->data.mutex);
+	return ok;
+}
+
+
 int drbd_send_block(drbd_dev *mdev, Drbd_Packet_Cmd cmd,
 		    struct Tl_epoch_entry *e)
 {

Modified: branches/drbd-0.7/drbd/drbd_proc.c
===================================================================
--- branches/drbd-0.7/drbd/drbd_proc.c	2006-10-11 14:23:39 UTC (rev 2525)
+++ branches/drbd-0.7/drbd/drbd_proc.c	2006-10-11 20:01:33 UTC (rev 2526)
@@ -254,7 +254,7 @@
 			seq_printf( seq, "%2d: cs:Unconfigured\n", i);
 		else
 			seq_printf( seq,
-			   "%2d: cs:%s st:%s/%s ld:%s\n"
+			   "%2d: cs:%s st:%s/%s ld:%s %c\n"
 			   "    ns:%u nr:%u dw:%u dr:%u al:%u bm:%u "
 			   "lo:%d pe:%d ua:%d ap:%d\n",
 			   i, sn,
@@ -262,7 +262,7 @@
 			   nodestate_to_name(drbd_conf[i].o_state),
 			   (drbd_conf[i].gen_cnt[Flags]
 			    & MDF_Consistent) ? "Consistent" : "Inconsistent",
-			// FIXME partner consistent?
+			   test_bit(IO_FROZEN, &drbd_conf[i].flags)? 'F' : ' ',
 			   drbd_conf[i].send_cnt/2,
 			   drbd_conf[i].recv_cnt/2,
 			   drbd_conf[i].writ_cnt/2,

Modified: branches/drbd-0.7/drbd/drbd_receiver.c
===================================================================
--- branches/drbd-0.7/drbd/drbd_receiver.c	2006-10-11 14:23:39 UTC (rev 2525)
+++ branches/drbd-0.7/drbd/drbd_receiver.c	2006-10-11 20:01:33 UTC (rev 2526)
@@ -1583,6 +1583,16 @@
 
 	if (mdev->cstate == WFReportParams) {
 		INFO("Connection established.\n");
+
+		if(test_bit(IO_FROZEN, &mdev->flags)) {
+			WARN("Going to thaw IO, resuming %d requests.\n",
+			     atomic_read(&mdev->ap_pending_cnt));
+			tl_resend(mdev);
+			if (mdev->cstate == WFReportParams) {
+				clear_bit(IO_FROZEN, &mdev->flags);
+				consider_sync = 0; 
+			} else return FALSE;
+		}
 	}
 
 	if (consider_sync) {
@@ -1869,12 +1879,21 @@
 	drbd_wait_ee(mdev,&mdev->sync_ee);
 	drbd_clear_done_ee(mdev);
 
-	// primary
-	tl_clear(mdev);
-	clear_bit(ISSUE_BARRIER,&mdev->flags);
-	wait_event( mdev->cstate_wait, atomic_read(&mdev->ap_pending_cnt)==0 );
-	D_ASSERT(mdev->oldest_barrier->n_req == 0);
+	if(test_bit(IO_FROZEN, &mdev->flags)) {
+		WARN("IO frozen with ap_pending_cnt = %d\n",
+		     atomic_read(&mdev->ap_pending_cnt));
+	} else {
+		tl_clear(mdev);
+		clear_bit(ISSUE_BARRIER,&mdev->flags);
+		wait_event( mdev->cstate_wait, atomic_read(&mdev->ap_pending_cnt)==0 );
+		D_ASSERT(mdev->oldest_barrier->n_req == 0);
 
+		if(atomic_read(&mdev->ap_pending_cnt)) {
+			ERR("ap_pending_cnt = %d\n",atomic_read(&mdev->ap_pending_cnt));
+			atomic_set(&mdev->ap_pending_cnt,0);
+		}
+	}
+
 	// both
 	clear_bit(PARTNER_CONSISTENT, &mdev->flags);
 	clear_bit(PARTNER_DISKLESS,&mdev->flags);
@@ -1905,11 +1924,6 @@
 	   on the fly. */
 	atomic_set(&mdev->rs_pending_cnt,0);
 
-	if(atomic_read(&mdev->ap_pending_cnt)) {
-		ERR("ap_pending_cnt = %d\n",atomic_read(&mdev->ap_pending_cnt));
-		atomic_set(&mdev->ap_pending_cnt,0);
-	}
-
 	wake_up(&mdev->cstate_wait);
 
 	if ( mdev->state == Primary &&

Modified: branches/drbd-0.7/drbd/drbd_req.c
===================================================================
--- branches/drbd-0.7/drbd/drbd_req.c	2006-10-11 14:23:39 UTC (rev 2525)
+++ branches/drbd-0.7/drbd/drbd_req.c	2006-10-11 20:01:33 UTC (rev 2526)
@@ -243,8 +243,9 @@
 	// down_read(mdev->device_lock);
 
 	wait_event( mdev->cstate_wait,
-		    (volatile int)(mdev->cstate < WFBitMapS || 
-				   mdev->cstate > WFBitMapT) );
+		    ((volatile int)mdev->cstate < WFBitMapS || 
+		     (volatile int) mdev->cstate > WFBitMapT) &&
+		    !(rw == WRITE && test_bit(IO_FROZEN, &mdev->flags)));
 
 	local = inc_local(mdev);
 	NOT_IN_26( if (rw == READA) rw=READ );

Modified: branches/drbd-0.7/testing/Makefile
===================================================================
--- branches/drbd-0.7/testing/Makefile	2006-10-11 14:23:39 UTC (rev 2525)
+++ branches/drbd-0.7/testing/Makefile	2006-10-11 20:01:33 UTC (rev 2526)
@@ -1,5 +1,5 @@
-PROGRAMS=show_size  access_and_verify ioctl_structs_sizes
-CFLAGS=-Wall -I../drbd
+PROGRAMS=show_size  access_and_verify ioctl_structs_sizes io-latency-test
+CFLAGS=-pthread -Wall -I../drbd
 
 all: $(PROGRAMS)
 

Added: branches/drbd-0.7/testing/io-latency-test.c
===================================================================
--- branches/drbd-0.7/testing/io-latency-test.c	2006-10-11 14:23:39 UTC (rev 2525)
+++ branches/drbd-0.7/testing/io-latency-test.c	2006-10-11 20:01:33 UTC (rev 2526)
@@ -0,0 +1,176 @@
+/*
+   io-latency-test.c
+
+   By Philipp Reisner.
+
+   Copyright (C) 2006, Philipp Reisner <philipp.reisner at linbit.com>.
+        Initial author.
+
+   io-latency-test is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 2, or (at your option)
+   any later version.
+
+   dm is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with dm; see the file COPYING.  If not, write to
+   the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
+
+ */
+
+/* In case this crashes (in your UML)
+   touch /etc/ld.so.nohwcap
+ */
+
+// compile with gcc -pthread -o io-latency-test io-latency-test.c
+
+#include <sys/poll.h>
+#include <sys/time.h>
+#include <unistd.h>
+#include <time.h>
+#include <pthread.h>
+#include <stdio.h>
+
+#define MONITOR_TIME 300000
+// Check every 300 milliseconds. (3.33 times per second)
+
+#define RECORD_TIME 20000
+// Try to write a record every 20 milliseconds (50 per second)
+
+struct shared_data {
+	pthread_mutex_t mutex;
+	unsigned long record_nr;
+	unsigned int write_duration_us;
+	unsigned int write_duration_records;
+};
+
+void* wd_thread(void *arg)
+{
+	struct shared_data *data = (struct shared_data*) arg;
+	unsigned long last_record_nr=-1, current_record_nr;
+	unsigned int avg_write,wd,wr;
+	enum { IO_RUNNING, IO_BLOCKED } io_state = IO_RUNNING;
+	
+	while(1) {
+		usleep(MONITOR_TIME); // sleep some milliseconds
+
+		pthread_mutex_lock(&data->mutex);
+		current_record_nr = data->record_nr;
+		wd = data->write_duration_us;
+		wr = data->write_duration_records;
+		data->write_duration_us = 0;
+		data->write_duration_records = 0;
+		pthread_mutex_unlock(&data->mutex);
+
+		switch(io_state) {
+		case IO_RUNNING:
+			if(current_record_nr == last_record_nr) {
+				printf("IO got frozen. Last completely "
+				       "written record: %lu\n",
+				       last_record_nr);
+				io_state = IO_BLOCKED;
+			} else {
+				avg_write = wd/wr;
+
+				printf("Current record: %lu "
+				       "( AVG write duration %d.%02dms )\r",
+				       current_record_nr,
+				       avg_write/1000,(avg_write%1000)/10);
+				fflush(stdout);
+			}
+			last_record_nr = current_record_nr;
+		case IO_BLOCKED:
+			if(current_record_nr != last_record_nr) {
+				printf("IO just resumed.\n");
+				io_state = IO_RUNNING;
+			}
+		}
+	}
+}
+
+int main(int argc, char** argv)
+{
+	pthread_t watch_dog;
+	unsigned long record_nr=0;
+	FILE* record_f;
+
+	struct timeval now_tv, then_tv;
+	struct tm now_tm;
+	int write_duration_us=0;
+
+	struct shared_data data;
+
+	if(argc != 2) {
+		fprintf(stderr,"USAGE: %s recordfile\n",argv[0]);
+		return 10;
+	}
+
+	if(!(record_f = fopen(argv[1],"w"))) {
+		perror("fopen:");
+		fprintf(stderr,"Failed to open '%s' for writing\n",argv[1]);
+		return 10;
+	}
+
+	printf("\n"
+	       "This programm writes records to a file, shows the write latency\n"
+	       "of the file system and block device combination and informs\n"
+	       "you in case IO completely stalls.\n\n"
+	       "  Due to the nature of the 'D' process state on Linux\n"
+	       "  (and other Unix operating systems) you can not kill this\n"
+	       "  test programm while IO is frozen. You have to kill it with\n"
+	       "  Ctrl-C (SIGINT) while IO is running.\n\n"
+	       "In case the record file's block device freezes, this "
+	       "program will\n"
+	       "inform you here which record was completely written before it "
+	       "freezed.\n\n"
+	       );
+
+	pthread_mutex_init(&data.mutex,NULL);
+	data.write_duration_us = 0;
+	data.write_duration_records = 1;
+
+	pthread_create(&watch_dog,NULL,wd_thread,&data);
+
+	for(;;record_nr++) {
+		gettimeofday(&now_tv, NULL);
+		localtime_r(&now_tv.tv_sec,&now_tm);
+
+		fprintf(record_f,
+			"%04d-%02d-%02d %02d:%02d:%02d.%06ld: "
+			"Record number: %-6lu "
+			"(L.r.w.t.: %d.%02dms)\n",
+			1900+ now_tm.tm_year,
+			1+ now_tm.tm_mon,
+			now_tm.tm_mday,
+			now_tm.tm_hour,
+			now_tm.tm_min,
+			now_tm.tm_sec,
+			now_tv.tv_usec,
+			record_nr,
+			write_duration_us/1000,
+			(write_duration_us%1000)/10);
+		
+		fflush(record_f); // flush it from glibc to the kernel.
+		fdatasync(fileno(record_f)); // from buffer cache to disk.
+
+		// eventually wait for full RECORD_TIME
+		gettimeofday(&then_tv, NULL);
+		write_duration_us =
+			( (then_tv.tv_sec  - now_tv.tv_sec ) * 1000000 +
+			  (then_tv.tv_usec - now_tv.tv_usec) );
+
+		pthread_mutex_lock(&data.mutex);
+		data.record_nr = record_nr;
+		data.write_duration_us += write_duration_us;
+		data.write_duration_records++;
+		pthread_mutex_unlock(&data.mutex);
+
+		if(write_duration_us < RECORD_TIME ) {
+			usleep(RECORD_TIME - write_duration_us);
+		}
+	}
+}

Modified: branches/drbd-0.7/user/drbdsetup.c
===================================================================
--- branches/drbd-0.7/user/drbdsetup.c	2006-10-11 14:23:39 UTC (rev 2525)
+++ branches/drbd-0.7/user/drbdsetup.c	2006-10-11 20:01:33 UTC (rev 2526)
@@ -188,7 +188,7 @@
 const char *dh_names[] = {
   [Reconnect]   = "reconnect",
   [DropNetConf] = "stand_alone",
-  // [FreezeIO]    = "freeze_io" // TODO on the kernel side...
+  [FreezeIO]    = "freeze_io"
 };
 
 unsigned long resolv(const char* name)



More information about the drbd-cvs mailing list