[DRBD-cvs] svn commit by phil - r2526 - in branches/drbd-0.7: drbd
testing user - First part of the "freeze-io" feature for
drbd-0.7. See
drbd-cvs at lists.linbit.com
drbd-cvs at lists.linbit.com
Wed Oct 11 22:01:36 CEST 2006
Author: phil
Date: 2006-10-11 22:01:33 +0200 (Wed, 11 Oct 2006)
New Revision: 2526
Added:
branches/drbd-0.7/testing/io-latency-test.c
Modified:
branches/drbd-0.7/drbd/drbd_compat_wrappers.h
branches/drbd-0.7/drbd/drbd_fs.c
branches/drbd-0.7/drbd/drbd_int.h
branches/drbd-0.7/drbd/drbd_main.c
branches/drbd-0.7/drbd/drbd_proc.c
branches/drbd-0.7/drbd/drbd_receiver.c
branches/drbd-0.7/drbd/drbd_req.c
branches/drbd-0.7/testing/Makefile
branches/drbd-0.7/user/drbdsetup.c
Log:
First part of the "freeze-io" feature for drbd-0.7.
Seems to work already, but more testing is needed.
Modified: branches/drbd-0.7/drbd/drbd_compat_wrappers.h
===================================================================
--- branches/drbd-0.7/drbd/drbd_compat_wrappers.h 2006-10-11 14:23:39 UTC (rev 2525)
+++ branches/drbd-0.7/drbd/drbd_compat_wrappers.h 2006-10-11 20:01:33 UTC (rev 2526)
@@ -625,7 +625,7 @@
static inline int _drbd_send_bio(drbd_dev *mdev, struct bio *bio)
{
- struct bio_vec *bvec = bio_iovec(bio);
+ struct bio_vec *bvec = bio_iovec_idx(bio,0);
struct page *page = bvec->bv_page;
size_t size = bvec->bv_len;
int offset = bvec->bv_offset;
Modified: branches/drbd-0.7/drbd/drbd_fs.c
===================================================================
--- branches/drbd-0.7/drbd/drbd_fs.c 2006-10-11 14:23:39 UTC (rev 2525)
+++ branches/drbd-0.7/drbd/drbd_fs.c 2006-10-11 20:01:33 UTC (rev 2526)
@@ -1293,8 +1293,7 @@
err=-ENODATA;
break;
}
- /* FIXME what if fsync returns error */
- drbd_sync_me(mdev);
+
set_bit(DO_NOT_INC_CONCNT,&mdev->flags);
set_cstate(mdev,Unconnected);
drbd_thread_stop(&mdev->receiver);
Modified: branches/drbd-0.7/drbd/drbd_int.h
===================================================================
--- branches/drbd-0.7/drbd/drbd_int.h 2006-10-11 14:23:39 UTC (rev 2525)
+++ branches/drbd-0.7/drbd/drbd_int.h 2006-10-11 20:01:33 UTC (rev 2526)
@@ -690,7 +690,8 @@
MD_DIRTY, // current gen counts and flags not yet on disk
SYNC_STARTED, // Needed to agree on the exact point in time..
USE_DEGR_WFC_T, // Use degr-wfc-timeout instad of wfc-timeout.
- CRASHED_PRIMARY // This node was a crashed primary
+ CRASHED_PRIMARY, // This node was a crashed primary
+ IO_FROZEN // IO Frozen.
};
struct drbd_bitmap; // opaque for Drbd_Conf
@@ -832,6 +833,7 @@
extern void tl_release(drbd_dev *mdev,unsigned int barrier_nr,
unsigned int set_size);
extern void tl_clear(drbd_dev *mdev);
+extern void tl_resend(drbd_dev *mdev);
extern int tl_dependence(drbd_dev *mdev, drbd_request_t * item, int free_it);
extern void drbd_free_sock(drbd_dev *mdev);
extern int drbd_send(drbd_dev *mdev, struct socket *sock,
@@ -863,6 +865,9 @@
extern int drbd_io_error(drbd_dev* mdev);
extern void drbd_mdev_cleanup(drbd_dev *mdev);
+extern int drbd_resend_barrier(drbd_dev *mdev,struct drbd_barrier *b);
+extern int drbd_resend_dblock(drbd_dev *mdev, drbd_request_t *req);
+
// drbd_meta-data.c (still in drbd_main.c)
extern void drbd_md_write(drbd_dev *mdev);
extern int drbd_md_read(drbd_dev *mdev);
Modified: branches/drbd-0.7/drbd/drbd_main.c
===================================================================
--- branches/drbd-0.7/drbd/drbd_main.c 2006-10-11 14:23:39 UTC (rev 2525)
+++ branches/drbd-0.7/drbd/drbd_main.c 2006-10-11 20:01:33 UTC (rev 2526)
@@ -214,7 +214,7 @@
new_item->barrier = b;
new_item->rq_status |= RQ_DRBD_IN_TL;
- list_add(&new_item->w.list,&b->requests);
+ list_add_tail(&new_item->w.list,&b->requests);
if( b->n_req++ > mdev->conf.max_epoch_size ) {
set_bit(ISSUE_BARRIER,&mdev->flags);
@@ -282,10 +282,45 @@
D_ASSERT(b->br_number == barrier_nr);
D_ASSERT(b->n_req == set_size);
+#if 1
+ if(b->br_number != barrier_nr) {
+ DUMPI(b->br_number);
+ DUMPI(barrier_nr);
+ }
+ if(b->n_req != set_size) {
+ DUMPI(b->n_req);
+ DUMPI(set_size);
+ }
+#endif
+
list_del(&b->requests);
kfree(b);
}
+/* Since IO is frozen, nobody may modifiy the Transfer right log now...
+ */
+void tl_resend(drbd_dev *mdev)
+{
+ struct drbd_request *req;
+ struct list_head *le;
+ struct drbd_barrier *b;
+
+ D_ASSERT(test_bit(IO_FROZEN,&mdev->flags));
+
+ b = mdev->oldest_barrier;
+ while(1) {
+ list_for_each(le, &b->requests) {
+ req = list_entry(le, struct drbd_request,w.list);
+ drbd_resend_dblock(mdev,req);
+ }
+ if( b == mdev->newest_barrier ) break;
+ drbd_resend_barrier(mdev,b);
+ b = b->next;
+ }
+ D_ASSERT(test_bit(IO_FROZEN,&mdev->flags));
+}
+
+
void tl_clear(drbd_dev *mdev)
{
struct list_head *le,*tle;
@@ -455,6 +490,19 @@
// FIXME EXPLAIN
clear_bit(MD_IO_ALLOWED,&mdev->flags);
}
+
+ if(mdev->conf.on_disconnect == FreezeIO && mdev->state == Primary) {
+ if(os >= Connected && ns < Connected) {
+ set_bit(IO_FROZEN, &mdev->flags);
+ }
+ }
+
+ if( ns <= StandAlone && test_bit(IO_FROZEN, &mdev->flags)) {
+ WARN("Going to thaw IO, setting out of sync %d requests.\n",
+ atomic_read(&mdev->ap_pending_cnt));
+ tl_clear(mdev);
+ clear_bit(IO_FROZEN, &mdev->flags);
+ }
}
STATIC int drbd_thread_setup(void* arg)
@@ -803,6 +851,17 @@
return ok;
}
+int drbd_resend_barrier(drbd_dev *mdev,struct drbd_barrier *b)
+{
+ int ok;
+ Drbd_Barrier_Packet p;
+
+ p.barrier=b->br_number;
+ ok=drbd_send_cmd(mdev,USE_DATA_SOCKET, Barrier,(Drbd_Header*)&p,sizeof(p));
+
+ return ok;
+}
+
int drbd_send_b_ack(drbd_dev *mdev, u32 barrier_nr,u32 set_size)
{
int ok;
@@ -1098,6 +1157,39 @@
return ok;
}
+int drbd_resend_dblock(drbd_dev *mdev, drbd_request_t *req)
+{
+ int ok;
+ Drbd_Data_Packet p;
+
+ p.head.magic = BE_DRBD_MAGIC;
+ p.head.command = cpu_to_be16(Data);
+ p.head.length = cpu_to_be16(sizeof(p)-sizeof(Drbd_Header)+req->size);
+
+ p.sector = cpu_to_be64(req->sector);
+ p.block_id = (unsigned long)req;
+
+ down(&mdev->data.mutex);
+
+ spin_lock(&mdev->send_task_lock);
+ mdev->send_task=current;
+ spin_unlock(&mdev->send_task_lock);
+
+ dump_packet(mdev,mdev->data.socket,0,(void*)&p, __FILE__, __LINE__);
+ ok = sizeof(p) == drbd_send(mdev,mdev->data.socket,&p,sizeof(p),MSG_MORE);
+ if(ok) {
+ ok = _drbd_send_bio(mdev,req->master_bio);
+ }
+
+ spin_lock(&mdev->send_task_lock);
+ mdev->send_task=NULL;
+ spin_unlock(&mdev->send_task_lock);
+
+ up(&mdev->data.mutex);
+ return ok;
+}
+
+
int drbd_send_block(drbd_dev *mdev, Drbd_Packet_Cmd cmd,
struct Tl_epoch_entry *e)
{
Modified: branches/drbd-0.7/drbd/drbd_proc.c
===================================================================
--- branches/drbd-0.7/drbd/drbd_proc.c 2006-10-11 14:23:39 UTC (rev 2525)
+++ branches/drbd-0.7/drbd/drbd_proc.c 2006-10-11 20:01:33 UTC (rev 2526)
@@ -254,7 +254,7 @@
seq_printf( seq, "%2d: cs:Unconfigured\n", i);
else
seq_printf( seq,
- "%2d: cs:%s st:%s/%s ld:%s\n"
+ "%2d: cs:%s st:%s/%s ld:%s %c\n"
" ns:%u nr:%u dw:%u dr:%u al:%u bm:%u "
"lo:%d pe:%d ua:%d ap:%d\n",
i, sn,
@@ -262,7 +262,7 @@
nodestate_to_name(drbd_conf[i].o_state),
(drbd_conf[i].gen_cnt[Flags]
& MDF_Consistent) ? "Consistent" : "Inconsistent",
- // FIXME partner consistent?
+ test_bit(IO_FROZEN, &drbd_conf[i].flags)? 'F' : ' ',
drbd_conf[i].send_cnt/2,
drbd_conf[i].recv_cnt/2,
drbd_conf[i].writ_cnt/2,
Modified: branches/drbd-0.7/drbd/drbd_receiver.c
===================================================================
--- branches/drbd-0.7/drbd/drbd_receiver.c 2006-10-11 14:23:39 UTC (rev 2525)
+++ branches/drbd-0.7/drbd/drbd_receiver.c 2006-10-11 20:01:33 UTC (rev 2526)
@@ -1583,6 +1583,16 @@
if (mdev->cstate == WFReportParams) {
INFO("Connection established.\n");
+
+ if(test_bit(IO_FROZEN, &mdev->flags)) {
+ WARN("Going to thaw IO, resuming %d requests.\n",
+ atomic_read(&mdev->ap_pending_cnt));
+ tl_resend(mdev);
+ if (mdev->cstate == WFReportParams) {
+ clear_bit(IO_FROZEN, &mdev->flags);
+ consider_sync = 0;
+ } else return FALSE;
+ }
}
if (consider_sync) {
@@ -1869,12 +1879,21 @@
drbd_wait_ee(mdev,&mdev->sync_ee);
drbd_clear_done_ee(mdev);
- // primary
- tl_clear(mdev);
- clear_bit(ISSUE_BARRIER,&mdev->flags);
- wait_event( mdev->cstate_wait, atomic_read(&mdev->ap_pending_cnt)==0 );
- D_ASSERT(mdev->oldest_barrier->n_req == 0);
+ if(test_bit(IO_FROZEN, &mdev->flags)) {
+ WARN("IO frozen with ap_pending_cnt = %d\n",
+ atomic_read(&mdev->ap_pending_cnt));
+ } else {
+ tl_clear(mdev);
+ clear_bit(ISSUE_BARRIER,&mdev->flags);
+ wait_event( mdev->cstate_wait, atomic_read(&mdev->ap_pending_cnt)==0 );
+ D_ASSERT(mdev->oldest_barrier->n_req == 0);
+ if(atomic_read(&mdev->ap_pending_cnt)) {
+ ERR("ap_pending_cnt = %d\n",atomic_read(&mdev->ap_pending_cnt));
+ atomic_set(&mdev->ap_pending_cnt,0);
+ }
+ }
+
// both
clear_bit(PARTNER_CONSISTENT, &mdev->flags);
clear_bit(PARTNER_DISKLESS,&mdev->flags);
@@ -1905,11 +1924,6 @@
on the fly. */
atomic_set(&mdev->rs_pending_cnt,0);
- if(atomic_read(&mdev->ap_pending_cnt)) {
- ERR("ap_pending_cnt = %d\n",atomic_read(&mdev->ap_pending_cnt));
- atomic_set(&mdev->ap_pending_cnt,0);
- }
-
wake_up(&mdev->cstate_wait);
if ( mdev->state == Primary &&
Modified: branches/drbd-0.7/drbd/drbd_req.c
===================================================================
--- branches/drbd-0.7/drbd/drbd_req.c 2006-10-11 14:23:39 UTC (rev 2525)
+++ branches/drbd-0.7/drbd/drbd_req.c 2006-10-11 20:01:33 UTC (rev 2526)
@@ -243,8 +243,9 @@
// down_read(mdev->device_lock);
wait_event( mdev->cstate_wait,
- (volatile int)(mdev->cstate < WFBitMapS ||
- mdev->cstate > WFBitMapT) );
+ ((volatile int)mdev->cstate < WFBitMapS ||
+ (volatile int) mdev->cstate > WFBitMapT) &&
+ !(rw == WRITE && test_bit(IO_FROZEN, &mdev->flags)));
local = inc_local(mdev);
NOT_IN_26( if (rw == READA) rw=READ );
Modified: branches/drbd-0.7/testing/Makefile
===================================================================
--- branches/drbd-0.7/testing/Makefile 2006-10-11 14:23:39 UTC (rev 2525)
+++ branches/drbd-0.7/testing/Makefile 2006-10-11 20:01:33 UTC (rev 2526)
@@ -1,5 +1,5 @@
-PROGRAMS=show_size access_and_verify ioctl_structs_sizes
-CFLAGS=-Wall -I../drbd
+PROGRAMS=show_size access_and_verify ioctl_structs_sizes io-latency-test
+CFLAGS=-pthread -Wall -I../drbd
all: $(PROGRAMS)
Added: branches/drbd-0.7/testing/io-latency-test.c
===================================================================
--- branches/drbd-0.7/testing/io-latency-test.c 2006-10-11 14:23:39 UTC (rev 2525)
+++ branches/drbd-0.7/testing/io-latency-test.c 2006-10-11 20:01:33 UTC (rev 2526)
@@ -0,0 +1,176 @@
+/*
+ io-latency-test.c
+
+ By Philipp Reisner.
+
+ Copyright (C) 2006, Philipp Reisner <philipp.reisner at linbit.com>.
+ Initial author.
+
+ io-latency-test is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 2, or (at your option)
+ any later version.
+
+ dm is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with dm; see the file COPYING. If not, write to
+ the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
+
+ */
+
+/* In case this crashes (in your UML)
+ touch /etc/ld.so.nohwcap
+ */
+
+// compile with gcc -pthread -o io-latency-test io-latency-test.c
+
+#include <sys/poll.h>
+#include <sys/time.h>
+#include <unistd.h>
+#include <time.h>
+#include <pthread.h>
+#include <stdio.h>
+
+#define MONITOR_TIME 300000
+// Check every 300 milliseconds. (3.33 times per second)
+
+#define RECORD_TIME 20000
+// Try to write a record every 20 milliseconds (50 per second)
+
+struct shared_data {
+ pthread_mutex_t mutex;
+ unsigned long record_nr;
+ unsigned int write_duration_us;
+ unsigned int write_duration_records;
+};
+
+void* wd_thread(void *arg)
+{
+ struct shared_data *data = (struct shared_data*) arg;
+ unsigned long last_record_nr=-1, current_record_nr;
+ unsigned int avg_write,wd,wr;
+ enum { IO_RUNNING, IO_BLOCKED } io_state = IO_RUNNING;
+
+ while(1) {
+ usleep(MONITOR_TIME); // sleep some milliseconds
+
+ pthread_mutex_lock(&data->mutex);
+ current_record_nr = data->record_nr;
+ wd = data->write_duration_us;
+ wr = data->write_duration_records;
+ data->write_duration_us = 0;
+ data->write_duration_records = 0;
+ pthread_mutex_unlock(&data->mutex);
+
+ switch(io_state) {
+ case IO_RUNNING:
+ if(current_record_nr == last_record_nr) {
+ printf("IO got frozen. Last completely "
+ "written record: %lu\n",
+ last_record_nr);
+ io_state = IO_BLOCKED;
+ } else {
+ avg_write = wd/wr;
+
+ printf("Current record: %lu "
+ "( AVG write duration %d.%02dms )\r",
+ current_record_nr,
+ avg_write/1000,(avg_write%1000)/10);
+ fflush(stdout);
+ }
+ last_record_nr = current_record_nr;
+ case IO_BLOCKED:
+ if(current_record_nr != last_record_nr) {
+ printf("IO just resumed.\n");
+ io_state = IO_RUNNING;
+ }
+ }
+ }
+}
+
+int main(int argc, char** argv)
+{
+ pthread_t watch_dog;
+ unsigned long record_nr=0;
+ FILE* record_f;
+
+ struct timeval now_tv, then_tv;
+ struct tm now_tm;
+ int write_duration_us=0;
+
+ struct shared_data data;
+
+ if(argc != 2) {
+ fprintf(stderr,"USAGE: %s recordfile\n",argv[0]);
+ return 10;
+ }
+
+ if(!(record_f = fopen(argv[1],"w"))) {
+ perror("fopen:");
+ fprintf(stderr,"Failed to open '%s' for writing\n",argv[1]);
+ return 10;
+ }
+
+ printf("\n"
+ "This programm writes records to a file, shows the write latency\n"
+ "of the file system and block device combination and informs\n"
+ "you in case IO completely stalls.\n\n"
+ " Due to the nature of the 'D' process state on Linux\n"
+ " (and other Unix operating systems) you can not kill this\n"
+ " test programm while IO is frozen. You have to kill it with\n"
+ " Ctrl-C (SIGINT) while IO is running.\n\n"
+ "In case the record file's block device freezes, this "
+ "program will\n"
+ "inform you here which record was completely written before it "
+ "freezed.\n\n"
+ );
+
+ pthread_mutex_init(&data.mutex,NULL);
+ data.write_duration_us = 0;
+ data.write_duration_records = 1;
+
+ pthread_create(&watch_dog,NULL,wd_thread,&data);
+
+ for(;;record_nr++) {
+ gettimeofday(&now_tv, NULL);
+ localtime_r(&now_tv.tv_sec,&now_tm);
+
+ fprintf(record_f,
+ "%04d-%02d-%02d %02d:%02d:%02d.%06ld: "
+ "Record number: %-6lu "
+ "(L.r.w.t.: %d.%02dms)\n",
+ 1900+ now_tm.tm_year,
+ 1+ now_tm.tm_mon,
+ now_tm.tm_mday,
+ now_tm.tm_hour,
+ now_tm.tm_min,
+ now_tm.tm_sec,
+ now_tv.tv_usec,
+ record_nr,
+ write_duration_us/1000,
+ (write_duration_us%1000)/10);
+
+ fflush(record_f); // flush it from glibc to the kernel.
+ fdatasync(fileno(record_f)); // from buffer cache to disk.
+
+ // eventually wait for full RECORD_TIME
+ gettimeofday(&then_tv, NULL);
+ write_duration_us =
+ ( (then_tv.tv_sec - now_tv.tv_sec ) * 1000000 +
+ (then_tv.tv_usec - now_tv.tv_usec) );
+
+ pthread_mutex_lock(&data.mutex);
+ data.record_nr = record_nr;
+ data.write_duration_us += write_duration_us;
+ data.write_duration_records++;
+ pthread_mutex_unlock(&data.mutex);
+
+ if(write_duration_us < RECORD_TIME ) {
+ usleep(RECORD_TIME - write_duration_us);
+ }
+ }
+}
Modified: branches/drbd-0.7/user/drbdsetup.c
===================================================================
--- branches/drbd-0.7/user/drbdsetup.c 2006-10-11 14:23:39 UTC (rev 2525)
+++ branches/drbd-0.7/user/drbdsetup.c 2006-10-11 20:01:33 UTC (rev 2526)
@@ -188,7 +188,7 @@
const char *dh_names[] = {
[Reconnect] = "reconnect",
[DropNetConf] = "stand_alone",
- // [FreezeIO] = "freeze_io" // TODO on the kernel side...
+ [FreezeIO] = "freeze_io"
};
unsigned long resolv(const char* name)
More information about the drbd-cvs
mailing list