[DRBD-cvs] drbd by phil; Reenabled zero copy send via network. Pr...

Thu, 24 Jun 2004 17:17:38 +0200 (CEST)

DRBD CVS committal

Author  : phil
Module  : drbd

Dir     : drbd/drbd


Modified Files:
      Tag: rel-0_7-branch
	drbd_main.c drbd_receiver.c 


Log Message:
Reenabled zero copy send via network.
  Protocol A is still broken, I will disable zero_copy for protocol A...
Needs testing and verification.

===================================================================
RCS file: /var/lib/cvs/drbd/drbd/drbd/drbd_main.c,v
retrieving revision 1.73.2.189
retrieving revision 1.73.2.190
diff -u -3 -r1.73.2.189 -r1.73.2.190

--- drbd_main.c	24 Jun 2004 11:43:39 -0000	1.73.2.189
+++ drbd_main.c	24 Jun 2004 15:17:33 -0000	1.73.2.190
@@ -864,7 +864,7 @@
 	return drop_it; /* && (mdev->state == Primary) */;
 }
 
-#if 0
+#if 1
 /* We have the following problem with zero copy network IO:
    
    The idea of sendpage seems to be to put some kind of reference 
@@ -874,23 +874,16 @@
    As soon as the page was really sent over the network put_page()
    gets called by some part of the network layer. [ NIC driver? ]
 
-   [ get_page() / put_page() are functions of the buffer cache, they
-     increment/decrement the count. If count reaches 0 it goes deeply
-     into the page cache... ]
+   [ get_page() / put_page() increment/decrement the count. If count 
+     reaches 0 the page will be freed. ]
 
-   This works nicely as long as the FSs only use pages that are 
-   unter the control of the page cache. [ XFS is one of the
-   exceptions, it also uses pages allocated by other means ]
-
-   The other problematic case are our own private buffer pages (EEs). 
-   We initialize the count to 1, so they do not get handed back to the 
-   page cache, this is good. But we do not wait until the data is really 
-   sent, so somethimes we reuse our EE pages before the data was actually 
-   sent. (happens during resync.)
-
-   => I think it is possible to fix the EE case, but what should be done 
-      to the XFS issue ? 
+   This works nicely with pages from FSs. 
+   But this means that in protocol A we might signal IO completion too early !
 
+   In order not to corrupt data during a full sync we must make sure
+   that we do not reuse our own buffer pages (EEs) to early. 
+   Have a look at drbd_get_ee() where we check if the count of the page
+   has already dropped to 1 .
 */
 int _drbd_send_page(drbd_dev *mdev, struct page *page,
 		    int offset, size_t size)
===================================================================
RCS file: /var/lib/cvs/drbd/drbd/drbd/drbd_receiver.c,v
retrieving revision 1.97.2.173
retrieving revision 1.97.2.174
diff -u -3 -r1.97.2.173 -r1.97.2.174
--- drbd_receiver.c	18 Jun 2004 12:45:05 -0000	1.97.2.173
+++ drbd_receiver.c	24 Jun 2004 15:17:33 -0000	1.97.2.174
@@ -280,6 +280,7 @@
 	struct list_head *le;
 	struct Tl_epoch_entry* e;
 	DEFINE_WAIT(wait);
+	LIST_HEAD(active);
 
 	MUST_HOLD(&mdev->ee_lock);
 
@@ -288,7 +289,7 @@
 		drbd_kick_lo(mdev);
 		spin_lock_irq(&mdev->ee_lock);
 	}
-
+ retry:
 	if(list_empty(&mdev->free_ee)) _drbd_process_ee(mdev,&mdev->done_ee);
 
 	if(list_empty(&mdev->free_ee)) {
@@ -310,20 +311,39 @@
 			finish_wait(&mdev->ee_wait, &wait);
 			if (signal_pending(current)) {
 				WARN("drbd_get_ee interrupted!\n");
+				list_splice(&active,mdev->free_ee.prev);
 				return 0;
 			}
 			// finish wait is inside, so that we are TASK_RUNNING 
 			// in _drbd_process_ee (which might sleep by itself.)
 			_drbd_process_ee(mdev,&mdev->done_ee);
+
+			list_for_each(le,&active) {
+				e=list_entry(le, struct Tl_epoch_entry,w.list);
+				if( page_count(drbd_bio_get_page(&e->private_bio)) == 1 ) {
+					list_move(le,&mdev->free_ee);
+					break;
+				}
+			}
 		}
 		finish_wait(&mdev->ee_wait, &wait); 
 	}
 
 	le=mdev->free_ee.next;
 	list_del(le);
+
+	e=list_entry(le, struct Tl_epoch_entry, w.list);
+	if( page_count(drbd_bio_get_page(&e->private_bio)) > 1 ) {
+		/* This might happen if the sendpage() has not finished */
+		list_add(le,&active);
+		goto retry;
+	}
+
+	list_splice(&active,mdev->free_ee.prev);
+
 	mdev->ee_vacant--;
 	mdev->ee_in_use++;
-	e=list_entry(le, struct Tl_epoch_entry, w.list);
+
 ONLY_IN_26(
 	D_ASSERT(e->private_bio.bi_idx == 0);
 	drbd_ee_init(e,e->ee_bvec.bv_page); // reinitialize
@@ -343,7 +363,7 @@
 	mdev->ee_vacant++;
 	e->block_id = ID_VACANT;
 	INVALIDATE_MAGIC(e);
-	list_add(&e->w.list,&mdev->free_ee);
+	list_add_tail(&e->w.list,&mdev->free_ee);
 
 	if((mdev->ee_vacant * 2 > mdev->ee_in_use ) &&
 	   ( mdev->ee_vacant + mdev->ee_in_use > EE_MININUM) ) {