[DRBD-cvs] svn commit by lars - r2398 - trunk/drbd - moved
_req_may_be_done and _req_mod into drbd_req.c no
drbd-cvs at lists.linbit.com
drbd-cvs at lists.linbit.com
Mon Sep 11 12:41:31 CEST 2006
Author: lars
Date: 2006-09-11 12:41:30 +0200 (Mon, 11 Sep 2006)
New Revision: 2398
Modified:
trunk/drbd/drbd_req.c
trunk/drbd/drbd_req.h
Log:
moved _req_may_be_done and _req_mod into drbd_req.c
no longer "static inline": aparently that got too large.
Modified: trunk/drbd/drbd_req.c
===================================================================
--- trunk/drbd/drbd_req.c 2006-09-11 10:23:03 UTC (rev 2397)
+++ trunk/drbd/drbd_req.c 2006-09-11 10:41:30 UTC (rev 2398)
@@ -33,6 +33,560 @@
#include "drbd_int.h"
#include "drbd_req.h"
+void _req_may_be_done(drbd_request_t *req)
+{
+ const unsigned long s = req->rq_state;
+ drbd_dev *mdev = req->mdev;
+ int rw;
+
+ MUST_HOLD(&mdev->req_lock)
+
+ if (s & RQ_NET_PENDING) return;
+ if (s & RQ_LOCAL_PENDING) return;
+
+ if (req->master_bio) {
+ /* this is data_received (remote read)
+ * or protocol C WriteAck
+ * or protocol B RecvAck
+ * or protocol A "handed_over_to_network" (SendAck)
+ * or canceled or failed,
+ * or killed from the transfer log due to connection loss.
+ */
+
+ /*
+ * figure out whether to report success or failure.
+ *
+ * report success when at least one of the oprations suceeded.
+ * or, to put the other way,
+ * only report failure, when both operations failed.
+ *
+ * what to do about the failures is handled elsewhere.
+ * what we need to do here is just: complete the master_bio.
+ */
+ int ok = (s & RQ_LOCAL_OK) || (s & RQ_NET_OK);
+ rw = bio_data_dir(req->master_bio);
+ if (rw == WRITE) {
+ drbd_request_t *i;
+ struct Tl_epoch_entry *e;
+ struct hlist_node *n;
+ struct hlist_head *slot;
+
+ /* before we can signal completion to the upper layers,
+ * we may need to close the current epoch */
+ if (req->epoch == mdev->newest_barrier->br_number)
+ set_bit(ISSUE_BARRIER,&mdev->flags);
+
+ /* and maybe "wake" those conflicting requests that
+ * wait for this request to finish.
+ * we just have to walk starting from req->next,
+ * see _req_add_hash_check_colision(); */
+#define OVERLAPS overlaps(req->sector, req->size, i->sector, i->size)
+ n = req->colision.next;
+ /* hlist_del ... done below */
+ hlist_for_each_entry_from(i, n, colision) {
+ if (OVERLAPS)
+ drbd_queue_work(&mdev->data.work,&i->w);
+ }
+
+ /* and maybe "wake" those conflicting epoch entries
+ * that wait for this request to finish */
+ /* FIXME looks alot like we could consolidate some code
+ * and maybe even hash tables? */
+#undef OVERLAPS
+#define OVERLAPS overlaps(req->sector, req->size, e->sector, e->size)
+ slot = ee_hash_slot(mdev,req->sector);
+ hlist_for_each_entry(e, n, slot, colision) {
+ if (OVERLAPS)
+ drbd_queue_work(&mdev->data.work,&e->w);
+ }
+#undef OVERLAPS
+ }
+ /* else: READ, READA: nothing more to do */
+
+ /* remove the request from the conflict detection
+ * respective block_id verification hash */
+ hlist_del(&req->colision);
+
+ /* FIXME not yet implemented...
+ * in case we got "suspended" (on_disconnect: freeze io)
+ * we may not yet complete the request...
+ * though, this is probably best handled elsewhere by not
+ * walking the transfer log until "unfreeze", so we won't end
+ * up here anyways during the freeze ...
+ * then again, if it is a READ, it is not in the TL at all.
+ * is it still leagal to complete a READ during freeze? */
+ bio_endio(req->master_bio, req->master_bio->bi_size, ok ? 0 : -EIO);
+ req->master_bio = NULL;
+ } else {
+ /* only WRITE requests can end up here without a master_bio */
+ rw = WRITE;
+ }
+
+ if ((s == RQ_NET_MASK) == 0 || (s & RQ_NET_DONE)) {
+ /* this is disconnected (local only) operation,
+ * or protocol C WriteAck,
+ * or protocol A or B BarrierAck,
+ * or killed from the transfer log due to connection loss. */
+
+ /* if it was a write, we may have to set the corresponding
+ * bit(s) out-of-sync first. If it had a local part, we need to
+ * release the reference to the activity log. */
+ if (rw == WRITE) {
+ /* remove it from the transfer log */
+ list_del(&req->tl_requests);
+ /* Set out-of-sync unless both OK flags are set
+ * (local only or remote failed).
+ * Other places where we set out-of-sync:
+ * READ with local io-error */
+ if (!(s & RQ_NET_OK) || !(s & RQ_LOCAL_OK))
+ drbd_set_out_of_sync(mdev,req->sector,req->size);
+ if (s & RQ_LOCAL_MASK) {
+ drbd_al_complete_io(mdev, req->sector);
+ }
+ }
+
+ /* if it was an io error, we want to notify our
+ * peer about that, and see if we need to
+ * detach the disk and stuff.
+ * to avoid allocating some special work
+ * struct, reuse the request. */
+ if (rw == WRITE &&
+ (( s & RQ_LOCAL_MASK) && !(s & RQ_LOCAL_OK))) {
+ if (!(req->w.list.next == LIST_POISON1 ||
+ list_empty(&req->w.list))) {
+ /* DEBUG ASSERT only; if this triggers, we
+ * probably corrupt the worker list here */
+ DUMPP(req->w.list.next);
+ DUMPP(req->w.list.prev);
+ }
+ req->w.cb = w_io_error;
+ drbd_queue_work(&mdev->data.work, &req->w);
+ /* drbd_req_free() is done in w_io_error */
+ } else {
+ drbd_req_free(req);
+ }
+ }
+ /* else: network part and not DONE yet. that is
+ * protocol A or B, barrier ack still pending... */
+}
+
+/*
+ * checks whether there was an overlapping request already registered.
+ * if so, add the request to the colision hash
+ * _after_ the (first) overlapping request,
+ * and return 1
+ * if no overlap was found, add this request to the front of the chain,
+ * and return 0
+ *
+ * corresponding hlist_del is in _req_may_be_done()
+ *
+ * NOTE:
+ * paranoia: assume something above us is broken, and issues different write
+ * requests for the same block simultaneously...
+ *
+ * To ensure these won't be reordered differently on both nodes, resulting in
+ * diverging data sets, we discard the later one(s). Not that this is supposed
+ * to happen, but this is the rationale why we also have to check for
+ * conflicting requests with local origin, and why we have to do so regardless
+ * of whether we allowed multiple primaries.
+ *
+ * BTW, in case we only have one primary, the ee_hash is empty anyways, and the
+ * second hlist_for_each_entry becomes a noop. This is even simpler than to
+ * grab a reference on the net_conf, and check for the two_primaries flag...
+ */
+STATIC int _req_add_hash_check_colision(drbd_request_t *req)
+{
+ drbd_dev *mdev = req->mdev;
+ const sector_t sector = req->sector;
+ const int size = req->size;
+ drbd_request_t *i;
+ struct Tl_epoch_entry *e;
+ struct hlist_node *n;
+ struct hlist_head *slot;
+
+ MUST_HOLD(&mdev->req_lock);
+ D_ASSERT(hlist_unhashed(&req->colision));
+#define OVERLAPS overlaps(i->sector, i->size, sector, size)
+ slot = tl_hash_slot(mdev,sector);
+ hlist_for_each_entry(i, n, slot, colision) {
+ if (OVERLAPS) {
+ ALERT("%s[%u] Concurrent local write detected!"
+ " [DISCARD L] new: %llu +%d; pending: %llu +%d\n",
+ current->comm, current->pid,
+ (unsigned long long)sector, size,
+ (unsigned long long)i->sector, i->size);
+ hlist_add_after(n,&req->colision);
+ return 1;
+ }
+ }
+ /* no overlapping request with local origin found,
+ * register in front */
+ hlist_add_head(&req->colision,slot);
+
+ /* now, check for overlapping requests with remote origin */
+#undef OVERLAPS
+#define OVERLAPS overlaps(e->sector, e->size, sector, size)
+ slot = ee_hash_slot(mdev,sector);
+ hlist_for_each_entry(e, n, slot, colision) {
+ if (OVERLAPS) {
+ ALERT("%s[%u] Concurrent remote write detected!"
+ " [DISCARD L] new: %llu +%d; pending: %llu +%d\n",
+ current->comm, current->pid,
+ (unsigned long long)sector, size,
+ e->sector, e->size);
+ return 1;
+ }
+ }
+#undef OVERLAPS
+
+ /* this is like it should be, and what we expected.
+ * out users do behave after all... */
+ return 0;
+}
+
+/* obviously this could be coded as many single functions
+ * instead of one huge switch,
+ * or by putting the code directly in the respective locations
+ * (as it has been before).
+ *
+ * but having it this way
+ * enforces that it is all in this one place, where it is easier to audit,
+ * it makes it obvious that whatever "event" "happens" to a request should
+ * happen "atomically" within the req_lock,
+ * and it enforces that we have to think in a very structured manner
+ * about the "events" that may happen to a request during its life time ...
+ *
+ * Though I think it is likely that we break this again into many
+ * static inline void _req_mod_ ## what (req) ...
+ */
+void _req_mod(drbd_request_t *req, drbd_req_event_t what)
+{
+ drbd_dev *mdev = req->mdev;
+ MUST_HOLD(&mdev->req_lock);
+
+ switch(what) {
+ default:
+ ERR("LOGIC BUG in %s:%u\n", __FILE__ , __LINE__ );
+ return;
+
+ /* does not happen...
+ * initialization done in drbd_req_new
+ case created:
+ break;
+ */
+
+ case to_be_send: /* via network */
+ /* reached via drbd_make_request_common
+ * and from FIXME w_read_retry_remote */
+ D_ASSERT(!(req->rq_state & RQ_NET_MASK));
+ req->rq_state |= RQ_NET_PENDING;
+ inc_ap_pending(mdev);
+ break;
+
+ case to_be_submitted: /* locally */
+ /* reached via drbd_make_request_common */
+ D_ASSERT(!(req->rq_state & RQ_LOCAL_MASK));
+ req->rq_state |= RQ_LOCAL_PENDING;
+ break;
+
+#if 0
+ /* done inline below */
+ case suspend_because_of_conflict:
+ /* assert something? */
+ /* reached via drbd_make_request_common */
+ /* update state flag? why? which one? */
+ req->w.cb = w_req_cancel_conflict;
+ /* no queue here, see below! */
+ break;
+#endif
+
+ /* FIXME these *_completed_* are basically the same.
+ * can probably be merged with some if (what == xy) */
+
+ case completed_ok:
+ if (bio_data_dir(req->private_bio) == WRITE)
+ mdev->writ_cnt += req->size>>9;
+ else
+ mdev->read_cnt += req->size>>9;
+
+ bio_put(req->private_bio);
+ req->private_bio = NULL;
+ dec_local(mdev);
+
+ req->rq_state |= (RQ_LOCAL_COMPLETED|RQ_LOCAL_OK);
+ req->rq_state &= ~RQ_LOCAL_PENDING;
+
+ _req_may_be_done(req);
+ break;
+
+ case write_completed_with_error:
+ req->rq_state |= RQ_LOCAL_COMPLETED;
+ req->rq_state &= ~RQ_LOCAL_PENDING;
+
+ bio_put(req->private_bio);
+ req->private_bio = NULL;
+ dec_local(mdev);
+ ALERT("Local WRITE failed sec=%llu size=%u\n",
+ req->sector, req->size);
+ /* and now: check how to handle local io error.
+ * FIXME see comment below in read_completed_with_error */
+ __drbd_chk_io_error(mdev);
+ _req_may_be_done(req);
+ break;
+
+ case read_completed_with_error:
+ drbd_set_out_of_sync(mdev,req->sector,req->size);
+ req->rq_state |= RQ_LOCAL_COMPLETED;
+ req->rq_state &= ~RQ_LOCAL_PENDING;
+
+ bio_put(req->private_bio);
+ req->private_bio = NULL;
+ dec_local(mdev);
+ if (bio_rw(req->master_bio) == READA)
+ /* it is legal to fail READA */
+ break;
+ /* else */
+ ALERT("Local READ failed sec=%llu size=%u\n",
+ req->sector, req->size);
+ /* _req_mod(req,to_be_send); oops, recursion in static inline */
+ D_ASSERT(!(req->rq_state & RQ_NET_MASK));
+ req->rq_state |= RQ_NET_PENDING;
+ inc_ap_pending(mdev);
+
+ /* and now: check how to handle local io error.
+ *
+ * FIXME we should not handle WRITE and READ io errors
+ * the same. When we retry the READ, and then write
+ * the answer, that might suceed because modern drives
+ * would relocate the sectors. We'd need to keep our
+ * private bio then, and round the offset and size so
+ * we get back enough data to be able to clear the bits again.
+ */
+ __drbd_chk_io_error(mdev);
+ /* fall through: _req_mod(req,queue_for_net_read); */
+
+ case queue_for_net_read:
+ /* READ or READA, and
+ * no local disk,
+ * or target area marked as invalid,
+ * or just got an io-error. */
+ /* from drbd_make_request_common
+ * or from bio_endio during read io-error recovery */
+
+ /* so we can verify the handle in the answer packet
+ * corresponding hlist_del is in _req_may_be_done() */
+ hlist_add_head(&req->colision, ar_hash_slot(mdev,req->sector));
+
+ set_bit(UNPLUG_REMOTE,&mdev->flags); /* why? */
+
+ D_ASSERT(req->rq_state & RQ_NET_PENDING);
+ req->rq_state |= RQ_NET_QUEUED;
+ req->w.cb = (req->rq_state & RQ_LOCAL_MASK)
+ ? w_read_retry_remote
+ : w_send_read_req;
+ drbd_queue_work(&mdev->data.work, &req->w);
+ break;
+
+ case queue_for_net_write:
+ /* assert something? */
+ /* from drbd_make_request_common only */
+
+ /* NOTE
+ * In case the req ended up on the transfer log before being
+ * queued on the worker, it could lead to this request being
+ * missed during cleanup after connection loss.
+ * So we have to do both operations here,
+ * within the same lock that protects the transfer log.
+ */
+
+ /* register this request on the colison detection hash
+ * tables. if we have a conflict, just leave here.
+ * the request will be "queued" for faked "completion"
+ * once the conflicting request is done.
+ */
+ if (_req_add_hash_check_colision(req)) {
+ /* this is a conflicting request.
+ * even though it may have been only _partially_
+ * overlapping with one of the currently pending requests,
+ * without even submitting or sending it,
+ * we will pretend that it was successfully served
+ * once the pending conflicting request is done.
+ */
+ /* _req_mod(req, suspend_because_of_conflict); */
+ /* this callback is just for ASSERT purposes */
+ req->w.cb = w_req_cancel_conflict;
+
+ /* we don't add this to any epoch (barrier) object.
+ * assign the "invalid" barrier_number 0.
+ * it should be 0 anyways, still,
+ * but being explicit won't harm. */
+ req->epoch = 0;
+
+ /*
+ * EARLY break here!
+ */
+ break;
+ }
+
+ /* _req_add_to_epoch(req); this has to be after the
+ * _maybe_start_new_epoch(req); which happened in
+ * drbd_make_request_common, because we now may set the bit
+ * again ourselves to close the current epoch.
+ *
+ * Add req to the (now) current epoch (barrier). */
+
+ /* see drbd_make_request_common just after it grabs the req_lock */
+ D_ASSERT(test_bit(ISSUE_BARRIER, &mdev->flags) == 0);
+
+ req->epoch = mdev->newest_barrier->br_number;
+ list_add(&req->tl_requests,&mdev->newest_barrier->requests);
+
+ /* mark the current epoch as closed,
+ * in case it outgrew the limit */
+ if( ++mdev->newest_barrier->n_req >= mdev->net_conf->max_epoch_size )
+ set_bit(ISSUE_BARRIER,&mdev->flags);
+
+ D_ASSERT(req->rq_state & RQ_NET_PENDING);
+ req->rq_state |= RQ_NET_QUEUED;
+ req->w.cb = w_send_dblock;
+ drbd_queue_work(&mdev->data.work, &req->w);
+ break;
+
+ case conflicting_req_done:
+ case conflicting_ee_done:
+ /* reached via bio_endio of the
+ * conflicting request or epoch entry.
+ * we now just "fake" completion of this request.
+ * THINK: I'm going to _FAIL_ this request.
+ */
+ D_ASSERT(req->w.cb == w_req_cancel_conflict);
+ D_ASSERT(req->epoch == 0);
+ {
+ const unsigned long s = req->rq_state;
+ if (s & RQ_LOCAL_MASK) {
+ D_ASSERT(s & RQ_LOCAL_PENDING);
+ bio_put(req->private_bio);
+ req->private_bio = NULL;
+ dec_local(mdev);
+ }
+ D_ASSERT((s & RQ_NET_MASK) == RQ_NET_PENDING);
+ dec_ap_pending(mdev);
+ }
+ /* no _OK ... this is going to be an io-error */
+ req->rq_state = RQ_LOCAL_COMPLETED|RQ_NET_DONE;
+ _req_may_be_done(req);
+ break;
+
+ /* FIXME
+ * to implement freeze-io,
+ * we may not finish the request just yet.
+ */
+ case send_canceled:
+ /* for the request, this is the same thing */
+ case send_failed:
+ D_ASSERT(req->rq_state & RQ_NET_PENDING);
+ dec_ap_pending(mdev);
+ req->rq_state &= ~(RQ_NET_PENDING|RQ_NET_QUEUED|RQ_NET_OK);
+ req->rq_state |= RQ_NET_DONE;
+ _req_may_be_done(req);
+ break;
+
+ case handed_over_to_network:
+ /* assert something? */
+ if ( bio_data_dir(req->master_bio) == WRITE &&
+ mdev->net_conf->wire_protocol == DRBD_PROT_A ) {
+ /* this is what is dangerous about protocol A:
+ * pretend it was sucessfully written on the peer.
+ * FIXME in case we get a local io-error in
+ * protocol != C, we might want to defer comletion
+ * until we get the barrier ack, and send a NegAck
+ * in case the other node had an io-error, too...
+ * That way we would at least not report "success"
+ * if it was not written at all. */
+ if (req->rq_state & RQ_NET_PENDING) {
+ dec_ap_pending(mdev);
+ req->rq_state &= ~RQ_NET_PENDING;
+ req->rq_state |= RQ_NET_OK;
+ } /* else: neg-ack was faster... */
+ /* it is still not yet RQ_NET_DONE until the
+ * corresponding epoch barrier got acked as well,
+ * so we know what to dirty on connection loss */
+ }
+ req->rq_state &= ~RQ_NET_QUEUED;
+ req->rq_state |= RQ_NET_SENT;
+ /* because _drbd_send_zc_bio could sleep, and may want to
+ * dereference the bio even after the "write_acked_by_peer" and
+ * "completed_ok" events came in, once we return from
+ * _drbd_send_zc_bio (drbd_send_dblock), we have to check
+ * whether it is done already, and end it. */
+ _req_may_be_done(req);
+ break;
+
+ case connection_lost_while_pending:
+ /* transfer log cleanup after connection loss */
+ /* assert something? */
+ if (req->rq_state & RQ_NET_PENDING) dec_ap_pending(mdev);
+ req->rq_state &= ~(RQ_NET_OK|RQ_NET_PENDING);
+ req->rq_state |= RQ_NET_DONE;
+ /* if it is still queued, we may not complete it here.
+ * it will be canceled soon.
+ * FIXME we should change the code so this can not happen. */
+ if (!(req->rq_state & RQ_NET_QUEUED))
+ _req_may_be_done(req);
+ break;
+
+ case write_acked_by_peer:
+ /* assert something? */
+ /* protocol C; successfully written on peer */
+ req->rq_state |= RQ_NET_DONE;
+ /* rest is the same as for: */
+ case recv_acked_by_peer:
+ /* protocol B; pretends to be sucessfully written on peer.
+ * see also notes above in handed_over_to_network about
+ * protocol != C */
+ req->rq_state |= RQ_NET_OK;
+ D_ASSERT(req->rq_state & RQ_NET_PENDING);
+ dec_ap_pending(mdev);
+ req->rq_state &= ~RQ_NET_PENDING;
+ if (req->rq_state & RQ_NET_SENT)
+ _req_may_be_done(req);
+ /* else: done by handed_over_to_network */
+ break;
+
+ case neg_acked:
+ /* assert something? */
+ if (req->rq_state & RQ_NET_PENDING) dec_ap_pending(mdev);
+ req->rq_state &= ~(RQ_NET_OK|RQ_NET_PENDING);
+ /* FIXME THINK! is it DONE now, or is it not? */
+ req->rq_state |= RQ_NET_DONE;
+ if (req->rq_state & RQ_NET_SENT)
+ _req_may_be_done(req);
+ /* else: done by handed_over_to_network */
+ break;
+
+ case barrier_acked:
+ /* can even happen for protocol C,
+ * when local io is stil pending.
+ * in which case it does nothing. */
+ D_ASSERT(req->rq_state & RQ_NET_SENT);
+ req->rq_state |= RQ_NET_DONE;
+ _req_may_be_done(req);
+ break;
+
+ case data_received:
+ D_ASSERT(req->rq_state & RQ_NET_PENDING);
+ dec_ap_pending(mdev);
+ req->rq_state &= ~RQ_NET_PENDING;
+ req->rq_state |= (RQ_NET_OK|RQ_NET_DONE);
+ /* can it happen that we receive the DataReply
+ * before the send DataRequest function returns? */
+ if (req->rq_state & RQ_NET_SENT)
+ _req_may_be_done(req);
+ /* else: done by handed_over_to_network */
+ break;
+ };
+}
+
/* we may do a local read if:
* - we are consistent (of course),
* - or we are generally inconsistent,
Modified: trunk/drbd/drbd_req.h
===================================================================
--- trunk/drbd/drbd_req.h 2006-09-11 10:23:03 UTC (rev 2397)
+++ trunk/drbd/drbd_req.h 2006-09-11 10:41:30 UTC (rev 2398)
@@ -291,560 +291,11 @@
return !( ( s1 + (l1>>9) <= s2 ) || ( s1 >= s2 + (l2>>9) ) );
}
-static inline void _req_may_be_done(drbd_request_t *req)
-{
- const unsigned long s = req->rq_state;
- drbd_dev *mdev = req->mdev;
- int rw;
+/* aparently too large to be inlined...
+ * moved to drbd_req.c */
+extern void _req_may_be_done(drbd_request_t *req);
+extern void _req_mod(drbd_request_t *req, drbd_req_event_t what);
- MUST_HOLD(&mdev->req_lock)
-
- if (s & RQ_NET_PENDING) return;
- if (s & RQ_LOCAL_PENDING) return;
-
- if (req->master_bio) {
- /* this is data_received (remote read)
- * or protocol C WriteAck
- * or protocol B RecvAck
- * or protocol A "handed_over_to_network" (SendAck)
- * or canceled or failed,
- * or killed from the transfer log due to connection loss.
- */
-
- /*
- * figure out whether to report success or failure.
- *
- * report success when at least one of the oprations suceeded.
- * or, to put the other way,
- * only report failure, when both operations failed.
- *
- * what to do about the failures is handled elsewhere.
- * what we need to do here is just: complete the master_bio.
- */
- int ok = (s & RQ_LOCAL_OK) || (s & RQ_NET_OK);
- rw = bio_data_dir(req->master_bio);
- if (rw == WRITE) {
- drbd_request_t *i;
- struct Tl_epoch_entry *e;
- struct hlist_node *n;
- struct hlist_head *slot;
-
- /* before we can signal completion to the upper layers,
- * we may need to close the current epoch */
- if (req->epoch == mdev->newest_barrier->br_number)
- set_bit(ISSUE_BARRIER,&mdev->flags);
-
- /* and maybe "wake" those conflicting requests that
- * wait for this request to finish.
- * we just have to walk starting from req->next,
- * see _req_add_hash_check_colision(); */
-#define OVERLAPS overlaps(req->sector, req->size, i->sector, i->size)
- n = req->colision.next;
- /* hlist_del ... done below */
- hlist_for_each_entry_from(i, n, colision) {
- if (OVERLAPS)
- drbd_queue_work(&mdev->data.work,&i->w);
- }
-
- /* and maybe "wake" those conflicting epoch entries
- * that wait for this request to finish */
- /* FIXME looks alot like we could consolidate some code
- * and maybe even hash tables? */
-#undef OVERLAPS
-#define OVERLAPS overlaps(req->sector, req->size, e->sector, e->size)
- slot = ee_hash_slot(mdev,req->sector);
- hlist_for_each_entry(e, n, slot, colision) {
- if (OVERLAPS)
- drbd_queue_work(&mdev->data.work,&e->w);
- }
-#undef OVERLAPS
- }
- /* else: READ, READA: nothing more to do */
-
- /* remove the request from the conflict detection
- * respective block_id verification hash */
- hlist_del(&req->colision);
-
- /* FIXME not yet implemented...
- * in case we got "suspended" (on_disconnect: freeze io)
- * we may not yet complete the request...
- * though, this is probably best handled elsewhere by not
- * walking the transfer log until "unfreeze", so we won't end
- * up here anyways during the freeze ...
- * then again, if it is a READ, it is not in the TL at all.
- * is it still leagal to complete a READ during freeze? */
- bio_endio(req->master_bio, req->master_bio->bi_size, ok ? 0 : -EIO);
- req->master_bio = NULL;
- } else {
- /* only WRITE requests can end up here without a master_bio */
- rw = WRITE;
- }
-
- if ((s == RQ_NET_MASK) == 0 || (s & RQ_NET_DONE)) {
- /* this is disconnected (local only) operation,
- * or protocol C WriteAck,
- * or protocol A or B BarrierAck,
- * or killed from the transfer log due to connection loss. */
-
- /* if it was a write, we may have to set the corresponding
- * bit(s) out-of-sync first. If it had a local part, we need to
- * release the reference to the activity log. */
- if (rw == WRITE) {
- /* remove it from the transfer log */
- list_del(&req->tl_requests);
- /* Set out-of-sync unless both OK flags are set
- * (local only or remote failed).
- * Other places where we set out-of-sync:
- * READ with local io-error */
- if (!(s & RQ_NET_OK) || !(s & RQ_LOCAL_OK))
- drbd_set_out_of_sync(mdev,req->sector,req->size);
- if (s & RQ_LOCAL_MASK) {
- drbd_al_complete_io(mdev, req->sector);
- }
- }
-
- /* if it was an io error, we want to notify our
- * peer about that, and see if we need to
- * detach the disk and stuff.
- * to avoid allocating some special work
- * struct, reuse the request. */
- if (rw == WRITE &&
- (( s & RQ_LOCAL_MASK) && !(s & RQ_LOCAL_OK))) {
- if (!(req->w.list.next == LIST_POISON1 ||
- list_empty(&req->w.list))) {
- /* DEBUG ASSERT only; if this triggers, we
- * probably corrupt the worker list here */
- DUMPP(req->w.list.next);
- DUMPP(req->w.list.prev);
- }
- req->w.cb = w_io_error;
- drbd_queue_work(&mdev->data.work, &req->w);
- /* drbd_req_free() is done in w_io_error */
- } else {
- drbd_req_free(req);
- }
- }
- /* else: network part and not DONE yet. that is
- * protocol A or B, barrier ack still pending... */
-}
-
-/*
- * checks whether there was an overlapping request already registered.
- * if so, add the request to the colision hash
- * _after_ the (first) overlapping request,
- * and return 1
- * if no overlap was found, add this request to the front of the chain,
- * and return 0
- *
- * corresponding hlist_del is in _req_may_be_done()
- *
- * NOTE:
- * paranoia: assume something above us is broken, and issues different write
- * requests for the same block simultaneously...
- *
- * To ensure these won't be reordered differently on both nodes, resulting in
- * diverging data sets, we discard the later one(s). Not that this is supposed
- * to happen, but this is the rationale why we also have to check for
- * conflicting requests with local origin, and why we have to do so regardless
- * of whether we allowed multiple primaries.
- *
- * BTW, in case we only have one primary, the ee_hash is empty anyways, and the
- * second hlist_for_each_entry becomes a noop. This is even simpler than to
- * grab a reference on the net_conf, and check for the two_primaries flag...
- */
-static int _req_add_hash_check_colision(drbd_request_t *req)
-{
- drbd_dev *mdev = req->mdev;
- const sector_t sector = req->sector;
- const int size = req->size;
- drbd_request_t *i;
- struct Tl_epoch_entry *e;
- struct hlist_node *n;
- struct hlist_head *slot;
-
- MUST_HOLD(&mdev->req_lock);
- D_ASSERT(hlist_unhashed(&req->colision));
-#define OVERLAPS overlaps(i->sector, i->size, sector, size)
- slot = tl_hash_slot(mdev,sector);
- hlist_for_each_entry(i, n, slot, colision) {
- if (OVERLAPS) {
- ALERT("%s[%u] Concurrent local write detected!"
- " [DISCARD L] new: %llu +%d; pending: %llu +%d\n",
- current->comm, current->pid,
- (unsigned long long)sector, size,
- (unsigned long long)i->sector, i->size);
- hlist_add_after(n,&req->colision);
- return 1;
- }
- }
- /* no overlapping request with local origin found,
- * register in front */
- hlist_add_head(&req->colision,slot);
-
- /* now, check for overlapping requests with remote origin */
-#undef OVERLAPS
-#define OVERLAPS overlaps(e->sector, e->size, sector, size)
- slot = ee_hash_slot(mdev,sector);
- hlist_for_each_entry(e, n, slot, colision) {
- if (OVERLAPS) {
- ALERT("%s[%u] Concurrent remote write detected!"
- " [DISCARD L] new: %llu +%d; pending: %llu +%d\n",
- current->comm, current->pid,
- (unsigned long long)sector, size,
- e->sector, e->size);
- return 1;
- }
- }
-#undef OVERLAPS
-
- /* this is like it should be, and what we expected.
- * out users do behave after all... */
- return 0;
-}
-
-/* obviously this could be coded as many single functions
- * instead of one huge switch,
- * or by putting the code directly in the respective locations
- * (as it has been before).
- *
- * but having it this way
- * enforces that it is all in this one place, where it is easier to audit,
- * it makes it obvious that whatever "event" "happens" to a request should
- * happen "atomically" within the req_lock,
- * and it enforces that we have to think in a very structured manner
- * about the "events" that may happen to a request during its life time ...
- *
- * Though I think it is likely that we break this again into many
- * static inline void _req_mod_ ## what (req) ...
- */
-static inline void _req_mod(drbd_request_t *req, drbd_req_event_t what)
-{
- drbd_dev *mdev = req->mdev;
- MUST_HOLD(&mdev->req_lock);
-
- switch(what) {
- default:
- ERR("LOGIC BUG in %s:%u\n", __FILE__ , __LINE__ );
- return;
-
- /* does not happen...
- * initialization done in drbd_req_new
- case created:
- break;
- */
-
- case to_be_send: /* via network */
- /* reached via drbd_make_request_common
- * and from FIXME w_read_retry_remote */
- D_ASSERT(!(req->rq_state & RQ_NET_MASK));
- req->rq_state |= RQ_NET_PENDING;
- inc_ap_pending(mdev);
- break;
-
- case to_be_submitted: /* locally */
- /* reached via drbd_make_request_common */
- D_ASSERT(!(req->rq_state & RQ_LOCAL_MASK));
- req->rq_state |= RQ_LOCAL_PENDING;
- break;
-
-#if 0
- /* done inline below */
- case suspend_because_of_conflict:
- /* assert something? */
- /* reached via drbd_make_request_common */
- /* update state flag? why? which one? */
- req->w.cb = w_req_cancel_conflict;
- /* no queue here, see below! */
- break;
-#endif
-
- /* FIXME these *_completed_* are basically the same.
- * can probably be merged with some if (what == xy) */
-
- case completed_ok:
- if (bio_data_dir(req->private_bio) == WRITE)
- mdev->writ_cnt += req->size>>9;
- else
- mdev->read_cnt += req->size>>9;
-
- bio_put(req->private_bio);
- req->private_bio = NULL;
- dec_local(mdev);
-
- req->rq_state |= (RQ_LOCAL_COMPLETED|RQ_LOCAL_OK);
- req->rq_state &= ~RQ_LOCAL_PENDING;
-
- _req_may_be_done(req);
- break;
-
- case write_completed_with_error:
- req->rq_state |= RQ_LOCAL_COMPLETED;
- req->rq_state &= ~RQ_LOCAL_PENDING;
-
- bio_put(req->private_bio);
- req->private_bio = NULL;
- dec_local(mdev);
- ALERT("Local WRITE failed sec=%llu size=%u\n",
- req->sector, req->size);
- /* and now: check how to handle local io error.
- * FIXME see comment below in read_completed_with_error */
- __drbd_chk_io_error(mdev);
- _req_may_be_done(req);
- break;
-
- case read_completed_with_error:
- drbd_set_out_of_sync(mdev,req->sector,req->size);
- req->rq_state |= RQ_LOCAL_COMPLETED;
- req->rq_state &= ~RQ_LOCAL_PENDING;
-
- bio_put(req->private_bio);
- req->private_bio = NULL;
- dec_local(mdev);
- if (bio_rw(req->master_bio) == READA)
- /* it is legal to fail READA */
- break;
- /* else */
- ALERT("Local READ failed sec=%llu size=%u\n",
- req->sector, req->size);
- /* _req_mod(req,to_be_send); oops, recursion in static inline */
- D_ASSERT(!(req->rq_state & RQ_NET_MASK));
- req->rq_state |= RQ_NET_PENDING;
- inc_ap_pending(mdev);
-
- /* and now: check how to handle local io error.
- *
- * FIXME we should not handle WRITE and READ io errors
- * the same. When we retry the READ, and then write
- * the answer, that might suceed because modern drives
- * would relocate the sectors. We'd need to keep our
- * private bio then, and round the offset and size so
- * we get back enough data to be able to clear the bits again.
- */
- __drbd_chk_io_error(mdev);
- /* fall through: _req_mod(req,queue_for_net_read); */
-
- case queue_for_net_read:
- /* READ or READA, and
- * no local disk,
- * or target area marked as invalid,
- * or just got an io-error. */
- /* from drbd_make_request_common
- * or from bio_endio during read io-error recovery */
-
- /* so we can verify the handle in the answer packet
- * corresponding hlist_del is in _req_may_be_done() */
- hlist_add_head(&req->colision, ar_hash_slot(mdev,req->sector));
-
- set_bit(UNPLUG_REMOTE,&mdev->flags); /* why? */
-
- D_ASSERT(req->rq_state & RQ_NET_PENDING);
- req->rq_state |= RQ_NET_QUEUED;
- req->w.cb = (req->rq_state & RQ_LOCAL_MASK)
- ? w_read_retry_remote
- : w_send_read_req;
- drbd_queue_work(&mdev->data.work, &req->w);
- break;
-
- case queue_for_net_write:
- /* assert something? */
- /* from drbd_make_request_common only */
-
- /* NOTE
- * In case the req ended up on the transfer log before being
- * queued on the worker, it could lead to this request being
- * missed during cleanup after connection loss.
- * So we have to do both operations here,
- * within the same lock that protects the transfer log.
- */
-
- /* register this request on the colison detection hash
- * tables. if we have a conflict, just leave here.
- * the request will be "queued" for faked "completion"
- * once the conflicting request is done.
- */
- if (_req_add_hash_check_colision(req)) {
- /* this is a conflicting request.
- * even though it may have been only _partially_
- * overlapping with one of the currently pending requests,
- * without even submitting or sending it,
- * we will pretend that it was successfully served
- * once the pending conflicting request is done.
- */
- /* _req_mod(req, suspend_because_of_conflict); */
- /* this callback is just for ASSERT purposes */
- req->w.cb = w_req_cancel_conflict;
-
- /* we don't add this to any epoch (barrier) object.
- * assign the "invalid" barrier_number 0.
- * it should be 0 anyways, still,
- * but being explicit won't harm. */
- req->epoch = 0;
-
- /*
- * EARLY break here!
- */
- break;
- }
-
- /* _req_add_to_epoch(req); this has to be after the
- * _maybe_start_new_epoch(req); which happened in
- * drbd_make_request_common, because we now may set the bit
- * again ourselves to close the current epoch.
- *
- * Add req to the (now) current epoch (barrier). */
-
- /* see drbd_make_request_common just after it grabs the req_lock */
- D_ASSERT(test_bit(ISSUE_BARRIER, &mdev->flags) == 0);
-
- req->epoch = mdev->newest_barrier->br_number;
- list_add(&req->tl_requests,&mdev->newest_barrier->requests);
-
- /* mark the current epoch as closed,
- * in case it outgrew the limit */
- if( ++mdev->newest_barrier->n_req >= mdev->net_conf->max_epoch_size )
- set_bit(ISSUE_BARRIER,&mdev->flags);
-
- D_ASSERT(req->rq_state & RQ_NET_PENDING);
- req->rq_state |= RQ_NET_QUEUED;
- req->w.cb = w_send_dblock;
- drbd_queue_work(&mdev->data.work, &req->w);
- break;
-
- case conflicting_req_done:
- case conflicting_ee_done:
- /* reached via bio_endio of the
- * conflicting request or epoch entry.
- * we now just "fake" completion of this request.
- * THINK: I'm going to _FAIL_ this request.
- */
- D_ASSERT(req->w.cb == w_req_cancel_conflict);
- D_ASSERT(req->epoch == 0);
- {
- const unsigned long s = req->rq_state;
- if (s & RQ_LOCAL_MASK) {
- D_ASSERT(s & RQ_LOCAL_PENDING);
- bio_put(req->private_bio);
- req->private_bio = NULL;
- dec_local(mdev);
- }
- D_ASSERT((s & RQ_NET_MASK) == RQ_NET_PENDING);
- dec_ap_pending(mdev);
- }
- /* no _OK ... this is going to be an io-error */
- req->rq_state = RQ_LOCAL_COMPLETED|RQ_NET_DONE;
- _req_may_be_done(req);
- break;
-
- /* FIXME
- * to implement freeze-io,
- * we may not finish the request just yet.
- */
- case send_canceled:
- /* for the request, this is the same thing */
- case send_failed:
- D_ASSERT(req->rq_state & RQ_NET_PENDING);
- dec_ap_pending(mdev);
- req->rq_state &= ~(RQ_NET_PENDING|RQ_NET_QUEUED|RQ_NET_OK);
- req->rq_state |= RQ_NET_DONE;
- _req_may_be_done(req);
- break;
-
- case handed_over_to_network:
- /* assert something? */
- if ( bio_data_dir(req->master_bio) == WRITE &&
- mdev->net_conf->wire_protocol == DRBD_PROT_A ) {
- /* this is what is dangerous about protocol A:
- * pretend it was sucessfully written on the peer.
- * FIXME in case we get a local io-error in
- * protocol != C, we might want to defer comletion
- * until we get the barrier ack, and send a NegAck
- * in case the other node had an io-error, too...
- * That way we would at least not report "success"
- * if it was not written at all. */
- if (req->rq_state & RQ_NET_PENDING) {
- dec_ap_pending(mdev);
- req->rq_state &= ~RQ_NET_PENDING;
- req->rq_state |= RQ_NET_OK;
- } /* else: neg-ack was faster... */
- /* it is still not yet RQ_NET_DONE until the
- * corresponding epoch barrier got acked as well,
- * so we know what to dirty on connection loss */
- }
- req->rq_state &= ~RQ_NET_QUEUED;
- req->rq_state |= RQ_NET_SENT;
- /* because _drbd_send_zc_bio could sleep, and may want to
- * dereference the bio even after the "write_acked_by_peer" and
- * "completed_ok" events came in, once we return from
- * _drbd_send_zc_bio (drbd_send_dblock), we have to check
- * whether it is done already, and end it. */
- _req_may_be_done(req);
- break;
-
- case connection_lost_while_pending:
- /* transfer log cleanup after connection loss */
- /* assert something? */
- if (req->rq_state & RQ_NET_PENDING) dec_ap_pending(mdev);
- req->rq_state &= ~(RQ_NET_OK|RQ_NET_PENDING);
- req->rq_state |= RQ_NET_DONE;
- /* if it is still queued, we may not complete it here.
- * it will be canceled soon.
- * FIXME we should change the code so this can not happen. */
- if (!(req->rq_state & RQ_NET_QUEUED))
- _req_may_be_done(req);
- break;
-
- case write_acked_by_peer:
- /* assert something? */
- /* protocol C; successfully written on peer */
- req->rq_state |= RQ_NET_DONE;
- /* rest is the same as for: */
- case recv_acked_by_peer:
- /* protocol B; pretends to be sucessfully written on peer.
- * see also notes above in handed_over_to_network about
- * protocol != C */
- req->rq_state |= RQ_NET_OK;
- D_ASSERT(req->rq_state & RQ_NET_PENDING);
- dec_ap_pending(mdev);
- req->rq_state &= ~RQ_NET_PENDING;
- if (req->rq_state & RQ_NET_SENT)
- _req_may_be_done(req);
- /* else: done by handed_over_to_network */
- break;
-
- case neg_acked:
- /* assert something? */
- if (req->rq_state & RQ_NET_PENDING) dec_ap_pending(mdev);
- req->rq_state &= ~(RQ_NET_OK|RQ_NET_PENDING);
- /* FIXME THINK! is it DONE now, or is it not? */
- req->rq_state |= RQ_NET_DONE;
- if (req->rq_state & RQ_NET_SENT)
- _req_may_be_done(req);
- /* else: done by handed_over_to_network */
- break;
-
- case barrier_acked:
- /* can even happen for protocol C,
- * when local io is stil pending.
- * in which case it does nothing. */
- D_ASSERT(req->rq_state & RQ_NET_SENT);
- req->rq_state |= RQ_NET_DONE;
- _req_may_be_done(req);
- break;
-
- case data_received:
- D_ASSERT(req->rq_state & RQ_NET_PENDING);
- dec_ap_pending(mdev);
- req->rq_state &= ~RQ_NET_PENDING;
- req->rq_state |= (RQ_NET_OK|RQ_NET_DONE);
- /* can it happen that we receive the DataReply
- * before the send DataRequest function returns? */
- if (req->rq_state & RQ_NET_SENT)
- _req_may_be_done(req);
- /* else: done by handed_over_to_network */
- break;
- };
-}
-
/* If you need it irqsave, do it your self! */
static inline void req_mod(drbd_request_t *req, drbd_req_event_t what)
{
More information about the drbd-cvs
mailing list