Index: drbd/drbd_proc.c =================================================================== --- drbd/drbd_proc.c (.../trunk) (revision 5016) +++ drbd/drbd_proc.c (.../branches/panic-2) (revision 5016) @@ -74,15 +74,15 @@ * units of BM_BLOCK_SIZE. * for the percentage, we don't care. */ - rs_left = drbd_bm_total_weight(mdev); + rs_left = drbd_bm_total_weight(mdev) - mdev->rs_failed; /* >> 10 to prevent overflow, * +1 to prevent division by zero */ if (rs_left > mdev->rs_total) { /* doh. logic bug somewhere. * for now, just try to prevent in-kernel buffer overflow. */ - ERR("logic bug? rs_left=%lu > rs_total=%lu\n", - rs_left, mdev->rs_total); + ERR("logic bug? rs_left=%lu > rs_total=%lu (rs_failed %lu)\n", + rs_left, mdev->rs_total, mdev->rs_failed); res = 1000; } else { res = (rs_left >> 10)*1000/((mdev->rs_total >> 10) + 1); Index: drbd/drbd_receiver.c =================================================================== --- drbd/drbd_receiver.c (.../trunk) (revision 5016) +++ drbd/drbd_receiver.c (.../branches/panic-2) (revision 5016) @@ -1005,6 +1005,9 @@ * and why do we set it only in the "up-to-date" branch? */ set_bit(SYNC_STARTED,&mdev->flags); } else { + // Record failure to sync + drbd_rs_failed_io(mdev, sector, e->size); + ok = drbd_send_ack(mdev,NegAck,e); ok&= drbd_io_error(mdev, FALSE); } @@ -2472,6 +2475,7 @@ on the fly. */ drbd_rs_cancel_all(mdev); mdev->rs_total=0; + mdev->rs_failed=0; atomic_set(&mdev->rs_pending_cnt,0); wake_up(&mdev->cstate_wait); @@ -2949,6 +2953,9 @@ { Drbd_BlockAck_Packet *p = (Drbd_BlockAck_Packet*)h; + if (DRBD_ratelimit(5*HZ,5)) + WARN("Got NegAck packet. Peer is in troubles?\n"); + update_peer_seq(mdev,be32_to_cpu(p->seq_num)); /* do nothing here. @@ -2956,10 +2963,13 @@ * and will do the cleanup then and there. */ if(is_syncer_block_id(p->block_id)) { + sector_t sector = be64_to_cpu(p->sector); + unsigned long size = be32_to_cpu(p->blksize); + dec_rs_pending(mdev); + + drbd_rs_failed_io(mdev, sector, size); } - if (DRBD_ratelimit(5*HZ,5)) - WARN("Got NegAck packet. Peer is in troubles?\n"); return TRUE; } @@ -2993,13 +3003,19 @@ STATIC int got_NegRSDReply(drbd_dev *mdev, Drbd_Header* h) { sector_t sector; + int size; Drbd_BlockAck_Packet *p = (Drbd_BlockAck_Packet*)h; sector = be64_to_cpu(p->sector); + size = be32_to_cpu(p->blksize); D_ASSERT(p->block_id == ID_SYNCER); + dec_rs_pending(mdev); + drbd_rs_complete_io(mdev,sector); + drbd_rs_failed_io(mdev, sector, size); + return TRUE; } Index: drbd/drbd_actlog.c =================================================================== --- drbd/drbd_actlog.c (.../trunk) (revision 5016) +++ drbd/drbd_actlog.c (.../branches/panic-2) (revision 5016) @@ -605,7 +605,7 @@ kfree(udw); - if(drbd_bm_total_weight(mdev) == 0 && + if(drbd_bm_total_weight(mdev) <= mdev->rs_failed && ( mdev->state.conn == SyncSource || mdev->state.conn == SyncTarget || mdev->state.conn == PausedSyncS || mdev->state.conn == PausedSyncT ) ) { D_ASSERT( mdev->resync_work.cb == w_resync_inactive ); @@ -624,7 +624,7 @@ * TODO will be obsoleted once we have a caching lru of the on disk bitmap */ STATIC void drbd_try_clear_on_disk_bm(struct Drbd_Conf *mdev,sector_t sector, - int cleared) + int cleared, int success) { struct bm_extent* ext; struct update_odbm_work * udw; @@ -640,11 +640,14 @@ ext = (struct bm_extent *) lc_get(mdev->resync,enr); if (ext) { if( ext->lce.lc_number == enr) { - ext->rs_left -= cleared; - if (ext->rs_left < 0) { - ERR("BAD! sector=%lu enr=%u rs_left=%d cleared=%d\n", + if (success) + ext->rs_left -= cleared; + else + ext->rs_failed += cleared; + if (ext->rs_left < ext->rs_failed) { + ERR("BAD! sector=%lu enr=%u rs_left=%d rs_failed=%d cleared=%d\n", (unsigned long)sector, - ext->lce.lc_number, ext->rs_left, cleared); + ext->lce.lc_number, ext->rs_left, ext->rs_failed, cleared); // FIXME brrrgs. should never happen! drbd_force_state(mdev,NS(conn,Disconnecting)); return; @@ -665,12 +668,16 @@ ext->flags = 0; } ext->rs_left = rs_left; + if (!success) + ext->rs_failed = cleared; lc_changed(mdev->resync,&ext->lce); } lc_put(mdev->resync,&ext->lce); // no race, we are within the al_lock! - if (ext->rs_left == 0) { + if (ext->rs_left <= ext->rs_failed) { + ext->rs_left = ext->rs_failed = 0; + udw=kmalloc(sizeof(*udw),GFP_ATOMIC); if(udw) { udw->enr = ext->lce.lc_number; @@ -729,7 +736,7 @@ } sbnr = BM_SECT_TO_BIT(sector + BM_SECT_PER_BIT-1); - MTRACE(TraceTypePacket, TraceLvlAll, + MTRACE(TraceTypeResync, TraceLvlMetrics, INFO("drbd_set_in_sync: sector=%llx size=%x sbnr=%lx ebnr=%lx\n", (long long)sector, size, sbnr, ebnr); ); @@ -753,7 +760,7 @@ mdev->rs_mark_left =drbd_bm_total_weight(mdev); } } - drbd_try_clear_on_disk_bm(mdev,sector,count); + drbd_try_clear_on_disk_bm(mdev,sector,count,TRUE); /* just wake_up unconditional now, * various lc_chaged(), lc_put() in drbd_try_clear_on_disk_bm(). */ wake_up=1; @@ -792,7 +799,7 @@ sbnr = BM_SECT_TO_BIT(sector); ebnr = BM_SECT_TO_BIT(esector); - MTRACE(TraceTypePacket, TraceLvlAll, + MTRACE(TraceTypeResync, TraceLvlMetrics, INFO("drbd_set_out_of_sync: sector=%llx size=%x sbnr=%lx ebnr=%lx\n", (long long)sector, size, sbnr, ebnr); ); @@ -877,6 +884,11 @@ struct bm_extent* bm_ext; int i, sig; + MTRACE(TraceTypeResync, TraceLvlAll, + INFO("drbd_rs_begin_io: sector=%llx\n", + (long long)sector); + ); + sig = wait_event_interruptible( mdev->al_wait, (bm_ext = _bme_get(mdev,enr)) ); if (sig) return 0; @@ -909,6 +921,11 @@ struct bm_extent* bm_ext; unsigned long flags; + MTRACE(TraceTypeResync, TraceLvlAll, + INFO("drbd_rs_complete_io: sector=%llx\n", + (long long)sector); + ); + spin_lock_irqsave(&mdev->al_lock,flags); bm_ext = (struct bm_extent*) lc_find(mdev->resync,enr); if(!bm_ext) { @@ -936,6 +953,10 @@ struct bm_extent* bm_ext; int i; + MTRACE(TraceTypeResync, TraceLvlMetrics, + INFO("drbd_rs_cancel_all\n"); + ); + spin_lock_irq(&mdev->al_lock); if(inc_local_if_state(mdev,Failed)) { // Makes sure ->resync is there. @@ -964,6 +985,10 @@ struct bm_extent* bm_ext; int i; + MTRACE(TraceTypeResync, TraceLvlMetrics, + INFO("drbd_rs_del_all\n"); + ); + spin_lock_irq(&mdev->al_lock); if(inc_local_if_state(mdev,Failed)) { // Makes sure ->resync is there. @@ -981,3 +1006,69 @@ } spin_unlock_irq(&mdev->al_lock); } + +/* Record information on a failure to resync the specified blocks + * + * called on SyncTarget when resync write fails or NegRSDReply received + * + */ +void drbd_rs_failed_io(drbd_dev* mdev, sector_t sector, int size) +{ + /* Is called from worker and receiver context _only_ */ + unsigned long sbnr,ebnr,lbnr,bnr; + unsigned long count = 0; + sector_t esector, nr_sectors; + int wake_up=0; + + MTRACE(TraceTypeResync, TraceLvlSummary, + INFO("drbd_rs_failed_io: sector=%llx, size=%x\n", + (long long)sector,size); + ); + + if (size <= 0 || (size & 0x1ff) != 0 || size > DRBD_MAX_SEGMENT_SIZE) { + ERR("drbd_rs_failed_io: sector=%lu size=%d nonsense!\n", + (unsigned long)sector,size); + return; + } + nr_sectors = drbd_get_capacity(mdev->this_bdev); + esector = sector + (size>>9) -1; + + ERR_IF(sector >= nr_sectors) return; + ERR_IF(esector >= nr_sectors) esector = (nr_sectors-1); + + lbnr = BM_SECT_TO_BIT(nr_sectors-1); + + /* + * round up start sector, round down end sector. we make sure we only + * handle full, alligned, BM_BLOCK_SIZE (4K) blocks */ + if (unlikely(esector < BM_SECT_PER_BIT-1)) { + return; + } else if (unlikely(esector == (nr_sectors-1))) { + ebnr = lbnr; + } else { + ebnr = BM_SECT_TO_BIT(esector - (BM_SECT_PER_BIT-1)); + } + sbnr = BM_SECT_TO_BIT(sector + BM_SECT_PER_BIT-1); + + if (sbnr > ebnr) return; + + /* + * ok, (capacity & 7) != 0 sometimes, but who cares... + * we count rs_{total,left} in bits, not sectors. + */ + spin_lock_irq(&mdev->al_lock); + for(bnr=sbnr; bnr <= ebnr; bnr++) { + if (drbd_bm_test_bit(mdev,bnr)>0) count++; + } + if (count) { + mdev->rs_failed += count; + + drbd_try_clear_on_disk_bm(mdev,sector,count,FALSE); + + /* just wake_up unconditional now, + * various lc_chaged(), lc_put() in drbd_try_clear_on_disk_bm(). */ + wake_up=1; + } + spin_unlock_irq(&mdev->al_lock); + if(wake_up) wake_up(&mdev->al_wait); +} Index: drbd/drbd_worker.c =================================================================== --- drbd/drbd_worker.c (.../trunk) (revision 5016) +++ drbd/drbd_worker.c (.../branches/panic-2) (revision 5016) @@ -384,6 +384,7 @@ int drbd_resync_finished(drbd_dev* mdev) { unsigned long db,dt,dbdt; + int dstate, pdstate; dt = (jiffies - mdev->rs_start - mdev->rs_paused) / HZ; if (dt <= 0) dt=1; @@ -393,43 +394,61 @@ INFO("Resync done (total %lu sec; paused %lu sec; %lu K/sec)\n", dt + mdev->rs_paused, mdev->rs_paused, dbdt); - D_ASSERT(drbd_bm_total_weight(mdev) == 0); - mdev->rs_total = 0; - mdev->rs_paused = 0; + D_ASSERT((drbd_bm_total_weight(mdev)-mdev->rs_failed) == 0); - if ( mdev->state.conn == SyncTarget ) { - if( mdev->p_uuid ) { - int i; - for ( i=Bitmap ; i<=History_end ; i++ ) { - _drbd_uuid_set(mdev,i,mdev->p_uuid[i]); + if (mdev->rs_failed) { + INFO(" %lu failed blocks\n",mdev->rs_failed); + + if (mdev->state.conn == SyncTarget) { + dstate = Inconsistent; + pdstate = UpToDate; + } + else { + dstate = UpToDate; + pdstate = Inconsistent; + } + } + else { + dstate = pdstate = UpToDate; + + if ( mdev->state.conn == SyncTarget) { + if( mdev->p_uuid ) { + int i; + for ( i=Bitmap ; i<=History_end ; i++ ) { + _drbd_uuid_set(mdev,i,mdev->p_uuid[i]); + } + drbd_uuid_set_bm(mdev,0UL); //Rotate peer's bitmap-UUID + drbd_uuid_set(mdev,Current,mdev->p_uuid[Current]); + } else { + ERR("mdev->p_uuid is NULL! BUG\n"); } - drbd_uuid_set_bm(mdev,0UL); //Rotate peer's bitmap-UUID - drbd_uuid_set(mdev,Current,mdev->p_uuid[Current]); - } else { - ERR("mdev->p_uuid is NULL! BUG\n"); } + + drbd_uuid_set_bm(mdev,0UL); + + if ( mdev->p_uuid ) { + kfree(mdev->p_uuid); + mdev->p_uuid = NULL; + } } - drbd_uuid_set_bm(mdev,0UL); + mdev->rs_total = 0; + mdev->rs_failed = 0; + mdev->rs_paused = 0; - // Remove all elements form the resync LRU. Since future actions + // Remove all elements from the resync LRU. Since future actions // might set bits in the (main) bitmap, then the entries in the // resync LRU would be wrong. drbd_rs_del_all(mdev); - if ( mdev->p_uuid ) { - kfree(mdev->p_uuid); - mdev->p_uuid = NULL; - } - if (test_and_clear_bit(WRITE_BM_AFTER_RESYNC,&mdev->flags)) { WARN("Writing the whole bitmap, due to failed kmalloc\n"); drbd_bm_write(mdev); } drbd_request_state(mdev,NS3(conn,Connected, - disk,UpToDate, - pdsk,UpToDate)); + disk,dstate, + pdsk,pdstate)); drbd_md_sync(mdev); @@ -453,9 +472,12 @@ if(likely(drbd_bio_uptodate(e->private_bio))) { ok=drbd_send_block(mdev, DataReply, e); } else { + if (DRBD_ratelimit(5*HZ,5)) + ERR("Sending NegDReply. sector=%llx.\n", + (unsigned long long)e->sector); + ok=drbd_send_ack(mdev,NegDReply,e); - if (DRBD_ratelimit(5*HZ,5)) - ERR("Sending NegDReply. I guess it gets messy.\n"); + /* FIXME we should not detach for read io-errors, in particular * not now: when the peer asked us for our data, we are likely * the only remaining disk... */ @@ -478,7 +500,7 @@ } /** - * w_e_end_data_req: Send the answer (RSDataReply) to a RSDataRequest. + * w_e_end_rsdata_req: Send the answer (RSDataReply) to a RSDataRequest. */ int w_e_end_rsdata_req(drbd_dev *mdev, struct drbd_work *w, int cancel) { @@ -503,10 +525,16 @@ ok=1; } } else { + if (DRBD_ratelimit(5*HZ,5)) + ERR("Sending NegRSDReply. sector %llx.\n", + (unsigned long long)e->sector); + ok=drbd_send_ack(mdev,NegRSDReply,e); - if (DRBD_ratelimit(5*HZ,5)) - ERR("Sending NegRSDReply. I guess it gets messy.\n"); + drbd_io_error(mdev, FALSE); + + // update resync data with failure + drbd_rs_failed_io(mdev, e->sector, e->size); } dec_unacked(mdev); @@ -745,6 +773,11 @@ drbd_state_t os,ns; int r=0; + MTRACE(TraceTypeResync, TraceLvlSummary, + INFO("Resync starting: side=%s\n", + side==SyncTarget?"SyncTarget":"SyncSource"); + ); + if(side == SyncTarget) { drbd_bm_reset_find(mdev); } else /* side == SyncSource */ { @@ -776,6 +809,7 @@ if ( r == SS_Success ) { mdev->rs_total = mdev->rs_mark_left = drbd_bm_total_weight(mdev); + mdev->rs_failed = 0; mdev->rs_paused = 0; mdev->rs_start = mdev->rs_mark_time = jiffies; Index: drbd/linux/drbd_config.h =================================================================== --- drbd/linux/drbd_config.h (.../trunk) (revision 5016) +++ drbd/linux/drbd_config.h (.../branches/panic-2) (revision 5016) @@ -50,6 +50,6 @@ // #define DRBD_DISABLE_SENDPAGE // Enable fault insertion code -//#define DRBD_ENABLE_FAULTS +#define DRBD_ENABLE_FAULTS #endif Index: drbd/drbd_main.c =================================================================== --- drbd/drbd_main.c (.../trunk) (revision 5016) +++ drbd/drbd_main.c (.../branches/panic-2) (revision 5016) @@ -2065,6 +2065,7 @@ mdev->p_size = mdev->rs_start = mdev->rs_total = + mdev->rs_failed = mdev->rs_mark_left = mdev->rs_mark_time = 0; D_ASSERT(mdev->net_conf == NULL); @@ -2874,8 +2875,8 @@ unsigned int rnd = ((_drbd_fault_random(&rrs) % 100) + 1); unsigned int ret = (rnd <= fault_rate); - if (ret) - printk(KERN_ALERT "Simulating %s failure\n", _drbd_fault_str(type)); + if (ret && printk_ratelimit()) + printk(KERN_ALERT "Simulating %s failure\n", _drbd_fault_str(type)); return ret; } @@ -3081,6 +3082,15 @@ } \ } while (0) +char *_dump_block_id(u64 block_id, char *buff) { + if (is_syncer_block_id(block_id)) + strcpy(buff,"SyncerId"); + else + sprintf(buff,"%llx",block_id); + + return buff; +} + void _dump_packet(drbd_dev *mdev, struct socket *sock, int recv, Drbd_Polymorph_Packet *p, char* file, int line) @@ -3099,40 +3109,41 @@ break; case Data: - INFOP("%s (sector %llx, id %llx, seq %x, f %x)\n", cmdname(cmd), - (long long)be64_to_cpu(p->Data.sector), - (long long)be64_to_cpu(p->Data.block_id), - be32_to_cpu(p->Data.seq_num), - be32_to_cpu(p->Data.dp_flags) - ); + INFOP("%s (sector %llx, id %s, seq %x, f %x)\n", cmdname(cmd), + (long long)be64_to_cpu(p->Data.sector), + _dump_block_id(p->Data.block_id,tmp), + be32_to_cpu(p->Data.seq_num), + be32_to_cpu(p->Data.dp_flags) + ); break; case DataReply: case RSDataReply: - INFOP("%s (sector %llx, id %llx)\n", cmdname(cmd), - (long long)be64_to_cpu(p->Data.sector), - (long long)be64_to_cpu(p->Data.block_id) - ); + INFOP("%s (sector %llx, id %s)\n", cmdname(cmd), + (long long)be64_to_cpu(p->Data.sector), + _dump_block_id(p->Data.block_id,tmp) + ); break; case RecvAck: case WriteAck: case NegAck: - INFOP("%s (sector %llx, size %x, id %llx, seq %x)\n", cmdname(cmd), - (long long)be64_to_cpu(p->BlockAck.sector), - be32_to_cpu(p->BlockAck.blksize), - (long long)be64_to_cpu(p->BlockAck.block_id), - be32_to_cpu(p->BlockAck.seq_num) - ); + case NegRSDReply: + INFOP("%s (sector %llx, size %x, id %s, seq %x)\n", cmdname(cmd), + (long long)be64_to_cpu(p->BlockAck.sector), + be32_to_cpu(p->BlockAck.blksize), + _dump_block_id(p->BlockAck.block_id,tmp), + be32_to_cpu(p->BlockAck.seq_num) + ); break; case DataRequest: case RSDataRequest: - INFOP("%s (sector %llx, size %x, id %llx)\n", cmdname(cmd), - (long long)be64_to_cpu(p->BlockRequest.sector), - be32_to_cpu(p->BlockRequest.blksize), - (long long)be64_to_cpu(p->BlockRequest.block_id) - ); + INFOP("%s (sector %llx, size %x, id %s)\n", cmdname(cmd), + (long long)be64_to_cpu(p->BlockRequest.sector), + be32_to_cpu(p->BlockRequest.blksize), + _dump_block_id(p->BlockRequest.block_id,tmp) + ); break; case Barrier: @@ -3150,11 +3161,11 @@ case ReportSizes: INFOP("%s (d %lluMiB, u %lluMiB, c %lldMiB, max bio %x, q order %x)\n", cmdname(cmd), - (long long)(be64_to_cpu(p->Sizes.d_size)>>(20-9)), - (long long)(be64_to_cpu(p->Sizes.u_size)>>(20-9)), - (long long)(be64_to_cpu(p->Sizes.c_size)>>(20-9)), - be32_to_cpu(p->Sizes.max_segment_size), - be32_to_cpu(p->Sizes.queue_order_type)); + (long long)(be64_to_cpu(p->Sizes.d_size)>>(20-9)), + (long long)(be64_to_cpu(p->Sizes.u_size)>>(20-9)), + (long long)(be64_to_cpu(p->Sizes.c_size)>>(20-9)), + be32_to_cpu(p->Sizes.max_segment_size), + be32_to_cpu(p->Sizes.queue_order_type)); break; case ReportState: @@ -3173,13 +3184,13 @@ case StateChgReply: INFOP("%s (ret %x)\n", cmdname(cmd), - be32_to_cpu(p->RqSReply.retcode)); + be32_to_cpu(p->RqSReply.retcode)); break; case DiscardNote: INFOP("%s (id %llx, seq %x)\n", cmdname(cmd), - (long long)be64_to_cpu(p->Discard.block_id), - be32_to_cpu(p->Discard.seq_num)); + (long long)p->Discard.block_id, + be32_to_cpu(p->Discard.seq_num)); break; case Ping: Index: drbd/drbd_int.h =================================================================== --- drbd/drbd_int.h (.../trunk) (revision 5016) +++ drbd/drbd_int.h (.../branches/panic-2) (revision 5016) @@ -801,6 +801,7 @@ // sector_t rs_left; // blocks not up-to-date [unit BM_BLOCK_SIZE] // moved into bitmap->bm_set unsigned long rs_total; // blocks to sync in this run [unit BM_BLOCK_SIZE] + unsigned long rs_failed; // number of sync IOs that failed in this run unsigned long rs_start; // Syncer's start time [unit jiffies] unsigned long rs_paused; // cumulated time in PausedSyncX state [unit jiffies] unsigned long rs_mark_left;// block not up-to-date at mark [unit BM_BLOCK_SIZE] @@ -1008,6 +1009,7 @@ struct bm_extent { struct lc_element lce; int rs_left; //number of bits set (out of sync) in this extent. + int rs_failed; // number of failed resync requests in this extent. unsigned long flags; }; @@ -1174,6 +1176,7 @@ TraceTypePacket = 0x00000001, TraceTypeRq = 0x00000002, TraceTypeUuid = 0x00000004, + TraceTypeResync = 0x00000008, }; static inline int @@ -1319,6 +1322,7 @@ extern int drbd_rs_begin_io(struct Drbd_Conf *mdev, sector_t sector); extern void drbd_rs_cancel_all(drbd_dev* mdev); extern void drbd_rs_del_all(drbd_dev* mdev); +extern void drbd_rs_failed_io(drbd_dev* mdev, sector_t sector, int size); extern int drbd_al_read_log(struct Drbd_Conf *mdev,struct drbd_backing_dev *); extern void __drbd_set_in_sync(drbd_dev* mdev, sector_t sector, int size, const char* file, const unsigned int line); #define drbd_set_in_sync(mdev,sector,size) \ @@ -1397,14 +1401,16 @@ switch(mdev->bc->dc.on_io_error) { case PassOn: /* FIXME would this be better named "Ignore"? */ if (!forcedetach) { - ERR("Local IO failed. Passing error on...\n"); + if (printk_ratelimit()) + ERR("Local IO failed. Passing error on...\n"); break; } /* NOTE fall through to detach case if forcedetach set */ case Detach: if (_drbd_set_state(_NS(mdev,disk,Failed),ChgStateHard) == SS_Success) { - ERR("Local IO failed. Detaching...\n"); + if (printk_ratelimit()) + ERR("Local IO failed. Detaching...\n"); } break; case CallIOEHelper: