Actions
Bug #9809
closedcommon/perf_counters.cc: 105: FAILED assert(idx < m_upper_bound)
Status:
Rejected
Priority:
Normal
Assignee:
-
Category:
-
Target version:
-
% Done:
0%
Source:
other
Tags:
Backport:
Regression:
Severity:
3 - minor
Reviewed:
Affected Versions:
ceph-qa-suite:
Pull request ID:
Crash signature (v1):
Crash signature (v2):
Description
Steps to reproduce
- modify vstart.sh with
@@ -337,6 +337,9 @@ if [ "$start_mon" -eq 1 ]; then osd pg bits = 3 osd pgp bits = 5 ; (invalid, but ceph should cope!) osd crush chooseleaf type = 0 + osd pool default size = 2 + osd_max_backfills = 2 + osd min pg log entries = 5 osd pool default min size = 1 osd pool default erasure code directory = .libs osd pool default erasure code profile = plugin=jerasure technique=reed_sol_van k=2 m=1 ruleset-failure-domain=osd
- rm -fr dev out ; mkdir -p dev ; MON=1 OSD=3 ./vstart.sh -d -n -l mon osd
- rados -p rbd bench 1 write -b 4096 --no-cleanup
- ceph tell osd.* injectargs -- --osd_backfill_full_ratio 0.95 # because my disk is 90% full already
- ceph tell osd.2 injectargs -- --osd_backfill_full_ratio 0
- pkill -f 'ceph-osd -i 0'
- ceph osd out 0
- sleep 120
- osd.2 crashes with
-29> 2014-10-17 16:45:04.101261 7f2f6a9e5700 10 osd.2 15 do_waiters -- start -28> 2014-10-17 16:45:04.101263 7f2f6a9e5700 10 osd.2 15 do_waiters -- finish -27> 2014-10-17 16:45:04.101264 7f2f624d5700 10 osd.2 pg_epoch: 15 pg[0.6( v 10'8 (10'3,10'8] lb 0//0//-1 local-les=0 n=0 ec=1 les/c 15/12 13/14/11) [1,2]/[1] r=-1 lpr=15 pi=11-13/2 luod=0'0 crt=0'0 lcod 0'0 active] handle_peering_event: epoch_sent: 15 epoch_requested: 15 RequestBackfillPrio: priority 201 -26> 2014-10-17 16:45:04.101301 7f2f624d5700 10 osd.2 pg_epoch: 15 pg[0.6( v 10'8 (10'3,10'8] lb 0//0//-1 local-les=0 n=0 ec=1 les/c 15/12 13/14/11) [1,2]/[1] r=-1 lpr=15 pi=11-13/2 luod=0'0 crt=0'0 lcod 0'0 active] state<Started/ReplicaActive/RepNotRecovering>: backfill reservation rejected: full ratio is 0.896747, which is greater than max allowed ratio 0 -25> 2014-10-17 16:45:04.101342 7f2f624d5700 5 osd.2 pg_epoch: 15 pg[0.6( v 10'8 (10'3,10'8] lb 0//0//-1 local-les=0 n=0 ec=1 les/c 15/12 13/14/11) [1,2]/[1] r=-1 lpr=15 pi=11-13/2 luod=0'0 crt=0'0 lcod 0'0 active] exit Started/ReplicaActive/RepNotRecovering 0.394974 4 0.000311 -24> 2014-10-17 16:45:04.101384 7f2f624d5700 5 osd.2 pg_epoch: 15 pg[0.6( v 10'8 (10'3,10'8] lb 0//0//-1 local-les=0 n=0 ec=1 les/c 15/12 13/14/11) [1,2]/[1] r=-1 lpr=15 pi=11-13/2 luod=0'0 crt=0'0 lcod 0'0 active] enter Started/ReplicaActive/RepWaitBackfillReserved -23> 2014-10-17 16:45:04.101440 7f2f624d5700 20 osd.2 15 share_map_peer 0x5c334a0 already has epoch 15 -22> 2014-10-17 16:45:04.101453 7f2f624d5700 1 -- 127.0.0.1:6809/8024 --> 127.0.0.1:6805/7805 -- MBackfillReserve REJECT pgid: 0.6, query_epoch: 15 v3 -- ?+0 0x5576940 con 0x5c334a0 -21> 2014-10-17 16:45:04.101490 7f2f624d5700 5 osd.2 pg_epoch: 15 pg[0.6( v 10'8 (10'3,10'8] lb 0//0//-1 local-les=0 n=0 ec=1 les/c 15/12 13/14/11) [1,2]/[1] r=-1 lpr=15 pi=11-13/2 luod=0'0 crt=0'0 lcod 0'0 active] exit Started/ReplicaActive/RepWaitBackfillReserved 0.000106 0 0.000000 -20> 2014-10-17 16:45:04.101536 7f2f624d5700 5 osd.2 pg_epoch: 15 pg[0.6( v 10'8 (10'3,10'8] lb 0//0//-1 local-les=0 n=0 ec=1 les/c 15/12 13/14/11) [1,2]/[1] r=-1 lpr=15 pi=11-13/2 luod=0'0 crt=0'0 lcod 0'0 active] enter Started/ReplicaActive/RepNotRecovering -19> 2014-10-17 16:45:04.101579 7f2f624d5700 10 log is not dirty -18> 2014-10-17 16:45:04.101988 7f2f6a9e5700 1 -- 127.0.0.1:6809/8024 <== osd.1 127.0.0.1:6805/7805 27 ==== MBackfillReserve GRANT pgid: 0.1, query_epoch: 15 v3 ==== 30+0+0 (3475927235 0 0) 0x5576940 con 0x5c334a0 -17> 2014-10-17 16:45:04.102016 7f2f6a9e5700 10 osd.2 15 do_waiters -- start -16> 2014-10-17 16:45:04.102019 7f2f6a9e5700 10 osd.2 15 do_waiters -- finish -15> 2014-10-17 16:45:04.102021 7f2f6a9e5700 20 osd.2 15 _dispatch 0x5576940 MBackfillReserve GRANT pgid: 0.1, query_epoch: 15 v3 -14> 2014-10-17 16:45:04.102028 7f2f6a9e5700 5 -- op tracker -- seq: 126, time: 2014-10-17 16:45:04.101868, event: header_read, op: MBackfillReserve GRANT pgid: 0.1, query_epoch: 15 -13> 2014-10-17 16:45:04.102034 7f2f6a9e5700 5 -- op tracker -- seq: 126, time: 2014-10-17 16:45:04.101870, event: throttled, op: MBackfillReserve GRANT pgid: 0.1, query_epoch: 15 -12> 2014-10-17 16:45:04.102039 7f2f6a9e5700 5 -- op tracker -- seq: 126, time: 2014-10-17 16:45:04.101925, event: all_read, op: MBackfillReserve GRANT pgid: 0.1, query_epoch: 15 -11> 2014-10-17 16:45:04.102043 7f2f6a9e5700 5 -- op tracker -- seq: 126, time: 2014-10-17 16:45:04.102015, event: dispatched, op: MBackfillReserve GRANT pgid: 0.1, query_epoch: 15 -10> 2014-10-17 16:45:04.102049 7f2f6a9e5700 5 -- op tracker -- seq: 126, time: 2014-10-17 16:45:04.102048, event: waiting_for_osdmap, op: MBackfillReserve GRANT pgid: 0.1, query_epoch: 15 -9> 2014-10-17 16:45:04.102053 7f2f6a9e5700 15 osd.2 15 require_same_or_newer_map 15 (i am 15) 0x5576940 -8> 2014-10-17 16:45:04.102078 7f2f6a9e5700 5 -- op tracker -- seq: 126, time: 2014-10-17 16:45:04.102078, event: done, op: MBackfillReserve GRANT pgid: 0.1, query_epoch: 15 -7> 2014-10-17 16:45:04.102102 7f2f6a9e5700 10 osd.2 15 do_waiters -- start -6> 2014-10-17 16:45:04.102108 7f2f6a9e5700 10 osd.2 15 do_waiters -- finish -5> 2014-10-17 16:45:04.102107 7f2f62d26700 10 osd.2 pg_epoch: 15 pg[0.1( v 10'16 (10'5,10'16] local-les=15 n=16 ec=1 les/c 15/12 13/14/9) [2,1]/[2] r=0 lpr=14 pi=11-13/2 bft=1 crt=10'5 lcod 10'15 mlcod 0'0 active+undersized+degraded+remapped+wait_backfill] handle_peering_event: epoch_sent: 15 epoch_requested: 15 RemoteBackfillReserved -4> 2014-10-17 16:45:04.102168 7f2f62d26700 5 osd.2 pg_epoch: 15 pg[0.1( v 10'16 (10'5,10'16] local-les=15 n=16 ec=1 les/c 15/12 13/14/9) [2,1]/[2] r=0 lpr=14 pi=11-13/2 bft=1 crt=10'5 lcod 10'15 mlcod 0'0 active+undersized+degraded+remapped+wait_backfill] exit Started/Primary/Active/WaitRemoteBackfillReserved 0.001372 1 0.000136 -3> 2014-10-17 16:45:04.102213 7f2f62d26700 5 osd.2 pg_epoch: 15 pg[0.1( v 10'16 (10'5,10'16] local-les=15 n=16 ec=1 les/c 15/12 13/14/9) [2,1]/[2] r=0 lpr=14 pi=11-13/2 bft=1 crt=10'5 lcod 10'15 mlcod 0'0 active+undersized+degraded+remapped+wait_backfill] enter Started/Primary/Active/Backfilling -2> 2014-10-17 16:45:04.102253 7f2f62d26700 10 osd.2 15 queue_for_recovery queued pg[0.1( v 10'16 (10'5,10'16] local-les=15 n=16 ec=1 les/c 15/12 13/14/9) [2,1]/[2] r=0 lpr=14 pi=11-13/2 bft=1 crt=10'5 lcod 10'15 mlcod 0'0 active+undersized+degraded+remapped+wait_backfill] -1> 2014-10-17 16:45:04.102267 7f2f5c95a700 10 osd.2 15 do_recovery can start 5 (0/15 rops) 0> 2014-10-17 16:45:04.134123 7f2f62d26700 -1 common/perf_counters.cc: In function 'void PerfCounters::inc(int, uint64_t)' thread 7f2f62d26700 time 2014-10-17 16:45:04.102285 common/perf_counters.cc: 105: FAILED assert(idx < m_upper_bound) ceph version 0.86-415-g66cffd7 (66cffd72f521dc9140c910a98335ee009616b41b) 1: (ceph::__ceph_assert_fail(char const*, char const*, int, char const*)+0x95) [0x191ed7b] 2: (PerfCounters::inc(int, unsigned long)+0x87) [0x1a07bff] ... very long boost prototypes trimmed ... 26: (PG::RecoveryState::handle_event(std::tr1::shared_ptr<PG::CephPeeringEvt>, PG::RecoveryCtx*)+0x4d) [0x151988d] 27: (PG::handle_peering_event(std::tr1::shared_ptr<PG::CephPeeringEvt>, PG::RecoveryCtx*)+0x37f) [0x14fc325] 28: (OSD::process_peering_events(std::list<PG*, std::allocator<PG*> > const&, ThreadPool::TPHandle&)+0x26b) [0x130cf43] 29: (OSD::PeeringWQ::_process(std::list<PG*, std::allocator<PG*> > const&, ThreadPool::TPHandle&)+0x30) [0x13279e4] 30: (ThreadPool::BatchWorkQueue<PG>::_void_process(void*, ThreadPool::TPHandle&)+0x33) [0x13bd063] 31: (ThreadPool::worker(ThreadPool::WorkThread*)+0x734) [0x190ec04] 32: (ThreadPool::WorkThread::entry()+0x23) [0x1912d9b] 33: (Thread::entry_wrapper()+0x84) [0x1905148] 34: (Thread::_entry_func(void*)+0x18) [0x19050ba] 35: (()+0x8182) [0x7f2f7cdac182] 36: (clone()+0x6d) [0x7f2f7b39a30d]
Files
Updated by Loïc Dachary over 9 years ago
- Status changed from New to Rejected
I changed the code and introduced the problem and then forgot I changed the code. Reverting the change fixes the problem.
3 files changed, 8 insertions(+) src/osd/OSD.cc | 3 +++ src/osd/OSD.h | 3 +++ src/osd/PG.cc | 2 ++ Modified src/osd/OSD.cc diff --git a/src/osd/OSD.cc b/src/osd/OSD.cc index 322cf5c..3644f9c 100644 --- a/src/osd/OSD.cc +++ b/src/osd/OSD.cc @@ -2306,6 +2306,9 @@ void OSD::create_recoverystate_perf() rs_perf.add_time_avg(rs_incomplete_latency, "incomplete_latency"); rs_perf.add_time_avg(rs_getmissing_latency, "getmissing_latency"); rs_perf.add_time_avg(rs_waitupthru_latency, "waitupthru_latency"); + rs_perf.add_u64(rs_backfill, "backfill"); + rs_perf.add_u64(rs_wait_local_backfill, "wait_local_backfill"); + rs_perf.add_u64(rs_wait_remote_backfill, "wait_remote_backfill"); recoverystate_perf = rs_perf.create_perf_counters(); cct->get_perfcounters_collection()->add(recoverystate_perf); Modified src/osd/OSD.h diff --git a/src/osd/OSD.h b/src/osd/OSD.h index 426a2b1..948883b 100644 --- a/src/osd/OSD.h +++ b/src/osd/OSD.h @@ -176,6 +176,9 @@ enum { rs_incomplete_latency, rs_getmissing_latency, rs_waitupthru_latency, + rs_backfill, + rs_wait_local_backfill, + rs_wait_remote_backfill, rs_last, }; Modified src/osd/PG.cc diff --git a/src/osd/PG.cc b/src/osd/PG.cc index 7ef000e..52f9751 100644 --- a/src/osd/PG.cc +++ b/src/osd/PG.cc @@ -5640,6 +5640,7 @@ PG::RecoveryState::Backfilling::Backfilling(my_context ctx) pg->state_clear(PG_STATE_BACKFILL_TOOFULL); pg->state_clear(PG_STATE_BACKFILL_WAIT); pg->state_set(PG_STATE_BACKFILL); + pg->osd->logger->inc(rs_backfill); } boost::statechart::result @@ -5685,6 +5686,7 @@ void PG::RecoveryState::Backfilling::exit() pg->state_clear(PG_STATE_BACKFILL); utime_t dur = ceph_clock_now(pg->cct) - enter_time; pg->osd->recoverystate_perf->tinc(rs_backfilling_latency, dur); + pg->osd->logger->inc(rs_backfill); } /*--WaitRemoteBackfillReserved--*/
Actions