Project

General

Profile

Actions

Bug #9809

closed

common/perf_counters.cc: 105: FAILED assert(idx < m_upper_bound)

Added by Loïc Dachary over 9 years ago. Updated over 9 years ago.

Status:
Rejected
Priority:
Normal
Assignee:
-
Category:
-
Target version:
-
% Done:

0%

Source:
other
Tags:
Backport:
Regression:
Severity:
3 - minor
Reviewed:
Affected Versions:
ceph-qa-suite:
Pull request ID:
Crash signature (v1):
Crash signature (v2):

Description

Steps to reproduce

  • modify vstart.sh with
    @@ -337,6 +337,9 @@ if [ "$start_mon" -eq 1 ]; then
             osd pg bits = 3
             osd pgp bits = 5  ; (invalid, but ceph should cope!)
             osd crush chooseleaf type = 0
    +        osd pool default size = 2
    +        osd_max_backfills = 2
    +        osd min pg log entries = 5
             osd pool default min size = 1
             osd pool default erasure code directory = .libs
             osd pool default erasure code profile = plugin=jerasure technique=reed_sol_van k=2 m=1 ruleset-failure-domain=osd
    
  • rm -fr dev out ; mkdir -p dev ; MON=1 OSD=3 ./vstart.sh -d -n -l mon osd
  • rados -p rbd bench 1 write -b 4096 --no-cleanup
  • ceph tell osd.* injectargs -- --osd_backfill_full_ratio 0.95 # because my disk is 90% full already
  • ceph tell osd.2 injectargs -- --osd_backfill_full_ratio 0
  • pkill -f 'ceph-osd -i 0'
  • ceph osd out 0
  • sleep 120
  • osd.2 crashes with
       -29> 2014-10-17 16:45:04.101261 7f2f6a9e5700 10 osd.2 15 do_waiters -- start
       -28> 2014-10-17 16:45:04.101263 7f2f6a9e5700 10 osd.2 15 do_waiters -- finish
       -27> 2014-10-17 16:45:04.101264 7f2f624d5700 10 osd.2 pg_epoch: 15 pg[0.6( v 10'8 (10'3,10'8] lb 0//0//-1 local-les=0 n=0 ec=1 les/c 15/12 13/14/11) [1,2]/[1] r=-1 lpr=15 pi=11-13/2 luod=0'0 crt=0'0 lcod 0'0 active] handle_peering_event: epoch_sent: 15 epoch_requested: 15 RequestBackfillPrio: priority 201
       -26> 2014-10-17 16:45:04.101301 7f2f624d5700 10 osd.2 pg_epoch: 15 pg[0.6( v 10'8 (10'3,10'8] lb 0//0//-1 local-les=0 n=0 ec=1 les/c 15/12 13/14/11) [1,2]/[1] r=-1 lpr=15 pi=11-13/2 luod=0'0 crt=0'0 lcod 0'0 active] state<Started/ReplicaActive/RepNotRecovering>: backfill reservation rejected: full ratio is 0.896747, which is greater than max allowed ratio 0
       -25> 2014-10-17 16:45:04.101342 7f2f624d5700  5 osd.2 pg_epoch: 15 pg[0.6( v 10'8 (10'3,10'8] lb 0//0//-1 local-les=0 n=0 ec=1 les/c 15/12 13/14/11) [1,2]/[1] r=-1 lpr=15 pi=11-13/2 luod=0'0 crt=0'0 lcod 0'0 active] exit Started/ReplicaActive/RepNotRecovering 0.394974 4 0.000311
       -24> 2014-10-17 16:45:04.101384 7f2f624d5700  5 osd.2 pg_epoch: 15 pg[0.6( v 10'8 (10'3,10'8] lb 0//0//-1 local-les=0 n=0 ec=1 les/c 15/12 13/14/11) [1,2]/[1] r=-1 lpr=15 pi=11-13/2 luod=0'0 crt=0'0 lcod 0'0 active] enter Started/ReplicaActive/RepWaitBackfillReserved
       -23> 2014-10-17 16:45:04.101440 7f2f624d5700 20 osd.2 15 share_map_peer 0x5c334a0 already has epoch 15
       -22> 2014-10-17 16:45:04.101453 7f2f624d5700  1 -- 127.0.0.1:6809/8024 --> 127.0.0.1:6805/7805 -- MBackfillReserve REJECT  pgid: 0.6, query_epoch: 15 v3 -- ?+0 0x5576940 con 0x5c334a0
       -21> 2014-10-17 16:45:04.101490 7f2f624d5700  5 osd.2 pg_epoch: 15 pg[0.6( v 10'8 (10'3,10'8] lb 0//0//-1 local-les=0 n=0 ec=1 les/c 15/12 13/14/11) [1,2]/[1] r=-1 lpr=15 pi=11-13/2 luod=0'0 crt=0'0 lcod 0'0 active] exit Started/ReplicaActive/RepWaitBackfillReserved 0.000106 0 0.000000
       -20> 2014-10-17 16:45:04.101536 7f2f624d5700  5 osd.2 pg_epoch: 15 pg[0.6( v 10'8 (10'3,10'8] lb 0//0//-1 local-les=0 n=0 ec=1 les/c 15/12 13/14/11) [1,2]/[1] r=-1 lpr=15 pi=11-13/2 luod=0'0 crt=0'0 lcod 0'0 active] enter Started/ReplicaActive/RepNotRecovering
       -19> 2014-10-17 16:45:04.101579 7f2f624d5700 10 log is not dirty
       -18> 2014-10-17 16:45:04.101988 7f2f6a9e5700  1 -- 127.0.0.1:6809/8024 <== osd.1 127.0.0.1:6805/7805 27 ==== MBackfillReserve GRANT  pgid: 0.1, query_epoch: 15 v3 ==== 30+0+0 (3475927235 0 0) 0x5576940 con 0x5c334a0
       -17> 2014-10-17 16:45:04.102016 7f2f6a9e5700 10 osd.2 15 do_waiters -- start
       -16> 2014-10-17 16:45:04.102019 7f2f6a9e5700 10 osd.2 15 do_waiters -- finish
       -15> 2014-10-17 16:45:04.102021 7f2f6a9e5700 20 osd.2 15 _dispatch 0x5576940 MBackfillReserve GRANT  pgid: 0.1, query_epoch: 15 v3
       -14> 2014-10-17 16:45:04.102028 7f2f6a9e5700  5 -- op tracker -- seq: 126, time: 2014-10-17 16:45:04.101868, event: header_read, op: MBackfillReserve GRANT  pgid: 0.1, query_epoch: 15
       -13> 2014-10-17 16:45:04.102034 7f2f6a9e5700  5 -- op tracker -- seq: 126, time: 2014-10-17 16:45:04.101870, event: throttled, op: MBackfillReserve GRANT  pgid: 0.1, query_epoch: 15
       -12> 2014-10-17 16:45:04.102039 7f2f6a9e5700  5 -- op tracker -- seq: 126, time: 2014-10-17 16:45:04.101925, event: all_read, op: MBackfillReserve GRANT  pgid: 0.1, query_epoch: 15
       -11> 2014-10-17 16:45:04.102043 7f2f6a9e5700  5 -- op tracker -- seq: 126, time: 2014-10-17 16:45:04.102015, event: dispatched, op: MBackfillReserve GRANT  pgid: 0.1, query_epoch: 15
       -10> 2014-10-17 16:45:04.102049 7f2f6a9e5700  5 -- op tracker -- seq: 126, time: 2014-10-17 16:45:04.102048, event: waiting_for_osdmap, op: MBackfillReserve GRANT  pgid: 0.1, query_epoch: 15
        -9> 2014-10-17 16:45:04.102053 7f2f6a9e5700 15 osd.2 15 require_same_or_newer_map 15 (i am 15) 0x5576940
        -8> 2014-10-17 16:45:04.102078 7f2f6a9e5700  5 -- op tracker -- seq: 126, time: 2014-10-17 16:45:04.102078, event: done, op: MBackfillReserve GRANT  pgid: 0.1, query_epoch: 15
        -7> 2014-10-17 16:45:04.102102 7f2f6a9e5700 10 osd.2 15 do_waiters -- start
        -6> 2014-10-17 16:45:04.102108 7f2f6a9e5700 10 osd.2 15 do_waiters -- finish
        -5> 2014-10-17 16:45:04.102107 7f2f62d26700 10 osd.2 pg_epoch: 15 pg[0.1( v 10'16 (10'5,10'16] local-les=15 n=16 ec=1 les/c 15/12 13/14/9) [2,1]/[2] r=0 lpr=14 pi=11-13/2 bft=1 crt=10'5 lcod 10'15 mlcod 0'0 active+undersized+degraded+remapped+wait_backfill] handle_peering_event: epoch_sent: 15 epoch_requested: 15 RemoteBackfillReserved
        -4> 2014-10-17 16:45:04.102168 7f2f62d26700  5 osd.2 pg_epoch: 15 pg[0.1( v 10'16 (10'5,10'16] local-les=15 n=16 ec=1 les/c 15/12 13/14/9) [2,1]/[2] r=0 lpr=14 pi=11-13/2 bft=1 crt=10'5 lcod 10'15 mlcod 0'0 active+undersized+degraded+remapped+wait_backfill] exit Started/Primary/Active/WaitRemoteBackfillReserved 0.001372 1 0.000136
        -3> 2014-10-17 16:45:04.102213 7f2f62d26700  5 osd.2 pg_epoch: 15 pg[0.1( v 10'16 (10'5,10'16] local-les=15 n=16 ec=1 les/c 15/12 13/14/9) [2,1]/[2] r=0 lpr=14 pi=11-13/2 bft=1 crt=10'5 lcod 10'15 mlcod 0'0 active+undersized+degraded+remapped+wait_backfill] enter Started/Primary/Active/Backfilling
        -2> 2014-10-17 16:45:04.102253 7f2f62d26700 10 osd.2 15 queue_for_recovery queued pg[0.1( v 10'16 (10'5,10'16] local-les=15 n=16 ec=1 les/c 15/12 13/14/9) [2,1]/[2] r=0 lpr=14 pi=11-13/2 bft=1 crt=10'5 lcod 10'15 mlcod 0'0 active+undersized+degraded+remapped+wait_backfill]
        -1> 2014-10-17 16:45:04.102267 7f2f5c95a700 10 osd.2 15 do_recovery can start 5 (0/15 rops)
         0> 2014-10-17 16:45:04.134123 7f2f62d26700 -1 common/perf_counters.cc: In function 'void PerfCounters::inc(int, uint64_t)' thread 7f2f62d26700 time 2014-10-17 16:45:04.102285
    common/perf_counters.cc: 105: FAILED assert(idx < m_upper_bound)
    
     ceph version 0.86-415-g66cffd7 (66cffd72f521dc9140c910a98335ee009616b41b)
     1: (ceph::__ceph_assert_fail(char const*, char const*, int, char const*)+0x95) [0x191ed7b]
     2: (PerfCounters::inc(int, unsigned long)+0x87) [0x1a07bff]
    ... very long boost prototypes trimmed ...
     26: (PG::RecoveryState::handle_event(std::tr1::shared_ptr<PG::CephPeeringEvt>, PG::RecoveryCtx*)+0x4d) [0x151988d]
     27: (PG::handle_peering_event(std::tr1::shared_ptr<PG::CephPeeringEvt>, PG::RecoveryCtx*)+0x37f) [0x14fc325]
     28: (OSD::process_peering_events(std::list<PG*, std::allocator<PG*> > const&, ThreadPool::TPHandle&)+0x26b) [0x130cf43]
     29: (OSD::PeeringWQ::_process(std::list<PG*, std::allocator<PG*> > const&, ThreadPool::TPHandle&)+0x30) [0x13279e4]
     30: (ThreadPool::BatchWorkQueue<PG>::_void_process(void*, ThreadPool::TPHandle&)+0x33) [0x13bd063]
     31: (ThreadPool::worker(ThreadPool::WorkThread*)+0x734) [0x190ec04]
     32: (ThreadPool::WorkThread::entry()+0x23) [0x1912d9b]
     33: (Thread::entry_wrapper()+0x84) [0x1905148]
     34: (Thread::_entry_func(void*)+0x18) [0x19050ba]
     35: (()+0x8182) [0x7f2f7cdac182]
     36: (clone()+0x6d) [0x7f2f7b39a30d]
    

Files

osd.2.log.gz (385 KB) osd.2.log.gz full osd.2 logs Loïc Dachary, 10/17/2014 04:50 PM
Actions #1

Updated by Loïc Dachary over 9 years ago

  • Status changed from New to Rejected

I changed the code and introduced the problem and then forgot I changed the code. Reverting the change fixes the problem.

3 files changed, 8 insertions(+)
 src/osd/OSD.cc | 3 +++
 src/osd/OSD.h  | 3 +++
 src/osd/PG.cc  | 2 ++

    Modified   src/osd/OSD.cc
diff --git a/src/osd/OSD.cc b/src/osd/OSD.cc
index 322cf5c..3644f9c 100644
--- a/src/osd/OSD.cc
+++ b/src/osd/OSD.cc
@@ -2306,6 +2306,9 @@ void OSD::create_recoverystate_perf()
   rs_perf.add_time_avg(rs_incomplete_latency, "incomplete_latency");
   rs_perf.add_time_avg(rs_getmissing_latency, "getmissing_latency");
   rs_perf.add_time_avg(rs_waitupthru_latency, "waitupthru_latency");
+  rs_perf.add_u64(rs_backfill, "backfill");
+  rs_perf.add_u64(rs_wait_local_backfill, "wait_local_backfill");
+  rs_perf.add_u64(rs_wait_remote_backfill, "wait_remote_backfill");

   recoverystate_perf = rs_perf.create_perf_counters();
   cct->get_perfcounters_collection()->add(recoverystate_perf);
    Modified   src/osd/OSD.h
diff --git a/src/osd/OSD.h b/src/osd/OSD.h
index 426a2b1..948883b 100644
--- a/src/osd/OSD.h
+++ b/src/osd/OSD.h
@@ -176,6 +176,9 @@ enum {
   rs_incomplete_latency,
   rs_getmissing_latency,
   rs_waitupthru_latency,
+  rs_backfill,
+  rs_wait_local_backfill,
+  rs_wait_remote_backfill,
   rs_last,
 };

    Modified   src/osd/PG.cc
diff --git a/src/osd/PG.cc b/src/osd/PG.cc
index 7ef000e..52f9751 100644
--- a/src/osd/PG.cc
+++ b/src/osd/PG.cc
@@ -5640,6 +5640,7 @@ PG::RecoveryState::Backfilling::Backfilling(my_context ctx)
   pg->state_clear(PG_STATE_BACKFILL_TOOFULL);
   pg->state_clear(PG_STATE_BACKFILL_WAIT);
   pg->state_set(PG_STATE_BACKFILL);
+  pg->osd->logger->inc(rs_backfill);
 }

 boost::statechart::result
@@ -5685,6 +5686,7 @@ void PG::RecoveryState::Backfilling::exit()
   pg->state_clear(PG_STATE_BACKFILL);
   utime_t dur = ceph_clock_now(pg->cct) - enter_time;
   pg->osd->recoverystate_perf->tinc(rs_backfilling_latency, dur);
+  pg->osd->logger->inc(rs_backfill);
 }

 /*--WaitRemoteBackfillReserved--*/

Actions

Also available in: Atom PDF