scenario:
1. 3-replica
2. space is nearlly full(some osd >96%)
We guess the reason is osd continuously receiving two "MBackfillReserve" message,
first cause "Backfilling" status convert to "NotBackfilling"
then osd crashed when the second arrival because osd still in "NotBackfilling"
gdb info:
(gdb) p evt._M_ptr->evt.px
$4 = (PG::RemoteReservationRejected *) 0x2bc42d80
stack info:
#0 0x00007faad75dcb7b in raise () from /lib/x86_64-linux-gnu/libpthread.so.0
#1 0x000000000099650e in reraise_fatal (signum=6) at global/signal_handler.cc:59
#2 handle_fatal_signal (signum=6) at global/signal_handler.cc:105
#3 <signal handler called>
#4 0x00007faad5b1e425 in raise () from /lib/x86_64-linux-gnu/libc.so.6
#5 0x00007faad5b21b8b in abort () from /lib/x86_64-linux-gnu/libc.so.6
#6 0x00007faad647169d in _gnu_cxx::_verbose_terminate_handler() () from /usr/lib/x86_64-linux-gnu/libstdc++.so.6
#7 0x00007faad646f846 in ?? () from /usr/lib/x86_64-linux-gnu/libstdc++.so.6
#8 0x00007faad646f873 in std::terminate() () from /usr/lib/x86_64-linux-gnu/libstdc++.so.6
#9 0x00007faad646f96e in _cxa_throw () from /usr/lib/x86_64-linux-gnu/libstdc++.so.6
#10 0x0000000000a778ff in ceph::_ceph_assert_fail (assertion=0xb88e28 "0 == \"we got a bad state machine event\"",
file=<optimized out>, line=5325,
func=0xb8e4c0 "PG::RecoveryState::Crashed::Crashed(boost::statechart::state<PG::RecoveryState::Crashed, PG::RecoveryState::RecoveryMachine>::my_context)") at common/assert.cc:77
#11 0x00000000007583df in PG::RecoveryState::Crashed::Crashed (this=<optimized out>, ctx=...) at osd/PG.cc:5325
#12 0x00000000007863d6 in shallow_construct (outermostContextBase=..., pContext=<optimized out>)
at /usr/include/boost/statechart/state.hpp:89
#13 deep_construct (outermostContextBase=..., pContext=<optimized out>) at /usr/include/boost/statechart/state.hpp:79
#14 boost::statechart::detail::inner_constructor<boost::mpl::l_item<mpl_::long_<1l>, PG::RecoveryState::Crashed, boost::mpl::l_end>, boost::statechart::state_machine<PG::RecoveryState::RecoveryMachine, PG::RecoveryState::Initial, std::allocator<void>, boost::statechart::null_exception_translator> >::construct (outermostContextBase=..., pContext=<optimized out>)
at /usr/include/boost/statechart/detail/constructor.hpp:93
#15 0x00000000007adca3 in transit_impl<PG::RecoveryState::Crashed, PG::RecoveryState::RecoveryMachine, boost::statechart::detail::no_transition_function> (this=0x19bff6e0, transitionAction=...) at /usr/include/boost/statechart/simple_state.hpp:798
#16 transit<PG::RecoveryState::Crashed> (this=0x19bff6e0) at /usr/include/boost/statechart/simple_state.hpp:314
#17 react_without_action (stt=...) at /usr/include/boost/statechart/transition.hpp:38
#18 react (stt=...) at /usr/include/boost/statechart/detail/reaction_dispatcher.hpp:47
#19 react (stt=..., evt=...) at /usr/include/boost/statechart/detail/reaction_dispatcher.hpp:68
#20 react (stt=..., evt=..., eventType=<optimized out>) at /usr/include/boost/statechart/detail/reaction_dispatcher.hpp:109
#21 react<PG::RecoveryState::Started, boost::statechart::event_base, void const*> (stt=..., evt=...,
eventType=<optimized out>) at /usr/include/boost/statechart/transition.hpp:59
#22 local_react_impl<boost::mpl::list1<boost::statechart::transition<boost::statechart::event_base, PG::RecoveryState::Crashed> >, boost::statechart::simple_state<PG::RecoveryState::Started, PG::RecoveryState::RecoveryMachine, PG::RecoveryState::Start, (boost::statechart::history_mode)0> > (stt=..., evt=..., eventType=<optimized out>)
at /usr/include/boost/statechart/simple_state.hpp:816
#23 local_react<boost::mpl::list1<boost::statechart::transition<boost::statechart::event_base, PG::RecoveryState::Crashed> > > (this=0x19bff6e0, evt=..., eventType=<optimized out>) at /usr/include/boost/statechart/simple_state.hpp:851
#24 boost::statechart::simple_state<PG::RecoveryState::Started, PG::RecoveryState::RecoveryMachine, PG::RecoveryState::Start, (boost::statechart::history_mode)0>::local_react_impl_non_empty::local_react_impl<boost::mpl::list2<boost::statechart::custom_reaction<PG::FlushedEvt>, boost::statechart::transition<boost::statechart::event_base, PG::RecoveryState::Crashed, boost::statechart::detail::no_context<boost::statechart::event_base>, &boost::statechart::detail::no_context<boost::statechart::event_base>::no_function> >, boost::statechart::simple_state<PG::RecoveryState::Started, PG::RecoveryState::RecoveryMachine, PG::RecoveryState::Start, (boost::statechart::history_mode)0> > (stt=..., evt=..., eventType=<optimized out>)
at /usr/include/boost/statechart/simple_state.hpp:820
#25 0x00000000007ade00 in local_react<boost::mpl::list2<boost::statechart::custom_reaction<PG::FlushedEvt>, boost::statechart::transition<boost::statechart::event_base, PG::RecoveryState::Crashed> > > (eventType=0xf47c80, evt=..., this=0x19bff6e0)
at /usr/include/boost/statechart/simple_state.hpp:851
#26 local_react_impl<boost::mpl::list3<boost::statechart::custom_reaction<PG::NullEvt>, boost::statechart::custom_reaction<PG::FlushedEvt>, boost::statechart::transition<boost::statechart::event_base, PG::RecoveryState::Crashed> >, boost::statechart::simple_state<PG::RecoveryState::Started, PG::RecoveryState::RecoveryMachine, PG::RecoveryState::Start, (boost::statechart::history_mode)0> > (eventType=0xf47c80, evt=..., stt=...) at /usr/include/boost/statechart/simple_state.hpp:820
#27 local_react<boost::mpl::list3<boost::statechart::custom_reaction<PG::NullEvt>, boost::statechart::custom_reaction<PG::FlushedEvt>, boost::statechart::transition<boost::statechart::event_base, PG::RecoveryState::Crashed> > > (eventType=0xf47c80,
evt=..., this=0x19bff6e0) at /usr/include/boost/statechart/simple_state.hpp:851
#28 local_react_impl<boost::mpl::list4<boost::statechart::custom_reaction<PG::AdvMap>, boost::statechart::custom_reaction<PG::NullEvt>, boost::statechart::custom_reaction<PG::FlushedEvt>, boost::statechart::transition<boost::statechart::event_base, PG::RecoveryState::Crashed> >, boost::statechart::simple_state<PG::RecoveryState::Started, PG::RecoveryState::RecoveryMachine, PG::RecoveryState::Start, (boost::statechart::history_mode)0> > (eventType=0xf47c80, evt=..., stt=...)
at /usr/include/boost/statechart/simple_state.hpp:820
#29 local_react<boost::mpl::list4<boost::statechart::custom_reaction<PG::AdvMap>, boost::statechart::custom_reaction<PG::NullEvt>, boost::statechart::custom_reaction<PG::FlushedEvt>, boost::statechart::transition<boost::statechart::event_base, PG::RecoveryState::Crashed> > > (this=0x19bff6e0, eventType=0xf47c80, evt=...)
at /usr/include/boost/statechart/simple_state.hpp:851
#30 local_react_impl<boost::mpl::list<boost::statechart::custom_reaction<PG::QueryState>, boost::statechart::custom_reaction<PG::AdvMap>, boost::statechart::custom_reaction<PG::NullEvt>, boost::statechart::custom_reaction<PG::FlushedEvt>, boost::statechart::transition<boost::statechart::event_base, PG::RecoveryState::Crashed> >, boost::statechart::simple_state<PG::RecoveryState::Started, PG::RecoveryState::RecoveryMachine, PG::RecoveryState::Start, (boost::statechart::history_mode)0> > (
eventType=0xf47c80, evt=..., stt=...) at /usr/include/boost/statechart/simple_state.hpp:820
#31 local_react<boost::mpl::list<boost::statechart::custom_reaction<PG::QueryState>, boost::statechart::custom_reaction<PG::AdvMap>, boost::statechart::custom_reaction<PG::NullEvt>, boost::statechart::custom_reaction<PG::FlushedEvt>, boost::statechart::transition<boost::statechart::event_base, PG::RecoveryState::Crashed> > > (eventType=0xf47c80, evt=..., this=0x19bff6e0)
at /usr/include/boost/statechart/simple_state.hpp:851
#32 boost::statechart::simple_state<PG::RecoveryState::Started, PG::RecoveryState::RecoveryMachine, PG::RecoveryState::Start, (boost::statechart::history_mode)0>::react_impl (this=0x19bff6e0, evt=..., eventType=0xf47c80)
at /usr/include/boost/statechart/simple_state.hpp:489
#33 0x00000000007b300c in react_impl (eventType=0xf47c80, evt=..., this=0x244fc8c0)
at /usr/include/boost/statechart/simple_state.hpp:498
#34 boost::statechart::simple_state<PG::RecoveryState::Primary, PG::RecoveryState::Started, PG::RecoveryState::Peering, (boost::statechart::history_mode)0>::react_impl (this=0x244fc8c0, evt=..., eventType=0xf47c80)
at /usr/include/boost/statechart/simple_state.hpp:482
#35 0x00000000007b1a79 in react_impl (eventType=0xf47c80, evt=..., this=0x34811840)
at /usr/include/boost/statechart/simple_state.hpp:498
#36 boost::statechart::simple_state<PG::RecoveryState::Active, PG::RecoveryState::Primary, PG::RecoveryState::Activating, (boost::statechart::history_mode)0>::react_impl (this=0x34811840, evt=..., eventType=0xf47c80)
at /usr/include/boost/statechart/simple_state.hpp:482
#37 0x00000000007afe78 in react_impl (eventType=0xf47c80, evt=..., this=0x244fc910)
at /usr/include/boost/statechart/simple_state.hpp:498
#38 boost::statechart::simple_state<PG::RecoveryState::NotBackfilling, PG::RecoveryState::Active, boost::mpl::list<mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na>, (boost::statechart::history_mode)0>::react_impl (
this=0x244fc910, evt=..., eventType=0xf47c80) at /usr/include/boost/statechart/simple_state.hpp:482
#39 0x00000000007933ab in operator() (this=<synthetic pointer>) at /usr/include/boost/statechart/state_machine.hpp:87
#40 operator()<boost::statechart::detail::send_function<boost::statechart::detail::state_base<std::allocator<void>, boost::statechart::detail::rtti_policy>, boost::statechart::event_base, const void*>, boost::statechart::state_machine<PG::RecoveryState::RecoveryMachine, PG::RecoveryState::Initial>::exception_event_handler> (action=..., this=<optimized out>)
at /usr/include/boost/statechart/null_exception_translator.hpp:33
#41 boost::statechart::state_machine<PG::RecoveryState::RecoveryMachine, PG::RecoveryState::Initial, std::allocator<void>, boost::statechart::null_exception_translator>::send_event (this=0x1afa5930, evt=...)
at /usr/include/boost/statechart/state_machine.hpp:885
#42 0x0000000000793701 in boost::statechart::state_machine<PG::RecoveryState::RecoveryMachine, PG::RecoveryState::Initial, std::allocator<void>, boost::statechart::null_exception_translator>::process_event (this=0x1afa5930, evt=...)
at /usr/include/boost/statechart/state_machine.hpp:275
#43 0x000000000074a263 in handle_event (rctx=0x7faac40aa8d0,
evt=<error reading variable: access outside bounds of object referenced via synthetic pointer>, this=0x1afa5930)
at osd/PG.h:1877
#44 PG::handle_peering_event (this=0x1afa4000, evt=..., rctx=0x7faac40aa8d0) at osd/PG.cc:5170
#45 0x0000000000661a76 in OSD::process_peering_events (this=0x30b2000, pgs=..., handle=...) at osd/OSD.cc:7770
#46 0x00000000006b0e62 in OSD::PeeringWQ::_process (this=<optimized out>, pgs=..., handle=...) at osd/OSD.h:1202
#47 0x0000000000a68116 in ThreadPool::worker (this=0x30b2470, wt=0x19694260) at common/WorkQueue.cc:125
#48 0x0000000000a69f20 in ThreadPool::WorkThread::entry (this=<optimized out>) at common/WorkQueue.h:317
#49 0x00007faad75d4e9a in start_thread () from /lib/x86_64-linux-gnu/libpthread.so.0
#50 0x00007faad5bdc3fd in clone () from /lib/x86_64-linux-gnu/libc.so.6
log info:
ceph-osd.6.log.1: -158> 2014-07-29 05:12:50.037217 7faac40ab700 5 osd.6 pg_epoch: 3914 pg[0.862( v 166'25838 (166'22837,166'25838] local-les=2799 n=412 ec=1 les/c 2799/1506 1507/2797/2797) [8,20,10]/[6,15,19] r=0 lpr=2797 pi=549-2796/18 bft=8,10,20 crt=0'0 lcod 0'0 mlcod 0'0 active+remapped+wait_backfill] exit Started/Primary/Active/WaitRemoteBackfillReserved 15008.755999 3339 0.017972
ceph-osd.6.log.1: -157> 2014-07-29 05:12:50.037255 7faac40ab700 5 osd.6 pg_epoch: 3914 pg[0.862( v 166'25838 (166'22837,166'25838] local-les=2799 n=412 ec=1 les/c 2799/1506 1507/2797/2797) [8,20,10]/[6,15,19] r=0 lpr=2797 pi=549-2796/18 bft=8,10,20 crt=0'0 lcod 0'0 mlcod 0'0 active+remapped+wait_backfill] enter Started/Primary/Active/Backfilling
ceph-osd.6.log.1: -54> 2014-07-29 05:12:51.242714 7faac38aa700 5 osd.6 pg_epoch: 3914 pg[0.862( v 166'25838 (166'22837,166'25838] local-les=2799 n=412 ec=1 les/c 2799/1506 1507/2797/2797) [8,20,10]/[6,15,19] r=0 lpr=2797 pi=549-2796/18 rops=1 bft=8,10,20 crt=0'0 lcod 0'0 mlcod 0'0 active+remapped+backfilling] backfill_pos is 0//0//-1
ceph-osd.6.log.1: -46> 2014-07-29 05:12:51.243402 7faac48ac700 5 osd.6 pg_epoch: 3914 pg[0.862( v 166'25838 (166'22837,166'25838] local-les=2799 n=412 ec=1 les/c 2799/1506 1507/2797/2797) [8,20,10]/[6,15,19] r=0 lpr=2797 pi=549-2796/18 rops=1 bft=8,10,20 crt=0'0 lcod 0'0 mlcod 0'0 active+remapped+backfilling+backfill_toofull] exit Started/Primary/Active/Backfilling 1.206147 1 0.000079
ceph-osd.6.log.1: -44> 2014-07-29 05:12:51.243452 7faac48ac700 5 osd.6 pg_epoch: 3914 pg[0.862( v 166'25838 (166'22837,166'25838] local-les=2799 n=412 ec=1 les/c 2799/1506 1507/2797/2797) [8,20,10]/[6,15,19] r=0 lpr=2797 pi=549-2796/18 rops=1 bft=8,10,20 crt=0'0 lcod 0'0 mlcod 0'0 active+remapped+backfill_toofull] enter Started/Primary/Active/NotBackfilling
ceph-osd.6.log.1: -20> 2014-07-29 05:12:51.265706 7faac40ab700 5 osd.6 pg_epoch: 3914 pg[0.862( v 166'25838 (166'22837,166'25838] local-les=2799 n=412 ec=1 les/c 2799/1506 1507/2797/2797) [8,20,10]/[6,15,19] r=0 lpr=2797 pi=549-2796/18 rops=1 bft=8,10,20 crt=0'0 lcod 0'0 mlcod 0'0 active+remapped+backfill_toofull] exit Started/Primary/Active/NotBackfilling 0.022255 1 0.000096
ceph-osd.6.log.1: -17> 2014-07-29 05:12:51.265755 7faac40ab700 5 osd.6 pg_epoch: 3914 pg[0.862( v 166'25838 (166'22837,166'25838] local-les=2799 n=412 ec=1 les/c 2799/1506 1507/2797/2797) [8,20,10]/[6,15,19] r=0 lpr=2797 pi=549-2796/18 rops=1 bft=8,10,20 crt=0'0 lcod 0'0 mlcod 0'0 active+remapped+backfill_toofull] exit Started/Primary/Active 15015.343863 0 0.000000
ceph-osd.6.log.1: -16> 2014-07-29 05:12:51.265853 7faac40ab700 5 osd.6 pg_epoch: 3914 pg[0.862( v 166'25838 (166'22837,166'25838] local-les=2799 n=412 ec=1 les/c 2799/1506 1507/2797/2797) [8,20,10]/[6,15,19] r=0 lpr=2797 pi=549-2796/18 rops=1 bft=8,10,20 crt=0'0 lcod 0'0 mlcod 0'0 active+remapped] exit Started/Primary 15018.581516 0 0.000000
ceph-osd.6.log.1: -15> 2014-07-29 05:12:51.265878 7faac40ab700 5 osd.6 pg_epoch: 3914 pg[0.862( v 166'25838 (166'22837,166'25838] local-les=2799 n=412 ec=1 les/c 2799/1506 1507/2797/2797) [8,20,10]/[6,15,19] r=0 lpr=2797 pi=549-2796/18 rops=1 bft=8,10,20 crt=0'0 lcod 0'0 mlcod 0'0 active+remapped] exit Started 15018.581619 0 0.000000
ceph-osd.6.log.1: -14> 2014-07-29 05:12:51.265899 7faac40ab700 5 osd.6 pg_epoch: 3914 pg[0.862( v 166'25838 (166'22837,166'25838] local-les=2799 n=412 ec=1 les/c 2799/1506 1507/2797/2797) [8,20,10]/[6,15,19] r=0 lpr=2797 pi=549-2796/18 rops=1 bft=8,10,20 crt=0'0 lcod 0'0 mlcod 0'0 active+remapped] enter Crashed