Bug #1073
closedosd: failed assert: info.last_complete == info.last_update
Description
I got this crash on several osds:
osd/PG.cc: In function 'void PG::finish_recovery(ObjectStore::Transaction&, std::list<Context*, std::allocator<Context*> >&)', in thread '0x7fce5ac15710' osd/PG.cc: 1721: FAILED assert(info.last_complete == info.last_update) ceph version 0.27.1-307-g85292b3 (commit:85292b367b0e6e6d8963de32ad198482500c887f) 1: (ceph::__ceph_assert_fail(char const*, char const*, int, char const*)+0x58) [0x955fc0] 2: (PG::finish_recovery(ObjectStore::Transaction&, std::list<Context*, std::allocator<Context*> >&)+0xde) [0x83a070] 3: (PG::RecoveryState::Active::react(PG::RecoveryState::ActMap const&)+0x4f5) [0x84c08d] 4: (boost::statechart::detail::reaction_result boost::statechart::custom_reaction<PG::RecoveryState::ActMap>::react<PG::RecoveryState::Active, boost::statechart::event_base, void const*>(PG::Recov eryState::Active&, boost::statechart::event_base const&, void const* const&)+0x52) [0x886e6f] 5: (boost::statechart::detail::reaction_result boost::statechart::simple_state<PG::RecoveryState::Active, PG::RecoveryState::Primary, boost::mpl::list<mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_: :na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na>, (boost::statechart::history_mode)0>::loca l_react_impl_non_empty::local_react_impl<boost::mpl::list<boost::statechart::custom_reaction<PG::RecoveryState::ActMap>, boost::statechart::custom_reaction<PG::RecoveryState::AdvMap>, boost::statec hart::custom_reaction<PG::RecoveryState::MInfoRec>, boost::statechart::custom_reaction<PG::RecoveryState::MNotifyRec>, boost::statechart::custom_reaction<PG::RecoveryState::MLogRec>, mpl_::na, mpl_ ::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na>, boost::statechart::simple_state<PG::RecoveryState::Active, P G::RecoveryState::Primary, boost::mpl::list<mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl _::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na>, (boost::statechart::history_mode)0> >(boost::statechart::simple_state<PG::RecoveryState::Active, PG::RecoveryState::Primary, boost::mpl::list<mpl_::n a, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na>, (bo ost::statechart::history_mode)0>&, boost::statechart::event_base const&, void const*)+0x38) [0x885424] 6: (boost::statechart::detail::reaction_result boost::statechart::simple_state<PG::RecoveryState::Active, PG::RecoveryState::Primary, boost::mpl::list<mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_: :na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na>, (boost::statechart::history_mode)0>::loca l_react<boost::mpl::list<boost::statechart::custom_reaction<PG::RecoveryState::ActMap>, boost::statechart::custom_reaction<PG::RecoveryState::AdvMap>, boost::statechart::custom_reaction<PG::Recover yState::MInfoRec>, boost::statechart::custom_reaction<PG::RecoveryState::MNotifyRec>, boost::statechart::custom_reaction<PG::RecoveryState::MLogRec>, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::n a, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na> >(boost::statechart::event_base const&, void const*)+0x30) [0x8834fe] 7: (boost::statechart::simple_state<PG::RecoveryState::Active, PG::RecoveryState::Primary, boost::mpl::list<mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na , mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na>, (boost::statechart::history_mode)0>::react_impl(boost::statechart::event_base const& , void const*)+0x31) [0x880c89] 8: (boost::statechart::detail::send_function<boost::statechart::detail::state_base<std::allocator<void>, boost::statechart::detail::rtti_policy>, boost::statechart::event_base, void const*>::opera tor()()+0x4b) [0x817d4b] 9: (boost::statechart::detail::safe_reaction_result boost::statechart::null_exception_translator::operator()<boost::statechart::detail::send_function<boost::statechart::detail::state_base<std::allocator<void>, boost::statechart::detail::rtti_policy>, boost::statechart::event_base, void const*>, boost::statechart::state_machine<PG::RecoveryState::RecoveryMachine, PG::RecoveryState::Initial, std::allocator<void>, boost::statechart::null_exception_translator>::exception_event_handler>(boost::statechart::detail::send_function<boost::statechart::detail::state_base<std::allocator<void>, boost::statechart::detail::rtti_policy>, boost::statechart::event_base, void const*>, boost::statechart::state_machine<PG::RecoveryState::RecoveryMachine, PG::RecoveryState::Initial, std::allocator<void>, boost::statechart::null_exception_translator>::exception_event_handler)+0x2f) [0x808805] 10: (boost::statechart::state_machine<PG::RecoveryState::RecoveryMachine, PG::RecoveryState::Initial, std::allocator<void>, boost::statechart::null_exception_translator>::send_event(boost::statechart::event_base const&)+0x149) [0x7f3147] 11: (boost::statechart::state_machine<PG::RecoveryState::RecoveryMachine, PG::RecoveryState::Initial, std::allocator<void>, boost::statechart::null_exception_translator>::process_event(boost::statechart::event_base const&)+0x28) [0x860084] 12: (PG::RecoveryState::handle_activate_map(PG::RecoveryCtx*)+0x124) [0x852baa] 13: (PG::handle_activate_map(PG::RecoveryCtx*)+0x2f) [0x7c6a93] 14: (OSD::activate_map(ObjectStore::Transaction&, std::list<Context*, std::allocator<Context*> >&)+0x2ea) [0x7ad9bc] 15: (OSD::handle_osd_map(MOSDMap*)+0x1917) [0x7ab7ad] 16: (OSD::_dispatch(Message*)+0x358) [0x7a817c] 17: (OSD::ms_dispatch(Message*)+0x125) [0x7a74e5] 18: (Messenger::ms_deliver_dispatch(Message*)+0x68) [0x6fad5e] 19: (SimpleMessenger::dispatch_entry()+0x702) [0x6e8c0e] 20: (SimpleMessenger::DispatchThread::entry()+0x31) [0x6dddd1] 21: (Thread::_entry_func(void*)+0x28) [0x6f9af5] 22: (()+0x68ba) [0x7fce6527a8ba] 23: (clone()+0x6d) [0x7fce63f0f02d]
To reproduce:
Run 20 osds on one machine, and a rados write benchmark:
CEPH_NUM_OSD=20 CEPH_NUM_MDS=1 ./vstart.sh -d -n ./rados bench 10000 -p data write
Mark some osds out, with delays in between (I did 0-4).
Mark osd.0 in. Wait for it to peer and recover.
Logs, cores, and executables are in vit:/home/joshd/crashes/finish_recovery.
Updated by Sage Weil almost 13 years ago
- Target version set to v0.28
- Translation missing: en.field_position set to 14
Updated by Sage Weil almost 13 years ago
something is definitely going wrong here.. i see
2011-05-06 17:34:24.391722 7fe3aae59710 osd9 43 pg[0.12( v 21'17 lc 0'0 (21'15,21'17]+backlog n=17 ec=2 les=42 42/42/42) [9] r=0 mlcod 0'0 !hml active+degraded m=17 u=17] mark_all_unfound_as_lost
which shouldn't happen! (and if it does, it should advance last_complete accordingly.)
Updated by Sage Weil almost 13 years ago
this is wrong:
2011-05-06 17:33:48.284200 7f1466b56710 osd4 31 pg[0.12( v 21'17 (21'15,21'17] n=17 ec=2 les=11 31/31/31) [4] r=0 luod=0'0 lcod 21'16 mlcod 0'0 !hml active] noting past interval(13-30 [11,4]/[11,4])
should have maybe_went_rw for the 13-30 interval (it clearly did, since we did lots of writes!)
Updated by Sage Weil almost 13 years ago
we only wait for up_thru updates if some_down:
if (prior.some_down) {
need_up_thru = true;
for (vector<Interval>::iterator i = prior.inter_up_thru.begin();
i != prior.inter_up_thru.end();
++i) {
OSDMap *lastmap = osd->get_map(i->last);
prior.up_thru[i->acting[0]] = lastmap->get_up_thru(i->acting[0]);
}
}
but when logging past intervals we always assume that up_thru needs to be updated. I suspect the above some_down condition got added in the conversion from the old code? don't have time to look now.
Updated by Sage Weil almost 13 years ago
- Status changed from New to Resolved