Bug #58370
openOSD crash
0%
Description
# ceph crash info 2022-12-29T08:36:17.297189Z_28d36dd7-fd87-4bbe-b03d-208b802f693d { "assert_condition": "abort", "assert_file": "/home/jenkins-build/build/workspace/ceph-build/ARCH/x86_64/AVAILABLE_ARCH/x86_64/AVAILABLE_DIST/centos8/DIST/centos8/MACHINE_SIZE/gigantic/release/15.2.7/rpm/el8/BUILD/ceph-15.2.7/src/osd/PeeringState.cc", "assert_func": "PeeringState::Crashed::Crashed(boost::statechart::state<PeeringState::Crashed, PeeringState::PeeringMachine>::my_context)", "assert_line": 4243, "assert_msg": "/home/jenkins-build/build/workspace/ceph-build/ARCH/x86_64/AVAILABLE_ARCH/x86_64/AVAILABLE_DIST/centos8/DIST/centos8/MACHINE_SIZE/gigantic/release/15.2.7/rpm/el8/BUILD/ceph-15.2.7/src/osd/PeeringState.cc: In function 'PeeringState::Crashed::Crashed(boost::statechart::state<PeeringState::Crashed, PeeringState::PeeringMachine>::my_context)' thread 7fc2daaff700 time 2022-12-29T08:36:17.290548+0000\n/home/jenkins-build/build/workspace/ceph-build/ARCH/x86_64/AVAILABLE_ARCH/x86_64/AVAILABLE_DIST/centos8/DIST/centos8/MACHINE_SIZE/gigantic/release/15.2.7/rpm/el8/BUILD/ceph-15.2.7/src/osd/PeeringState.cc: 4243: ceph_abort_msg(\"we got a bad state machine event\")\n", "assert_thread_name": "tp_osd_tp", "backtrace": [ "(()+0x12dd0) [0x7fc2fab97dd0]", "(gsignal()+0x10f) [0x7fc2f980070f]", "(abort()+0x127) [0x7fc2f97eab25]", "(ceph::__ceph_abort(char const*, int, char const*, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&)+0x1b6) [0x5578891275cf]", "(PeeringState::Crashed::Crashed(boost::statechart::state<PeeringState::Crashed, PeeringState::PeeringMachine, boost::mpl::list<mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na>, (boost::statechart::history_mode)0>::my_context)+0xc4) [0x5578894a1014]", "(boost::statechart::state<PeeringState::Crashed, PeeringState::PeeringMachine, boost::mpl::list<mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na>, (boost::statechart::history_mode)0>::deep_construct(boost::statechart::state_machine<PeeringState::PeeringMachine, PeeringState::Initial, std::allocator<boost::statechart::none>, boost::statechart::null_exception_translator>* const&, boost::statechart::state_machine<PeeringState::PeeringMachine, PeeringState::Initial, std::allocator<boost::statechart::none>, boost::statechart::null_exception_translator>&)+0x3a) [0x5578894d66ba]", "(boost::statechart::simple_state<PeeringState::Deleting, PeeringState::ToDelete, boost::mpl::list<mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na>, (boost::statechart::history_mode)0>::react_impl(boost::statechart::event_base const&, void const*)+0x119) [0x5578894d7c69]", "(boost::statechart::state_machine<PeeringState::PeeringMachine, PeeringState::Initial, std::allocator<boost::statechart::none>, boost::statechart::null_exception_translator>::process_event(boost::statechart::event_base const&)+0x5b) [0x5578892cac1b]", "(PG::do_peering_event(std::shared_ptr<PGPeeringEvent>, PeeringCtx&)+0x2d1) [0x5578892bd801]", "(OSD::dequeue_peering_evt(OSDShard*, PG*, std::shared_ptr<PGPeeringEvent>, ThreadPool::TPHandle&)+0x29c) [0x55788923482c]", "(ceph::osd::scheduler::PGPeeringItem::run(OSD*, OSDShard*, boost::intrusive_ptr<PG>&, ThreadPool::TPHandle&)+0x56) [0x5578894660c6]", "(OSD::ShardedOpWQ::_process(unsigned int, ceph::heartbeat_handle_d*)+0x12ef) [0x5578892274df]", "(ShardedThreadPool::shardedthreadpool_worker(unsigned int)+0x5c4) [0x557889860224]", "(ShardedThreadPool::WorkThreadSharded::entry()+0x14) [0x557889862e84]", "(()+0x82de) [0x7fc2fab8d2de]", "(clone()+0x43) [0x7fc2f98c4e83]" ], "ceph_version": "15.2.7", "crash_id": "2022-12-29T08:36:17.297189Z_28d36dd7-fd87-4bbe-b03d-208b802f693d", "entity_name": "osd.6", "os_id": "centos", "os_name": "CentOS Linux", "os_version": "8 (Core)", "os_version_id": "8", "process_name": "ceph-osd", "stack_sig": "0ccafe69a1d090e77f1c849c90c2b18309551bdd77a4229f380d74ac4cae5e68", "timestamp": "2022-12-29T08:36:17.297189Z", "utsname_hostname": "n6-071-033", "utsname_machine": "x86_64", "utsname_release": "5.4.56.bsk.9-amd64", "utsname_sysname": "Linux", "utsname_version": "#5.4.56.bsk.9 SMP Debian 5.4.56.bsk.9 Wed Aug 25 03:42:38 UTC 20"
Files
Updated by Radoslaw Zarzynski over 1 year ago
- Status changed from New to Need More Info
Is there the related log available by any chance?
Updated by Radoslaw Zarzynski over 1 year ago
PG the was 2.50:
-12> 2022-12-29T08:36:17.289+0000 7fc2daaff700 5 osd.6 pg_epoch: 1917 pg[2.50( v 1813'48194490 (1628'48184402,1813'48194490] lb MIN local-lis/les=1795/1796 n=49202 ec=988/29 lis/c=1818/1814 les/c/f=1819/1815/0 sis=1889) [8,7,3] r=-1 lpr=1889 pi=[1795,1889)/2 crt=1813'48194490 lcod 1813'48194489 mlcod 0'0 unknown NOTIFY mbc={}] enter Crashed
The PG was the Deleting
substate:
-15> 2022-12-29T08:36:17.289+0000 7fc2daaff700 5 osd.6 pg_epoch: 1917 pg[2.50( v 1813'48194490 (1628'48184402,1813'48194490] lb MIN local-lis/les=1795/1796 n=49202 ec=988/29 lis/c=1818/1814 les/c/f=1819/1815/0 sis=1889) [8,7,3] r=-1 lpr=1889 DELETING pi=[1795,1889)/2 crt=1813'48194490 lcod 1813'48194489 mlcod 0'0 unknown NOTIFY mbc={}] exit Started/ToDelete/Deleting 0.000047 1 0.000025
The acting set for this PG is [8, 7, 3]
but the log line came from crashed osd.6. Looks like a delayed event as osd.6 got reassigned out.
Another symptom of the nonce=constant issue?
Do you use a containerized environment?
Would you be able to provide a log with debug_ms=5
and debug_osd=20
? We could figure out the conn's nonces that way.
Updated by yite gu over 1 year ago
Radoslaw Zarzynski wrote:
PG the was 2.50:
[...]
The PG was the
Deleting
substate:[...]
The acting set for this PG is
[8, 7, 3]
but the log line came from crashed osd.6. Looks like a delayed event as osd.6 got reassigned out.Another symptom of the nonce=constant issue?
Do you use a containerized environment?
Would you be able to provide a log withdebug_ms=5
anddebug_osd=20
? We could figure out the conn's nonces that way.
my environment is containerized
Updated by Radoslaw Zarzynski over 1 year ago
OK, then it's susceptible to the nonce issue. Would a debug_ms=5
log.
Updated by yite gu about 1 year ago
Radoslaw Zarzynski wrote:
OK, then it's susceptible to the nonce issue. Would a
debug_ms=5
log.
Ok, but But I'm not sure when the problem will happen again.