Project

General

Profile

Bug #50101

unhandled event in ReplicaActive

Added by Sage Weil almost 3 years ago. Updated almost 3 years ago.

Status:
New
Priority:
Normal
Assignee:
-
Category:
-
Target version:
-
% Done:

0%

Source:
Tags:
Backport:
Regression:
No
Severity:
3 - minor
Reviewed:
Affected Versions:
ceph-qa-suite:
Component(RADOS):
Pull request ID:
Crash signature (v1):

9aa3c57e050d0d976cd83938712756b1e31a68adf250f406e06afa1d413c016f

Crash signature (v2):

Description

{
    "assert_condition": "abort",
    "assert_file": "/home/jenkins-build/build/workspace/ceph-dev-build/ARCH/x86_64/AVAILABLE_ARCH/x86_64/AVAILABLE_DIST/centos8/DIST/centos8/MACHINE_SIZE/gigantic/release/16.1.0-1323-g7e7e1f4e/rpm/el8/BUILD/ceph-16.1.0-1323-g7e7e1f4e/src/osd/PeeringState.cc",
    "assert_func": "PeeringState::Crashed::Crashed(boost::statechart::state<PeeringState::Crashed, PeeringState::PeeringMachine>::my_context)",
    "assert_line": 4594,
    "assert_msg": "/home/jenkins-build/build/workspace/ceph-dev-build/ARCH/x86_64/AVAILABLE_ARCH/x86_64/AVAILABLE_DIST/centos8/DIST/centos8/MACHINE_SIZE/gigantic/release/16.1.0-1323-g7e7e1f4e/rpm/el8/BUILD/ceph-16.1.0-1323-g7e7e1f4e/src/osd/PeeringState.cc: In function 'PeeringState::Crashed::Crashed(boost::statechart::state<PeeringState::Crashed, PeeringState::PeeringMachine>::my_context)' thread 7f84ffe66700 time 2021-03-30T23:11:31.552274+0000\n/home/jenkins-build/build/workspace/ceph-dev-build/ARCH/x86_64/AVAILABLE_ARCH/x86_64/AVAILABLE_DIST/centos8/DIST/centos8/MACHINE_SIZE/gigantic/release/16.1.0-1323-g7e7e1f4e/rpm/el8/BUILD/ceph-16.1.0-1323-g7e7e1f4e/src/osd/PeeringState.cc: 4594: ceph_abort_msg(\"we got a bad state machine event\")\n",
    "assert_thread_name": "tp_osd_tp",
    "backtrace": [
        "/lib64/libpthread.so.0(+0x12b20) [0x7f8521f0eb20]",
        "gsignal()",
        "abort()",
        "(ceph::__ceph_abort(char const*, int, char const*, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&)+0x1b6) [0x55b41d947509]",
        "(PeeringState::Crashed::Crashed(boost::statechart::state<PeeringState::Crashed, PeeringState::PeeringMachine, boost::mpl::list<mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na>, (boost::statechart::history_mode)0>::my_context)+0xc4) [0x55b41dceb554]",
        "(boost::statechart::state<PeeringState::Crashed, PeeringState::PeeringMachine, boost::mpl::list<mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na>, (boost::statechart::history_mode)0>::deep_construct(boost::statechart::state_machine<PeeringState::PeeringMachine, PeeringState::Initial, std::allocator<boost::statechart::none>, boost::statechart::null_exception_translator>* const&, boost::statechart::state_machine<PeeringState::PeeringMachine, PeeringState::Initial, std::allocator<boost::statechart::none>, boost::statechart::null_exception_translator>&)+0x3a) [0x55b41dd214fa]",
        "(boost::statechart::simple_state<PeeringState::ReplicaActive, PeeringState::Started, PeeringState::RepNotRecovering, (boost::statechart::history_mode)0>::react_impl(boost::statechart::event_base const&, void const*)+0x2e3) [0x55b41dd226d3]",
        "(boost::statechart::simple_state<PeeringState::RepRecovering, PeeringState::ReplicaActive, boost::mpl::list<mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na>, (boost::statechart::history_mode)0>::react_impl(boost::statechart::event_base const&, void const*)+0x99) [0x55b41dd206b9]",
        "(boost::statechart::state_machine<PeeringState::PeeringMachine, PeeringState::Initial, std::allocator<boost::statechart::none>, boost::statechart::null_exception_translator>::process_event(boost::statechart::event_base const&)+0x5b) [0x55b41db0a21b]",
        "(PG::do_peering_event(std::shared_ptr<PGPeeringEvent>, PeeringCtx&)+0x2d1) [0x55b41dafebf1]",
        "(OSD::dequeue_peering_evt(OSDShard*, PG*, std::shared_ptr<PGPeeringEvent>, ThreadPool::TPHandle&)+0x29c) [0x55b41da792cc]",
        "(ceph::osd::scheduler::PGPeeringItem::run(OSD*, OSDShard*, boost::intrusive_ptr<PG>&, ThreadPool::TPHandle&)+0x56) [0x55b41dca7ae6]",
        "(OSD::ShardedOpWQ::_process(unsigned int, ceph::heartbeat_handle_d*)+0xa58) [0x55b41da6b048]",
        "(ShardedThreadPool::shardedthreadpool_worker(unsigned int)+0x5c4) [0x55b41e0d2494]",
        "(ShardedThreadPool::WorkThreadSharded::entry()+0x14) [0x55b41e0d5134]",
        "/lib64/libpthread.so.0(+0x814a) [0x7f8521f0414a]",
        "clone()" 
    ],
    "ceph_version": "16.1.0-1323-g7e7e1f4e",
    "crash_id": "2021-03-30T23:11:32.528564Z_ecb4c036-5628-4abf-ab37-a205037feaf2",
    "entity_name": "osd.65",
    "os_id": "centos",
    "os_name": "CentOS Linux",
    "os_version": "8",
    "os_version_id": "8",
    "process_name": "ceph-osd",
    "stack_sig": "9aa3c57e050d0d976cd83938712756b1e31a68adf250f406e06afa1d413c016f",
    "timestamp": "2021-03-30T23:11:32.528564Z",
    "utsname_hostname": "mira093",
    "utsname_machine": "x86_64",
    "utsname_release": "5.4.0-66-generic",
    "utsname_sysname": "Linux",
    "utsname_version": "#74~18.04.2-Ubuntu SMP Fri Feb 5 11:17:31 UTC 2021" 
}

log.gz - log from the crash (167 KB) Josh Durgin, 04/09/2021 09:49 PM

History

#1 Updated by Sage Weil almost 3 years ago

  • Project changed from Ceph to RADOS

#2 Updated by Josh Durgin almost 3 years ago

#3 Updated by Neha Ojha almost 3 years ago

 -2219> 2021-03-30T23:10:27.629+0000 7f84ffe66700  5 osd.65 pg_epoch: 10575944 pg[114.57( v 10575941'3129534 (10575152'3126961,10575941'3129534] lb 114:ea7a883f:::1000ef5fccb.00000000:head local-lis/les=10573977/10573978 n=46366 ec=5887724/4421456 lis/c=10573977/10573790 les/c/f=10573978/10573794/9814718 sis=10575818) [83,65,8]/[83,40,125] r=-1 lpr=10575818 pi=[10573790,10575818)/3 luod=0'0 lua=10575817'3129484 crt=10575941'3129534 mlcod 10575941'3129523 active+remapped mbc={}] enter Started/ReplicaActive/RepNotRecovering
 -2093> 2021-03-30T23:10:30.249+0000 7f84ffe66700  5 osd.65 pg_epoch: 10575946 pg[114.57( v 10575941'3129534 (10575152'3126961,10575941'3129534] lb 114:ea7a8b66:::1000f206522.00000000:head local-lis/les=10573977/10573978 n=46371 ec=5887724/4421456 lis/c=10573977/10573790 les/c/f=10573978/10573794/9814718 sis=10575818) [83,65,8]/[83,40,125] r=-1 lpr=10575818 pi=[10573790,10575818)/3 luod=0'0 lua=10575817'3129484 crt=10575941'3129534 mlcod 10575941'3129523 active+remapped mbc={}] exit Started/ReplicaActive/RepNotRecovering 2.620139 8 0.000079
 -2092> 2021-03-30T23:10:30.249+0000 7f84ffe66700  5 osd.65 pg_epoch: 10575946 pg[114.57( v 10575941'3129534 (10575152'3126961,10575941'3129534] lb 114:ea7a8b66:::1000f206522.00000000:head local-lis/les=10573977/10573978 n=46371 ec=5887724/4421456 lis/c=10573977/10573790 les/c/f=10573978/10573794/9814718 sis=10575818) [83,65,8]/[83,40,125] r=-1 lpr=10575818 pi=[10573790,10575818)/3 luod=0'0 lua=10575817'3129484 crt=10575941'3129534 mlcod 10575941'3129523 active+remapped mbc={}] enter Started/ReplicaActive/RepWaitBackfillReserved
 -2091> 2021-03-30T23:10:30.249+0000 7f84ffe66700  5 osd.65 pg_epoch: 10575946 pg[114.57( v 10575941'3129534 (10575152'3126961,10575941'3129534] lb 114:ea7a8b66:::1000f206522.00000000:head local-lis/les=10573977/10573978 n=46371 ec=5887724/4421456 lis/c=10573977/10573790 les/c/f=10573978/10573794/9814718 sis=10575818) [83,65,8]/[83,40,125] r=-1 lpr=10575818 pi=[10573790,10575818)/3 luod=0'0 lua=10575817'3129484 crt=10575941'3129534 mlcod 10575941'3129523 active+remapped mbc={}] exit Started/ReplicaActive/RepWaitBackfillReserved 0.000038 1 0.000188
 -2090> 2021-03-30T23:10:30.249+0000 7f84ffe66700  5 osd.65 pg_epoch: 10575946 pg[114.57( v 10575941'3129534 (10575152'3126961,10575941'3129534] lb 114:ea7a8b66:::1000f206522.00000000:head local-lis/les=10573977/10573978 n=46371 ec=5887724/4421456 lis/c=10573977/10573790 les/c/f=10573978/10573794/9814718 sis=10575818) [83,65,8]/[83,40,125] r=-1 lpr=10575818 pi=[10573790,10575818)/3 luod=0'0 lua=10575817'3129484 crt=10575941'3129534 mlcod 10575941'3129523 active+remapped mbc={}] enter Started/ReplicaActive/RepRecovering
 -2062> 2021-03-30T23:10:31.257+0000 7f84ffe66700  5 osd.65 pg_epoch: 10575947 pg[114.57( v 10575941'3129534 (10575152'3126961,10575941'3129534] lb 114:ea7a9cb0:::1000ef1f65b.00000000:head local-lis/les=10573977/10573978 n=46406 ec=5887724/4421456 lis/c=10573977/10573790 les/c/f=10573978/10573794/9814718 sis=10575818) [83,65,8]/[83,40,125] r=-1 lpr=10575818 pi=[10573790,10575818)/3 luod=0'0 lua=10575817'3129484 crt=10575941'3129534 mlcod 10575941'3129523 active+remapped mbc={}] exit Started/ReplicaActive/RepRecovering 1.007368 4 0.000055
 -2061> 2021-03-30T23:10:31.257+0000 7f84ffe66700  5 osd.65 pg_epoch: 10575947 pg[114.57( v 10575941'3129534 (10575152'3126961,10575941'3129534] lb 114:ea7a9cb0:::1000ef1f65b.00000000:head local-lis/les=10573977/10573978 n=46406 ec=5887724/4421456 lis/c=10573977/10573790 les/c/f=10573978/10573794/9814718 sis=10575818) [83,65,8]/[83,40,125] r=-1 lpr=10575818 pi=[10573790,10575818)/3 luod=0'0 lua=10575817'3129484 crt=10575941'3129534 mlcod 10575941'3129523 active+remapped mbc={}] enter Started/ReplicaActive/RepNotRecovering
   -26> 2021-03-30T23:11:31.493+0000 7f84ffe66700  5 osd.65 pg_epoch: 10575949 pg[114.57( v 10575941'3129534 (10575152'3126961,10575941'3129534] lb 114:ea7a9cb0:::1000ef1f65b.00000000:head local-lis/les=10573977/10573978 n=46406 ec=5887724/4421456 lis/c=10573977/10573790 les/c/f=10573978/10573794/9814718 sis=10575818) [83,65,8]/[83,40,125] r=-1 lpr=10575818 pi=[10573790,10575818)/3 luod=0'0 lua=10575817'3129484 crt=10575941'3129534 mlcod 10575941'3129523 active+remapped mbc={}] exit Started/ReplicaActive/RepNotRecovering 60.237929 5 0.000072
   -25> 2021-03-30T23:11:31.493+0000 7f84ffe66700  5 osd.65 pg_epoch: 10575949 pg[114.57( v 10575941'3129534 (10575152'3126961,10575941'3129534] lb 114:ea7a9cb0:::1000ef1f65b.00000000:head local-lis/les=10573977/10573978 n=46406 ec=5887724/4421456 lis/c=10573977/10573790 les/c/f=10573978/10573794/9814718 sis=10575818) [83,65,8]/[83,40,125] r=-1 lpr=10575818 pi=[10573790,10575818)/3 luod=0'0 lua=10575817'3129484 crt=10575941'3129534 mlcod 10575941'3129523 active+remapped mbc={}] enter Started/ReplicaActive/RepWaitBackfillReserved
   -24> 2021-03-30T23:11:31.493+0000 7f84ffe66700  5 osd.65 pg_epoch: 10575949 pg[114.57( v 10575941'3129534 (10575152'3126961,10575941'3129534] lb 114:ea7a9cb0:::1000ef1f65b.00000000:head local-lis/les=10573977/10573978 n=46406 ec=5887724/4421456 lis/c=10573977/10573790 les/c/f=10573978/10573794/9814718 sis=10575818) [83,65,8]/[83,40,125] r=-1 lpr=10575818 pi=[10573790,10575818)/3 luod=0'0 lua=10575817'3129484 crt=10575941'3129534 mlcod 10575941'3129523 active+remapped mbc={}] exit Started/ReplicaActive/RepWaitBackfillReserved 0.000078 2 0.000130
   -23> 2021-03-30T23:11:31.493+0000 7f84ffe66700  5 osd.65 pg_epoch: 10575949 pg[114.57( v 10575941'3129534 (10575152'3126961,10575941'3129534] lb 114:ea7a9cb0:::1000ef1f65b.00000000:head local-lis/les=10573977/10573978 n=46406 ec=5887724/4421456 lis/c=10573977/10573790 les/c/f=10573978/10573794/9814718 sis=10575818) [83,65,8]/[83,40,125] r=-1 lpr=10575818 pi=[10573790,10575818)/3 luod=0'0 lua=10575817'3129484 crt=10575941'3129534 mlcod 10575941'3129523 active+remapped mbc={}] enter Started/ReplicaActive/RepNotRecovering
   -22> 2021-03-30T23:11:31.493+0000 7f84ffe66700  5 osd.65 pg_epoch: 10575949 pg[114.57( v 10575941'3129534 (10575152'3126961,10575941'3129534] lb 114:ea7a9cb0:::1000ef1f65b.00000000:head local-lis/les=10573977/10573978 n=46406 ec=5887724/4421456 lis/c=10573977/10573790 les/c/f=10573978/10573794/9814718 sis=10575818) [83,65,8]/[83,40,125] r=-1 lpr=10575818 pi=[10573790,10575818)/3 luod=0'0 lua=10575817'3129484 crt=10575941'3129534 mlcod 10575941'3129523 active+remapped mbc={}] exit Started/ReplicaActive/RepNotRecovering 0.000195 1 0.000033
   -21> 2021-03-30T23:11:31.493+0000 7f84ffe66700  5 osd.65 pg_epoch: 10575949 pg[114.57( v 10575941'3129534 (10575152'3126961,10575941'3129534] lb 114:ea7a9cb0:::1000ef1f65b.00000000:head local-lis/les=10573977/10573978 n=46406 ec=5887724/4421456 lis/c=10573977/10573790 les/c/f=10573978/10573794/9814718 sis=10575818) [83,65,8]/[83,40,125] r=-1 lpr=10575818 pi=[10573790,10575818)/3 luod=0'0 lua=10575817'3129484 crt=10575941'3129534 mlcod 10575941'3129523 active+remapped mbc={}] enter Started/ReplicaActive/RepWaitBackfillReserved
   -18> 2021-03-30T23:11:31.493+0000 7f84ffe66700  5 osd.65 pg_epoch: 10575949 pg[114.57( v 10575941'3129534 (10575152'3126961,10575941'3129534] lb 114:ea7a9cb0:::1000ef1f65b.00000000:head local-lis/les=10573977/10573978 n=46406 ec=5887724/4421456 lis/c=10573977/10573790 les/c/f=10573978/10573794/9814718 sis=10575818) [83,65,8]/[83,40,125] r=-1 lpr=10575818 pi=[10573790,10575818)/3 luod=0'0 lua=10575817'3129484 crt=10575941'3129534 mlcod 10575941'3129523 active+remapped mbc={}] exit Started/ReplicaActive/RepWaitBackfillReserved 0.000214 1 0.000069
   -17> 2021-03-30T23:11:31.493+0000 7f84ffe66700  5 osd.65 pg_epoch: 10575949 pg[114.57( v 10575941'3129534 (10575152'3126961,10575941'3129534] lb 114:ea7a9cb0:::1000ef1f65b.00000000:head local-lis/les=10573977/10573978 n=46406 ec=5887724/4421456 lis/c=10573977/10573790 les/c/f=10573978/10573794/9814718 sis=10575818) [83,65,8]/[83,40,125] r=-1 lpr=10575818 pi=[10573790,10575818)/3 luod=0'0 lua=10575817'3129484 crt=10575941'3129534 mlcod 10575941'3129523 active+remapped mbc={}] enter Started/ReplicaActive/RepRecovering
   -16> 2021-03-30T23:11:31.493+0000 7f84ffe66700  5 osd.65 pg_epoch: 10575949 pg[114.57( v 10575941'3129534 (10575152'3126961,10575941'3129534] lb 114:ea7a9cb0:::1000ef1f65b.00000000:head local-lis/les=10573977/10573978 n=46406 ec=5887724/4421456 lis/c=10573977/10573790 les/c/f=10573978/10573794/9814718 sis=10575818) [83,65,8]/[83,40,125] r=-1 lpr=10575818 pi=[10573790,10575818)/3 luod=0'0 lua=10575817'3129484 crt=10575941'3129534 mlcod 10575941'3129523 active+remapped mbc={}] exit Started/ReplicaActive/RepRecovering 0.000018 1 0.000041
   -15> 2021-03-30T23:11:31.493+0000 7f84ffe66700  5 osd.65 pg_epoch: 10575949 pg[114.57( v 10575941'3129534 (10575152'3126961,10575941'3129534] lb 114:ea7a9cb0:::1000ef1f65b.00000000:head local-lis/les=10573977/10573978 n=46406 ec=5887724/4421456 lis/c=10573977/10573790 les/c/f=10573978/10573794/9814718 sis=10575818) [83,65,8]/[83,40,125] r=-1 lpr=10575818 pi=[10573790,10575818)/3 luod=0'0 lua=10575817'3129484 crt=10575941'3129534 mlcod 10575941'3129523 active+remapped mbc={}] exit Started/ReplicaActive 378.850507 0 0.000000
   -14> 2021-03-30T23:11:31.493+0000 7f84ffe66700  5 osd.65 pg_epoch: 10575949 pg[114.57( v 10575941'3129534 (10575152'3126961,10575941'3129534] lb 114:ea7a9cb0:::1000ef1f65b.00000000:head local-lis/les=10573977/10573978 n=46406 ec=5887724/4421456 lis/c=10573977/10573790 les/c/f=10573978/10573794/9814718 sis=10575818) [83,65,8]/[83,40,125] r=-1 lpr=10575818 pi=[10573790,10575818)/3 luod=0'0 lua=10575817'3129484 crt=10575941'3129534 mlcod 0'0 active+remapped mbc={}] exit Started 383.384446 0 0.000000
   -13> 2021-03-30T23:11:31.493+0000 7f84ffe66700  5 osd.65 pg_epoch: 10575949 pg[114.57( v 10575941'3129534 (10575152'3126961,10575941'3129534] lb 114:ea7a9cb0:::1000ef1f65b.00000000:head local-lis/les=10573977/10573978 n=46406 ec=5887724/4421456 lis/c=10573977/10573790 les/c/f=10573978/10573794/9814718 sis=10575818) [83,65,8]/[83,40,125] r=-1 lpr=10575818 pi=[10573790,10575818)/3 luod=0'0 lua=10575817'3129484 crt=10575941'3129534 mlcod 0'0 active+remapped mbc={}] enter Crashed

#4 Updated by Neha Ojha almost 3 years ago

  • Subject changed from PeeringState::Crashed::Crashed to unhandled event in ReplicaActive

This seems to be an unhandled event in the ReplicaActive state, couldn't find much in the logs that tell us what that was.

Also available in: Atom PDF