Project

General

Profile

Actions

Bug #50101

open

unhandled event in ReplicaActive

Added by Sage Weil about 3 years ago. Updated about 3 years ago.

Status:
New
Priority:
Normal
Assignee:
-
Category:
-
Target version:
-
% Done:

0%

Source:
Tags:
Backport:
Regression:
No
Severity:
3 - minor
Reviewed:
Affected Versions:
ceph-qa-suite:
Component(RADOS):
Pull request ID:
Crash signature (v1):

9aa3c57e050d0d976cd83938712756b1e31a68adf250f406e06afa1d413c016f

Crash signature (v2):

Description

{
    "assert_condition": "abort",
    "assert_file": "/home/jenkins-build/build/workspace/ceph-dev-build/ARCH/x86_64/AVAILABLE_ARCH/x86_64/AVAILABLE_DIST/centos8/DIST/centos8/MACHINE_SIZE/gigantic/release/16.1.0-1323-g7e7e1f4e/rpm/el8/BUILD/ceph-16.1.0-1323-g7e7e1f4e/src/osd/PeeringState.cc",
    "assert_func": "PeeringState::Crashed::Crashed(boost::statechart::state<PeeringState::Crashed, PeeringState::PeeringMachine>::my_context)",
    "assert_line": 4594,
    "assert_msg": "/home/jenkins-build/build/workspace/ceph-dev-build/ARCH/x86_64/AVAILABLE_ARCH/x86_64/AVAILABLE_DIST/centos8/DIST/centos8/MACHINE_SIZE/gigantic/release/16.1.0-1323-g7e7e1f4e/rpm/el8/BUILD/ceph-16.1.0-1323-g7e7e1f4e/src/osd/PeeringState.cc: In function 'PeeringState::Crashed::Crashed(boost::statechart::state<PeeringState::Crashed, PeeringState::PeeringMachine>::my_context)' thread 7f84ffe66700 time 2021-03-30T23:11:31.552274+0000\n/home/jenkins-build/build/workspace/ceph-dev-build/ARCH/x86_64/AVAILABLE_ARCH/x86_64/AVAILABLE_DIST/centos8/DIST/centos8/MACHINE_SIZE/gigantic/release/16.1.0-1323-g7e7e1f4e/rpm/el8/BUILD/ceph-16.1.0-1323-g7e7e1f4e/src/osd/PeeringState.cc: 4594: ceph_abort_msg(\"we got a bad state machine event\")\n",
    "assert_thread_name": "tp_osd_tp",
    "backtrace": [
        "/lib64/libpthread.so.0(+0x12b20) [0x7f8521f0eb20]",
        "gsignal()",
        "abort()",
        "(ceph::__ceph_abort(char const*, int, char const*, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&)+0x1b6) [0x55b41d947509]",
        "(PeeringState::Crashed::Crashed(boost::statechart::state<PeeringState::Crashed, PeeringState::PeeringMachine, boost::mpl::list<mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na>, (boost::statechart::history_mode)0>::my_context)+0xc4) [0x55b41dceb554]",
        "(boost::statechart::state<PeeringState::Crashed, PeeringState::PeeringMachine, boost::mpl::list<mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na>, (boost::statechart::history_mode)0>::deep_construct(boost::statechart::state_machine<PeeringState::PeeringMachine, PeeringState::Initial, std::allocator<boost::statechart::none>, boost::statechart::null_exception_translator>* const&, boost::statechart::state_machine<PeeringState::PeeringMachine, PeeringState::Initial, std::allocator<boost::statechart::none>, boost::statechart::null_exception_translator>&)+0x3a) [0x55b41dd214fa]",
        "(boost::statechart::simple_state<PeeringState::ReplicaActive, PeeringState::Started, PeeringState::RepNotRecovering, (boost::statechart::history_mode)0>::react_impl(boost::statechart::event_base const&, void const*)+0x2e3) [0x55b41dd226d3]",
        "(boost::statechart::simple_state<PeeringState::RepRecovering, PeeringState::ReplicaActive, boost::mpl::list<mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na>, (boost::statechart::history_mode)0>::react_impl(boost::statechart::event_base const&, void const*)+0x99) [0x55b41dd206b9]",
        "(boost::statechart::state_machine<PeeringState::PeeringMachine, PeeringState::Initial, std::allocator<boost::statechart::none>, boost::statechart::null_exception_translator>::process_event(boost::statechart::event_base const&)+0x5b) [0x55b41db0a21b]",
        "(PG::do_peering_event(std::shared_ptr<PGPeeringEvent>, PeeringCtx&)+0x2d1) [0x55b41dafebf1]",
        "(OSD::dequeue_peering_evt(OSDShard*, PG*, std::shared_ptr<PGPeeringEvent>, ThreadPool::TPHandle&)+0x29c) [0x55b41da792cc]",
        "(ceph::osd::scheduler::PGPeeringItem::run(OSD*, OSDShard*, boost::intrusive_ptr<PG>&, ThreadPool::TPHandle&)+0x56) [0x55b41dca7ae6]",
        "(OSD::ShardedOpWQ::_process(unsigned int, ceph::heartbeat_handle_d*)+0xa58) [0x55b41da6b048]",
        "(ShardedThreadPool::shardedthreadpool_worker(unsigned int)+0x5c4) [0x55b41e0d2494]",
        "(ShardedThreadPool::WorkThreadSharded::entry()+0x14) [0x55b41e0d5134]",
        "/lib64/libpthread.so.0(+0x814a) [0x7f8521f0414a]",
        "clone()" 
    ],
    "ceph_version": "16.1.0-1323-g7e7e1f4e",
    "crash_id": "2021-03-30T23:11:32.528564Z_ecb4c036-5628-4abf-ab37-a205037feaf2",
    "entity_name": "osd.65",
    "os_id": "centos",
    "os_name": "CentOS Linux",
    "os_version": "8",
    "os_version_id": "8",
    "process_name": "ceph-osd",
    "stack_sig": "9aa3c57e050d0d976cd83938712756b1e31a68adf250f406e06afa1d413c016f",
    "timestamp": "2021-03-30T23:11:32.528564Z",
    "utsname_hostname": "mira093",
    "utsname_machine": "x86_64",
    "utsname_release": "5.4.0-66-generic",
    "utsname_sysname": "Linux",
    "utsname_version": "#74~18.04.2-Ubuntu SMP Fri Feb 5 11:17:31 UTC 2021" 
}

Files

log.gz (167 KB) log.gz log from the crash Josh Durgin, 04/09/2021 09:49 PM
Actions #1

Updated by Sage Weil about 3 years ago

  • Project changed from Ceph to RADOS
Actions #2

Updated by Josh Durgin about 3 years ago

Actions #3

Updated by Neha Ojha about 3 years ago

 -2219> 2021-03-30T23:10:27.629+0000 7f84ffe66700  5 osd.65 pg_epoch: 10575944 pg[114.57( v 10575941'3129534 (10575152'3126961,10575941'3129534] lb 114:ea7a883f:::1000ef5fccb.00000000:head local-lis/les=10573977/10573978 n=46366 ec=5887724/4421456 lis/c=10573977/10573790 les/c/f=10573978/10573794/9814718 sis=10575818) [83,65,8]/[83,40,125] r=-1 lpr=10575818 pi=[10573790,10575818)/3 luod=0'0 lua=10575817'3129484 crt=10575941'3129534 mlcod 10575941'3129523 active+remapped mbc={}] enter Started/ReplicaActive/RepNotRecovering
 -2093> 2021-03-30T23:10:30.249+0000 7f84ffe66700  5 osd.65 pg_epoch: 10575946 pg[114.57( v 10575941'3129534 (10575152'3126961,10575941'3129534] lb 114:ea7a8b66:::1000f206522.00000000:head local-lis/les=10573977/10573978 n=46371 ec=5887724/4421456 lis/c=10573977/10573790 les/c/f=10573978/10573794/9814718 sis=10575818) [83,65,8]/[83,40,125] r=-1 lpr=10575818 pi=[10573790,10575818)/3 luod=0'0 lua=10575817'3129484 crt=10575941'3129534 mlcod 10575941'3129523 active+remapped mbc={}] exit Started/ReplicaActive/RepNotRecovering 2.620139 8 0.000079
 -2092> 2021-03-30T23:10:30.249+0000 7f84ffe66700  5 osd.65 pg_epoch: 10575946 pg[114.57( v 10575941'3129534 (10575152'3126961,10575941'3129534] lb 114:ea7a8b66:::1000f206522.00000000:head local-lis/les=10573977/10573978 n=46371 ec=5887724/4421456 lis/c=10573977/10573790 les/c/f=10573978/10573794/9814718 sis=10575818) [83,65,8]/[83,40,125] r=-1 lpr=10575818 pi=[10573790,10575818)/3 luod=0'0 lua=10575817'3129484 crt=10575941'3129534 mlcod 10575941'3129523 active+remapped mbc={}] enter Started/ReplicaActive/RepWaitBackfillReserved
 -2091> 2021-03-30T23:10:30.249+0000 7f84ffe66700  5 osd.65 pg_epoch: 10575946 pg[114.57( v 10575941'3129534 (10575152'3126961,10575941'3129534] lb 114:ea7a8b66:::1000f206522.00000000:head local-lis/les=10573977/10573978 n=46371 ec=5887724/4421456 lis/c=10573977/10573790 les/c/f=10573978/10573794/9814718 sis=10575818) [83,65,8]/[83,40,125] r=-1 lpr=10575818 pi=[10573790,10575818)/3 luod=0'0 lua=10575817'3129484 crt=10575941'3129534 mlcod 10575941'3129523 active+remapped mbc={}] exit Started/ReplicaActive/RepWaitBackfillReserved 0.000038 1 0.000188
 -2090> 2021-03-30T23:10:30.249+0000 7f84ffe66700  5 osd.65 pg_epoch: 10575946 pg[114.57( v 10575941'3129534 (10575152'3126961,10575941'3129534] lb 114:ea7a8b66:::1000f206522.00000000:head local-lis/les=10573977/10573978 n=46371 ec=5887724/4421456 lis/c=10573977/10573790 les/c/f=10573978/10573794/9814718 sis=10575818) [83,65,8]/[83,40,125] r=-1 lpr=10575818 pi=[10573790,10575818)/3 luod=0'0 lua=10575817'3129484 crt=10575941'3129534 mlcod 10575941'3129523 active+remapped mbc={}] enter Started/ReplicaActive/RepRecovering
 -2062> 2021-03-30T23:10:31.257+0000 7f84ffe66700  5 osd.65 pg_epoch: 10575947 pg[114.57( v 10575941'3129534 (10575152'3126961,10575941'3129534] lb 114:ea7a9cb0:::1000ef1f65b.00000000:head local-lis/les=10573977/10573978 n=46406 ec=5887724/4421456 lis/c=10573977/10573790 les/c/f=10573978/10573794/9814718 sis=10575818) [83,65,8]/[83,40,125] r=-1 lpr=10575818 pi=[10573790,10575818)/3 luod=0'0 lua=10575817'3129484 crt=10575941'3129534 mlcod 10575941'3129523 active+remapped mbc={}] exit Started/ReplicaActive/RepRecovering 1.007368 4 0.000055
 -2061> 2021-03-30T23:10:31.257+0000 7f84ffe66700  5 osd.65 pg_epoch: 10575947 pg[114.57( v 10575941'3129534 (10575152'3126961,10575941'3129534] lb 114:ea7a9cb0:::1000ef1f65b.00000000:head local-lis/les=10573977/10573978 n=46406 ec=5887724/4421456 lis/c=10573977/10573790 les/c/f=10573978/10573794/9814718 sis=10575818) [83,65,8]/[83,40,125] r=-1 lpr=10575818 pi=[10573790,10575818)/3 luod=0'0 lua=10575817'3129484 crt=10575941'3129534 mlcod 10575941'3129523 active+remapped mbc={}] enter Started/ReplicaActive/RepNotRecovering
   -26> 2021-03-30T23:11:31.493+0000 7f84ffe66700  5 osd.65 pg_epoch: 10575949 pg[114.57( v 10575941'3129534 (10575152'3126961,10575941'3129534] lb 114:ea7a9cb0:::1000ef1f65b.00000000:head local-lis/les=10573977/10573978 n=46406 ec=5887724/4421456 lis/c=10573977/10573790 les/c/f=10573978/10573794/9814718 sis=10575818) [83,65,8]/[83,40,125] r=-1 lpr=10575818 pi=[10573790,10575818)/3 luod=0'0 lua=10575817'3129484 crt=10575941'3129534 mlcod 10575941'3129523 active+remapped mbc={}] exit Started/ReplicaActive/RepNotRecovering 60.237929 5 0.000072
   -25> 2021-03-30T23:11:31.493+0000 7f84ffe66700  5 osd.65 pg_epoch: 10575949 pg[114.57( v 10575941'3129534 (10575152'3126961,10575941'3129534] lb 114:ea7a9cb0:::1000ef1f65b.00000000:head local-lis/les=10573977/10573978 n=46406 ec=5887724/4421456 lis/c=10573977/10573790 les/c/f=10573978/10573794/9814718 sis=10575818) [83,65,8]/[83,40,125] r=-1 lpr=10575818 pi=[10573790,10575818)/3 luod=0'0 lua=10575817'3129484 crt=10575941'3129534 mlcod 10575941'3129523 active+remapped mbc={}] enter Started/ReplicaActive/RepWaitBackfillReserved
   -24> 2021-03-30T23:11:31.493+0000 7f84ffe66700  5 osd.65 pg_epoch: 10575949 pg[114.57( v 10575941'3129534 (10575152'3126961,10575941'3129534] lb 114:ea7a9cb0:::1000ef1f65b.00000000:head local-lis/les=10573977/10573978 n=46406 ec=5887724/4421456 lis/c=10573977/10573790 les/c/f=10573978/10573794/9814718 sis=10575818) [83,65,8]/[83,40,125] r=-1 lpr=10575818 pi=[10573790,10575818)/3 luod=0'0 lua=10575817'3129484 crt=10575941'3129534 mlcod 10575941'3129523 active+remapped mbc={}] exit Started/ReplicaActive/RepWaitBackfillReserved 0.000078 2 0.000130
   -23> 2021-03-30T23:11:31.493+0000 7f84ffe66700  5 osd.65 pg_epoch: 10575949 pg[114.57( v 10575941'3129534 (10575152'3126961,10575941'3129534] lb 114:ea7a9cb0:::1000ef1f65b.00000000:head local-lis/les=10573977/10573978 n=46406 ec=5887724/4421456 lis/c=10573977/10573790 les/c/f=10573978/10573794/9814718 sis=10575818) [83,65,8]/[83,40,125] r=-1 lpr=10575818 pi=[10573790,10575818)/3 luod=0'0 lua=10575817'3129484 crt=10575941'3129534 mlcod 10575941'3129523 active+remapped mbc={}] enter Started/ReplicaActive/RepNotRecovering
   -22> 2021-03-30T23:11:31.493+0000 7f84ffe66700  5 osd.65 pg_epoch: 10575949 pg[114.57( v 10575941'3129534 (10575152'3126961,10575941'3129534] lb 114:ea7a9cb0:::1000ef1f65b.00000000:head local-lis/les=10573977/10573978 n=46406 ec=5887724/4421456 lis/c=10573977/10573790 les/c/f=10573978/10573794/9814718 sis=10575818) [83,65,8]/[83,40,125] r=-1 lpr=10575818 pi=[10573790,10575818)/3 luod=0'0 lua=10575817'3129484 crt=10575941'3129534 mlcod 10575941'3129523 active+remapped mbc={}] exit Started/ReplicaActive/RepNotRecovering 0.000195 1 0.000033
   -21> 2021-03-30T23:11:31.493+0000 7f84ffe66700  5 osd.65 pg_epoch: 10575949 pg[114.57( v 10575941'3129534 (10575152'3126961,10575941'3129534] lb 114:ea7a9cb0:::1000ef1f65b.00000000:head local-lis/les=10573977/10573978 n=46406 ec=5887724/4421456 lis/c=10573977/10573790 les/c/f=10573978/10573794/9814718 sis=10575818) [83,65,8]/[83,40,125] r=-1 lpr=10575818 pi=[10573790,10575818)/3 luod=0'0 lua=10575817'3129484 crt=10575941'3129534 mlcod 10575941'3129523 active+remapped mbc={}] enter Started/ReplicaActive/RepWaitBackfillReserved
   -18> 2021-03-30T23:11:31.493+0000 7f84ffe66700  5 osd.65 pg_epoch: 10575949 pg[114.57( v 10575941'3129534 (10575152'3126961,10575941'3129534] lb 114:ea7a9cb0:::1000ef1f65b.00000000:head local-lis/les=10573977/10573978 n=46406 ec=5887724/4421456 lis/c=10573977/10573790 les/c/f=10573978/10573794/9814718 sis=10575818) [83,65,8]/[83,40,125] r=-1 lpr=10575818 pi=[10573790,10575818)/3 luod=0'0 lua=10575817'3129484 crt=10575941'3129534 mlcod 10575941'3129523 active+remapped mbc={}] exit Started/ReplicaActive/RepWaitBackfillReserved 0.000214 1 0.000069
   -17> 2021-03-30T23:11:31.493+0000 7f84ffe66700  5 osd.65 pg_epoch: 10575949 pg[114.57( v 10575941'3129534 (10575152'3126961,10575941'3129534] lb 114:ea7a9cb0:::1000ef1f65b.00000000:head local-lis/les=10573977/10573978 n=46406 ec=5887724/4421456 lis/c=10573977/10573790 les/c/f=10573978/10573794/9814718 sis=10575818) [83,65,8]/[83,40,125] r=-1 lpr=10575818 pi=[10573790,10575818)/3 luod=0'0 lua=10575817'3129484 crt=10575941'3129534 mlcod 10575941'3129523 active+remapped mbc={}] enter Started/ReplicaActive/RepRecovering
   -16> 2021-03-30T23:11:31.493+0000 7f84ffe66700  5 osd.65 pg_epoch: 10575949 pg[114.57( v 10575941'3129534 (10575152'3126961,10575941'3129534] lb 114:ea7a9cb0:::1000ef1f65b.00000000:head local-lis/les=10573977/10573978 n=46406 ec=5887724/4421456 lis/c=10573977/10573790 les/c/f=10573978/10573794/9814718 sis=10575818) [83,65,8]/[83,40,125] r=-1 lpr=10575818 pi=[10573790,10575818)/3 luod=0'0 lua=10575817'3129484 crt=10575941'3129534 mlcod 10575941'3129523 active+remapped mbc={}] exit Started/ReplicaActive/RepRecovering 0.000018 1 0.000041
   -15> 2021-03-30T23:11:31.493+0000 7f84ffe66700  5 osd.65 pg_epoch: 10575949 pg[114.57( v 10575941'3129534 (10575152'3126961,10575941'3129534] lb 114:ea7a9cb0:::1000ef1f65b.00000000:head local-lis/les=10573977/10573978 n=46406 ec=5887724/4421456 lis/c=10573977/10573790 les/c/f=10573978/10573794/9814718 sis=10575818) [83,65,8]/[83,40,125] r=-1 lpr=10575818 pi=[10573790,10575818)/3 luod=0'0 lua=10575817'3129484 crt=10575941'3129534 mlcod 10575941'3129523 active+remapped mbc={}] exit Started/ReplicaActive 378.850507 0 0.000000
   -14> 2021-03-30T23:11:31.493+0000 7f84ffe66700  5 osd.65 pg_epoch: 10575949 pg[114.57( v 10575941'3129534 (10575152'3126961,10575941'3129534] lb 114:ea7a9cb0:::1000ef1f65b.00000000:head local-lis/les=10573977/10573978 n=46406 ec=5887724/4421456 lis/c=10573977/10573790 les/c/f=10573978/10573794/9814718 sis=10575818) [83,65,8]/[83,40,125] r=-1 lpr=10575818 pi=[10573790,10575818)/3 luod=0'0 lua=10575817'3129484 crt=10575941'3129534 mlcod 0'0 active+remapped mbc={}] exit Started 383.384446 0 0.000000
   -13> 2021-03-30T23:11:31.493+0000 7f84ffe66700  5 osd.65 pg_epoch: 10575949 pg[114.57( v 10575941'3129534 (10575152'3126961,10575941'3129534] lb 114:ea7a9cb0:::1000ef1f65b.00000000:head local-lis/les=10573977/10573978 n=46406 ec=5887724/4421456 lis/c=10573977/10573790 les/c/f=10573978/10573794/9814718 sis=10575818) [83,65,8]/[83,40,125] r=-1 lpr=10575818 pi=[10573790,10575818)/3 luod=0'0 lua=10575817'3129484 crt=10575941'3129534 mlcod 0'0 active+remapped mbc={}] enter Crashed
Actions #4

Updated by Neha Ojha about 3 years ago

  • Subject changed from PeeringState::Crashed::Crashed to unhandled event in ReplicaActive

This seems to be an unhandled event in the ReplicaActive state, couldn't find much in the logs that tell us what that was.

Actions

Also available in: Atom PDF