Bug #53600
closedCrash in MOSDPGLog::encode_payload
0%
Description
3 OSDs crashed on the gibba cluster. All the OSDs were a part of gibba045 node.
Observations:
- osd.15 and osd.176 have the same backtrace but at different times.
- osd.549 went down due to a heartbeat timeout. If this is unrelated to the other crash, a new tracker may be created.
- osd.15 crashed first and then osd.549 went down around the same time.
Logs:
The OSD logs are placed under /home/sseshasa/gibba_cluster_logs on senta02.front.sepia.ceph.com.
The logs can be moved to a new location if required.
1. osd.15
Crashid: 2021-12-13T12:07:49.888443Z_0b56adca-66b7-47d9-b351-f60d507a514b
{ "backtrace": [ "/lib64/libpthread.so.0(+0x12c20) [0x7f4755fb7c20]", "gsignal()", "abort()", "/lib64/libc.so.6(+0x21c89) [0x7f4754c00c89]", "/lib64/libc.so.6(+0x2fa76) [0x7f4754c0ea76]", "(MOSDPGLog::encode_payload(unsigned long)+0x389) [0x55fa59fb74f9]", "(Message::encode(unsigned long, int, bool)+0x2e) [0x55fa5a7b574e]", "(ProtocolV2::prepare_send_message(unsigned long, Message*)+0x44) [0x55fa5aa3b6b4]", "(ProtocolV2::send_message(Message*)+0x3ae) [0x55fa5aa3bd0e]", "(AsyncConnection::send_message(Message*)+0x53e) [0x55fa5aa183be]", "(PG::send_cluster_message(int, boost::intrusive_ptr<Message>, unsigned int, bool)+0xa5) [0x55fa5a017ac5]", "(PeeringState::activate(ceph::os::Transaction&, unsigned int, PeeringCtxWrapper&)+0x16c0) [0x55fa5a265e30]", "(PeeringState::Active::Active(boost::statechart::state<PeeringState::Active, PeeringState::Primary, PeeringState::Activating, (boost::statechart::history_mode)0>::my_context)+0x336) [0x55fa5a287786]", "(boost::statechart::detail::safe_reaction_result boost::statechart::simple_state<PeeringState::Peering, PeeringState::Primary, PeeringState::GetInfo, (boost::statechart::history_mode)0>::transit_impl<PeeringState::Active, PeeringState::PeeringMachine, boost::statechart::detail::no_transition_function>(boost::statechart::detail::no_transition_function const&)+0xb6) [0x55fa5a2b8146]", "(boost::statechart::simple_state<PeeringState::Peering, PeeringState::Primary, PeeringState::GetInfo, (boost::statechart::history_mode)0>::react_impl(boost::statechart::event_base const&, void const*)+0x16a) [0x55fa5a2b85da]", "(boost::statechart::simple_state<PeeringState::GetMissing, PeeringState::Peering, boost::mpl::list<mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na>, (boost::statechart::history_mode)0>::react_impl(boost::statechart::event_base const&, void const*)+0xb9) [0x55fa5a2b89f9]", "(boost::statechart::state_machine<PeeringState::PeeringMachine, PeeringState::Initial, std::allocator<boost::statechart::none>, boost::statechart::null_exception_translator>::process_event(boost::statechart::event_base const&)+0x16a) [0x55fa5a03a52a]", "(PG::do_peering_event(std::shared_ptr<PGPeeringEvent>, PeeringCtx&)+0x2d6) [0x55fa5a02e746]", "(OSD::dequeue_peering_evt(OSDShard*, PG*, std::shared_ptr<PGPeeringEvent>, ThreadPool::TPHandle&)+0x175) [0x55fa59fa4775]", "(ceph::osd::scheduler::PGPeeringItem::run(OSD*, OSDShard*, boost::intrusive_ptr<PG>&, ThreadPool::TPHandle&)+0x56) [0x55fa5a2398c6]", "(OSD::ShardedOpWQ::_process(unsigned int, ceph::heartbeat_handle_d*)+0xac8) [0x55fa59f967b8]", "(ShardedThreadPool::shardedthreadpool_worker(unsigned int)+0x5c4) [0x55fa5a6884c4]", "(ShardedThreadPool::WorkThreadSharded::entry()+0x14) [0x55fa5a688fb4]", "/lib64/libpthread.so.0(+0x817a) [0x7f4755fad17a]", "clone()" ], "ceph_version": "17.0.0-9475-g8ea352e9", "crash_id": "2021-12-13T12:07:49.888443Z_0b56adca-66b7-47d9-b351-f60d507a514b", "entity_name": "osd.15", "os_id": "centos", "os_name": "CentOS Linux", "os_version": "8", "os_version_id": "8", "process_name": "ceph-osd", "stack_sig": "303c972f70bfb57ef0cae9d126712730e5aa6e892803a630e6e3b37e795047cd", "timestamp": "2021-12-13T12:07:49.888443Z", "utsname_hostname": "gibba045", "utsname_machine": "x86_64", "utsname_release": "4.18.0-301.1.el8.x86_64", "utsname_sysname": "Linux", "utsname_version": "#1 SMP Tue Apr 13 16:24:22 UTC 2021" }
2. osd.176
Crashid: 2021-12-13T15:27:30.525590Z_b9772214-8f71-4033-9482-2439bdd15ec7
{ "backtrace": [ "/lib64/libpthread.so.0(+0x12c20) [0x7f899c0e9c20]", "gsignal()", "abort()", "/lib64/libc.so.6(+0x21c89) [0x7f899ad32c89]", "/lib64/libc.so.6(+0x2fa76) [0x7f899ad40a76]", "(MOSDPGLog::encode_payload(unsigned long)+0x389) [0x55c76afc34f9]", "(Message::encode(unsigned long, int, bool)+0x2e) [0x55c76b7c174e]", "(ProtocolV2::prepare_send_message(unsigned long, Message*)+0x44) [0x55c76ba476b4]", "(ProtocolV2::send_message(Message*)+0x3ae) [0x55c76ba47d0e]", "(AsyncConnection::send_message(Message*)+0x53e) [0x55c76ba243be]", "(PG::send_cluster_message(int, boost::intrusive_ptr<Message>, unsigned int, bool)+0xa5) [0x55c76b023ac5]", "(PeeringState::fulfill_log(pg_shard_t, pg_query_t const&, unsigned int)+0x2b4) [0x55c76b2755f4]", "(PeeringState::fulfill_query(MQuery const&, PeeringCtxWrapper&)+0x15e) [0x55c76b275cfe]", "(PeeringState::Stray::react(MQuery const&)+0x37) [0x55c76b275dd7]", "(boost::statechart::simple_state<PeeringState::Stray, PeeringState::Started, boost::mpl::list<mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na>, (boost::statechart::history_mode)0>::react_impl(boost::statechart::event_base const&, void const*)+0x231) [0x55c76b2c8121]", "(boost::statechart::state_machine<PeeringState::PeeringMachine, PeeringState::Initial, std::allocator<boost::statechart::none>, boost::statechart::null_exception_translator>::process_event(boost::statechart::event_base const&)+0x74) [0x55c76b046434]", "(PG::do_peering_event(std::shared_ptr<PGPeeringEvent>, PeeringCtx&)+0x2d6) [0x55c76b03a746]", "(OSD::dequeue_peering_evt(OSDShard*, PG*, std::shared_ptr<PGPeeringEvent>, ThreadPool::TPHandle&)+0x175) [0x55c76afb0775]", "(ceph::osd::scheduler::PGPeeringItem::run(OSD*, OSDShard*, boost::intrusive_ptr<PG>&, ThreadPool::TPHandle&)+0x56) [0x55c76b2458c6]", "(OSD::ShardedOpWQ::_process(unsigned int, ceph::heartbeat_handle_d*)+0xac8) [0x55c76afa27b8]", "(ShardedThreadPool::shardedthreadpool_worker(unsigned int)+0x5c4) [0x55c76b6944c4]", "(ShardedThreadPool::WorkThreadSharded::entry()+0x14) [0x55c76b694fb4]", "/lib64/libpthread.so.0(+0x817a) [0x7f899c0df17a]", "clone()" ], "ceph_version": "17.0.0-9475-g8ea352e9", "crash_id": "2021-12-13T15:27:30.525590Z_b9772214-8f71-4033-9482-2439bdd15ec7", "entity_name": "osd.176", "os_id": "centos", "os_name": "CentOS Linux", "os_version": "8", "os_version_id": "8", "process_name": "ceph-osd", "stack_sig": "b1fa0d4357cf041ac97a0fbfb3d989a15cd0f1d6c0a552d16212d8dd4887dda6", "timestamp": "2021-12-13T15:27:30.525590Z", "utsname_hostname": "gibba045", "utsname_machine": "x86_64", "utsname_release": "4.18.0-301.1.el8.x86_64", "utsname_sysname": "Linux", "utsname_version": "#1 SMP Tue Apr 13 16:24:22 UTC 2021" }
3. osd.549
Crashid: 2021-12-13T12:10:41.871131Z_3e0a41b5-f86d-4636-a758-3204f4b8877e
osd logs show that the following thread reported the heartbeat timeout:
2021-12-13T12:09:55.056+0000 7f3607ca5700 1 heartbeat_map reset_timeout 'OSD::osd_op_tp thread 0x7f3607ca5700' had timed out after 15.000000954s
{ "backtrace": [ "/lib64/libpthread.so.0(+0x12c20) [0x7f362c54cc20]", "/lib64/libpthread.so.0(+0x1175d) [0x7f362c54b75d]", "/lib64/libpthread.so.0(+0xc9e7) [0x7f362c5469e7]", "pthread_cond_wait()", "(std::condition_variable::wait(std::unique_lock<std::mutex>&)+0x10) [0x7f362bb8f8f0]", "(ceph::logging::Log::submit_entry(ceph::logging::Entry&&)+0xb7) [0x55842cc136e7]", "(ceph::HeartbeatMap::_check(ceph::heartbeat_handle_d const*, char const*, std::chrono::time_point<ceph::coarse_mono_clock, std::chrono::duration<unsigned long, std::ratio<1l, 1000000000l> > >)+0x408) [0x55842c9fdd28]", "(ceph::HeartbeatMap::reset_timeout(ceph::heartbeat_handle_d*, std::chrono::duration<unsigned long, std::ratio<1l, 1000000000l> >, std::chrono::duration<unsigned long, std::ratio<1l, 1000000000l> >)+0x466) [0x55842c9ff006]", "(ShardedThreadPool::shardedthreadpool_worker(unsigned int)+0x5b0) [0x55842ca2e4b0]", "(ShardedThreadPool::WorkThreadSharded::entry()+0x14) [0x55842ca2efb4]", "/lib64/libpthread.so.0(+0x817a) [0x7f362c54217a]", "clone()" ], "ceph_version": "17.0.0-9475-g8ea352e9", "crash_id": "2021-12-13T12:10:41.871131Z_3e0a41b5-f86d-4636-a758-3204f4b8877e", "entity_name": "osd.549", "os_id": "centos", "os_name": "CentOS Linux", "os_version": "8", "os_version_id": "8", "process_name": "ceph-osd", "stack_sig": "97d3b8c07f84b93a4e052b7e344e4c48b41b417f97f9ca16002bda90ea7b7bfb", "timestamp": "2021-12-13T12:10:41.871131Z", "utsname_hostname": "gibba045", "utsname_machine": "x86_64", "utsname_release": "4.18.0-301.1.el8.x86_64", "utsname_sysname": "Linux", "utsname_version": "#1 SMP Tue Apr 13 16:24:22 UTC 2021" }
Updated by Brad Hubbard over 2 years ago
The binaries running when these crashes were seen actually are from this wip branch in the ceph-ci repo.
https://shaman.ceph.com/builds/ceph/cephadm-fix-osd-activate/
It would be interesting to know whether any of the divergences from master in that branch may have contributed to this issue.
It appears glibc aborted the osd process here, 'MOSDPGLog::encode_payload(unsigned long)+0x389'.
Without a coredump we can't see what data it was working with but maybe we can get some details by examining that offset in gdb.
$ podman run -it --privileged --rm quay.ceph.io/ceph-ci/ceph@sha256:14b1ea54031bea23a37c589a02be794dca9c5a0807116ffef655bea631f9a62e Trying to pull quay.ceph.io/ceph-ci/ceph@sha256:14b1ea54031bea23a37c589a02be794dca9c5a0807116ffef655bea631f9a62e... Getting image source signatures Copying blob c6c62e60f876 done Copying blob ab4052451ca3 done Copying config f79fcb826d done Writing manifest to image destination Storing signatures [root@8681d3690687 /]# ceph-osd --version ceph version 17.0.0-9475-g8ea352e9 (8ea352e994feffca1bfd357a20c491df01db91a9) quincy (dev) [root@8681d3690687 /]# dnf -y install yum-utils gdb ... [root@8681d3690687 /]# debuginfo-install -y ceph-osd ... [root@8681d3690687 /]# gdb -q /usr/bin/ceph-osd Reading symbols from /usr/bin/ceph-osd...Reading symbols from /usr/lib/debug/usr/bin/ceph-osd-17.0.0-9475.g8ea352e9.el8.x86_64.debug...done. done. (gdb) start Temporary breakpoint 1 at 0x645ad0: main. (2 locations) Starting program: /usr/bin/ceph-osd Missing separate debuginfos, use: yum debuginfo-install glibc-2.28-164.el8.x86_64 [Thread debugging using libthread_db enabled] Using host libthread_db library "/lib64/libthread_db.so.1". Temporary breakpoint 1, main (argc=1, argv=0x7fffffffeba8) at /usr/src/debug/ceph-17.0.0-9475.g8ea352e9.el8.x86_64/src/ceph_osd.cc:120 120 { Missing separate debuginfos, use: yum debuginfo-install fuse-libs-2.9.7-12.el8.x86_64 gperftools-libs-2.7-9.el8.x86_64 leveldb-1.22-1.el8.x86_64 libaio-0.3.112-1.el8.x86_64 libblkid-2.32.1-28.el8.x86_64 libgcc-8.5.0-4.el8_5.x86_64 libibverbs-35.0-1.el8.x86_64 libmount-2.32.1-28.el8.x86_64 libnl3-3.5.0-1.el8.x86_64 librdmacm-35.0-1.el8.x86_64 libselinux-2.9-5.el8.x86_64 libstdc++-8.5.0-4.el8_5.x86_64 libunwind-1.3.1-3.el8.x86_64 libuuid-2.32.1-28.el8.x86_64 lttng-ust-2.8.1-11.el8.x86_64 lz4-libs-1.8.3-3.el8_4.x86_64 openssl-libs-1.1.1k-4.el8.x86_64 snappy-1.1.8-3.el8.x86_64 systemd-libs-239-51.el8.x86_64 userspace-rcu-0.10.1-4.el8.x86_64 zlib-1.2.11-17.el8.x86_64 Work out decimal for the function offset. (gdb) p/d 0x389 $1 = 905 (gdb) disass /s 'MOSDPGLog::encode_payload(unsigned long)' ... /usr/src/debug/ceph-17.0.0-9475.g8ea352e9.el8.x86_64/src/osd/osd_types.h: 3855 WRITE_CLASS_ENCODER(pg_lease_t) 0x0000555555b184a8 <+824>: lea 0x728(%rbp),%rdi 0x0000555555b184af <+831>: mov %rbx,%rsi 0x0000555555b184b2 <+834>: callq 0x555555d4d480 <pg_lease_t::encode(ceph::buffer::v15_2_0::list&) const> 0x0000555555b184b7 <+839>: jmp 0x555555b18482 <MOSDPGLog::encode_payload(unsigned long)+786> 0x0000555555b184b9 <+841>: nopl 0x0(%rax) 3307 encode((__u8)0, bl); 0x0000555555b184c0 <+848>: mov $0x1,%edx 0x0000555555b184c5 <+853>: mov %r12,%rsi 0x0000555555b184c8 <+856>: mov %rbx,%rdi 0x0000555555b184cb <+859>: movb $0x0,0x30(%rsp) /usr/src/debug/ceph-17.0.0-9475.g8ea352e9.el8.x86_64/src/include/encoding.h: 86 WRITE_RAW_ENCODER(__u8) 0x0000555555b184d0 <+864>: callq 0x5555563c3db0 <ceph::buffer::v15_2_0::list::append(char const*, unsigned int)> 0x0000555555b184d5 <+869>: jmpq 0x555555b1841f <MOSDPGLog::encode_payload(unsigned long)+687> 0x0000555555b184da <+874>: lea 0xeb619f(%rip),%rcx # 0x5555569ce680 <_ZZN9MOSDPGLog14encode_payloadEmE19__PRETTY_FUNCTION__> 0x0000555555b184e1 <+881>: mov $0x68,%edx 0x0000555555b184e6 <+886>: lea 0xeacd03(%rip),%rsi # 0x5555569c51f0 0x0000555555b184ed <+893>: lea 0xeacdf4(%rip),%rdi # 0x5555569c52e8 0x0000555555b184f4 <+900>: callq 0x55555596a230 <__assert_fail@plt> <--------------------- maybe hit this? 0x0000555555b184f9 <+905>: callq 0x55555596a7e0 <__stack_chk_fail@plt> <--------------------- pointer is HERE End of assembler dump.
If I'm interpreting that right it looks like we may have been encoding the value for a pg_lease_t from the Message::payload when we went off into the weeds and glibc aborted. That's probably the 'lease' argument here so maybe that gives some hints?
src/messages/MOSDPGLog.h: 98 void encode_payload(uint64_t features) override { 99 using ceph::encode; 100 encode(epoch, payload); 101 encode(info, payload); 102 encode(log, payload); 103 encode(missing, payload, features); 104 assert(HAVE_FEATURE(features, SERVER_NAUTILUS)); 105 encode(query_epoch, payload); 106 encode(past_intervals, payload); 107 encode(to, payload); 108 encode(from, payload); 109 encode(lease, payload); <---------------------HERE 110 }
Updated by Brad Hubbard over 2 years ago
It should be noted there were a whole lot of oom-kill events on this node during the times these crashes occurred. Given this I'm not sure there is much value in pursuing these issues further.
Dec 13 12:06:42 gibba045 kernel: Out of memory: Killed process 679166 (ceph-osd) total-vm:1451636kB, anon-rss:461656kB, file-rss:0kB, shmem-rss:0kB, UID:167 pgtables:1708kB oom_score_adj:0 Dec 13 12:06:42 gibba045 kernel: Out of memory: Killed process 729772 (ceph-osd) total-vm:1438320kB, anon-rss:456288kB, file-rss:0kB, shmem-rss:0kB, UID:167 pgtables:1680kB oom_score_adj:0 Dec 13 12:06:51 gibba045 kernel: Out of memory: Killed process 1086254 (ceph-osd) total-vm:1437108kB, anon-rss:462544kB, file-rss:0kB, shmem-rss:0kB, UID:167 pgtables:1636kB oom_score_adj:0 Dec 13 12:07:02 gibba045 kernel: Out of memory: Killed process 1044752 (ceph-osd) total-vm:1435276kB, anon-rss:463632kB, file-rss:0kB, shmem-rss:0kB, UID:167 pgtables:1652kB oom_score_adj:0 Dec 13 12:07:02 gibba045 kernel: Out of memory: Killed process 831542 (ceph-osd) total-vm:1439472kB, anon-rss:468140kB, file-rss:0kB, shmem-rss:0kB, UID:167 pgtables:1668kB oom_score_adj:0 Dec 13 12:07:09 gibba045 kernel: Out of memory: Killed process 984863 (ceph-osd) total-vm:1445300kB, anon-rss:465464kB, file-rss:0kB, shmem-rss:0kB, UID:167 pgtables:1680kB oom_score_adj:0 Dec 13 12:07:09 gibba045 kernel: Out of memory: Killed process 781159 (ceph-osd) total-vm:1443592kB, anon-rss:462068kB, file-rss:0kB, shmem-rss:0kB, UID:167 pgtables:1680kB oom_score_adj:0 Dec 13 12:07:13 gibba045 kernel: Out of memory: Killed process 1249365 (ceph-osd) total-vm:1404644kB, anon-rss:460916kB, file-rss:0kB, shmem-rss:0kB, UID:167 pgtables:1632kB oom_score_adj:0
Updated by Brad Hubbard over 2 years ago
- Status changed from New to Rejected
- Assignee set to Brad Hubbard
- Priority changed from Normal to Low
Updated by Telemetry Bot almost 2 years ago
- Related to Bug #56324: crash: MOSDPGLog::encode_payload(unsigned long) added
Updated by Telemetry Bot almost 2 years ago
- Related to Bug #56331: crash: MOSDPGLog::encode_payload(unsigned long) added