Project

General

Profile

Bug #56101

Gibba Cluster: 17.2.0 to 17.2.1 RC upgrade OSD crash in function safe_timer

Added by Vikhyat Umrao 5 months ago. Updated 17 days ago.

Status:
Fix Under Review
Priority:
Urgent
Assignee:
Category:
-
Target version:
-
% Done:

0%

Source:
Tags:
Backport:
quincy
Regression:
No
Severity:
3 - minor
Reviewed:
Affected Versions:
ceph-qa-suite:
Component(RADOS):
Pull request ID:
Crash signature (v1):

8401ef44af5a55696ab4d190dcfd98f4b85bab2f37ee2f91d6cd7a241a3b0737
ed6371172b6960cd5f07c50f52a37857ae827a53c2737d5623312332a672c814
99b56157ea4b210db422174f528bf297535ac749ee952a700e03651444bec3e7


Description

[root@gibba001 f9d4cf6a-edcf-11ec-a96a-3cecef3d8fb8]# ceph crash info 2022-06-17T18:32:15.065315Z_b367a9b5-64d1-4005-9d0f-cba44819fc79
{
    "backtrace": [
        "/lib64/libpthread.so.0(+0x12ce0) [0x7fe6de9fece0]",
        "(rocksdb::DBImpl::GetApproximateSizes(rocksdb::SizeApproximationOptions const&, rocksdb::ColumnFamilyHandle*, rocksdb::Range const*, int, unsigned long*)+0x3a) [0x5641d6e6114a]",
        "(RocksDBStore::estimate_prefix_size(std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&)+0x41e) [0x5641d6e25bae]",
        "(BlueStore::pool_statfs(unsigned long, store_statfs_t*, bool*)+0x1e8) [0x5641d6806648]",
        "(OSD::collect_pg_stats()+0x741) [0x5641d62ddac1]",
        "(MgrClient::_send_pgstats()+0x40) [0x5641d6c17b10]",
        "(MgrClient::_send_stats()+0x15) [0x5641d6c1a945]",
        "(Context::complete(int)+0xd) [0x5641d6309d1d]",
        "(CommonSafeTimer<std::mutex>::timer_thread()+0x11a) [0x5641d69e2afa]",
        "(CommonSafeTimerThread<std::mutex>::entry()+0x11) [0x5641d69e4121]",
        "/lib64/libpthread.so.0(+0x81ca) [0x7fe6de9f41ca]",
        "clone()" 
    ],
    "ceph_version": "17.2.0",
    "crash_id": "2022-06-17T18:32:15.065315Z_b367a9b5-64d1-4005-9d0f-cba44819fc79",
    "entity_name": "osd.298",
    "os_id": "centos",
    "os_name": "CentOS Stream",
    "os_version": "8",
    "os_version_id": "8",
    "process_name": "ceph-osd",
    "stack_sig": "8401ef44af5a55696ab4d190dcfd98f4b85bab2f37ee2f91d6cd7a241a3b0737",
    "timestamp": "2022-06-17T18:32:15.065315Z",
    "utsname_hostname": "gibba016",
    "utsname_machine": "x86_64",
    "utsname_release": "4.18.0-301.1.el8.x86_64",
    "utsname_sysname": "Linux",
    "utsname_version": "#1 SMP Tue Apr 13 16:24:22 UTC 2021" 
}

[root@gibba001 f9d4cf6a-edcf-11ec-a96a-3cecef3d8fb8]# ceph tell osd.298 version
{
    "version": "17.2.0-436-gda36d2c9",
    "release": "quincy",
    "release_type": "stable" 
}

Related issues

Duplicated by RADOS - Bug #56102: Gibba Cluster: 17.2.0 to 17.2.1 RC upgrade OSD crash in - RocksDBStore::estimate_prefix_size Duplicate

History

#1 Updated by Vikhyat Umrao 5 months ago

  • Description updated (diff)

#2 Updated by Vikhyat Umrao 5 months ago

[root@gibba001 f9d4cf6a-edcf-11ec-a96a-3cecef3d8fb8]# ceph crash info 2022-06-17T21:05:51.129950Z_321e5448-12ed-49c8-9c17-36002d31b396
{
    "backtrace": [
        "/lib64/libpthread.so.0(+0x12ce0) [0x7f4573c6bce0]",
        "(rocksdb::DBImpl::GetApproximateSizes(rocksdb::SizeApproximationOptions const&, rocksdb::ColumnFamilyHandle*, rocksdb::Range const*, int, unsigned long*)+0x3a) [0x55852e25b14a]",
        "(RocksDBStore::estimate_prefix_size(std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&)+0x41e) [0x55852e21fbae]",
        "(BlueStore::pool_statfs(unsigned long, store_statfs_t*, bool*)+0x1e8) [0x55852dc00648]",
        "(OSD::collect_pg_stats()+0x741) [0x55852d6d7ac1]",
        "(MgrClient::_send_pgstats()+0x40) [0x55852e011b10]",
        "(MgrClient::_send_stats()+0x15) [0x55852e014945]",
        "(Context::complete(int)+0xd) [0x55852d703d1d]",
        "(CommonSafeTimer<std::mutex>::timer_thread()+0x11a) [0x55852dddcafa]",
        "(CommonSafeTimerThread<std::mutex>::entry()+0x11) [0x55852ddde121]",
        "/lib64/libpthread.so.0(+0x81ca) [0x7f4573c611ca]",
        "clone()" 
    ],
    "ceph_version": "17.2.0",
    "crash_id": "2022-06-17T21:05:51.129950Z_321e5448-12ed-49c8-9c17-36002d31b396",
    "entity_name": "osd.392",
    "os_id": "centos",
    "os_name": "CentOS Stream",
    "os_version": "8",
    "os_version_id": "8",
    "process_name": "ceph-osd",
    "stack_sig": "8401ef44af5a55696ab4d190dcfd98f4b85bab2f37ee2f91d6cd7a241a3b0737",
    "timestamp": "2022-06-17T21:05:51.129950Z",
    "utsname_hostname": "gibba037",
    "utsname_machine": "x86_64",
    "utsname_release": "4.18.0-301.1.el8.x86_64",
    "utsname_sysname": "Linux",
    "utsname_version": "#1 SMP Tue Apr 13 16:24:22 UTC 2021" 
}

#3 Updated by Vikhyat Umrao 5 months ago

- After upgrading the LRC cluster, the same crash was seen in one of the OSDs in LRC.

{
    "backtrace": [
        "/lib64/libpthread.so.0(+0x12ce0) [0x7f35a3e73ce0]",
        "(rocksdb::DBImpl::GetApproximateSizes(rocksdb::SizeApproximationOptions const&, rocksdb::ColumnFamilyHandle*, rocksdb::Range const*, int, unsigned long*)+0x3a) [0x564c38f9f73a]",
        "(RocksDBStore::estimate_prefix_size(std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&)+0x41e) [0x564c38f6419e]",
        "(BlueStore::pool_statfs(unsigned long, store_statfs_t*, bool*)+0x1e8) [0x564c38944d38]",
        "(OSD::collect_pg_stats()+0x741) [0x564c3841bce1]",
        "(MgrClient::_send_pgstats()+0x40) [0x564c38d56100]",
        "(MgrClient::_send_stats()+0x15) [0x564c38d58f35]",
        "(Context::complete(int)+0xd) [0x564c38447f3d]",
        "(CommonSafeTimer<std::mutex>::timer_thread()+0x11a) [0x564c38b211ea]",
        "(CommonSafeTimerThread<std::mutex>::entry()+0x11) [0x564c38b22811]",
        "/lib64/libpthread.so.0(+0x81ca) [0x7f35a3e691ca]",
        "clone()" 
    ],
    "ceph_version": "17.2.0-179-g4d9fe5e9",
    "crash_id": "2022-06-21T15:19:35.271516Z_71571a4a-5c61-41e4-8612-c2df199d7b32",
    "entity_name": "osd.161",
    "os_id": "centos",
    "os_name": "CentOS Stream",
    "os_version": "8",
    "os_version_id": "8",
    "process_name": "ceph-osd",
    "stack_sig": "8401ef44af5a55696ab4d190dcfd98f4b85bab2f37ee2f91d6cd7a241a3b0737",
    "timestamp": "2022-06-21T15:19:35.271516Z",
    "utsname_hostname": "ivan07",
    "utsname_machine": "x86_64",
    "utsname_release": "5.4.0-109-generic",
    "utsname_sysname": "Linux",
    "utsname_version": "#123-Ubuntu SMP Fri Apr 8 09:10:54 UTC 2022" 
}

#4 Updated by Vikhyat Umrao 5 months ago

- Looks like there is a commonality that this crash is happening in shutdown/restart so looks like some issue during the shutdown. The issue is also in the 17.2.0 GA version! as this crash is in 17.2.0.

From OSD.392 in the gibba cluster:

** File Read Latency Histogram By Level [P] **

2022-06-17T21:05:49.723+0000 7f456d46c700 -1 received  signal: Terminated from /dev/init -- /usr/bin/ceph-osd -n osd.392 -f --setuser ceph --setgroup ceph --default-log-to-file=false --default-log-to-journald=true --default-log-to-stderr=false  (PID: 1) UID: 0
2022-06-17T21:05:49.723+0000 7f456d46c700 -1 osd.392 6946 *** Got signal Terminated ***
2022-06-17T21:05:49.723+0000 7f456d46c700  0 osd.392 6946 Fast Shutdown: - cct->_conf->osd_fast_shutdown = 1, null-fm = 1
2022-06-17T21:05:49.723+0000 7f456d46c700 -1 osd.392 6946 *** Immediate shutdown (osd_fast_shutdown=true) ***
2022-06-17T21:05:49.723+0000 7f456d46c700  0 osd.392 6946 prepare_to_stop telling mon we are shutting down and dead
2022-06-17T21:05:51.049+0000 7f455b3dc700  0 osd.392 6947 got_stop_ack starting shutdown
2022-06-17T21:05:51.049+0000 7f456d46c700  0 osd.392 6947 prepare_to_stop starting shutdown
2022-06-17T21:05:51.051+0000 7f4560be7700  0 bluestore(/var/lib/ceph/osd/ceph-392)  allocation stats probe 0: cnt: 320657 frags: 320657 size: 5448175616
2022-06-17T21:05:51.051+0000 7f4560be7700  0 bluestore(/var/lib/ceph/osd/ceph-392)  probe -1: 0,  0, 0
2022-06-17T21:05:51.051+0000 7f4560be7700  0 bluestore(/var/lib/ceph/osd/ceph-392)  probe -2: 0,  0, 0
2022-06-17T21:05:51.051+0000 7f4560be7700  0 bluestore(/var/lib/ceph/osd/ceph-392)  probe -4: 0,  0, 0
2022-06-17T21:05:51.051+0000 7f4560be7700  0 bluestore(/var/lib/ceph/osd/ceph-392)  probe -8: 0,  0, 0
2022-06-17T21:05:51.051+0000 7f4560be7700  0 bluestore(/var/lib/ceph/osd/ceph-392)  probe -16: 0,  0, 0
2022-06-17T21:05:51.051+0000 7f4560be7700  0 bluestore(/var/lib/ceph/osd/ceph-392) ------------
2022-06-17T21:05:51.068+0000 7f456d46c700  4 rocksdb: [db/db_impl/db_impl.cc:447] Shutdown: canceling all background work
2022-06-17T21:05:51.069+0000 7f456d46c700  4 rocksdb: [db/db_impl/db_impl.cc:625] Shutdown complete
2022-06-17T21:05:51.071+0000 7f456d46c700  1 bluefs umount
2022-06-17T21:05:51.071+0000 7f456d46c700  1 bdev(0x55853202fc00 /var/lib/ceph/osd/ceph-392/block) close
2022-06-17T21:05:51.133+0000 7f4559bd9700 -1 *** Caught signal (Segmentation fault) **
 in thread 7f4559bd9700 thread_name:safe_timer

 ceph version 17.2.0 (43e2e60a7559d3f46c9d53f1ca875fd499a1e35e) quincy (stable)
 1: /lib64/libpthread.so.0(+0x12ce0) [0x7f4573c6bce0]
 2: (rocksdb::DBImpl::GetApproximateSizes(rocksdb::SizeApproximationOptions const&, rocksdb::ColumnFamilyHandle*, rocksdb::Range const*, int, unsigned long*)+0x3a) [0x55852e25b14a]
 3: (RocksDBStore::estimate_prefix_size(std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&)+0x41e) [0x55852e21fbae]
 4: (BlueStore::pool_statfs(unsigned long, store_statfs_t*, bool*)+0x1e8) [0x55852dc00648]
 5: (OSD::collect_pg_stats()+0x741) [0x55852d6d7ac1]
 6: (MgrClient::_send_pgstats()+0x40) [0x55852e011b10]
 7: (MgrClient::_send_stats()+0x15) [0x55852e014945]

- From LRC

2022-06-21T15:19:34.235+0000 7f359d674700 -1 received  signal: Terminated from /sbin/docker-init -- /usr/bin/ceph-osd -n osd.161 -f --setuser ceph --setgroup ceph --default-log-to-file=false --default-log-to-stderr=true --default-log-stderr-prefix=debug   (PID: 1) UID: 0
2022-06-21T15:19:34.235+0000 7f359d674700 -1 osd.161 10645622 *** Got signal Terminated ***
2022-06-21T15:19:34.235+0000 7f359d674700  0 osd.161 10645622 Fast Shutdown: - cct->_conf->osd_fast_shutdown = 1, null-fm = 1
2022-06-21T15:19:34.235+0000 7f359d674700 -1 osd.161 10645622 *** Immediate shutdown (osd_fast_shutdown=true) ***
2022-06-21T15:19:34.235+0000 7f359d674700  0 osd.161 10645622 prepare_to_stop telling mon we are shutting down and dead
2022-06-21T15:19:35.211+0000 7f358bc43700  0 osd.161 10645623 got_stop_ack starting shutdown
2022-06-21T15:19:35.211+0000 7f359d674700  0 osd.161 10645623 prepare_to_stop starting shutdown
2022-06-21T15:19:35.231+0000 7f359144e700  0 bluestore(/var/lib/ceph/osd/ceph-161)  allocation stats probe 13: cnt: 79062 frags: 79062 size: 44799180800
2022-06-21T15:19:35.231+0000 7f359144e700  0 bluestore(/var/lib/ceph/osd/ceph-161)  probe -1: 35222,  35222, 10082373632
2022-06-21T15:19:35.231+0000 7f359144e700  0 bluestore(/var/lib/ceph/osd/ceph-161)  probe -3: 37550,  37550, 14709874688
2022-06-21T15:19:35.231+0000 7f359144e700  0 bluestore(/var/lib/ceph/osd/ceph-161)  probe -5: 87399,  87399, 41224392704
2022-06-21T15:19:35.231+0000 7f359144e700  0 bluestore(/var/lib/ceph/osd/ceph-161)  probe -13: 1657233,  1657233, 1143646580736
2022-06-21T15:19:35.231+0000 7f359144e700  0 bluestore(/var/lib/ceph/osd/ceph-161)  probe -29: 0,  0, 0
2022-06-21T15:19:35.231+0000 7f359144e700  0 bluestore(/var/lib/ceph/osd/ceph-161) ------------
2022-06-21T15:19:35.239+0000 7f359d674700  4 rocksdb: [db/db_impl/db_impl.cc:447] Shutdown: canceling all background work
2022-06-21T15:19:35.243+0000 7f359d674700  4 rocksdb: [db/db_impl/db_impl.cc:625] Shutdown complete
2022-06-21T15:19:35.267+0000 7f358a440700 -1 *** Caught signal (Segmentation fault) **
 in thread 7f358a440700 thread_name:safe_timer

 ceph version 17.2.0-179-g4d9fe5e9 (4d9fe5e9f451836e1b56ea8b6fc72a9884351b9d) quincy (stable)

#5 Updated by Vikhyat Umrao 5 months ago

- OSD.392 logs are available in gibba037 following path:

/root/tracker56101

#6 Updated by Vikhyat Umrao 5 months ago

  • Related to Bug #56102: Gibba Cluster: 17.2.0 to 17.2.1 RC upgrade OSD crash in - RocksDBStore::estimate_prefix_size added

#7 Updated by Vikhyat Umrao 5 months ago

  • Project changed from bluestore to RADOS

#8 Updated by Brad Hubbard 5 months ago

Looking into this further in today's team meeting we discussed the fact that these segfaults appear to occur in pthread code and the fact that the kernel is reporting an issue in "safe_timer".

[3526151.020506] safe_timer[1886589]: segfault at 7f357ec299d0 ip 00007f35a3e74f58 sp 00007f358a438530 error 4 in libpthread-2.28.so[7f35a3e61000+1b000]

[16923529.873402] safe_timer[3754702]: segfault at 7fe6b494c9d0 ip 00007fe6de9fff58 sp 00007fe6c4964b70 error 4 in libpthread-2.28.so[7fe6de9ec000+1b000]

[16932975.955716] safe_timer[3228095]: segfault at 7f454b3bc9d0 ip 00007f4573c6cf58 sp 00007f4559bd1b70 error 4 in libpthread-2.28.so[7f4573c59000+1b000]

Because of this, and the fact that there doesn't appear to be a direct connection between the last two frames of the stack trace we are suspecting a problem around safe_timer in a similar fashion to https://tracker.ceph.com/issues/23352 which was caused by unprotected access to a data structure in MgrClient and interestingly MgrClient is also present in these stacktraces.

I'm specifically looking at how we're calling _send_pgstats(). When we call it via send_pgstats() we protect it with a lock but when we call it via _send_stats() we don't seem to and I suspect what we are seeing here is the timer context triggering when the timer expires and calling _send_pgstats() without the protection of the lock and that this access is racing with another thread.

This is only a theory at the moment but we may be able to confirm this if we can capture a coredump or possibly with a tool like helgrind or gccs thread sanitizer although they may be problematic to get in place at the right time to capture the issue.

#9 Updated by Neha Ojha 5 months ago

http://telemetry.front.sepia.ceph.com:4000/d/jByk5HaMz/crash-spec-x-ray?var-sig_v2=97cfb7606f247983cba0a9666bb882d9e1d0d75606e58792709230707bc5371e&orgId=1 indicates two occurrences in the field before this was seen in gibba and the LRC, likely not a regression in 17.2.1

#10 Updated by Neha Ojha 5 months ago

  • Priority changed from Normal to Urgent

#11 Updated by Vikhyat Umrao 5 months ago

  • Subject changed from Gibba Cluster: 17.2.0 to 17.2.1 RC upgrade OSD crash in - rocksdb::DBImpl::GetApproximateSizes to Gibba Cluster: 17.2.0 to 17.2.1 RC upgrade OSD crash in function safe_timer

#12 Updated by Vikhyat Umrao 5 months ago

  • Related to deleted (Bug #56102: Gibba Cluster: 17.2.0 to 17.2.1 RC upgrade OSD crash in - RocksDBStore::estimate_prefix_size)

#13 Updated by Vikhyat Umrao 5 months ago

  • Duplicates Bug #56102: Gibba Cluster: 17.2.0 to 17.2.1 RC upgrade OSD crash in - RocksDBStore::estimate_prefix_size added

#14 Updated by Vikhyat Umrao 5 months ago

  • Status changed from New to Duplicate

#15 Updated by Vikhyat Umrao 5 months ago

  • Status changed from Duplicate to New

#16 Updated by Telemetry Bot 5 months ago

  • Crash signature (v1) updated (diff)
  • Crash signature (v2) updated (diff)
  • Affected Versions v17.2.0 added

http://telemetry.front.sepia.ceph.com:4000/d/jByk5HaMz/crash-spec-x-ray?orgId=1&var-sig_v2=97cfb7606f247983cba0a9666bb882d9e1d0d75606e58792709230707bc5371e

Sanitized backtrace:

    rocksdb::DBImpl::GetApproximateSizes(rocksdb::SizeApproximationOptions const&, rocksdb::ColumnFamilyHandle*, rocksdb::Range const*, int, unsigned long*)
    RocksDBStore::estimate_prefix_size(std::basic_string<char, std::char_traits<char>, std::allocator<char> > const&, std::basic_string<char, std::char_traits<char>, std::allocator<char> > const&)
    BlueStore::pool_statfs(unsigned long, store_statfs_t*, bool*)
    OSD::collect_pg_stats()
    MgrClient::_send_pgstats()
    MgrClient::_send_stats()
    Context::complete(int)
    CommonSafeTimer<std::mutex>::timer_thread()
    CommonSafeTimerThread<std::mutex>::entry()

Crash dump sample:
{
    "backtrace": [
        "/lib64/libpthread.so.0(+0x12ce0) [0x7f35a3e73ce0]",
        "(rocksdb::DBImpl::GetApproximateSizes(rocksdb::SizeApproximationOptions const&, rocksdb::ColumnFamilyHandle*, rocksdb::Range const*, int, unsigned long*)+0x3a) [0x564c38f9f73a]",
        "(RocksDBStore::estimate_prefix_size(std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&)+0x41e) [0x564c38f6419e]",
        "(BlueStore::pool_statfs(unsigned long, store_statfs_t*, bool*)+0x1e8) [0x564c38944d38]",
        "(OSD::collect_pg_stats()+0x741) [0x564c3841bce1]",
        "(MgrClient::_send_pgstats()+0x40) [0x564c38d56100]",
        "(MgrClient::_send_stats()+0x15) [0x564c38d58f35]",
        "(Context::complete(int)+0xd) [0x564c38447f3d]",
        "(CommonSafeTimer<std::mutex>::timer_thread()+0x11a) [0x564c38b211ea]",
        "(CommonSafeTimerThread<std::mutex>::entry()+0x11) [0x564c38b22811]",
        "/lib64/libpthread.so.0(+0x81ca) [0x7f35a3e691ca]",
        "clone()" 
    ],
    "ceph_version": "17.2.0-179-g4d9fe5e9",
    "crash_id": "2022-06-21T15:19:35.271516Z_71571a4a-5c61-41e4-8612-c2df199d7b32",
    "entity_name": "osd.a59912ded6cd215c8220792320f54b11dbe3ea28",
    "os_id": "centos",
    "os_name": "CentOS Stream",
    "os_version": "8",
    "os_version_id": "8",
    "process_name": "ceph-osd",
    "stack_sig": "8401ef44af5a55696ab4d190dcfd98f4b85bab2f37ee2f91d6cd7a241a3b0737",
    "timestamp": "2022-06-21T15:19:35.271516Z",
    "utsname_machine": "x86_64",
    "utsname_release": "5.4.0-109-generic",
    "utsname_sysname": "Linux",
    "utsname_version": "#123-Ubuntu SMP Fri Apr 8 09:10:54 UTC 2022" 
}

#17 Updated by Laura Flores 5 months ago

  • Crash signature (v1) updated (diff)

A Telemetry contact was able to provide their OSD log. There was not a coredump available anymore, but they were able to provide other coredump outputs from around the same time.

See the relevant info at dell-per320-4.gsslab.pnq.redhat.com under /home/core/tracker56101.

#18 Updated by Laura Flores 5 months ago

  • Crash signature (v1) updated (diff)

#19 Updated by Laura Flores 5 months ago

  • Crash signature (v1) updated (diff)

#20 Updated by Radoslaw Zarzynski 5 months ago

  • Status changed from New to Need More Info

Well, it seems the logs on dell-per320-4.gsslab.pnq.redhat.com:/home/core/tracker56101 are on the default levels. Still need more info :-(.

#21 Updated by Laura Flores 5 months ago

Noticed that this PR was newly included in 17.2.1, and it makes a change to GetApproximateSizes: https://github.com/ceph/ceph/pull/46492

#22 Updated by Yaarit Hatuka 4 months ago

  • Duplicated by Bug #56102: Gibba Cluster: 17.2.0 to 17.2.1 RC upgrade OSD crash in - RocksDBStore::estimate_prefix_size added

#23 Updated by Yaarit Hatuka 4 months ago

  • Duplicates deleted (Bug #56102: Gibba Cluster: 17.2.0 to 17.2.1 RC upgrade OSD crash in - RocksDBStore::estimate_prefix_size)

#24 Updated by Telemetry Bot 4 months ago

  • Affected Versions v17.2.1 added

#25 Updated by Telemetry Bot 4 months ago

  • Crash signature (v1) updated (diff)
  • Crash signature (v2) updated (diff)

#26 Updated by Vikhyat Umrao about 2 months ago

  • Crash signature (v1) updated (diff)
  • Crash signature (v2) updated (diff)

- This was visible again in LRC upgrade today.

2022-09-29T19:56:46.286638Z_71dc382b-98e7-40e7-b3a6-741b403b8d0e  osd.57                *   
root@reesi001:~# ceph crash info 2022-09-29T19:56:46.286638Z_71dc382b-98e7-40e7-b3a6-741b403b8d0e
{
    "backtrace": [
        "/lib64/libpthread.so.0(+0x12cf0) [0x7fdff3f3bcf0]",
        "(rocksdb::DBImpl::GetApproximateSizes(rocksdb::SizeApproximationOptions const&, rocksdb::ColumnFamilyHandle*, rocksdb::Range const*, int, unsigned long*)+0x3d) [0x5649be67178d]",
        "(RocksDBStore::estimate_prefix_size(std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&)+0x442) [0x5649be636462]",
        "(BlueStore::pool_statfs(unsigned long, store_statfs_t*, bool*)+0x215) [0x5649bdff9545]",
        "(OSD::collect_pg_stats()+0x8f2) [0x5649bda7ee02]",
        "(MgrClient::_send_pgstats()+0x41) [0x5649be429471]",
        "(MgrClient::_send_stats()+0x15) [0x5649be42e205]",
        "(Context::complete(int)+0xd) [0x5649bdabdabd]",
        "(CommonSafeTimer<std::mutex>::timer_thread()+0x12f) [0x5649be1e05ef]",
        "(CommonSafeTimerThread<std::mutex>::entry()+0x11) [0x5649be1e16e1]",
        "/lib64/libpthread.so.0(+0x81ca) [0x7fdff3f311ca]",
        "clone()" 
    ],
    "ceph_version": "17.2.3-770-g7f52e260",
    "crash_id": "2022-09-29T19:56:46.286638Z_71dc382b-98e7-40e7-b3a6-741b403b8d0e",
    "entity_name": "osd.57",
    "os_id": "centos",
    "os_name": "CentOS Stream",
    "os_version": "8",
    "os_version_id": "8",
    "process_name": "ceph-osd",
    "stack_sig": "8401ef44af5a55696ab4d190dcfd98f4b85bab2f37ee2f91d6cd7a241a3b0737",
    "timestamp": "2022-09-29T19:56:46.286638Z",
    "utsname_hostname": "ivan03",
    "utsname_machine": "x86_64",
    "utsname_release": "5.4.0-109-generic",
    "utsname_sysname": "Linux",
    "utsname_version": "#123-Ubuntu SMP Fri Apr 8 09:10:54 UTC 2022" 
}

#27 Updated by Radoslaw Zarzynski about 2 months ago

  • Assignee set to Laura Flores

This won't be easy to reproduce but there are still some options like:

Anyway, it's not so terribly urgent. Let's talk on the next RADOS Meeting.

#28 Updated by Laura Flores about 2 months ago

I contacted some Telemetry users. I will report back here with any information.

Something to note: The large majority of reported crashes came from Centos 8 Stream clusters; however, there were also some cases of it occurring on Ubuntu Jammy and other Debian clusters.

In terms of reproducing this crash, we may have the most luck on Centos 8 Stream. But clearly, the crash does not only occur on that OS.

None of the crashes reported have occurred below 17.2.0, so it's possible this is happening from a Quincy-specific change.

Studying the changes between v16.2.10 and v17.2.0, the fast shutdown PR (https://github.com/ceph/ceph/pull/45342) stood out to me, as it makes some changes to `BlueStore::pool_statfs`, which is part of the crash backtrace:

int BlueStore::pool_statfs(uint64_t pool_id, struct store_statfs_t *buf,
               bool *out_per_pool_omap)
{
  dout(20) << __func__ << " pool " << pool_id<< dendl;
  if (!per_pool_stat_collection) {
    dout(20) << __func__ << " not supported in legacy mode " << dendl;
    return -ENOTSUP;
  }
  buf->reset();
  {
    std::lock_guard l(vstatfs_lock);
    osd_pools[pool_id].publish(buf);
  }
  string key_prefix;
  _key_encode_u64(pool_id, &key_prefix);
  *out_per_pool_omap = per_pool_omap != OMAP_BULK;
- if (*out_per_pool_omap) {
+ // stop calls after db was closed
+ if (*out_per_pool_omap && db) {
    auto prefix = per_pool_omap == OMAP_PER_POOL ?
      PREFIX_PERPOOL_OMAP :
      PREFIX_PERPG_OMAP;
    buf->omap_allocated = db->estimate_prefix_size(prefix, key_prefix);
  }
  dout(10) << __func__ << *buf << dendl;
  return 0;
}

#29 Updated by Jimmy Spets about 2 months ago

Laura Flores wrote:

I contacted some Telemetry users. I will report back here with any information.

I am one of the users, OS was Centos 8.5, not Stream although that is in the logs back then. I am switching to Almalinux 9 during this month.
If you need more hardware or other info please email me.

{
    "archived": "2022-07-31 17:23:52.046724",
    "backtrace": [
        "/lib64/libpthread.so.0(+0x12ce0) [0x7fdd25ca5ce0]",
        "(rocksdb::DBImpl::GetApproximateSizes(rocksdb::SizeApproximationOptions const&, rocksdb::ColumnFamilyHandle*, rocksdb::Range const*, int, unsigned long*)+0x3a) [0x55b448960cca]",
        "(RocksDBStore::estimate_prefix_size(std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&)+0x41e) [0x55b448924fde]",
        "(BlueStore::pool_statfs(unsigned long, store_statfs_t*, bool*)+0x1e8) [0x55b4483028b8]",
        "(OSD::collect_pg_stats()+0x741) [0x55b447dd8421]",
        "(MgrClient::_send_pgstats()+0x40) [0x55b4487154c0]",
        "(MgrClient::_send_stats()+0x15) [0x55b4487182f5]",
        "(Context::complete(int)+0xd) [0x55b447e0468d]",
        "(CommonSafeTimer<std::mutex>::timer_thread()+0x11a) [0x55b4484dfdba]",
        "(CommonSafeTimerThread<std::mutex>::entry()+0x11) [0x55b4484e13e1]",
        "/lib64/libpthread.so.0(+0x81ca) [0x7fdd25c9b1ca]",
        "clone()" 
    ],
    "ceph_version": "17.2.1",
    "crash_id": "2022-07-31T17:01:34.851692Z_78514779-681a-4ae8-bd03-255121b6f183",
    "entity_name": "osd.34",
    "os_id": "centos",
    "os_name": "CentOS Stream",
    "os_version": "8",
    "os_version_id": "8",
    "process_name": "ceph-osd",
    "stack_sig": "8401ef44af5a55696ab4d190dcfd98f4b85bab2f37ee2f91d6cd7a241a3b0737",
    "timestamp": "2022-07-31T17:01:34.851692Z",
    "utsname_hostname": "cephosd03",
    "utsname_machine": "x86_64",
    "utsname_release": "4.18.0-372.16.1.el8_6.x86_64",
    "utsname_sysname": "Linux",
    "utsname_version": "#1 SMP Wed Jul 13 03:56:16 EDT 2022" 
}
{
    "archived": "2022-07-31 17:23:52.046724",
    "backtrace": [
        "/lib64/libpthread.so.0(+0x12ce0) [0x7fdd25ca5ce0]",
        "(rocksdb::DBImpl::GetApproximateSizes(rocksdb::SizeApproximationOptions const&, rocksdb::ColumnFamilyHandle*, rocksdb::Range const*, int, unsigned long*)+0x3a) [0x55b448960cca]",
        "(RocksDBStore::estimate_prefix_size(std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&)+0x41e) [0x55b448924fde]",
        "(BlueStore::pool_statfs(unsigned long, store_statfs_t*, bool*)+0x1e8) [0x55b4483028b8]",
        "(OSD::collect_pg_stats()+0x741) [0x55b447dd8421]",
        "(MgrClient::_send_pgstats()+0x40) [0x55b4487154c0]",
        "(MgrClient::_send_stats()+0x15) [0x55b4487182f5]",
        "(Context::complete(int)+0xd) [0x55b447e0468d]",
        "(CommonSafeTimer<std::mutex>::timer_thread()+0x11a) [0x55b4484dfdba]",
        "(CommonSafeTimerThread<std::mutex>::entry()+0x11) [0x55b4484e13e1]",
        "/lib64/libpthread.so.0(+0x81ca) [0x7fdd25c9b1ca]",
        "clone()" 
    ],
    "ceph_version": "17.2.1",
    "crash_id": "2022-07-31T17:01:34.851692Z_78514779-681a-4ae8-bd03-255121b6f183",
    "entity_name": "osd.34",
    "os_id": "centos",
    "os_name": "CentOS Stream",
    "os_version": "8",
    "os_version_id": "8",
    "process_name": "ceph-osd",
    "stack_sig": "8401ef44af5a55696ab4d190dcfd98f4b85bab2f37ee2f91d6cd7a241a3b0737",
    "timestamp": "2022-07-31T17:01:34.851692Z",
    "utsname_hostname": "cephosd03",
    "utsname_machine": "x86_64",
    "utsname_release": "4.18.0-372.16.1.el8_6.x86_64",
    "utsname_sysname": "Linux",
    "utsname_version": "#1 SMP Wed Jul 13 03:56:16 EDT 2022" 
}
{
    "archived": "2022-07-31 17:23:52.070063",
    "backtrace": [
        "/lib64/libpthread.so.0(+0x12ce0) [0x7f2b5c7d3ce0]",
        "(rocksdb::DBImpl::GetApproximateSizes(rocksdb::SizeApproximationOptions const&, rocksdb::ColumnFamilyHandle*, rocksdb::Range const*, int, unsigned long*)+0x3a) [0x5629c00c6cca]",
        "(RocksDBStore::estimate_prefix_size(std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&)+0x41e) [0x5629c008afde]",
        "(BlueStore::pool_statfs(unsigned long, store_statfs_t*, bool*)+0x1e8) [0x5629bfa688b8]",
        "(OSD::collect_pg_stats()+0x741) [0x5629bf53e421]",
        "(MgrClient::_send_pgstats()+0x40) [0x5629bfe7b4c0]",
        "(MgrClient::_send_stats()+0x15) [0x5629bfe7e2f5]",
        "(Context::complete(int)+0xd) [0x5629bf56a68d]",
        "(CommonSafeTimer<std::mutex>::timer_thread()+0x11a) [0x5629bfc45dba]",
        "(CommonSafeTimerThread<std::mutex>::entry()+0x11) [0x5629bfc473e1]",
        "/lib64/libpthread.so.0(+0x81ca) [0x7f2b5c7c91ca]",
        "clone()" 
    ],
    "ceph_version": "17.2.1",
    "crash_id": "2022-07-31T17:05:52.192534Z_51933922-6089-49cd-9aa7-930b782de303",
    "entity_name": "osd.25",
    "os_id": "centos",
    "os_name": "CentOS Stream",
    "os_version": "8",
    "os_version_id": "8",
    "process_name": "ceph-osd",
    "stack_sig": "8401ef44af5a55696ab4d190dcfd98f4b85bab2f37ee2f91d6cd7a241a3b0737",
    "timestamp": "2022-07-31T17:05:52.192534Z",
    "utsname_hostname": "cephosd06",
    "utsname_machine": "x86_64",
    "utsname_release": "4.18.0-372.16.1.el8_6.x86_64",
    "utsname_sysname": "Linux",
    "utsname_version": "#1 SMP Wed Jul 13 03:56:16 EDT 2022" 
}
{
    "archived": "2022-07-31 17:23:52.093455",
    "backtrace": [
        "/lib64/libpthread.so.0(+0x12ce0) [0x7f7a599fdce0]",
        "(rocksdb::DBImpl::GetApproximateSizes(rocksdb::SizeApproximationOptions const&, rocksdb::ColumnFamilyHandle*, rocksdb::Range const*, int, unsigned long*)+0x3a) [0x56426b0e3cca]",
        "(RocksDBStore::estimate_prefix_size(std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&)+0x41e) [0x56426b0a7fde]",
        "(BlueStore::pool_statfs(unsigned long, store_statfs_t*, bool*)+0x1e8) [0x56426aa858b8]",
        "(OSD::collect_pg_stats()+0x741) [0x56426a55b421]",
        "(MgrClient::_send_pgstats()+0x40) [0x56426ae984c0]",
        "(MgrClient::_send_stats()+0x15) [0x56426ae9b2f5]",
        "(Context::complete(int)+0xd) [0x56426a58768d]",
        "(CommonSafeTimer<std::mutex>::timer_thread()+0x11a) [0x56426ac62dba]",
        "(CommonSafeTimerThread<std::mutex>::entry()+0x11) [0x56426ac643e1]",
        "/lib64/libpthread.so.0(+0x81ca) [0x7f7a599f31ca]",
        "clone()" 
    ],
    "ceph_version": "17.2.1",
    "crash_id": "2022-07-31T17:08:17.011242Z_ad2234d6-7ffe-4a7f-89c8-9fb45b009d06",
    "entity_name": "osd.20",
    "os_id": "centos",
    "os_name": "CentOS Stream",
    "os_version": "8",
    "os_version_id": "8",
    "process_name": "ceph-osd",
    "stack_sig": "8401ef44af5a55696ab4d190dcfd98f4b85bab2f37ee2f91d6cd7a241a3b0737",
    "timestamp": "2022-07-31T17:08:17.011242Z",
    "utsname_hostname": "cephosd07",
    "utsname_machine": "x86_64",
    "utsname_release": "4.18.0-372.16.1.el8_6.x86_64",
    "utsname_sysname": "Linux",
    "utsname_version": "#1 SMP Wed Jul 13 03:56:16 EDT 2022" 
}

#30 Updated by Laura Flores about 1 month ago

Notes from the rados suite review:

We may need to check if we're shutting down while sending pg stats; if so, we don't need to send pg stats.

#31 Updated by Brad Hubbard about 1 month ago

Laura Flores wrote:

Notes from the rados suite review:

We may need to check if we're shutting down while sending pg stats; if so, we don't need to send pg stats.

Thanks Laura, I've had a bit of a look into this and I think I have a pretty good idea of what exactly is going here and some possible solutions. Since we'll be discussing this on Wednesday Australian time and trying to reproduce this in the Gibba cluster I'll provide a summary of our findings after that.

#32 Updated by Radoslaw Zarzynski 21 days ago

Is there any news on that?

#33 Updated by Laura Flores 21 days ago

Brad and I ran a reproducer on the gibba cluster (restarting OSDs with `for osd in $(systemctl -l |grep osd|gawk '{print $1}'); do systemctl restart $osd ; done`). We were unable to generate a coredump, but Vikhyat suggested running a radosbench workload while restarting OSDs to generate some client IO, as this would better simulate the state of clusters that experienced the crash.

I have added the topic to this coming RADOS meeting so we can discuss what kind of radosbench workload to run.

#34 Updated by Laura Flores 19 days ago

Managed to reproduce this on the Gibba cluster and produce a coredump!

The core file is located on gibba001 under /root/tracker_56101.
You will also see it when running `sudo coredumpctl list`.

I reproduced it by:

  1. Starting a workload with: [root@gibba001 ~] rados -p foo bench 7200 -b 4096 write --no-cleanup
  2. While workload was going: [root@gibba001 ~]# for osd in $(systemctl -l |grep osd|gawk '{print $1}'); do echo $osd ; systemctl restart $osd ; done
    May have to repeat this step several times.

Here is the crash info:

[root@gibba001 ~]# ceph crash info 2022-11-09T21:45:50.930316Z_21250a07-3868-4963-934d-e45fe02800bc
{
    "backtrace": [
        "/lib64/libpthread.so.0(+0x12cf0) [0x7fce3263acf0]",
        "(RocksDBStore::estimate_prefix_size(std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&)+0x3db) [0x55a6ac79a3eb]",
        "(BlueStore::pool_statfs(unsigned long, store_statfs_t*, bool*)+0x215) [0x55a6ac15d545]",
        "(OSD::collect_pg_stats()+0x8f2) [0x55a6abbe2e02]",
        "(MgrClient::_send_pgstats()+0x41) [0x55a6ac58d471]",
        "(MgrClient::_send_stats()+0x15) [0x55a6ac592205]",
        "(Context::complete(int)+0xd) [0x55a6abc21abd]",
        "(CommonSafeTimer<std::mutex>::timer_thread()+0x12f) [0x55a6ac3445ef]",
        "(CommonSafeTimerThread<std::mutex>::entry()+0x11) [0x55a6ac3456e1]",
        "/lib64/libpthread.so.0(+0x81ca) [0x7fce326301ca]",
        "clone()" 
    ],
    "ceph_version": "17.2.5",
    "crash_id": "2022-11-09T21:45:50.930316Z_21250a07-3868-4963-934d-e45fe02800bc",
    "entity_name": "osd.226",
    "os_id": "centos",
    "os_name": "CentOS Stream",
    "os_version": "8",
    "os_version_id": "8",
    "process_name": "ceph-osd",
    "stack_sig": "99b56157ea4b210db422174f528bf297535ac749ee952a700e03651444bec3e7",
    "timestamp": "2022-11-09T21:45:50.930316Z",
    "utsname_hostname": "gibba001",
    "utsname_machine": "x86_64",
    "utsname_release": "4.18.0-301.1.el8.x86_64",
    "utsname_sysname": "Linux",
    "utsname_version": "#1 SMP Tue Apr 13 16:24:22 UTC 2021" 
}

#35 Updated by Brad Hubbard 18 days ago

Thanks for your work in capturing the core Laura.

I had a look at the coredump and it shows exactly what we had speculated during our discussions about this.

The relevant thread is identifiable from the log that I dumped out of the core.

2022-11-09 21:45:49.895465 thread: 0x7fce2bc23700 priority: -1 subsystem: 0 osd.226 292533 *** Immediate shutdown (osd_fast_shutdown=true) ***


I dumped the output of 'info threads' to gdb2.txt.
# grep 0x7fce2bc23700 gdb2.txt 
  52   Thread 0x7fce2bc23700 (LWP 23)  syscall () at ../sysdeps/unix/sysv/linux/x86_64/syscall.S:38
(gdb) t 52
[Switching to thread 52 (Thread 0x7fce2bc23700 (LWP 23))]
#0  syscall () at ../sysdeps/unix/sysv/linux/x86_64/syscall.S:38
38              cmpq $-4095, %rax       /* Check %rax for error.  */
(gdb) bt
#0  syscall () at ../sysdeps/unix/sysv/linux/x86_64/syscall.S:38
#1  0x00007fce33b89cf5 in io_destroy () from /lib64/libaio.so.1
#2  0x000055a6ac785146 in aio_queue_t::shutdown (this=0x55a6aeb926a0) at /usr/src/debug/ceph-17.2.5-0.el8.x86_64/src/blk/aio/aio.h:147
#3  0x000055a6ac77c818 in KernelDevice::close (this=0x55a6adcf1c00) at /usr/src/debug/ceph-17.2.5-0.el8.x86_64/src/blk/kernel/KernelDevice.cc:304
#4  0x000055a6ac28f886 in BlueFS::~BlueFS (this=this@entry=0x55a6adcf1800, __in_chrg=<optimized out>) at /usr/src/debug/ceph-17.2.5-0.el8.x86_64/src/os/bluestore/BlueFS.cc:210
#5  0x000055a6ac141971 in BlueStore::_minimal_close_bluefs (this=0x55a6add22000) at /usr/src/debug/ceph-17.2.5-0.el8.x86_64/src/os/bluestore/BlueStore.cc:6271
#6  0x000055a6ac1bbf57 in BlueStore::_close_db_and_around (this=0x55a6add22000) at /usr/src/debug/ceph-17.2.5-0.el8.x86_64/src/os/bluestore/BlueStore.cc:6429
#7  0x000055a6ac1ecc1a in BlueStore::umount() () at /usr/src/debug/ceph-17.2.5-0.el8.x86_64/src/os/bluestore/BlueStore.cc:7709
#8  0x000055a6abbf6bad in OSD::shutdown() () at /opt/rh/gcc-toolset-11/root/usr/include/c++/11/bits/unique_ptr.h:421
#9  0x000055a6ac2f66e7 in SignalHandler::entry (this=0x55a6adcd1800) at /usr/src/debug/ceph-17.2.5-0.el8.x86_64/src/global/signal_handler.cc:566
#10 0x00007fce326301ca in start_thread (arg=<optimized out>) at pthread_create.c:479
#11 0x00007fce3129ae73 in clone () at ../sysdeps/unix/sysv/linux/x86_64/clone.S:95
(gdb) f 8
#8  0x000055a6abbf6bad in OSD::shutdown() () at /opt/rh/gcc-toolset-11/root/usr/include/c++/11/bits/unique_ptr.h:421
421     /opt/rh/gcc-toolset-11/root/usr/include/c++/11/bits/unique_ptr.h: No such file or directory.

Because we are in standard library code we need to mess around a bit to see what line we are on in OSD.cc. Basically we disassemble the function (with source code) and find the position counter (indicated with '=>') and work back from there to the last line in OSD.cc.

(gdb) disass /s
...
/usr/src/debug/ceph-17.2.5-0.el8.x86_64/src/osd/OSD.cc:
4338        utime_t end_time = ceph_clock_now();
=> 0x000055a6abbf6bad <+733>:   callq  0x55a6abb9b4a0 <ceph_clock_now()>

That's where we will return to after the current function call finishes so we are actually on the line above, 4336.

(gdb) l src/osd/OSD.cc:4338
4333        store->prepare_for_fast_shutdown();
4334        std::lock_guard lock(osd_lock);
4335        // TBD: assert in allocator that nothing is being add
4336        store->umount();
4337
4338        utime_t end_time = ceph_clock_now();
4339        if (cct->_conf->osd_fast_shutdown_timeout) {
4340          ceph_assert(end_time - start_time_func < cct->_conf->osd_fast_shutdown_timeout);
4341        }
4342        dout(0) <<"Fast Shutdown duration total     :" << end_time              - start_time_func       << " seconds" << dendl;

Of course the umount call in frame #7 confirms we are indeed actually on 4336 and unmounting the store which is exactly what we speculated and is pretty much definitive proof this is a race and we should not be trying to gather pg_stats while the store is being unmounted.

Laura, let's talk about the patch for this tomorrow your time.

#36 Updated by Laura Flores 18 days ago

  • Status changed from Need More Info to In Progress
  • Backport set to quincy

#37 Updated by Laura Flores 18 days ago

  • Status changed from In Progress to Fix Under Review
  • Pull request ID set to 48836

#38 Updated by Laura Flores 17 days ago

Also to note: We set `ceph config set mgr mgr_stats_period 1` on the gibba cluster to reproduce this bug. (This occurred before we ran the radosbench workload and restarted osds).

Also available in: Atom PDF