Project

General

Profile

Actions

Bug #56262

open

crash: BlueStore::_txc_create(BlueStore::Collection*, BlueStore::OpSequencer*, std::list<Context*, std::allocator<Context*> >*, boost::intrusive_ptr<TrackedOp>)

Added by Telemetry Bot almost 2 years ago. Updated 1 day ago.

Status:
New
Priority:
Normal
Assignee:
-
Target version:
-
% Done:

0%

Source:
Telemetry
Tags:
Backport:
Regression:
No
Severity:
3 - minor
Reviewed:
ceph-qa-suite:
Pull request ID:
Crash signature (v1):

649d2cb3ae548d2230ae867ccc544881777512ffefead6b85524131ae2aeca00
455b53887e2d309cda8baa5a9b439020485000773761a9fd4a50173f13a7bf46
312523bdc8259bba50a61c0785488096a9b94199c9c560bef14e55dba42554a7
9a994da7cf764fb736b1dc42633f4df0b9998bea8face2c95145736162cd40f6


Description

http://telemetry.front.sepia.ceph.com:4000/d/jByk5HaMz/crash-spec-x-ray?orgId=1&var-sig_v2=97298aa00eec3260da644360a67fbf3a42d3e72bf9aa76099e12c0f34b513e16

Sanitized backtrace:

    BlueStore::_txc_create(BlueStore::Collection*, BlueStore::OpSequencer*, std::list<Context*, std::allocator<Context*> >*, boost::intrusive_ptr<TrackedOp>)
    BlueStore::queue_transactions(boost::intrusive_ptr<ObjectStore::CollectionImpl>&, std::vector<ceph::os::Transaction, std::allocator<ceph::os::Transaction> >&, boost::intrusive_ptr<TrackedOp>, ThreadPool::TPHandle*)
    non-virtual thunk to PrimaryLogPG::queue_transactions(std::vector<ceph::os::Transaction, std::allocator<ceph::os::Transaction> >&, boost::intrusive_ptr<OpRequest>)
    ReplicatedBackend::submit_transaction(hobject_t const&, object_stat_sum_t const&, eversion_t const&, std::unique_ptr<PGTransaction, std::default_delete<PGTransaction> >&&, eversion_t const&, eversion_t const&, std::vector<pg_log_entry_t, std::allocator<pg_log_entry_t> >&&, std::optional<pg_hit_set_history_t>&, Context*, unsigned long, osd_reqid_t, boost::intrusive_ptr<OpRequest>)
    PrimaryLogPG::issue_repop(PrimaryLogPG::RepGather*, PrimaryLogPG::OpContext*)
    PrimaryLogPG::simple_opc_submit(std::unique_ptr<PrimaryLogPG::OpContext, std::default_delete<PrimaryLogPG::OpContext> >)
    PrimaryLogPG::handle_watch_timeout(std::shared_ptr<Watch>)
    HandleWatchTimeout::complete(int)
    CommonSafeTimer<std::mutex>::timer_thread()
    CommonSafeTimerThread<std::mutex>::entry()

Crash dump sample:
{
    "backtrace": [
        "/lib64/libpthread.so.0(+0x12ce0) [0x7f38fdddece0]",
        "(BlueStore::_txc_create(BlueStore::Collection*, BlueStore::OpSequencer*, std::__cxx11::list<Context*, std::allocator<Context*> >*, boost::intrusive_ptr<TrackedOp>)+0x3ae) [0x55a04e22855e]",
        "(BlueStore::queue_transactions(boost::intrusive_ptr<ObjectStore::CollectionImpl>&, std::vector<ceph::os::Transaction, std::allocator<ceph::os::Transaction> >&, boost::intrusive_ptr<TrackedOp>, ThreadPool::TPHandle*)+0x260) [0x55a04e28d3e0]",
        "(non-virtual thunk to PrimaryLogPG::queue_transactions(std::vector<ceph::os::Transaction, std::allocator<ceph::os::Transaction> >&, boost::intrusive_ptr<OpRequest>)+0x55) [0x55a04de8d2e5]",
        "(ReplicatedBackend::submit_transaction(hobject_t const&, object_stat_sum_t const&, eversion_t const&, std::unique_ptr<PGTransaction, std::default_delete<PGTransaction> >&&, eversion_t const&, eversion_t const&, std::vector<pg_log_entry_t, std::allocator<pg_log_entry_t> >&&, std::optional<pg_hit_set_history_t>&, Context*, unsigned long, osd_reqid_t, boost::intrusive_ptr<OpRequest>)+0xca8) [0x55a04e0a7f98]",
        "(PrimaryLogPG::issue_repop(PrimaryLogPG::RepGather*, PrimaryLogPG::OpContext*)+0xc90) [0x55a04ddf3da0]",
        "(PrimaryLogPG::simple_opc_submit(std::unique_ptr<PrimaryLogPG::OpContext, std::default_delete<PrimaryLogPG::OpContext> >)+0x120) [0x55a04ddf6000]",
        "(PrimaryLogPG::handle_watch_timeout(std::shared_ptr<Watch>)+0xb99) [0x55a04ddf83f9]",
        "(HandleWatchTimeout::complete(int)+0x11b) [0x55a04dd7a4ab]",
        "(CommonSafeTimer<std::mutex>::timer_thread()+0x11a) [0x55a04e3f8afa]",
        "(CommonSafeTimerThread<std::mutex>::entry()+0x11) [0x55a04e3fa121]",
        "/lib64/libpthread.so.0(+0x81ca) [0x7f38fddd41ca]",
        "clone()" 
    ],
    "ceph_version": "17.2.0",
    "crash_id": "2022-06-17T09:49:25.420851Z_81d904b4-64b5-4d44-bad9-e7387f23cd4c",
    "entity_name": "osd.24b5a6cace6a7a8b1937c5270d381d3994901bfa",
    "os_id": "centos",
    "os_name": "CentOS Stream",
    "os_version": "8",
    "os_version_id": "8",
    "process_name": "ceph-osd",
    "stack_sig": "649d2cb3ae548d2230ae867ccc544881777512ffefead6b85524131ae2aeca00",
    "timestamp": "2022-06-17T09:49:25.420851Z",
    "utsname_machine": "x86_64",
    "utsname_release": "5.18.3-arch1-1",
    "utsname_sysname": "Linux",
    "utsname_version": "#1 SMP PREEMPT_DYNAMIC Thu, 09 Jun 2022 16:14:10 +0000" 
}

Actions #1

Updated by Telemetry Bot almost 2 years ago

  • Crash signature (v1) updated (diff)
  • Crash signature (v2) updated (diff)
  • Affected Versions v17.2.0 added
Actions #2

Updated by Telemetry Bot over 1 year ago

  • Crash signature (v1) updated (diff)
  • Affected Versions v17.2.1 added
Actions #3

Updated by Telemetry Bot 12 months ago

  • Crash signature (v1) updated (diff)
  • Affected Versions v17.2.2, v17.2.3, v17.2.4, v17.2.5, v17.2.6 added
Actions #4

Updated by Deepika Upadhyay 11 months ago

  • Crash signature (v1) updated (diff)

I see a lot of frequent crashes with similar backtrace in the 17.2.5 version @Igor Gajowiak/@Adam can you help identify what can be done?

Actions #5

Updated by Deepika Upadhyay 11 months ago

eg crash:

Crash ID: 2023-04-25T00:10:34.765829Z_6e2592ba-d6f5-4be3-b7ca-12d532008551
Crash Info:
{
    "archived": "2023-04-25 16:16:43.236114",
    "backtrace": [
        "/lib64/libpthread.so.0(+0x12cf0) [0x7fe260463cf0]",
        "(BlueStore::_txc_create(BlueStore::Collection*, BlueStore::OpSequencer*, std::__cxx11::list<Context*, std::allocator<Context*> >*, boost::intrusive_ptr<TrackedOp>)+0x40a) [0x555f638d3d5a]",
        "(BlueStore::queue_transactions(boost::intrusive_ptr<ObjectStore::CollectionImpl>&, std::vector<ceph::os::Transaction, std::allocator<ceph::os::Transaction> >&, boost::intrusive_ptr<TrackedOp>, ThreadPool::TPHandle*)+0x21e) [0x555f6394f99e]",
        "(non-virtual thunk to PrimaryLogPG::queue_transactions(std::vector<ceph::os::Transaction, std::allocator<ceph::os::Transaction> >&, boost::intrusive_ptr<OpRequest>)+0x53) [0x555f63514203]",
        "(ReplicatedBackend::submit_transaction(hobject_t const&, object_stat_sum_t const&, eversion_t const&, std::unique_ptr<PGTransaction, std::default_delete<PGTransaction> >&&, eversion_t const&, eversion_t const&, std::vector<pg_log_entry_t, std::allocator<pg_log_entry_t> >&&, std::optional<pg_hit_set_history_t>&, Context*, unsigned long, osd_reqid_t, boost::intrusive_ptr<OpRequest>)+0x7c2) [0x555f63765ae2]",
        "(PrimaryLogPG::issue_repop(PrimaryLogPG::RepGather*, PrimaryLogPG::OpContext*)+0x50d) [0x555f6348bebd]",
        "(PrimaryLogPG::simple_opc_submit(std::unique_ptr<PrimaryLogPG::OpContext, std::default_delete<PrimaryLogPG::OpContext> >)+0x5a) [0x555f6348dcfa]",
        "(PrimaryLogPG::handle_watch_timeout(std::shared_ptr<Watch>)+0x87b) [0x555f6348ff1b]",
        "(HandleWatchTimeout::complete(int)+0x11a) [0x555f633f486a]",
        "(CommonSafeTimer<std::mutex>::timer_thread()+0x12f) [0x555f63ab95ef]",
        "(CommonSafeTimerThread<std::mutex>::entry()+0x11) [0x555f63aba6e1]",
        "/lib64/libpthread.so.0(+0x81ca) [0x7fe2604591ca]",
        "clone()" 
    ],
    "ceph_version": "17.2.5",
    "crash_id": "2023-04-25T00:10:34.765829Z_6e2592ba-d6f5-4be3-b7ca-12d532008551",
    "entity_name": "osd.79",
    "os_id": "centos",
    "os_name": "CentOS Stream",
    "os_version": "8",
    "os_version_id": "8",
    "process_name": "ceph-osd",
    "stack_sig": "649d2cb3ae548d2230ae867ccc544881777512ffefead6b85524131ae2aeca00",
    "timestamp": "2023-04-25T00:10:34.765829Z",
    "utsname_hostname": "rook-ceph-osd-79-6d4848f486-5ct8b",
    "utsname_machine": "x86_64",
    "utsname_release": "5.14.0-162.6.1.el9_1.x86_64",
    "utsname_sysname": "Linux",
    "utsname_version": "#1 SMP PREEMPT_DYNAMIC Tue Nov 15 07:49:10 EST 2022" 
}
-------------------------------------------

Actions #6

Updated by Prashant D 1 day ago ยท Edited

There seems to be some race condition at the time of OSD shutdown. The kv db handle was destroyed and one of OSD thread was trying to queue the bluestore transaction at the time of issuing a replication op.

(gdb) bt
#0  __pthread_getname_np (th=139985008813632, buf=0x7f50d63a0da0 "", len=16) at pthread_getname.c:45
#1  0x000055c1053c4385 in ceph::logging::Log::dump_recent (this=0x55c107e578c0) at /usr/src/debug/ceph-17.2.6-170.el9cp.x86_64/src/log/Log.cc:520
#2  0x000055c10521d252 in handle_oneshot_fatal_signal (signum=11) at /usr/src/debug/ceph-17.2.6-170.el9cp.x86_64/src/global/signal_handler.cc:347
#3  <signal handler called>
#4  0x000055c105130df1 in BlueStore::_txc_create (this=0x55c107eae000, c=<optimized out>, osr=0x55c11b6eb340, on_commits=0x7f50d63a55d0, osd_op=...) at /usr/src/debug/ceph-17.2.6-170.el9cp.x86_64/src/os/bluestore/BlueStore.cc:13670
#5  0x000055c1051495a3 in BlueStore::queue_transactions (this=<optimized out>, ch=..., tls=std::vector of length 1, capacity 1 = {...}, op=..., handle=0x0)
    at /usr/src/debug/ceph-17.2.6-170.el9cp.x86_64/redhat-linux-build/boost/include/boost/smart_ptr/intrusive_ptr.hpp:179
#6  0x000055c104fe79dd in PrimaryLogPG::queue_transactions (op=..., tls=std::vector of length 1, capacity 1 = {...}, this=<optimized out>) at /usr/src/debug/ceph-17.2.6-170.el9cp.x86_64/src/osd/PrimaryLogPG.h:364
#7  ReplicatedBackend::submit_transaction (this=0x55c11b531680, soid=..., delta_stats=..., at_version=..., _t=..., trim_to=..., min_last_complete_ondisk=..., _log_entries=..., 
    hset_history=std::optional<pg_hit_set_history_t> [no contained value], on_all_commit=0x55c12393b640, tid=1631, reqid=..., orig_op=...) at /usr/src/debug/ceph-17.2.6-170.el9cp.x86_64/src/osd/ReplicatedBackend.cc:551
#8  0x000055c104df6d3f in PrimaryLogPG::issue_repop (this=0x55c11bc91000, repop=0x55c123789e00, ctx=0x55c10fa62900) at /usr/src/debug/ceph-17.2.6-170.el9cp.x86_64/src/osd/PrimaryLogPG.cc:11429
#9  0x000055c104dfbec7 in PrimaryLogPG::simple_opc_submit (this=0x55c11bc91000, ctx=std::unique_ptr<PrimaryLogPG::OpContext> = {...}) at /usr/include/c++/11/bits/unique_ptr.h:173
#10 0x000055c104dfe593 in PrimaryLogPG::handle_watch_timeout (this=<optimized out>, watch=...) at /usr/include/c++/11/bits/unique_ptr.h:172
#11 0x000055c104d4d8de in HandleWatchTimeout::complete (this=0x55c112871700) at /usr/src/debug/ceph-17.2.6-170.el9cp.x86_64/src/osd/Watch.cc:265
#12 0x000055c1052526ea in CommonSafeTimer<std::mutex>::timer_thread (this=0x55c107e2b9c8) at /usr/src/debug/ceph-17.2.6-170.el9cp.x86_64/src/common/Timer.cc:103
#13 0x000055c105252fa1 in CommonSafeTimerThread<std::mutex>::entry (this=<optimized out>) at /usr/src/debug/ceph-17.2.6-170.el9cp.x86_64/src/common/Timer.cc:33
#14 0x00007f50f1fa5802 in start_thread (arg=<optimized out>) at pthread_create.c:443
#15 0x00007f50f1f45450 in clone3 () at ../sysdeps/unix/sysv/linux/x86_64/clone3.S:81
(gdb) f 6
#6  0x000055c104fe79dd in PrimaryLogPG::queue_transactions (op=..., tls=std::vector of length 1, capacity 1 = {...}, this=<optimized out>) at /usr/src/debug/ceph-17.2.6-170.el9cp.x86_64/src/osd/PrimaryLogPG.h:364
364        osd->store->queue_transactions(ch, tls, op, NULL);
(gdb) p tls[0] 
$7 = {data = {ops = {v = 2}, largest_data_len = {v = 0}, largest_data_off = {v = 0}, largest_data_off_in_data_bl = {v = 0}, fadvise_flags = {v = 0}}, coll_index = std::map with 1 element = {[{type = coll_t::TYPE_PG, pgid = {pgid = {
          m_pool = 6, m_seed = 7, static calc_name_buf_size = 36 '$'}, shard = {id = -1 '\377'}, static calc_name_buf_size = 40 '('}, removal_seq = 0, _str_buff = '\000' <repeats 31 times>, "\066.7_head", 
      _str = 0x55c11b5c90e7 "6.7_head"}] = 0}, object_index = std::map with 2 elements = {[{hobj = {static POOL_META = -1, static POOL_TEMP_START = -2, oid = {name = ""}, snap = {val = 18446744073709551614}, hash = 7, max = false, 
        nibblewise_key_cache = 1879048192, hash_reverse_bits = 3758096384, pool = 6, nspace = "", key = ""}, generation = 18446744073709551615, shard_id = {id = -1 '\377'}, max = false, static NO_GEN = 18446744073709551615}] = 1, [{
      hobj = {static POOL_META = -1, static POOL_TEMP_START = -2, oid = {name = "notify.0"}, snap = {val = 18446744073709551614}, hash = 1126365855, max = false, nibblewise_key_cache = 4189004340, hash_reverse_bits = 4183770306, 
        pool = 6, nspace = "", key = ""}, generation = 18446744073709551615, shard_id = {id = -1 '\377'}, max = false, static NO_GEN = 18446744073709551615}] = 0}, coll_id = 1, object_id = 2, data_bl = {_buffers = {_root = {
        next = 0x55c123a7a7e0}, _tail = 0x55c108e2b380}, _carriage = 0x55c108e2b380, _len = 1547, _num = 12, static always_empty_bptr = {<ceph::buffer::v15_2_0::ptr_hook> = {next = 0x0}, <ceph::buffer::v15_2_0::ptr> = {_raw = 0x0, 
        _off = 0, _len = 0}, <No data fields>}}, op_bl = {_buffers = {_root = {next = 0x55c123a7a8a0}, _tail = 0x55c123a7a8a0}, _carriage = 0x55c123a7a8a0, _len = 144, _num = 1, 
    static always_empty_bptr = {<ceph::buffer::v15_2_0::ptr_hook> = {next = 0x0}, <ceph::buffer::v15_2_0::ptr> = {_raw = 0x0, _off = 0, _len = 0}, <No data fields>}}, on_applied = empty std::__cxx11::list, 
  on_commit = empty std::__cxx11::list, on_applied_sync = empty std::__cxx11::list}
(gdb) p tls[0].coll_index
$8 = std::map with 1 element = {[{type = coll_t::TYPE_PG, pgid = {pgid = {m_pool = 6, m_seed = 7, static calc_name_buf_size = 36 '$'}, shard = {id = -1 '\377'}, static calc_name_buf_size = 40 '('}, removal_seq = 0, 
    _str_buff = '\000' <repeats 31 times>, "\066.7_head", _str = 0x55c11b5c90e7 "6.7_head"}] = 0}
(gdb) f 4
#4  0x000055c105130df1 in BlueStore::_txc_create (this=0x55c107eae000, c=<optimized out>, osr=0x55c11b6eb340, on_commits=0x7f50d63a55d0, osd_op=...) at /usr/src/debug/ceph-17.2.6-170.el9cp.x86_64/src/os/bluestore/BlueStore.cc:13670
13670      txc->t = db->get_transaction();
(gdb) p db
$9 = (KeyValueDB *) 0x0

Considering this crash, the OSD should boot without any issues and there is no corruption at bluestore or bluefs level.

Actions

Also available in: Atom PDF