Project

General

Profile

Actions

Bug #56456

open

rook-ceph-v1.9.5: ceph-osd crash randomly

Added by Aurélien Le Clainche almost 2 years ago. Updated over 1 year ago.

Status:
New
Priority:
Normal
Assignee:
-
Target version:
-
% Done:

0%

Source:
Tags:
Backport:
Regression:
No
Severity:
1 - critical
Reviewed:
Affected Versions:
ceph-qa-suite:
Pull request ID:
Crash signature (v1):
Crash signature (v2):

Description

Hi,

after a migration to rook-ceph v1.9.5, ceph osd crash :

sh-4.4$ ceph crash info 2022-07-04T09:38:35.887100Z_895047ab-c7a4-4162-ba1c-463dd6f88647
{
    "assert_condition": "abort",
    "assert_file": "/home/jenkins-build/build/workspace/ceph-build/ARCH/x86_64/AVAILABLE_ARCH/x86_64/AVAILABLE_DIST/centos8/DIST/centos8/MACHINE_SIZE/gigantic/release/16.2.9/rpm/el8/BUILD/ceph-16.2.9/src/os/bluestore/bluestore_types.cc",
    "assert_func": "void bluestore_extent_ref_map_t::put(uint64_t, uint32_t, PExtentVector*, bool*)",
    "assert_line": 210,
    "assert_msg": "/home/jenkins-build/build/workspace/ceph-build/ARCH/x86_64/AVAILABLE_ARCH/x86_64/AVAILABLE_DIST/centos8/DIST/centos8/MACHINE_SIZE/gigantic/release/16.2.9/rpm/el8/BUILD/ceph-16.2.9/src/os/bluestore/bluestore_types.cc: In function 'void bluestore_extent_ref_map_t::put(uint64_t, uint32_t, PExtentVector*, bool*)' thread 7fb0dfb59700 time 2022-07-04T09:38:35.866896+0000\n/home/jenkins-build/build/workspace/ceph-build/ARCH/x86_64/AVAILABLE_ARCH/x86_64/AVAILABLE_DIST/centos8/DIST/centos8/MACHINE_SIZE/gigantic/release/16.2.9/rpm/el8/BUILD/ceph-16.2.9/src/os/bluestore/bluestore_types.cc: 210: ceph_abort_msg(\"put on missing extent (nothing before)\")\n",
    "assert_thread_name": "tp_osd_tp",
    "backtrace": [
        "/lib64/libpthread.so.0(+0x12ce0) [0x7fb103458ce0]",
        "gsignal()",
        "abort()",
        "(ceph::__ceph_abort(char const*, int, char const*, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&)+0x1b6) [0x5564b6d7ff6d]",
        "(bluestore_extent_ref_map_t::put(unsigned long, unsigned int, std::vector<bluestore_pextent_t, mempool::pool_allocator<(mempool::pool_index_t)5, bluestore_pextent_t> >*, bool*)+0x41f) [0x5564b7422adf]",
        "(BlueStore::_wctx_finish(BlueStore::TransContext*, boost::intrusive_ptr<BlueStore::Collection>&, boost::intrusive_ptr<BlueStore::Onode>, BlueStore::WriteContext*, std::set<BlueStore::SharedBlob*, std::less<BlueStore::SharedBlob*>, std::allocator<BlueStore::SharedBlob*> >*)+0xf5b) [0x5564b73af6cb]",
        "(BlueStore::_do_truncate(BlueStore::TransContext*, boost::intrusive_ptr<BlueStore::Collection>&, boost::intrusive_ptr<BlueStore::Onode>, unsigned long, std::set<BlueStore::SharedBlob*, std::less<BlueStore::SharedBlob*>, std::allocator<BlueStore::SharedBlob*> >*)+0x3a8) [0x5564b73b0cb8]",
        "(BlueStore::_do_remove(BlueStore::TransContext*, boost::intrusive_ptr<BlueStore::Collection>&, boost::intrusive_ptr<BlueStore::Onode>)+0xce) [0x5564b73b15ee]",
        "(BlueStore::_remove(BlueStore::TransContext*, boost::intrusive_ptr<BlueStore::Collection>&, boost::intrusive_ptr<BlueStore::Onode>&)+0x22c) [0x5564b73b316c]",
        "(BlueStore::_txc_add_transaction(BlueStore::TransContext*, ceph::os::Transaction*)+0x1fcf) [0x5564b73cb1af]",
        "(BlueStore::queue_transactions(boost::intrusive_ptr<ObjectStore::CollectionImpl>&, std::vector<ceph::os::Transaction, std::allocator<ceph::os::Transaction> >&, boost::intrusive_ptr<TrackedOp>, ThreadPool::TPHandle*)+0x316) [0x5564b73e66f6]",
        "(ObjectStore::queue_transaction(boost::intrusive_ptr<ObjectStore::CollectionImpl>&, ceph::os::Transaction&&, boost::intrusive_ptr<TrackedOp>, ThreadPool::TPHandle*)+0x85) [0x5564b6eefed5]",
        "(OSD::dispatch_context(PeeringCtx&, PG*, std::shared_ptr<OSDMap const>, ThreadPool::TPHandle*)+0xf3) [0x5564b6e84dd3]",
        "(OSD::dequeue_peering_evt(OSDShard*, PG*, std::shared_ptr<PGPeeringEvent>, ThreadPool::TPHandle&)+0x2d8) [0x5564b6eb4778]",
        "(ceph::osd::scheduler::PGPeeringItem::run(OSD*, OSDShard*, boost::intrusive_ptr<PG>&, ThreadPool::TPHandle&)+0x56) [0x5564b70e7866]",
        "(OSD::ShardedOpWQ::_process(unsigned int, ceph::heartbeat_handle_d*)+0xc28) [0x5564b6ea64e8]",
        "(ShardedThreadPool::shardedthreadpool_worker(unsigned int)+0x5c4) [0x5564b75232c4]",
        "(ShardedThreadPool::WorkThreadSharded::entry()+0x14) [0x5564b75261a4]",
        "/lib64/libpthread.so.0(+0x81ca) [0x7fb10344e1ca]",
        "clone()" 
    ],
    "ceph_version": "16.2.9",
    "crash_id": "2022-07-04T09:38:35.887100Z_895047ab-c7a4-4162-ba1c-463dd6f88647",
    "entity_name": "osd.254",
    "os_id": "centos",
    "os_name": "CentOS Stream",
    "os_version": "8",
    "os_version_id": "8",
    "process_name": "ceph-osd",
    "stack_sig": "685e9ade14cafe369e502737407e80d9275da9e365c5e87c1ea6bad1e32038e3",
    "timestamp": "2022-07-04T09:38:35.887100Z",
    "utsname_hostname": "kw214-vso-pr",
    "utsname_machine": "x86_64",
    "utsname_release": "5.15.39-talos",
    "utsname_sysname": "Linux",
    "utsname_version": "#1 SMP Thu May 12 15:08:32 UTC 2022" 
}
sh-4.4$ ceph -s 
  cluster:
    id:     1b713b95-fb0d-4254-b83a-4421b28f35fa
    health: HEALTH_WARN
            1 osds down
            Degraded data redundancy: 301421/89274670 objects degraded (0.338%), 6 pgs degraded, 9 pgs undersized
            2060 daemons have recently crashed

  services:
    mon: 3 daemons, quorum g,h,i (age 6d)
    mgr: a(active, since 6d), standbys: b
    osd: 270 osds: 267 up (since 4m), 268 in (since 25h)

  data:
    pools:   17 pools, 801 pgs
    objects: 22.32M objects, 85 TiB
    usage:   114 TiB used, 666 TiB / 780 TiB avail
    pgs:     301421/89274670 objects degraded (0.338%)
             792 active+clean
             6   active+undersized+degraded
             3   active+undersized

  io:
    client:   583 MiB/s rd, 1.3 GiB/s wr, 5.02k op/s rd, 9.51k op/s wr

  progress:
    Global Recovery Event (15h)
      [===========================.] (remaining: 9m)

Files

Actions

Also available in: Atom PDF