Project

General

Profile

Actions

Bug #63929

open

OSD crash after restart

Added by Thomas Way 4 months ago. Updated 4 months ago.

Status:
New
Priority:
Normal
Assignee:
-
Category:
OSD
Target version:
-
% Done:

0%

Source:
Tags:
Backport:
Regression:
No
Severity:
2 - major
Reviewed:
Affected Versions:
ceph-qa-suite:
Pull request ID:
Crash signature (v1):
Crash signature (v2):

Description

Please refer to https://github.com/rook/rook/issues/13487.

All 6 OSDs were operating normally, and restarted under normal conditions. 2/6 then failed to come back as they are crashing. This happened with v18.2.0, but an upgrade to v18.2.1 did not help.

bash-4.4$ ceph crash ls-new
ID                                                                ENTITY        NEW
2023-11-18T15:14:23.064216Z_c6bc39ff-c9ef-4043-af90-5d916440dad9  mgr.b          *
2023-11-21T14:58:17.164852Z_fe974ff7-c8d2-4551-8df4-75dd8177b1a0  mgr.a          *
2023-12-03T17:34:19.517719Z_b175f9ed-8f9c-46fa-af1e-3585c6d4d69c  client.admin   *
2023-12-04T13:11:25.070434Z_d8b7d85a-7cfd-43d9-b35c-1699b3a5db2b  mgr.a          *
2023-12-09T13:38:10.049138Z_9c4f1987-2fe4-413b-9f0d-95496548cf4a  mgr.b          *
2023-12-10T04:39:36.042563Z_b1c9dfed-99c0-4122-b9bc-363792fdf2ec  client.admin   *
2023-12-10T16:07:30.851895Z_d9b35b5c-bb52-4765-87a2-e2a988082cc1  mgr.b          *
2023-12-12T15:47:51.563400Z_819a4273-9cf7-4049-8f82-3439cfbb28a3  client.admin   *
2023-12-18T12:47:43.511245Z_b5696986-3db4-495a-888f-c8f838e5d60b  osd.1          *
2023-12-18T12:50:13.729233Z_28c01465-09a3-4c0f-952e-f60c54f9b533  osd.3          *
2023-12-19T08:58:45.784233Z_ebd5e7d6-d7a9-48c5-8a82-e58d2dcb22b5  client.admin   *
2023-12-23T21:02:34.346701Z_b5843e15-e9d6-4289-b951-c8dfee533768  client.admin   *
2023-12-23T21:27:09.011584Z_8f27f077-a8f8-4150-a13d-bc436239988c  mgr.b          *
2023-12-27T15:13:51.210294Z_1422cf95-80a9-4cfe-ad39-e7cbc4917322  mon.c          *
2023-12-27T17:01:50.752257Z_ae787b34-6eca-4f26-b611-4abf5e9e6ca6  osd.1          *
2023-12-27T17:01:58.136196Z_1fccab75-4a3b-48fd-bbf7-1dadf2f43dbb  osd.0          *
2023-12-27T18:03:20.736548Z_823a60f0-9e59-470e-b24c-d45dbe6c3386  client.admin   *
2024-01-01T23:39:13.818340Z_f33db1c3-f3b3-4438-96c6-d7c862bbefaa  osd.2          *
2024-01-01T23:40:04.931311Z_4a4d6ee6-c2c2-41fd-bfbc-7d18ad256bca  osd.2          *
2024-01-01T23:51:58.520444Z_3feccb3e-83f6-4205-9f55-f4fd3ec5ceb9  osd.2          *
2024-01-01T23:57:00.614429Z_7af6c825-5b29-4b0a-9798-a5b99a9c6de1  osd.3          *
2024-01-02T00:02:27.010008Z_0b36ae34-c2b9-4225-8be9-56b542412a43  osd.3          *
2024-01-02T00:13:23.926556Z_c6145f47-59d1-40c3-8e5f-125866b73aa0  osd.2          *
2024-01-02T00:14:34.108020Z_8aa1dc19-73f4-4778-ab55-63e226f49fbf  osd.2          *
bash-4.4$ ceph crash info 2024-01-02T00:14:34.108020Z_8aa1dc19-73f4-4778-ab55-63e226f49fbf
{
    "assert_condition": "buffers.count(i.first)",
    "assert_file": "/home/jenkins-build/build/workspace/ceph-build/ARCH/x86_64/AVAILABLE_ARCH/x86_64/AVAILABLE_DIST/centos8/DIST/centos8/MACHINE_SIZE/gigantic/release/18.2.0/rpm/el8/BUILD/ceph-18.2.0/src/osd/ECTransaction.cc",
    "assert_func": "void encode_and_write(pg_t, const hobject_t&, const ECUtil::stripe_info_t&, ceph::ErasureCodeInterfaceRef&, const std::set<int>&, uint64_t, ceph::bufferlist, uint32_t, ECUtil::HashInfoRef, extent_map&, std::map<shard_id_t, ceph::os::Transaction>*, DoutPrefixProvider*)",
    "assert_line": 75,
    "assert_msg": "/home/jenkins-build/build/workspace/ceph-build/ARCH/x86_64/AVAILABLE_ARCH/x86_64/AVAILABLE_DIST/centos8/DIST/centos8/MACHINE_SIZE/gigantic/release/18.2.0/rpm/el8/BUILD/ceph-18.2.0/src/osd/ECTransaction.cc: In function 'void encode_and_write(pg_t, const hobject_t&, const ECUtil::stripe_info_t&, ceph::ErasureCodeInterfaceRef&, const std::set<int>&, uint64_t, ceph::bufferlist, uint32_t, ECUtil::HashInfoRef, extent_map&, std::map<shard_id_t, ceph::os::Transaction>*, DoutPrefixProvider*)' thread 7f9d3428b700 time 2024-01-02T00:14:34.107570+0000\n/home/jenkins-build/build/workspace/ceph-build/ARCH/x86_64/AVAILABLE_ARCH/x86_64/AVAILABLE_DIST/centos8/DIST/centos8/MACHINE_SIZE/gigantic/release/18.2.0/rpm/el8/BUILD/ceph-18.2.0/src/osd/ECTransaction.cc: 75: FAILED ceph_assert(buffers.count(i.first))\n",
    "assert_thread_name": "tp_osd_tp",
    "backtrace": [
        "/lib64/libpthread.so.0(+0x12cf0) [0x7f9d54905cf0]",
        "gsignal()",
        "abort()",
        "(ceph::__ceph_assert_fail(char const*, char const*, int, char const*)+0x18f) [0x5643948a96bb]",
        "ceph-osd(+0x62d827) [0x5643948a9827]",
        "(encode_and_write(pg_t, hobject_t const&, ECUtil::stripe_info_t const&, std::shared_ptr<ceph::ErasureCodeInterface>&, std::set<int, std::less<int>, std::allocator<int> > const&, unsigned long, ceph::buffer::v15_2_0::list, unsigned int, std::shared_ptr<ECUtil::HashInfo>, interval_map<unsigned long, ceph::buffer::v15_2_0::list, bl_split_merge>&, std::map<shard_id_t, ceph::os::Transaction, std::less<shard_id_t>, std::allocator<std::pair<shard_id_t const, ceph::os::Transaction> > >*, DoutPrefixProvider*)+0x327) [0x564394e4fff7]",
        "ceph-osd(+0xbd97ac) [0x564394e557ac]",
        "(ECTransaction::generate_transactions(ECTransaction::WritePlan&, std::shared_ptr<ceph::ErasureCodeInterface>&, pg_t, ECUtil::stripe_info_t const&, std::map<hobject_t, interval_map<unsigned long, ceph::buffer::v15_2_0::list, bl_split_merge>, std::less<hobject_t>, std::allocator<std::pair<hobject_t const, interval_map<unsigned long, ceph::buffer::v15_2_0::list, bl_split_merge> > > > const&, std::vector<pg_log_entry_t, std::allocator<pg_log_entry_t> >&, std::map<hobject_t, interval_map<unsigned long, ceph::buffer::v15_2_0::list, bl_split_merge>, std::less<hobject_t>, std::allocator<std::pair<hobject_t const, interval_map<unsigned long, ceph::buffer::v15_2_0::list, bl_split_merge> > > >*, std::map<shard_id_t, ceph::os::Transaction, std::less<shard_id_t>, std::allocator<std::pair<shard_id_t const, ceph::os::Transaction> > >*, std::set<hobject_t, std::less<hobject_t>, std::allocator<hobject_t> >*, std::set<hobject_t, std::less<hobject_t>, std::allocator<hobject_t> >*, DoutPrefixProvider*, ceph_release_t)+0xdab) [0x564394e5779b]",
        "(ECBackend::try_reads_to_commit()+0x781) [0x564394e2c151]",
        "(ECBackend::check_ops()+0x24) [0x564394e2edd4]",
        "(ECBackend::submit_transaction(hobject_t const&, object_stat_sum_t const&, eversion_t const&, std::unique_ptr<PGTransaction, std::default_delete<PGTransaction> >&&, eversion_t const&, eversion_t const&, std::vector<pg_log_entry_t, std::allocator<pg_log_entry_t> >&&, std::optional<pg_hit_set_history_t>&, Context*, unsigned long, osd_reqid_t, boost::intrusive_ptr<OpRequest>)+0x2fe) [0x564394e317ee]",
        "(PrimaryLogPG::issue_repop(PrimaryLogPG::RepGather*, PrimaryLogPG::OpContext*)+0x421) [0x564394b06721]",
        "(PrimaryLogPG::execute_ctx(PrimaryLogPG::OpContext*)+0xd15) [0x564394b62bf5]",
        "(PrimaryLogPG::do_op(boost::intrusive_ptr<OpRequest>&)+0x281e) [0x564394b6994e]",
        "(OSD::dequeue_op(boost::intrusive_ptr<PG>, boost::intrusive_ptr<OpRequest>, ThreadPool::TPHandle&)+0x1c6) [0x5643949b9346]",
        "(ceph::osd::scheduler::PGOpItem::run(OSD*, OSDShard*, boost::intrusive_ptr<PG>&, ThreadPool::TPHandle&)+0x69) [0x564394cceea9]",
        "(OSD::ShardedOpWQ::_process(unsigned int, ceph::heartbeat_handle_d*)+0x112f) [0x5643949cea9f]",
        "(ShardedThreadPool::shardedthreadpool_worker(unsigned int)+0x435) [0x564395080155]",
        "(ShardedThreadPool::WorkThreadSharded::entry()+0x14) [0x564395083354]",
        "/lib64/libpthread.so.0(+0x81ca) [0x7f9d548fb1ca]",
        "clone()" 
    ],
    "ceph_version": "18.2.0",
    "crash_id": "2024-01-02T00:14:34.108020Z_8aa1dc19-73f4-4778-ab55-63e226f49fbf",
    "entity_name": "osd.2",
    "os_id": "centos",
    "os_name": "CentOS Stream",
    "os_version": "8",
    "os_version_id": "8",
    "process_name": "ceph-osd",
    "stack_sig": "0f980ded1bd6ef5085755f52ef47d22c6960822447ae4a0d6977fd9e2e9df0d3",
    "timestamp": "2024-01-02T00:14:34.108020Z",
    "utsname_hostname": "rook-ceph-osd-2-84c56c8999-hw5h9",
    "utsname_machine": "x86_64",
    "utsname_release": "6.1.69-talos",
    "utsname_sysname": "Linux",
    "utsname_version": "#1 SMP PREEMPT_DYNAMIC Thu Dec 21 15:48:53 UTC 2023" 
}
bash-4.4$ ceph crash info 2024-01-01T23:57:00.614429Z_7af6c825-5b29-4b0a-9798-a5b99a9c6de1
{
    "assert_condition": "buffers.count(i.first)",
    "assert_file": "/home/jenkins-build/build/workspace/ceph-build/ARCH/x86_64/AVAILABLE_ARCH/x86_64/AVAILABLE_DIST/centos8/DIST/centos8/MACHINE_SIZE/gigantic/release/18.2.0/rpm/el8/BUILD/ceph-18.2.0/src/osd/ECTransaction.cc",
    "assert_func": "void encode_and_write(pg_t, const hobject_t&, const ECUtil::stripe_info_t&, ceph::ErasureCodeInterfaceRef&, const std::set<int>&, uint64_t, ceph::bufferlist, uint32_t, ECUtil::HashInfoRef, extent_map&, std::map<shard_id_t, ceph::os::Transaction>*, DoutPrefixProvider*)",
    "assert_line": 75,
    "assert_msg": "/home/jenkins-build/build/workspace/ceph-build/ARCH/x86_64/AVAILABLE_ARCH/x86_64/AVAILABLE_DIST/centos8/DIST/centos8/MACHINE_SIZE/gigantic/release/18.2.0/rpm/el8/BUILD/ceph-18.2.0/src/osd/ECTransaction.cc: In function 'void encode_and_write(pg_t, const hobject_t&, const ECUtil::stripe_info_t&, ceph::ErasureCodeInterfaceRef&, const std::set<int>&, uint64_t, ceph::bufferlist, uint32_t, ECUtil::HashInfoRef, extent_map&, std::map<shard_id_t, ceph::os::Transaction>*, DoutPrefixProvider*)' thread 7fe30035f700 time 2024-01-01T23:57:00.611018+0000\n/home/jenkins-build/build/workspace/ceph-build/ARCH/x86_64/AVAILABLE_ARCH/x86_64/AVAILABLE_DIST/centos8/DIST/centos8/MACHINE_SIZE/gigantic/release/18.2.0/rpm/el8/BUILD/ceph-18.2.0/src/osd/ECTransaction.cc: 75: FAILED ceph_assert(buffers.count(i.first))\n",
    "assert_thread_name": "tp_osd_tp",
    "backtrace": [
        "/lib64/libpthread.so.0(+0x12cf0) [0x7fe3211dacf0]",
        "gsignal()",
        "abort()",
        "(ceph::__ceph_assert_fail(char const*, char const*, int, char const*)+0x18f) [0x55deb69006bb]",
        "ceph-osd(+0x62d827) [0x55deb6900827]",
        "(encode_and_write(pg_t, hobject_t const&, ECUtil::stripe_info_t const&, std::shared_ptr<ceph::ErasureCodeInterface>&, std::set<int, std::less<int>, std::allocator<int> > const&, unsigned long, ceph::buffer::v15_2_0::list, unsigned int, std::shared_ptr<ECUtil::HashInfo>, interval_map<unsigned long, ceph::buffer::v15_2_0::list, bl_split_merge>&, std::map<shard_id_t, ceph::os::Transaction, std::less<shard_id_t>, std::allocator<std::pair<shard_id_t const, ceph::os::Transaction> > >*, DoutPrefixProvider*)+0x327) [0x55deb6ea6ff7]",
        "ceph-osd(+0xbd97ac) [0x55deb6eac7ac]",
        "(ECTransaction::generate_transactions(ECTransaction::WritePlan&, std::shared_ptr<ceph::ErasureCodeInterface>&, pg_t, ECUtil::stripe_info_t const&, std::map<hobject_t, interval_map<unsigned long, ceph::buffer::v15_2_0::list, bl_split_merge>, std::less<hobject_t>, std::allocator<std::pair<hobject_t const, interval_map<unsigned long, ceph::buffer::v15_2_0::list, bl_split_merge> > > > const&, std::vector<pg_log_entry_t, std::allocator<pg_log_entry_t> >&, std::map<hobject_t, interval_map<unsigned long, ceph::buffer::v15_2_0::list, bl_split_merge>, std::less<hobject_t>, std::allocator<std::pair<hobject_t const, interval_map<unsigned long, ceph::buffer::v15_2_0::list, bl_split_merge> > > >*, std::map<shard_id_t, ceph::os::Transaction, std::less<shard_id_t>, std::allocator<std::pair<shard_id_t const, ceph::os::Transaction> > >*, std::set<hobject_t, std::less<hobject_t>, std::allocator<hobject_t> >*, std::set<hobject_t, std::less<hobject_t>, std::allocator<hobject_t> >*, DoutPrefixProvider*, ceph_release_t)+0xdab) [0x55deb6eae79b]",
        "(ECBackend::try_reads_to_commit()+0x781) [0x55deb6e83151]",
        "(ECBackend::check_ops()+0x24) [0x55deb6e85dd4]",
        "(ECBackend::submit_transaction(hobject_t const&, object_stat_sum_t const&, eversion_t const&, std::unique_ptr<PGTransaction, std::default_delete<PGTransaction> >&&, eversion_t const&, eversion_t const&, std::vector<pg_log_entry_t, std::allocator<pg_log_entry_t> >&&, std::optional<pg_hit_set_history_t>&, Context*, unsigned long, osd_reqid_t, boost::intrusive_ptr<OpRequest>)+0x2fe) [0x55deb6e887ee]",
        "(PrimaryLogPG::issue_repop(PrimaryLogPG::RepGather*, PrimaryLogPG::OpContext*)+0x421) [0x55deb6b5d721]",
        "(PrimaryLogPG::execute_ctx(PrimaryLogPG::OpContext*)+0xd15) [0x55deb6bb9bf5]",
        "(PrimaryLogPG::do_op(boost::intrusive_ptr<OpRequest>&)+0x281e) [0x55deb6bc094e]",
        "(OSD::dequeue_op(boost::intrusive_ptr<PG>, boost::intrusive_ptr<OpRequest>, ThreadPool::TPHandle&)+0x1c6) [0x55deb6a10346]",
        "(ceph::osd::scheduler::PGOpItem::run(OSD*, OSDShard*, boost::intrusive_ptr<PG>&, ThreadPool::TPHandle&)+0x69) [0x55deb6d25ea9]",
        "(OSD::ShardedOpWQ::_process(unsigned int, ceph::heartbeat_handle_d*)+0x112f) [0x55deb6a25a9f]",
        "(ShardedThreadPool::shardedthreadpool_worker(unsigned int)+0x435) [0x55deb70d7155]",
        "(ShardedThreadPool::WorkThreadSharded::entry()+0x14) [0x55deb70da354]",
        "/lib64/libpthread.so.0(+0x81ca) [0x7fe3211d01ca]",
        "clone()" 
    ],
    "ceph_version": "18.2.0",
    "crash_id": "2024-01-01T23:57:00.614429Z_7af6c825-5b29-4b0a-9798-a5b99a9c6de1",
    "entity_name": "osd.3",
    "os_id": "centos",
    "os_name": "CentOS Stream",
    "os_version": "8",
    "os_version_id": "8",
    "process_name": "ceph-osd",
    "stack_sig": "0f980ded1bd6ef5085755f52ef47d22c6960822447ae4a0d6977fd9e2e9df0d3",
    "timestamp": "2024-01-01T23:57:00.614429Z",
    "utsname_hostname": "rook-ceph-osd-3-cc8cdd67b-p7trd",
    "utsname_machine": "x86_64",
    "utsname_release": "6.1.69-talos",
    "utsname_sysname": "Linux",
    "utsname_version": "#1 SMP PREEMPT_DYNAMIC Thu Dec 21 15:48:53 UTC 2023" 
}
Actions #1

Updated by Thomas Way 4 months ago

The OSDs were restarted hundreds of times and made no difference. It was in this state for at least a day. I hard reset the host (power off) and the OSDs have come back normally?

Actions #2

Updated by Thomas Way 4 months ago

Okay, and now those two OSDs are crashing again? Seems like this happens when a new persistent volume is provisioned in Kubernetes?

Actions

Also available in: Atom PDF