Actions
Bug #63929
openOSD crash after restart
Status:
New
Priority:
Normal
Assignee:
-
Category:
OSD
Target version:
-
% Done:
0%
Source:
Tags:
Backport:
Regression:
No
Severity:
2 - major
Reviewed:
Description
Please refer to https://github.com/rook/rook/issues/13487.
All 6 OSDs were operating normally, and restarted under normal conditions. 2/6 then failed to come back as they are crashing. This happened with v18.2.0, but an upgrade to v18.2.1 did not help.
bash-4.4$ ceph crash ls-new
ID ENTITY NEW
2023-11-18T15:14:23.064216Z_c6bc39ff-c9ef-4043-af90-5d916440dad9 mgr.b *
2023-11-21T14:58:17.164852Z_fe974ff7-c8d2-4551-8df4-75dd8177b1a0 mgr.a *
2023-12-03T17:34:19.517719Z_b175f9ed-8f9c-46fa-af1e-3585c6d4d69c client.admin *
2023-12-04T13:11:25.070434Z_d8b7d85a-7cfd-43d9-b35c-1699b3a5db2b mgr.a *
2023-12-09T13:38:10.049138Z_9c4f1987-2fe4-413b-9f0d-95496548cf4a mgr.b *
2023-12-10T04:39:36.042563Z_b1c9dfed-99c0-4122-b9bc-363792fdf2ec client.admin *
2023-12-10T16:07:30.851895Z_d9b35b5c-bb52-4765-87a2-e2a988082cc1 mgr.b *
2023-12-12T15:47:51.563400Z_819a4273-9cf7-4049-8f82-3439cfbb28a3 client.admin *
2023-12-18T12:47:43.511245Z_b5696986-3db4-495a-888f-c8f838e5d60b osd.1 *
2023-12-18T12:50:13.729233Z_28c01465-09a3-4c0f-952e-f60c54f9b533 osd.3 *
2023-12-19T08:58:45.784233Z_ebd5e7d6-d7a9-48c5-8a82-e58d2dcb22b5 client.admin *
2023-12-23T21:02:34.346701Z_b5843e15-e9d6-4289-b951-c8dfee533768 client.admin *
2023-12-23T21:27:09.011584Z_8f27f077-a8f8-4150-a13d-bc436239988c mgr.b *
2023-12-27T15:13:51.210294Z_1422cf95-80a9-4cfe-ad39-e7cbc4917322 mon.c *
2023-12-27T17:01:50.752257Z_ae787b34-6eca-4f26-b611-4abf5e9e6ca6 osd.1 *
2023-12-27T17:01:58.136196Z_1fccab75-4a3b-48fd-bbf7-1dadf2f43dbb osd.0 *
2023-12-27T18:03:20.736548Z_823a60f0-9e59-470e-b24c-d45dbe6c3386 client.admin *
2024-01-01T23:39:13.818340Z_f33db1c3-f3b3-4438-96c6-d7c862bbefaa osd.2 *
2024-01-01T23:40:04.931311Z_4a4d6ee6-c2c2-41fd-bfbc-7d18ad256bca osd.2 *
2024-01-01T23:51:58.520444Z_3feccb3e-83f6-4205-9f55-f4fd3ec5ceb9 osd.2 *
2024-01-01T23:57:00.614429Z_7af6c825-5b29-4b0a-9798-a5b99a9c6de1 osd.3 *
2024-01-02T00:02:27.010008Z_0b36ae34-c2b9-4225-8be9-56b542412a43 osd.3 *
2024-01-02T00:13:23.926556Z_c6145f47-59d1-40c3-8e5f-125866b73aa0 osd.2 *
2024-01-02T00:14:34.108020Z_8aa1dc19-73f4-4778-ab55-63e226f49fbf osd.2 *
bash-4.4$ ceph crash info 2024-01-02T00:14:34.108020Z_8aa1dc19-73f4-4778-ab55-63e226f49fbf
{
"assert_condition": "buffers.count(i.first)",
"assert_file": "/home/jenkins-build/build/workspace/ceph-build/ARCH/x86_64/AVAILABLE_ARCH/x86_64/AVAILABLE_DIST/centos8/DIST/centos8/MACHINE_SIZE/gigantic/release/18.2.0/rpm/el8/BUILD/ceph-18.2.0/src/osd/ECTransaction.cc",
"assert_func": "void encode_and_write(pg_t, const hobject_t&, const ECUtil::stripe_info_t&, ceph::ErasureCodeInterfaceRef&, const std::set<int>&, uint64_t, ceph::bufferlist, uint32_t, ECUtil::HashInfoRef, extent_map&, std::map<shard_id_t, ceph::os::Transaction>*, DoutPrefixProvider*)",
"assert_line": 75,
"assert_msg": "/home/jenkins-build/build/workspace/ceph-build/ARCH/x86_64/AVAILABLE_ARCH/x86_64/AVAILABLE_DIST/centos8/DIST/centos8/MACHINE_SIZE/gigantic/release/18.2.0/rpm/el8/BUILD/ceph-18.2.0/src/osd/ECTransaction.cc: In function 'void encode_and_write(pg_t, const hobject_t&, const ECUtil::stripe_info_t&, ceph::ErasureCodeInterfaceRef&, const std::set<int>&, uint64_t, ceph::bufferlist, uint32_t, ECUtil::HashInfoRef, extent_map&, std::map<shard_id_t, ceph::os::Transaction>*, DoutPrefixProvider*)' thread 7f9d3428b700 time 2024-01-02T00:14:34.107570+0000\n/home/jenkins-build/build/workspace/ceph-build/ARCH/x86_64/AVAILABLE_ARCH/x86_64/AVAILABLE_DIST/centos8/DIST/centos8/MACHINE_SIZE/gigantic/release/18.2.0/rpm/el8/BUILD/ceph-18.2.0/src/osd/ECTransaction.cc: 75: FAILED ceph_assert(buffers.count(i.first))\n",
"assert_thread_name": "tp_osd_tp",
"backtrace": [
"/lib64/libpthread.so.0(+0x12cf0) [0x7f9d54905cf0]",
"gsignal()",
"abort()",
"(ceph::__ceph_assert_fail(char const*, char const*, int, char const*)+0x18f) [0x5643948a96bb]",
"ceph-osd(+0x62d827) [0x5643948a9827]",
"(encode_and_write(pg_t, hobject_t const&, ECUtil::stripe_info_t const&, std::shared_ptr<ceph::ErasureCodeInterface>&, std::set<int, std::less<int>, std::allocator<int> > const&, unsigned long, ceph::buffer::v15_2_0::list, unsigned int, std::shared_ptr<ECUtil::HashInfo>, interval_map<unsigned long, ceph::buffer::v15_2_0::list, bl_split_merge>&, std::map<shard_id_t, ceph::os::Transaction, std::less<shard_id_t>, std::allocator<std::pair<shard_id_t const, ceph::os::Transaction> > >*, DoutPrefixProvider*)+0x327) [0x564394e4fff7]",
"ceph-osd(+0xbd97ac) [0x564394e557ac]",
"(ECTransaction::generate_transactions(ECTransaction::WritePlan&, std::shared_ptr<ceph::ErasureCodeInterface>&, pg_t, ECUtil::stripe_info_t const&, std::map<hobject_t, interval_map<unsigned long, ceph::buffer::v15_2_0::list, bl_split_merge>, std::less<hobject_t>, std::allocator<std::pair<hobject_t const, interval_map<unsigned long, ceph::buffer::v15_2_0::list, bl_split_merge> > > > const&, std::vector<pg_log_entry_t, std::allocator<pg_log_entry_t> >&, std::map<hobject_t, interval_map<unsigned long, ceph::buffer::v15_2_0::list, bl_split_merge>, std::less<hobject_t>, std::allocator<std::pair<hobject_t const, interval_map<unsigned long, ceph::buffer::v15_2_0::list, bl_split_merge> > > >*, std::map<shard_id_t, ceph::os::Transaction, std::less<shard_id_t>, std::allocator<std::pair<shard_id_t const, ceph::os::Transaction> > >*, std::set<hobject_t, std::less<hobject_t>, std::allocator<hobject_t> >*, std::set<hobject_t, std::less<hobject_t>, std::allocator<hobject_t> >*, DoutPrefixProvider*, ceph_release_t)+0xdab) [0x564394e5779b]",
"(ECBackend::try_reads_to_commit()+0x781) [0x564394e2c151]",
"(ECBackend::check_ops()+0x24) [0x564394e2edd4]",
"(ECBackend::submit_transaction(hobject_t const&, object_stat_sum_t const&, eversion_t const&, std::unique_ptr<PGTransaction, std::default_delete<PGTransaction> >&&, eversion_t const&, eversion_t const&, std::vector<pg_log_entry_t, std::allocator<pg_log_entry_t> >&&, std::optional<pg_hit_set_history_t>&, Context*, unsigned long, osd_reqid_t, boost::intrusive_ptr<OpRequest>)+0x2fe) [0x564394e317ee]",
"(PrimaryLogPG::issue_repop(PrimaryLogPG::RepGather*, PrimaryLogPG::OpContext*)+0x421) [0x564394b06721]",
"(PrimaryLogPG::execute_ctx(PrimaryLogPG::OpContext*)+0xd15) [0x564394b62bf5]",
"(PrimaryLogPG::do_op(boost::intrusive_ptr<OpRequest>&)+0x281e) [0x564394b6994e]",
"(OSD::dequeue_op(boost::intrusive_ptr<PG>, boost::intrusive_ptr<OpRequest>, ThreadPool::TPHandle&)+0x1c6) [0x5643949b9346]",
"(ceph::osd::scheduler::PGOpItem::run(OSD*, OSDShard*, boost::intrusive_ptr<PG>&, ThreadPool::TPHandle&)+0x69) [0x564394cceea9]",
"(OSD::ShardedOpWQ::_process(unsigned int, ceph::heartbeat_handle_d*)+0x112f) [0x5643949cea9f]",
"(ShardedThreadPool::shardedthreadpool_worker(unsigned int)+0x435) [0x564395080155]",
"(ShardedThreadPool::WorkThreadSharded::entry()+0x14) [0x564395083354]",
"/lib64/libpthread.so.0(+0x81ca) [0x7f9d548fb1ca]",
"clone()"
],
"ceph_version": "18.2.0",
"crash_id": "2024-01-02T00:14:34.108020Z_8aa1dc19-73f4-4778-ab55-63e226f49fbf",
"entity_name": "osd.2",
"os_id": "centos",
"os_name": "CentOS Stream",
"os_version": "8",
"os_version_id": "8",
"process_name": "ceph-osd",
"stack_sig": "0f980ded1bd6ef5085755f52ef47d22c6960822447ae4a0d6977fd9e2e9df0d3",
"timestamp": "2024-01-02T00:14:34.108020Z",
"utsname_hostname": "rook-ceph-osd-2-84c56c8999-hw5h9",
"utsname_machine": "x86_64",
"utsname_release": "6.1.69-talos",
"utsname_sysname": "Linux",
"utsname_version": "#1 SMP PREEMPT_DYNAMIC Thu Dec 21 15:48:53 UTC 2023"
}
bash-4.4$ ceph crash info 2024-01-01T23:57:00.614429Z_7af6c825-5b29-4b0a-9798-a5b99a9c6de1
{
"assert_condition": "buffers.count(i.first)",
"assert_file": "/home/jenkins-build/build/workspace/ceph-build/ARCH/x86_64/AVAILABLE_ARCH/x86_64/AVAILABLE_DIST/centos8/DIST/centos8/MACHINE_SIZE/gigantic/release/18.2.0/rpm/el8/BUILD/ceph-18.2.0/src/osd/ECTransaction.cc",
"assert_func": "void encode_and_write(pg_t, const hobject_t&, const ECUtil::stripe_info_t&, ceph::ErasureCodeInterfaceRef&, const std::set<int>&, uint64_t, ceph::bufferlist, uint32_t, ECUtil::HashInfoRef, extent_map&, std::map<shard_id_t, ceph::os::Transaction>*, DoutPrefixProvider*)",
"assert_line": 75,
"assert_msg": "/home/jenkins-build/build/workspace/ceph-build/ARCH/x86_64/AVAILABLE_ARCH/x86_64/AVAILABLE_DIST/centos8/DIST/centos8/MACHINE_SIZE/gigantic/release/18.2.0/rpm/el8/BUILD/ceph-18.2.0/src/osd/ECTransaction.cc: In function 'void encode_and_write(pg_t, const hobject_t&, const ECUtil::stripe_info_t&, ceph::ErasureCodeInterfaceRef&, const std::set<int>&, uint64_t, ceph::bufferlist, uint32_t, ECUtil::HashInfoRef, extent_map&, std::map<shard_id_t, ceph::os::Transaction>*, DoutPrefixProvider*)' thread 7fe30035f700 time 2024-01-01T23:57:00.611018+0000\n/home/jenkins-build/build/workspace/ceph-build/ARCH/x86_64/AVAILABLE_ARCH/x86_64/AVAILABLE_DIST/centos8/DIST/centos8/MACHINE_SIZE/gigantic/release/18.2.0/rpm/el8/BUILD/ceph-18.2.0/src/osd/ECTransaction.cc: 75: FAILED ceph_assert(buffers.count(i.first))\n",
"assert_thread_name": "tp_osd_tp",
"backtrace": [
"/lib64/libpthread.so.0(+0x12cf0) [0x7fe3211dacf0]",
"gsignal()",
"abort()",
"(ceph::__ceph_assert_fail(char const*, char const*, int, char const*)+0x18f) [0x55deb69006bb]",
"ceph-osd(+0x62d827) [0x55deb6900827]",
"(encode_and_write(pg_t, hobject_t const&, ECUtil::stripe_info_t const&, std::shared_ptr<ceph::ErasureCodeInterface>&, std::set<int, std::less<int>, std::allocator<int> > const&, unsigned long, ceph::buffer::v15_2_0::list, unsigned int, std::shared_ptr<ECUtil::HashInfo>, interval_map<unsigned long, ceph::buffer::v15_2_0::list, bl_split_merge>&, std::map<shard_id_t, ceph::os::Transaction, std::less<shard_id_t>, std::allocator<std::pair<shard_id_t const, ceph::os::Transaction> > >*, DoutPrefixProvider*)+0x327) [0x55deb6ea6ff7]",
"ceph-osd(+0xbd97ac) [0x55deb6eac7ac]",
"(ECTransaction::generate_transactions(ECTransaction::WritePlan&, std::shared_ptr<ceph::ErasureCodeInterface>&, pg_t, ECUtil::stripe_info_t const&, std::map<hobject_t, interval_map<unsigned long, ceph::buffer::v15_2_0::list, bl_split_merge>, std::less<hobject_t>, std::allocator<std::pair<hobject_t const, interval_map<unsigned long, ceph::buffer::v15_2_0::list, bl_split_merge> > > > const&, std::vector<pg_log_entry_t, std::allocator<pg_log_entry_t> >&, std::map<hobject_t, interval_map<unsigned long, ceph::buffer::v15_2_0::list, bl_split_merge>, std::less<hobject_t>, std::allocator<std::pair<hobject_t const, interval_map<unsigned long, ceph::buffer::v15_2_0::list, bl_split_merge> > > >*, std::map<shard_id_t, ceph::os::Transaction, std::less<shard_id_t>, std::allocator<std::pair<shard_id_t const, ceph::os::Transaction> > >*, std::set<hobject_t, std::less<hobject_t>, std::allocator<hobject_t> >*, std::set<hobject_t, std::less<hobject_t>, std::allocator<hobject_t> >*, DoutPrefixProvider*, ceph_release_t)+0xdab) [0x55deb6eae79b]",
"(ECBackend::try_reads_to_commit()+0x781) [0x55deb6e83151]",
"(ECBackend::check_ops()+0x24) [0x55deb6e85dd4]",
"(ECBackend::submit_transaction(hobject_t const&, object_stat_sum_t const&, eversion_t const&, std::unique_ptr<PGTransaction, std::default_delete<PGTransaction> >&&, eversion_t const&, eversion_t const&, std::vector<pg_log_entry_t, std::allocator<pg_log_entry_t> >&&, std::optional<pg_hit_set_history_t>&, Context*, unsigned long, osd_reqid_t, boost::intrusive_ptr<OpRequest>)+0x2fe) [0x55deb6e887ee]",
"(PrimaryLogPG::issue_repop(PrimaryLogPG::RepGather*, PrimaryLogPG::OpContext*)+0x421) [0x55deb6b5d721]",
"(PrimaryLogPG::execute_ctx(PrimaryLogPG::OpContext*)+0xd15) [0x55deb6bb9bf5]",
"(PrimaryLogPG::do_op(boost::intrusive_ptr<OpRequest>&)+0x281e) [0x55deb6bc094e]",
"(OSD::dequeue_op(boost::intrusive_ptr<PG>, boost::intrusive_ptr<OpRequest>, ThreadPool::TPHandle&)+0x1c6) [0x55deb6a10346]",
"(ceph::osd::scheduler::PGOpItem::run(OSD*, OSDShard*, boost::intrusive_ptr<PG>&, ThreadPool::TPHandle&)+0x69) [0x55deb6d25ea9]",
"(OSD::ShardedOpWQ::_process(unsigned int, ceph::heartbeat_handle_d*)+0x112f) [0x55deb6a25a9f]",
"(ShardedThreadPool::shardedthreadpool_worker(unsigned int)+0x435) [0x55deb70d7155]",
"(ShardedThreadPool::WorkThreadSharded::entry()+0x14) [0x55deb70da354]",
"/lib64/libpthread.so.0(+0x81ca) [0x7fe3211d01ca]",
"clone()"
],
"ceph_version": "18.2.0",
"crash_id": "2024-01-01T23:57:00.614429Z_7af6c825-5b29-4b0a-9798-a5b99a9c6de1",
"entity_name": "osd.3",
"os_id": "centos",
"os_name": "CentOS Stream",
"os_version": "8",
"os_version_id": "8",
"process_name": "ceph-osd",
"stack_sig": "0f980ded1bd6ef5085755f52ef47d22c6960822447ae4a0d6977fd9e2e9df0d3",
"timestamp": "2024-01-01T23:57:00.614429Z",
"utsname_hostname": "rook-ceph-osd-3-cc8cdd67b-p7trd",
"utsname_machine": "x86_64",
"utsname_release": "6.1.69-talos",
"utsname_sysname": "Linux",
"utsname_version": "#1 SMP PREEMPT_DYNAMIC Thu Dec 21 15:48:53 UTC 2023"
}
Updated by Thomas Way 4 months ago
The OSDs were restarted hundreds of times and made no difference. It was in this state for at least a day. I hard reset the host (power off) and the OSDs have come back normally?
Updated by Thomas Way 4 months ago
Okay, and now those two OSDs are crashing again? Seems like this happens when a new persistent volume is provisioned in Kubernetes?
Actions