Bug #59831
opencrash: void ECBackend::continue_recovery_op(ECBackend::RecoveryOp&, RecoveryMessages*): assert(pop.data.length() == sinfo.aligned_logical_offset_to_chunk_offset( after_progress.data_recovered_to - op.recovery_progress.data_recovered_to))
0%
75af2ed082be6a6c22eb7f04bd05cf98f4fbaf231abc9ff978b3684dacc5da0a
7b7aaff61eb3d064d2d3e9d204575affb0cc72f9424173ad80ce9a743f2eb0b7
db5932cf296c80a1c144a1aeda6e548ddad3e6295c21fea11903bc799f59b7cf
f9f22141ab4dd2eee0fc81814e484341f89b34a694a7637dd1b722e5c20a7759
Description
Assert condition: pop.data.length() == sinfo.aligned_logical_offset_to_chunk_offset( after_progress.data_recovered_to - op.recovery_progress.data_recovered_to)
Assert function: void ECBackend::continue_recovery_op(ECBackend::RecoveryOp&, RecoveryMessages*)
Sanitized backtrace:
ECBackend::continue_recovery_op(ECBackend::RecoveryOp&, RecoveryMessages*) ECBackend::handle_recovery_read_complete(hobject_t const&, boost::tuples::tuple<unsigned long, unsigned long, std::map<pg_shard_t, ceph::buffer::list, std::less<pg_shard_t>, std::allocator<std::pair<pg_shard_t const, ceph::buffer::list> > >, boost::tuples::null_type, boost::tuples::null_type, boost::tuples::null_type, boost::tuples::null_type, boost::tuples::null_type, boost::tuples::null_type, boost::tuples::null_type>&, std::optional<std::map<std::basic_string<char, std::char_traits<char>, std::allocator<char> >, ceph::buffer::list, std::less<void>, std::allocator<std::pair<std::basic_string<char, std::char_traits<char>, std::allocator<char> > const, ceph::buffer::list> > > >, RecoveryMessages*) OnRecoveryReadComplete::finish(std::pair<RecoveryMessages*, ECBackend::read_result_t&>&) ECBackend::complete_read_op(ECBackend::ReadOp&, RecoveryMessages*) ECBackend::handle_sub_read_reply(pg_shard_t, ECSubReadReply&, RecoveryMessages*, ZTracer::Trace const&) ECBackend::_handle_message(boost::intrusive_ptr<OpRequest>) PGBackend::handle_message(boost::intrusive_ptr<OpRequest>) PrimaryLogPG::do_request(boost::intrusive_ptr<OpRequest>&, ThreadPool::TPHandle&) OSD::dequeue_op(boost::intrusive_ptr<PG>, boost::intrusive_ptr<OpRequest>, ThreadPool::TPHandle&) ceph::osd::scheduler::PGOpItem::run(OSD*, OSDShard*, boost::intrusive_ptr<PG>&, ThreadPool::TPHandle&) OSD::ShardedOpWQ::_process(unsigned int, ceph::heartbeat_handle_d*) ShardedThreadPool::shardedthreadpool_worker(unsigned int) ShardedThreadPool::WorkThreadSharded::entry()
Crash dump sample:
{ "archived": "2023-04-11 12:16:54.864155", "assert_condition": "pop.data.length() == sinfo.aligned_logical_offset_to_chunk_offset( after_progress.data_recovered_to - op.recovery_progress.data_recovered_to)", "assert_file": "osd/ECBackend.cc", "assert_func": "void ECBackend::continue_recovery_op(ECBackend::RecoveryOp&, RecoveryMessages*)", "assert_line": 676, "assert_msg": "osd/ECBackend.cc: In function 'void ECBackend::continue_recovery_op(ECBackend::RecoveryOp&, RecoveryMessages*)' thread 7f8883209700 time 2023-04-11T10:10:46.038828+0100\nosd/ECBackend.cc: 676: FAILED ceph_assert(pop.data.length() == sinfo.aligned_logical_offset_to_chunk_offset( after_progress.data_recovered_to - op.recovery_progress.data_recovered_to))", "assert_thread_name": "tp_osd_tp", "backtrace": [ "/lib64/libpthread.so.0(+0x12cf0) [0x7f88a424acf0]", "gsignal()", "abort()", "(ceph::__ceph_assert_fail(char const*, char const*, int, char const*)+0x18f) [0x55b042d5468b]", "/usr/bin/ceph-osd(+0x5977f7) [0x55b042d547f7]", "(ECBackend::continue_recovery_op(ECBackend::RecoveryOp&, RecoveryMessages*)+0x1e30) [0x55b0432ad8b0]", "(ECBackend::handle_recovery_read_complete(hobject_t const&, boost::tuples::tuple<unsigned long, unsigned long, std::map<pg_shard_t, ceph::buffer::v15_2_0::list, std::less<pg_shard_t>, std::allocator<std::pair<pg_shard_t const, ceph::buffer::v15_2_0::list> > >, boost::tuples::null_type, boost::tuples::null_type, boost::tuples::null_type, boost::tuples::null_type, boost::tuples::null_type, boost::tuples::null_type, boost::tuples::null_type>&, std::optional<std::map<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >, ceph::buffer::v15_2_0::list, std::less<void>, std::allocator<std::pair<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const, ceph::buffer::v15_2_0::list> > > >, RecoveryMessages*)+0x4c2) [0x55b0432ae052]", "(OnRecoveryReadComplete::finish(std::pair<RecoveryMessages*, ECBackend::read_result_t&>&)+0x71) [0x55b0432d3e21]", "(ECBackend::complete_read_op(ECBackend::ReadOp&, RecoveryMessages*)+0x8d) [0x55b04329e88d]", "(ECBackend::handle_sub_read_reply(pg_shard_t, ECSubReadReply&, RecoveryMessages*, ZTracer::Trace const&)+0xd1c) [0x55b0432b9f4c]", "(ECBackend::_handle_message(boost::intrusive_ptr<OpRequest>)+0x2d4) [0x55b0432baa24]", "(PGBackend::handle_message(boost::intrusive_ptr<OpRequest>)+0x56) [0x55b04306d2c6]", "(PrimaryLogPG::do_request(boost::intrusive_ptr<OpRequest>&, ThreadPool::TPHandle&)+0x512) [0x55b042fff9f2]", "(OSD::dequeue_op(boost::intrusive_ptr<PG>, boost::intrusive_ptr<OpRequest>, ThreadPool::TPHandle&)+0x1c0) [0x55b042e55900]", "(ceph::osd::scheduler::PGOpItem::run(OSD*, OSDShard*, boost::intrusive_ptr<PG>&, ThreadPool::TPHandle&)+0x6d) [0x55b0431542ad]", "(OSD::ShardedOpWQ::_process(unsigned int, ceph::heartbeat_handle_d*)+0x115f) [0x55b042e68dbf]", "(ShardedThreadPool::shardedthreadpool_worker(unsigned int)+0x435) [0x55b0435c68c5]", "(ShardedThreadPool::WorkThreadSharded::entry()+0x14) [0x55b0435c8fe4]", "/lib64/libpthread.so.0(+0x81cf) [0x7f88a42401cf]", "clone()" ], "ceph_version": "17.2.5", "crash_id": "2023-04-11T09:10:46.064419Z_07dabafc-ac37-4c3d-a8bc-2d94e758acd9", "entity_name": "osd.5124c4347d97042388257a62052a332dbd538ede", "os_id": "almalinux", "os_name": "AlmaLinux", "os_version": "8.7 (Stone Smilodon)", "os_version_id": "8.7", "process_name": "ceph-osd", "stack_sig": "7b7aaff61eb3d064d2d3e9d204575affb0cc72f9424173ad80ce9a743f2eb0b7", "timestamp": "2023-04-11T09:10:46.064419Z", "utsname_machine": "x86_64", "utsname_release": "4.18.0-425.10.1.el8_7.x86_64", "utsname_sysname": "Linux", "utsname_version": "#1 SMP Thu Jan 12 10:05:00 EST 2023" }
Updated by 玮文 胡 6 months ago
We are hitting this in our production server (v17.2.6). Multiple OSDs crash loop. Recovery cannot progress. What can I do to help resolve this?
debug -158> 2023-10-22T06:27:26.376+0000 7fb19f595700 10 osd.13 202542 dequeue_op 0x55bacc82b080 prio 127 cost 0 latency 0.000110 MOSDECSubOpReadReply(19.7s1 202542/202541 ECSubReadReply(tid=0, attrs_read=0)) v2 pg pg[19.7s1( v 125574'572280 (124532'570350,125574'572280] local-lis/les=202541/202542 n=27385 ec=45495/45495 lis/c=202541/201867 les/c/f=202542/201868/108270 sis=202541) [38,13,50,49,NONE,47,37,34,4,3]/[NONE,13,50,49,15,47,37,34,4,3]p13(1) backfill=[38(0)] r=1 lpr=202541 pi=[201867,202541)/1 crt=125574'572280 lcod 0'0 mlcod 0'0 active+undersized+degraded+remapped+backfilling rops=1 mbc={}] debug -157> 2023-10-22T06:27:26.376+0000 7fb19f595700 10 osd.13 pg_epoch: 202542 pg[19.7s1( v 125574'572280 (124532'570350,125574'572280] local-lis/les=202541/202542 n=27385 ec=45495/45495 lis/c=202541/201867 les/c/f=202542/201868/108270 sis=202541) [38,13,50,49,NONE,47,37,34,4,3]/[NONE,13,50,49,15,47,37,34,4,3]p13(1) backfill=[38(0)] r=1 lpr=202541 pi=[201867,202541)/1 crt=125574'572280 lcod 0'0 mlcod 0'0 active+undersized+degraded+remapped+backfilling rops=1 mbc={}] _handle_message: MOSDECSubOpReadReply(19.7s1 202542/202541 ECSubReadReply(tid=0, attrs_read=0)) v2 debug -156> 2023-10-22T06:27:26.376+0000 7fb19f595700 10 osd.13 pg_epoch: 202542 pg[19.7s1( v 125574'572280 (124532'570350,125574'572280] local-lis/les=202541/202542 n=27385 ec=45495/45495 lis/c=202541/201867 les/c/f=202542/201868/108270 sis=202541) [38,13,50,49,NONE,47,37,34,4,3]/[NONE,13,50,49,15,47,37,34,4,3]p13(1) backfill=[38(0)] r=1 lpr=202541 pi=[201867,202541)/1 crt=125574'572280 lcod 0'0 mlcod 0'0 active+undersized+degraded+remapped+backfilling rops=1 mbc={}] handle_sub_read_reply: reply ECSubReadReply(tid=0, attrs_read=0) debug -155> 2023-10-22T06:27:26.376+0000 7fb19f595700 0 log_channel(cluster) log [WRN] : Error(s) ignored for 19:e0a22404:::10010679c05.00000000:head enough copies available debug -154> 2023-10-22T06:27:26.376+0000 7fb19f595700 10 osd.13 pg_epoch: 202542 pg[19.7s1( v 125574'572280 (124532'570350,125574'572280] local-lis/les=202541/202542 n=27385 ec=45495/45495 lis/c=202541/201867 les/c/f=202542/201868/108270 sis=202541) [38,13,50,49,NONE,47,37,34,4,3]/[NONE,13,50,49,15,47,37,34,4,3]p13(1) backfill=[38(0)] r=1 lpr=202541 pi=[201867,202541)/1 crt=125574'572280 lcod 0'0 mlcod 0'0 active+undersized+degraded+remapped+backfilling rops=1 mbc={}] handle_sub_read_reply Error(s) ignored for 19:e0a22404:::10010679c05.00000000:head enough copies available debug -153> 2023-10-22T06:27:26.376+0000 7fb19f595700 10 osd.13 pg_epoch: 202542 pg[19.7s1( v 125574'572280 (124532'570350,125574'572280] local-lis/les=202541/202542 n=27385 ec=45495/45495 lis/c=202541/201867 les/c/f=202542/201868/108270 sis=202541) [38,13,50,49,NONE,47,37,34,4,3]/[NONE,13,50,49,15,47,37,34,4,3]p13(1) backfill=[38(0)] r=1 lpr=202541 pi=[201867,202541)/1 crt=125574'572280 lcod 0'0 mlcod 0'0 active+undersized+degraded+remapped+backfilling rops=1 mbc={}] handle_recovery_read_complete: returned 19:e0a22404:::10010679c05.00000000:head (0, 8388608, [3(9),67584, 4(8),67584, 13(1),67584, 34(7),67584, 37(6),67584, 47(5),67584, 49(3),67584, 50(2),67584]) debug -152> 2023-10-22T06:27:26.376+0000 7fb19f595700 10 osd.13 pg_epoch: 202542 pg[19.7s1( v 125574'572280 (124532'570350,125574'572280] local-lis/les=202541/202542 n=27385 ec=45495/45495 lis/c=202541/201867 les/c/f=202542/201868/108270 sis=202541) [38,13,50,49,NONE,47,37,34,4,3]/[NONE,13,50,49,15,47,37,34,4,3]p13(1) backfill=[38(0)] r=1 lpr=202541 pi=[201867,202541)/1 crt=125574'572280 lcod 0'0 mlcod 0'0 active+undersized+degraded+remapped+backfilling rops=1 mbc={}] handle_recovery_read_complete: [1,67584, 2,67584, 3,67584, 5,67584, 6,67584, 7,67584, 8,67584, 9,67584] debug -151> 2023-10-22T06:27:26.384+0000 7fb19f595700 10 osd.13 pg_epoch: 202542 pg[19.7s1( v 125574'572280 (124532'570350,125574'572280] local-lis/les=202541/202542 n=27385 ec=45495/45495 lis/c=202541/201867 les/c/f=202542/201868/108270 sis=202541) [38,13,50,49,NONE,47,37,34,4,3]/[NONE,13,50,49,15,47,37,34,4,3]p13(1) backfill=[38(0)] r=1 lpr=202541 pi=[201867,202541)/1 crt=125574'572280 lcod 0'0 mlcod 0'0 active+undersized+degraded+remapped+backfilling rops=1 mbc={}] continue_recovery_op: continuing RecoveryOp(hoid=19:e0a22404:::10010679c05.00000000:head v=58663'98070 missing_on=38(0) missing_on_shards=0 recovery_info=ObjectRecoveryInfo(19:e0a22404:::10010679c05.00000000:head@58663'98070, size: 1049646, copy_subset: [], clone_subset: {}, snapset: 0=[]:{}, object_exist: 1) recovery_progress=ObjectRecoveryProgress(first, data_recovered_to:0, data_complete:false, omap_recovered_to:, omap_complete:true, error:false) obc refcount=2 state=READING waiting_on_pushes= extent_requested=0,8388608) debug -150> 2023-10-22T06:27:26.384+0000 7fb19f595700 10 osd.13 pg_epoch: 202542 pg[19.7s1( v 125574'572280 (124532'570350,125574'572280] local-lis/les=202541/202542 n=27385 ec=45495/45495 lis/c=202541/201867 les/c/f=202542/201868/108270 sis=202541) [38,13,50,49,NONE,47,37,34,4,3]/[NONE,13,50,49,15,47,37,34,4,3]p13(1) backfill=[38(0)] r=1 lpr=202541 pi=[201867,202541)/1 crt=125574'572280 lcod 0'0 mlcod 0'0 active+undersized+degraded+remapped+backfilling rops=1 mbc={}] continue_recovery_op: before_progress=ObjectRecoveryProgress(first, data_recovered_to:0, data_complete:false, omap_recovered_to:, omap_complete:true, error:false), after_progress=ObjectRecoveryProgress(!first, data_recovered_to:1081344, data_complete:true, omap_recovered_to:, omap_complete:true, error:false), pop.data.length()=65536, size=1049646 debug -149> 2023-10-22T06:27:26.396+0000 7fb19f595700 -1 /home/jenkins-build/build/workspace/ceph-build/ARCH/x86_64/AVAILABLE_ARCH/x86_64/AVAILABLE_DIST/centos8/DIST/centos8/MACHINE_SIZE/gigantic/release/17.2.6/rpm/el8/BUILD/ceph-17.2.6/src/osd/ECBackend.cc: In function 'void ECBackend::continue_recovery_op(ECBackend::RecoveryOp&, RecoveryMessages*)' thread 7fb19f595700 time 2023-10-22T06:27:26.390734+0000 /home/jenkins-build/build/workspace/ceph-build/ARCH/x86_64/AVAILABLE_ARCH/x86_64/AVAILABLE_DIST/centos8/DIST/centos8/MACHINE_SIZE/gigantic/release/17.2.6/rpm/el8/BUILD/ceph-17.2.6/src/osd/ECBackend.cc: 676: FAILED ceph_assert(pop.data.length() == sinfo.aligned_logical_offset_to_chunk_offset( after_progress.data_recovered_to - op.recovery_progress.data_recovered_to)) ceph version 17.2.6 (d7ff0d10654d2280e08f1ab989c7cdf3064446a5) quincy (stable)
Updated by 玮文 胡 6 months ago
More info about the pool
pool 19 'cephfs.cephfs.data_ec' erasure profile clay_profile size 10 min_size 9 crush_rule 4 object_hash rjenkins pg_num 32 pgp_num 32 autoscale_mode warn last_change 201520 flags hashpspool,ec_overwrites,nearfull stripe_width 32768 application cephfs
Updated by 玮文 胡 6 months ago
enabling osd_read_ec_check_for_errors do fix this (https://tracker.ceph.com/issues/55662). And gives this log:
debug 2023-10-22T10:07:41.522+0000 7f3a37930700 0 osd.13 pg_epoch: 202591 pg[19.7s1( v 125574'572280 (124532'570350,125574'572280] local-lis/les=202587/202588 n=27385 ec=45495/45495 lis/c=202587/201867 les/c/f=202588/201868/108270 sis=202587) [38,13,50,49,NONE,47,37,34,4,3]/[NONE,13,50,49,15,47,37,34,4,3]p13(1) backfill=[38(0)] r=1 lpr=202587 pi=[201867,202587)/1 crt=125574'572280 lcod 0'0 mlcod 0'0 active+undersized+degraded+remapped+backfilling rops=1 mbc={0={(0+0)=1},1={(1+0)=1},2={(1+0)=1},3={(1+0)=1},4={(0+0)=1},5={(1+0)=1},6={(1+0)=1},7={(1+0)=1},8={(1+0)=1},9={(1+0)=1}}] on_failed_pull 19:e0a22404:::10010679c05.00000000:head from shard 15(4), reps on 3(9),4(8),13(1),34(7),37(6),47(5),49(3),50(2) unfound? 0
But it should not crash with the default config. It is worth fixing.
Updated by dovefi Z 18 days ago
I had the same problem with version 14.2.21,is there any progress
2024-04-11 16:03:09.370 7f9eb0c4a700 0 log_channel(cluster) log [WRN] : Error(s) ignored for 4:2e74e020:::34lb5wmx66001777771919462895616_ryvoskbmfq_00000185_0001.000000000000000a:head enough copies available 2024-04-11 16:03:09.414 7f9eb0c4a700 -1 /home/build/Debian/WORKDIR/ceph-14.2.21.2-1+deb10u1/src/osd/ECBackend.cc: In function 'void ECBackend::continue_recovery_op(ECBackend::RecoveryOp&, RecoveryMessages*)' thread 7f9eb0c4a700 time 2024-04-11 16:03:09.413745 /home/build/Debian/WORKDIR/ceph-14.2.21.2-1+deb10u1/src/osd/ECBackend.cc: 648: FAILED ceph_assert(pop.data.length() == sinfo.aligned_logical_offset_to_chunk_offset( after_progress.data_recovered_to - op.recovery_progress.data_recovered_to)) ceph version 14.2.21.2-1+deb10u1 (fdc0caac815eff968a0fca77e87a3658fcafcf4c) nautilus (stable) 1: (ceph::__ceph_assert_fail(char const*, char const*, int, char const*)+0x152) [0x55f13dffeefa] 2: (()+0x5140d2) [0x55f13dfff0d2] 3: (ECBackend::continue_recovery_op(ECBackend::RecoveryOp&, RecoveryMessages*)+0x1c83) [0x55f13e414ac3] 4: (ECBackend::handle_recovery_read_complete(hobject_t const&, boost::tuples::tuple<unsigned long, unsigned long, std::map<pg_shard_t, ceph::buffer::v14_2_0::list, std::less<pg_shard_t>, std::allocator<std::pair<pg_shard_t const, ceph::buffer::v14_2_0::list> > >, boost::tuples::null_type, boost::tuples::null_type, boost::tuples::null_type, boost::tuples::null_type, boost::tuples::null_type, boost::tuples::null_type, boost::tuples::null_type>&, boost::optional<std::map<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >, ceph::buffer::v14_2_0::list, std::less<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > >, std::allocator<std::pair<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const, ceph::buffer::v14_2_0::list> > > >, RecoveryMessages*)+0x744) [0x55f13e415444] 5: (OnRecoveryReadComplete::finish(std::pair<RecoveryMessages*, ECBackend::read_result_t&>&)+0x6d) [0x55f13e435d6d] 6: (ECBackend::complete_read_op(ECBackend::ReadOp&, RecoveryMessages*)+0x8f) [0x55f13e4090bf] 7: (ECBackend::handle_sub_read_reply(pg_shard_t, ECSubReadReply&, RecoveryMessages*, ZTracer::Trace const&)+0xe79) [0x55f13e420669] 8: (ECBackend::_handle_message(boost::intrusive_ptr<OpRequest>)+0x1a4) [0x55f13e420f04] 9: (PGBackend::handle_message(boost::intrusive_ptr<OpRequest>)+0x57) [0x55f13e309437] 10: (PrimaryLogPG::do_request(boost::intrusive_ptr<OpRequest>&, ThreadPool::TPHandle&)+0x61f) [0x55f13e2b791f] 11: (OSD::dequeue_op(boost::intrusive_ptr<PG>, boost::intrusive_ptr<OpRequest>, ThreadPool::TPHandle&)+0x392) [0x55f13e0e3932] 12: (PGOpItem::run(OSD*, OSDShard*, boost::intrusive_ptr<PG>&, ThreadPool::TPHandle&)+0x62) [0x55f13e3881c2] 13: (OSD::ShardedOpWQ::_process(unsigned int, ceph::heartbeat_handle_d*)+0x7d7) [0x55f13e0ff677] 14: (ShardedThreadPool::shardedthreadpool_worker(unsigned int)+0x5b4) [0x55f13e6d4244] 15: (ShardedThreadPool::WorkThreadSharded::entry()+0x10) [0x55f13e6d6c50] 16: (()+0x7fa3) [0x7f9ed4175fa3] 17: (clone()+0x3f) [0x7f9ed3d2606f]