Project

General

Profile

Actions

Bug #59831

open

crash: void ECBackend::continue_recovery_op(ECBackend::RecoveryOp&, RecoveryMessages*): assert(pop.data.length() == sinfo.aligned_logical_offset_to_chunk_offset( after_progress.data_recovered_to - op.recovery_progress.data_recovered_to))

Added by Telemetry Bot 12 months ago. Updated 18 days ago.

Status:
New
Priority:
Normal
Assignee:
-
Category:
-
Target version:
-
% Done:

0%

Source:
Telemetry
Tags:
Backport:
Regression:
No
Severity:
3 - minor
Reviewed:
Affected Versions:
ceph-qa-suite:
Component(RADOS):
Pull request ID:
Crash signature (v1):

75af2ed082be6a6c22eb7f04bd05cf98f4fbaf231abc9ff978b3684dacc5da0a
7b7aaff61eb3d064d2d3e9d204575affb0cc72f9424173ad80ce9a743f2eb0b7
db5932cf296c80a1c144a1aeda6e548ddad3e6295c21fea11903bc799f59b7cf
f9f22141ab4dd2eee0fc81814e484341f89b34a694a7637dd1b722e5c20a7759


Description

http://telemetry.front.sepia.ceph.com:4000/d/jByk5HaMz/crash-spec-x-ray?orgId=1&var-sig_v2=9142b126ec0e6395147219ea2491c7ff242089468db66b588f9b7aef16759c8f

Assert condition: pop.data.length() == sinfo.aligned_logical_offset_to_chunk_offset( after_progress.data_recovered_to - op.recovery_progress.data_recovered_to)
Assert function: void ECBackend::continue_recovery_op(ECBackend::RecoveryOp&, RecoveryMessages*)

Sanitized backtrace:

    ECBackend::continue_recovery_op(ECBackend::RecoveryOp&, RecoveryMessages*)
    ECBackend::handle_recovery_read_complete(hobject_t const&, boost::tuples::tuple<unsigned long, unsigned long, std::map<pg_shard_t, ceph::buffer::list, std::less<pg_shard_t>, std::allocator<std::pair<pg_shard_t const, ceph::buffer::list> > >, boost::tuples::null_type, boost::tuples::null_type, boost::tuples::null_type, boost::tuples::null_type, boost::tuples::null_type, boost::tuples::null_type, boost::tuples::null_type>&, std::optional<std::map<std::basic_string<char, std::char_traits<char>, std::allocator<char> >, ceph::buffer::list, std::less<void>, std::allocator<std::pair<std::basic_string<char, std::char_traits<char>, std::allocator<char> > const, ceph::buffer::list> > > >, RecoveryMessages*)
    OnRecoveryReadComplete::finish(std::pair<RecoveryMessages*, ECBackend::read_result_t&>&)
    ECBackend::complete_read_op(ECBackend::ReadOp&, RecoveryMessages*)
    ECBackend::handle_sub_read_reply(pg_shard_t, ECSubReadReply&, RecoveryMessages*, ZTracer::Trace const&)
    ECBackend::_handle_message(boost::intrusive_ptr<OpRequest>)
    PGBackend::handle_message(boost::intrusive_ptr<OpRequest>)
    PrimaryLogPG::do_request(boost::intrusive_ptr<OpRequest>&, ThreadPool::TPHandle&)
    OSD::dequeue_op(boost::intrusive_ptr<PG>, boost::intrusive_ptr<OpRequest>, ThreadPool::TPHandle&)
    ceph::osd::scheduler::PGOpItem::run(OSD*, OSDShard*, boost::intrusive_ptr<PG>&, ThreadPool::TPHandle&)
    OSD::ShardedOpWQ::_process(unsigned int, ceph::heartbeat_handle_d*)
    ShardedThreadPool::shardedthreadpool_worker(unsigned int)
    ShardedThreadPool::WorkThreadSharded::entry()

Crash dump sample:
{
    "archived": "2023-04-11 12:16:54.864155",
    "assert_condition": "pop.data.length() == sinfo.aligned_logical_offset_to_chunk_offset( after_progress.data_recovered_to - op.recovery_progress.data_recovered_to)",
    "assert_file": "osd/ECBackend.cc",
    "assert_func": "void ECBackend::continue_recovery_op(ECBackend::RecoveryOp&, RecoveryMessages*)",
    "assert_line": 676,
    "assert_msg": "osd/ECBackend.cc: In function 'void ECBackend::continue_recovery_op(ECBackend::RecoveryOp&, RecoveryMessages*)' thread 7f8883209700 time 2023-04-11T10:10:46.038828+0100\nosd/ECBackend.cc: 676: FAILED ceph_assert(pop.data.length() == sinfo.aligned_logical_offset_to_chunk_offset( after_progress.data_recovered_to - op.recovery_progress.data_recovered_to))",
    "assert_thread_name": "tp_osd_tp",
    "backtrace": [
        "/lib64/libpthread.so.0(+0x12cf0) [0x7f88a424acf0]",
        "gsignal()",
        "abort()",
        "(ceph::__ceph_assert_fail(char const*, char const*, int, char const*)+0x18f) [0x55b042d5468b]",
        "/usr/bin/ceph-osd(+0x5977f7) [0x55b042d547f7]",
        "(ECBackend::continue_recovery_op(ECBackend::RecoveryOp&, RecoveryMessages*)+0x1e30) [0x55b0432ad8b0]",
        "(ECBackend::handle_recovery_read_complete(hobject_t const&, boost::tuples::tuple<unsigned long, unsigned long, std::map<pg_shard_t, ceph::buffer::v15_2_0::list, std::less<pg_shard_t>, std::allocator<std::pair<pg_shard_t const, ceph::buffer::v15_2_0::list> > >, boost::tuples::null_type, boost::tuples::null_type, boost::tuples::null_type, boost::tuples::null_type, boost::tuples::null_type, boost::tuples::null_type, boost::tuples::null_type>&, std::optional<std::map<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >, ceph::buffer::v15_2_0::list, std::less<void>, std::allocator<std::pair<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const, ceph::buffer::v15_2_0::list> > > >, RecoveryMessages*)+0x4c2) [0x55b0432ae052]",
        "(OnRecoveryReadComplete::finish(std::pair<RecoveryMessages*, ECBackend::read_result_t&>&)+0x71) [0x55b0432d3e21]",
        "(ECBackend::complete_read_op(ECBackend::ReadOp&, RecoveryMessages*)+0x8d) [0x55b04329e88d]",
        "(ECBackend::handle_sub_read_reply(pg_shard_t, ECSubReadReply&, RecoveryMessages*, ZTracer::Trace const&)+0xd1c) [0x55b0432b9f4c]",
        "(ECBackend::_handle_message(boost::intrusive_ptr<OpRequest>)+0x2d4) [0x55b0432baa24]",
        "(PGBackend::handle_message(boost::intrusive_ptr<OpRequest>)+0x56) [0x55b04306d2c6]",
        "(PrimaryLogPG::do_request(boost::intrusive_ptr<OpRequest>&, ThreadPool::TPHandle&)+0x512) [0x55b042fff9f2]",
        "(OSD::dequeue_op(boost::intrusive_ptr<PG>, boost::intrusive_ptr<OpRequest>, ThreadPool::TPHandle&)+0x1c0) [0x55b042e55900]",
        "(ceph::osd::scheduler::PGOpItem::run(OSD*, OSDShard*, boost::intrusive_ptr<PG>&, ThreadPool::TPHandle&)+0x6d) [0x55b0431542ad]",
        "(OSD::ShardedOpWQ::_process(unsigned int, ceph::heartbeat_handle_d*)+0x115f) [0x55b042e68dbf]",
        "(ShardedThreadPool::shardedthreadpool_worker(unsigned int)+0x435) [0x55b0435c68c5]",
        "(ShardedThreadPool::WorkThreadSharded::entry()+0x14) [0x55b0435c8fe4]",
        "/lib64/libpthread.so.0(+0x81cf) [0x7f88a42401cf]",
        "clone()" 
    ],
    "ceph_version": "17.2.5",
    "crash_id": "2023-04-11T09:10:46.064419Z_07dabafc-ac37-4c3d-a8bc-2d94e758acd9",
    "entity_name": "osd.5124c4347d97042388257a62052a332dbd538ede",
    "os_id": "almalinux",
    "os_name": "AlmaLinux",
    "os_version": "8.7 (Stone Smilodon)",
    "os_version_id": "8.7",
    "process_name": "ceph-osd",
    "stack_sig": "7b7aaff61eb3d064d2d3e9d204575affb0cc72f9424173ad80ce9a743f2eb0b7",
    "timestamp": "2023-04-11T09:10:46.064419Z",
    "utsname_machine": "x86_64",
    "utsname_release": "4.18.0-425.10.1.el8_7.x86_64",
    "utsname_sysname": "Linux",
    "utsname_version": "#1 SMP Thu Jan 12 10:05:00 EST 2023" 
}

Actions #1

Updated by Telemetry Bot 12 months ago

  • Crash signature (v1) updated (diff)
  • Crash signature (v2) updated (diff)
  • Affected Versions v17.2.3, v17.2.5 added
Actions #2

Updated by 玮文 胡 6 months ago

We are hitting this in our production server (v17.2.6). Multiple OSDs crash loop. Recovery cannot progress. What can I do to help resolve this?

debug   -158> 2023-10-22T06:27:26.376+0000 7fb19f595700 10 osd.13 202542 dequeue_op 0x55bacc82b080 prio 127 cost 0 latency 0.000110 MOSDECSubOpReadReply(19.7s1 202542/202541 ECSubReadReply(tid=0, attrs_read=0)) v2 pg pg[19.7s1( v 125574'572280 (124532'570350,125574'572280] local-lis/les=202541/202542 n=27385 ec=45495/45495 lis/c=202541/201867 les/c/f=202542/201868/108270 sis=202541) [38,13,50,49,NONE,47,37,34,4,3]/[NONE,13,50,49,15,47,37,34,4,3]p13(1) backfill=[38(0)] r=1 lpr=202541 pi=[201867,202541)/1 crt=125574'572280 lcod 0'0 mlcod 0'0 active+undersized+degraded+remapped+backfilling rops=1 mbc={}]
debug   -157> 2023-10-22T06:27:26.376+0000 7fb19f595700 10 osd.13 pg_epoch: 202542 pg[19.7s1( v 125574'572280 (124532'570350,125574'572280] local-lis/les=202541/202542 n=27385 ec=45495/45495 lis/c=202541/201867 les/c/f=202542/201868/108270 sis=202541) [38,13,50,49,NONE,47,37,34,4,3]/[NONE,13,50,49,15,47,37,34,4,3]p13(1) backfill=[38(0)] r=1 lpr=202541 pi=[201867,202541)/1 crt=125574'572280 lcod 0'0 mlcod 0'0 active+undersized+degraded+remapped+backfilling rops=1 mbc={}] _handle_message: MOSDECSubOpReadReply(19.7s1 202542/202541 ECSubReadReply(tid=0, attrs_read=0)) v2
debug   -156> 2023-10-22T06:27:26.376+0000 7fb19f595700 10 osd.13 pg_epoch: 202542 pg[19.7s1( v 125574'572280 (124532'570350,125574'572280] local-lis/les=202541/202542 n=27385 ec=45495/45495 lis/c=202541/201867 les/c/f=202542/201868/108270 sis=202541) [38,13,50,49,NONE,47,37,34,4,3]/[NONE,13,50,49,15,47,37,34,4,3]p13(1) backfill=[38(0)] r=1 lpr=202541 pi=[201867,202541)/1 crt=125574'572280 lcod 0'0 mlcod 0'0 active+undersized+degraded+remapped+backfilling rops=1 mbc={}] handle_sub_read_reply: reply ECSubReadReply(tid=0, attrs_read=0)
debug   -155> 2023-10-22T06:27:26.376+0000 7fb19f595700  0 log_channel(cluster) log [WRN] : Error(s) ignored for 19:e0a22404:::10010679c05.00000000:head enough copies available
debug   -154> 2023-10-22T06:27:26.376+0000 7fb19f595700 10 osd.13 pg_epoch: 202542 pg[19.7s1( v 125574'572280 (124532'570350,125574'572280] local-lis/les=202541/202542 n=27385 ec=45495/45495 lis/c=202541/201867 les/c/f=202542/201868/108270 sis=202541) [38,13,50,49,NONE,47,37,34,4,3]/[NONE,13,50,49,15,47,37,34,4,3]p13(1) backfill=[38(0)] r=1 lpr=202541 pi=[201867,202541)/1 crt=125574'572280 lcod 0'0 mlcod 0'0 active+undersized+degraded+remapped+backfilling rops=1 mbc={}] handle_sub_read_reply Error(s) ignored for 19:e0a22404:::10010679c05.00000000:head enough copies available
debug   -153> 2023-10-22T06:27:26.376+0000 7fb19f595700 10 osd.13 pg_epoch: 202542 pg[19.7s1( v 125574'572280 (124532'570350,125574'572280] local-lis/les=202541/202542 n=27385 ec=45495/45495 lis/c=202541/201867 les/c/f=202542/201868/108270 sis=202541) [38,13,50,49,NONE,47,37,34,4,3]/[NONE,13,50,49,15,47,37,34,4,3]p13(1) backfill=[38(0)] r=1 lpr=202541 pi=[201867,202541)/1 crt=125574'572280 lcod 0'0 mlcod 0'0 active+undersized+degraded+remapped+backfilling rops=1 mbc={}] handle_recovery_read_complete: returned 19:e0a22404:::10010679c05.00000000:head (0, 8388608, [3(9),67584, 4(8),67584, 13(1),67584, 34(7),67584, 37(6),67584, 47(5),67584, 49(3),67584, 50(2),67584])
debug   -152> 2023-10-22T06:27:26.376+0000 7fb19f595700 10 osd.13 pg_epoch: 202542 pg[19.7s1( v 125574'572280 (124532'570350,125574'572280] local-lis/les=202541/202542 n=27385 ec=45495/45495 lis/c=202541/201867 les/c/f=202542/201868/108270 sis=202541) [38,13,50,49,NONE,47,37,34,4,3]/[NONE,13,50,49,15,47,37,34,4,3]p13(1) backfill=[38(0)] r=1 lpr=202541 pi=[201867,202541)/1 crt=125574'572280 lcod 0'0 mlcod 0'0 active+undersized+degraded+remapped+backfilling rops=1 mbc={}] handle_recovery_read_complete: [1,67584, 2,67584, 3,67584, 5,67584, 6,67584, 7,67584, 8,67584, 9,67584]
debug   -151> 2023-10-22T06:27:26.384+0000 7fb19f595700 10 osd.13 pg_epoch: 202542 pg[19.7s1( v 125574'572280 (124532'570350,125574'572280] local-lis/les=202541/202542 n=27385 ec=45495/45495 lis/c=202541/201867 les/c/f=202542/201868/108270 sis=202541) [38,13,50,49,NONE,47,37,34,4,3]/[NONE,13,50,49,15,47,37,34,4,3]p13(1) backfill=[38(0)] r=1 lpr=202541 pi=[201867,202541)/1 crt=125574'572280 lcod 0'0 mlcod 0'0 active+undersized+degraded+remapped+backfilling rops=1 mbc={}] continue_recovery_op: continuing RecoveryOp(hoid=19:e0a22404:::10010679c05.00000000:head v=58663'98070 missing_on=38(0) missing_on_shards=0 recovery_info=ObjectRecoveryInfo(19:e0a22404:::10010679c05.00000000:head@58663'98070, size: 1049646, copy_subset: [], clone_subset: {}, snapset: 0=[]:{}, object_exist: 1) recovery_progress=ObjectRecoveryProgress(first, data_recovered_to:0, data_complete:false, omap_recovered_to:, omap_complete:true, error:false) obc refcount=2 state=READING waiting_on_pushes= extent_requested=0,8388608)
debug   -150> 2023-10-22T06:27:26.384+0000 7fb19f595700 10 osd.13 pg_epoch: 202542 pg[19.7s1( v 125574'572280 (124532'570350,125574'572280] local-lis/les=202541/202542 n=27385 ec=45495/45495 lis/c=202541/201867 les/c/f=202542/201868/108270 sis=202541) [38,13,50,49,NONE,47,37,34,4,3]/[NONE,13,50,49,15,47,37,34,4,3]p13(1) backfill=[38(0)] r=1 lpr=202541 pi=[201867,202541)/1 crt=125574'572280 lcod 0'0 mlcod 0'0 active+undersized+degraded+remapped+backfilling rops=1 mbc={}] continue_recovery_op: before_progress=ObjectRecoveryProgress(first, data_recovered_to:0, data_complete:false, omap_recovered_to:, omap_complete:true, error:false), after_progress=ObjectRecoveryProgress(!first, data_recovered_to:1081344, data_complete:true, omap_recovered_to:, omap_complete:true, error:false), pop.data.length()=65536, size=1049646
debug   -149> 2023-10-22T06:27:26.396+0000 7fb19f595700 -1 /home/jenkins-build/build/workspace/ceph-build/ARCH/x86_64/AVAILABLE_ARCH/x86_64/AVAILABLE_DIST/centos8/DIST/centos8/MACHINE_SIZE/gigantic/release/17.2.6/rpm/el8/BUILD/ceph-17.2.6/src/osd/ECBackend.cc: In function 'void ECBackend::continue_recovery_op(ECBackend::RecoveryOp&, RecoveryMessages*)' thread 7fb19f595700 time 2023-10-22T06:27:26.390734+0000
/home/jenkins-build/build/workspace/ceph-build/ARCH/x86_64/AVAILABLE_ARCH/x86_64/AVAILABLE_DIST/centos8/DIST/centos8/MACHINE_SIZE/gigantic/release/17.2.6/rpm/el8/BUILD/ceph-17.2.6/src/osd/ECBackend.cc: 676: FAILED ceph_assert(pop.data.length() == sinfo.aligned_logical_offset_to_chunk_offset( after_progress.data_recovered_to - op.recovery_progress.data_recovered_to))
 ceph version 17.2.6 (d7ff0d10654d2280e08f1ab989c7cdf3064446a5) quincy (stable)
Actions #3

Updated by 玮文 胡 6 months ago

More info about the pool

pool 19 'cephfs.cephfs.data_ec' erasure profile clay_profile size 10 min_size 9 crush_rule 4 object_hash rjenkins pg_num 32 pgp_num 32 autoscale_mode warn last_change 201520 flags hashpspool,ec_overwrites,nearfull stripe_width 32768 application cephfs

Actions #4

Updated by 玮文 胡 6 months ago

enabling osd_read_ec_check_for_errors do fix this (https://tracker.ceph.com/issues/55662). And gives this log:

debug 2023-10-22T10:07:41.522+0000 7f3a37930700 0 osd.13 pg_epoch: 202591 pg[19.7s1( v 125574'572280 (124532'570350,125574'572280] local-lis/les=202587/202588 n=27385 ec=45495/45495 lis/c=202587/201867 les/c/f=202588/201868/108270 sis=202587) [38,13,50,49,NONE,47,37,34,4,3]/[NONE,13,50,49,15,47,37,34,4,3]p13(1) backfill=[38(0)] r=1 lpr=202587 pi=[201867,202587)/1 crt=125574'572280 lcod 0'0 mlcod 0'0 active+undersized+degraded+remapped+backfilling rops=1 mbc={0={(0+0)=1},1={(1+0)=1},2={(1+0)=1},3={(1+0)=1},4={(0+0)=1},5={(1+0)=1},6={(1+0)=1},7={(1+0)=1},8={(1+0)=1},9={(1+0)=1}}] on_failed_pull 19:e0a22404:::10010679c05.00000000:head from shard 15(4), reps on 3(9),4(8),13(1),34(7),37(6),47(5),49(3),50(2) unfound? 0

But it should not crash with the default config. It is worth fixing.

Actions #5

Updated by dovefi Z 18 days ago

I had the same problem with version 14.2.21,is there any progress

2024-04-11 16:03:09.370 7f9eb0c4a700  0 log_channel(cluster) log [WRN] : Error(s) ignored for 4:2e74e020:::34lb5wmx66001777771919462895616_ryvoskbmfq_00000185_0001.000000000000000a:head enough copies available
2024-04-11 16:03:09.414 7f9eb0c4a700 -1 /home/build/Debian/WORKDIR/ceph-14.2.21.2-1+deb10u1/src/osd/ECBackend.cc: In function 'void ECBackend::continue_recovery_op(ECBackend::RecoveryOp&, RecoveryMessages*)' thread 7f9eb0c4a700 time 2024-04-11 16:03:09.413745
/home/build/Debian/WORKDIR/ceph-14.2.21.2-1+deb10u1/src/osd/ECBackend.cc: 648: FAILED ceph_assert(pop.data.length() == sinfo.aligned_logical_offset_to_chunk_offset( after_progress.data_recovered_to - op.recovery_progress.data_recovered_to))

 ceph version 14.2.21.2-1+deb10u1 (fdc0caac815eff968a0fca77e87a3658fcafcf4c) nautilus (stable)
 1: (ceph::__ceph_assert_fail(char const*, char const*, int, char const*)+0x152) [0x55f13dffeefa]
 2: (()+0x5140d2) [0x55f13dfff0d2]
 3: (ECBackend::continue_recovery_op(ECBackend::RecoveryOp&, RecoveryMessages*)+0x1c83) [0x55f13e414ac3]
 4: (ECBackend::handle_recovery_read_complete(hobject_t const&, boost::tuples::tuple<unsigned long, unsigned long, std::map<pg_shard_t, ceph::buffer::v14_2_0::list, std::less<pg_shard_t>, std::allocator<std::pair<pg_shard_t const, ceph::buffer::v14_2_0::list> > >, boost::tuples::null_type, boost::tuples::null_type, boost::tuples::null_type, boost::tuples::null_type, boost::tuples::null_type, boost::tuples::null_type, boost::tuples::null_type>&, boost::optional<std::map<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >, ceph::buffer::v14_2_0::list, std::less<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > >, std::allocator<std::pair<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const, ceph::buffer::v14_2_0::list> > > >, RecoveryMessages*)+0x744) [0x55f13e415444]
 5: (OnRecoveryReadComplete::finish(std::pair<RecoveryMessages*, ECBackend::read_result_t&>&)+0x6d) [0x55f13e435d6d]
 6: (ECBackend::complete_read_op(ECBackend::ReadOp&, RecoveryMessages*)+0x8f) [0x55f13e4090bf]
 7: (ECBackend::handle_sub_read_reply(pg_shard_t, ECSubReadReply&, RecoveryMessages*, ZTracer::Trace const&)+0xe79) [0x55f13e420669]
 8: (ECBackend::_handle_message(boost::intrusive_ptr<OpRequest>)+0x1a4) [0x55f13e420f04]
 9: (PGBackend::handle_message(boost::intrusive_ptr<OpRequest>)+0x57) [0x55f13e309437]
 10: (PrimaryLogPG::do_request(boost::intrusive_ptr<OpRequest>&, ThreadPool::TPHandle&)+0x61f) [0x55f13e2b791f]
 11: (OSD::dequeue_op(boost::intrusive_ptr<PG>, boost::intrusive_ptr<OpRequest>, ThreadPool::TPHandle&)+0x392) [0x55f13e0e3932]
 12: (PGOpItem::run(OSD*, OSDShard*, boost::intrusive_ptr<PG>&, ThreadPool::TPHandle&)+0x62) [0x55f13e3881c2]
 13: (OSD::ShardedOpWQ::_process(unsigned int, ceph::heartbeat_handle_d*)+0x7d7) [0x55f13e0ff677]
 14: (ShardedThreadPool::shardedthreadpool_worker(unsigned int)+0x5b4) [0x55f13e6d4244]
 15: (ShardedThreadPool::WorkThreadSharded::entry()+0x10) [0x55f13e6d6c50]
 16: (()+0x7fa3) [0x7f9ed4175fa3]
 17: (clone()+0x3f) [0x7f9ed3d2606f]

Actions

Also available in: Atom PDF