Project

General

Profile

Bug #51835

mgr/DaemonServer.cc: FAILED ceph_assert(pending_service_map.epoch > service_map.epoch)

Added by Neha Ojha over 2 years ago. Updated over 1 year ago.

Status:
Resolved
Priority:
High
Assignee:
Category:
-
Target version:
-
% Done:

100%

Source:
Tags:
Backport:
pacific,quincy
Regression:
No
Severity:
3 - minor
Reviewed:
ceph-qa-suite:
Pull request ID:
Crash signature (v1):

0998b43e2acf7885c0d520cb04bcac94422785326e2c3e613066c40ddbb222d1
172fd5df40a73f4369e7240613305d57096a0a8bfae90c9e74170eff9b7065d8
22f8111eb1ecee808a966f7952c522517b474189ea109b8744d1a006679ec1ab
38d8ea189f97870776c66efe71ffe388f9d8855c0ef27d8f41f51ed78d985371
3ec3e614ab29042467cd5a3774212f599db755986c38bc99d273a7070238f8dc
6f471308ed9ea4def78d4fdaf5d0ed984f759d2d50f0728b0990ffaab316fa17
aa3ccd51d5334df9e1e6472bcb9c51691650124acbe5d28d1759e0f9c13c079e
ab7b87d7633452843f4b37db73dbeec41fb42106a7ab602a89e49c88079ec2a4
d9a326986998843fd7fa1046b32bdf61be94a57e6aa0eb7265e5003114c315d8
010f090deced4e07677216b8f73f9adf35da584958503ae9cdd3aca63176cc0e
03f1cb3aeae4e9c82276d8c7d804cab48f6a1fae461b76621df8a87b3f9441b0
0579e12b2e960b1216257cd09934646758fcbc8db5f9f90b20ec6716ad4612c7
08454dd997ad3f835a09061d94928a36e63d08875b345dd9a8b090175e780eb1
119483325445ee6b0222f07c3a29a7568aea59bf293483cadbd3e596740df9ca
2125f03c1c250f75efa8ca24e5565de389c5ead2807c35ca592ddfd09ef3beb2
216260e18340ab112ca2e24c6cce7a599f42ddaa4811e183e8a428de53519438
2acd8a68926ac158e7fb96d0ec5acc5226bb665f795127678b699940cdea0b0c
2bc2a283a1f07f4fe708eede4d77e297cda40c43e22f64c835f0cbabe3f2fad8
36f27cd98e50a7ae81510e8949eba0d35a85a304d94e8e06403816dcbc7cbcf9
3881d5239ce2e48613a19e9b24199ac27dd335b09b3fae8deb138b7ccd099116
44b59051162226151391720a111bdc3ca4556e5a3e9bc276003e7a2c3afb4fa0
471d1d5136e84229dda6bc6f5568446960c03af390d96c359320fe3dd614678d
4845cc4bb3a81221e02013cd11d4384f719e89a18b894341ae93a271bdec9ba9
4b2a8f420d6481018a341b31d292c3cf12547ac393464666f4d0f9c86572bbfd
5186dc8fb989a114ea41e7ea01d107d06eb548e22b8967fea47a6fe97a2130f8
573da45aa64eebe9407cc8f3baa2c3f92c8b217e8cf99290cd9f53f5795356ab
58ffe0ed32df13b379245c260f85fa5db0189c99672650455a12c7569436dbc2
5aa913349c01afce61375c61c2014d58bbcab17053e6e9aebebc7412f3006365
5cc10a36dc4d4c48624252264847bdee83293ecc05b2ce66e9d31335a7bd5685
6c807ebc2eb228ad195a6b877c011da087e7633c1de5ffb4403bb34b3fdab378
719bcddca2a4bec4547d78aba06dcddcd8d84c1f84b268b541b1cc7d1b83107f
722a04ce791dcb923bcd32a67cd999360f24d2fea3181b3ac59ceedf4404f9f4
757d2951cb24701e0df318dcb1910d4dd560a77e71feca7bdb87ee893339151f
77e7c9f923007075d79e037449c7ee310fc0adcee2beeed7789068f2dc3cb542
77f8d194f75ebfa82427bee0695e00cbda772ff5f157545562ad69173e925348
85f9365bb64dc9def14ef196a932f782c134abfc2c7f6ed339afc807a5b6664b
86ab3a48e25206c884ac52034d1562c68826296d4fcbe7ceff0b9d8b9b4a56a1
8ece4f831f667493b84a70f0c9ab943d0efd126cf75e1c1b4f6ad08473195b55
9811463f7f12133066e34599ad54588574e20d191d42784dd69e18e0a0f67624
a1f65294bbb5a6473b79e510d113d5efe2b96703ebd2aed3905834f809b2aef0
a33906a4f60e1a3487349bc991bd0015ad8d1af964ca58eda7939014c15b1e07
a378e297b1c2429234b1f5597574b6bae1335c230bc33c4b5f6982e18cc114f7
ae21f660fb7d97495a00a4b94bf019f276f8ea2473979e4e7d8b0013db31b71c
b77960cb48ad305eba84eb735ff0a39cb7e7fb33017f2258217c6b8c6c7cc2fe
c781c96fc53925fe6567f13fd16d417ada1a38da36da7b832edd0492c57a809b
c90c67a2306d68817429c8c6e8bf7d3c9f2dea525b79782fcb15b4409a588e8e
c94a82229cbb8cc0953e1ecf4d8860f245890e2b4dd7799d5c7ed92f4958cfeb
c98537b8ae9cf9319e5b58113f7ac511c271fc331337529972ec97ad55f0341a
d1ffe583619ba0dd788128d94faf5dfa64efa21e9662f2f5065f171574518305
d483574e4cc9e637f6ca396dddad10e9a4c8117a8ddaec832708f1a51f47566d
d76259ab9b4a4291a4dced19967ec1041647cfd85d74b6ff30149b593f61eded
d7798dde063b5c016b1f8ed2284c422edf2af675a1d4a914abb9257cb113d2ea
dbf50cff7b7e9bbe2110ed80c258b726423301516b8e02ede6436ab777d1c0e9
de2498c3032d41642ac36ba7affe311481f4538d927e5dcac6bdaf4e43ce3087
e0af9aabe0a92669a10b6515af920a9107050fb8c507af088afeb24b3413dfe4
e5f8937e296fe97ca4d30ae0a0e8549357dd12595b65ac379980f6c368e3ee10
ead60d0b8fa94de09f455f43da79d9266b11449956e822c3e2d6a93552721a9e
f3b0fc4ad4132bf76d5e282b4146934157f37eac3251796da02058948dc6cd00
f7ed0de76267efa63188a1bc3a41c45c9e72d817cfcb89fe93530eeab5c8f28f
55ad203b14ba86896fb522b12a64c16ae6acefd0736d21d85ee042e9ff121474
5d711175d9ef3767a3e8f9de1a229853f45300f71d5d966e94bc9ffa6360673b
8a3d1b78a86a0b3da72d6517ad4be01464e8c9cc90aa982d0270b24f4552f129
914e2249b05186689356e444aca6d119e10f57e680701a8bc92a38dab7c35265
9d5766a6932fb99f1b25741b2eca53c1a6186f96f950c77cf5baeffeb09264fa
a1e8e2439f4cd325f3e6fedfa1f1f4859214cb2295161b455ecc588de71a3a31
aadd01c71217d6f3c81092cc84f1ae5fae2c0d19a4ba9ccd0ce45eb43f60348d
ad475ad57f22fdb3eaaaa6e101ae9b1cacf722804ff5562cff4a344e7ca3d4e8
b258614b124d8cbeb4370e302ae9c4a94b38f0a47dd56da09078e217377615da
b69cd8a5461aab728060adedb96c7130f3d9328472b5ee0e755fa21a35afc676
fbf3b67b60d77540a1ba06a2a33f6358f5e5363616c7af100084576b6db7d6d9
0fbede8a6ef204a08f3588700df3836dd017eb861974192bb1252bb6c901ebd2
12bed7213ee1bd2d12f032130c0158d64ed18585022a3cf4c1dfd4cff6e33a0e
e776dcb4d083d1521a5b7a3653719c342af04c042e68cbf3f37ece09cf4f9ca7
f8c7af4e590c2b697ceff6fb7fde5c5b90db3ec18551f854c95965fdd88a287a
fe2f1f256f7d9479267f8213c9dc440bf67a5f2a5b285d5ea3f29dd9d94ffd57


Description

{
    "os_id": "centos",
    "os_name": "CentOS Linux",
    "archived": "2021-04-28 11:23:38.548252",
    "crash_id": "2021-04-28T10:56:41.541431Z_ba9764d3-8860-427d-bc94-7f8333db7a3f",
    "backtrace": [
        "/lib64/libpthread.so.0(+0x12b20) [0x7fc69e5f6b20]",
        "gsignal()",
        "abort()",
        "(ceph::__ceph_assert_fail(char const*, char const*, int, char const*)+0x1a9) [0x7fc69fa0b52d]",
        "/usr/lib64/ceph/libceph-common.so.2(+0x2766f6) [0x7fc69fa0b6f6]",
        "(DaemonServer::got_service_map()+0xb2d) [0x55d30d90886d]",
        "(Mgr::handle_service_map(boost::intrusive_ptr<MServiceMap>)+0x1b6) [0x55d30d9374a6]",
        "(Mgr::ms_dispatch2(boost::intrusive_ptr<Message> const&)+0x894) [0x55d30d93a0e4]",
        "(MgrStandby::ms_dispatch2(boost::intrusive_ptr<Message> const&)+0xa5) [0x55d30d943695]",
        "(DispatchQueue::entry()+0x126a) [0x7fc69fc463fa]",
        "(DispatchQueue::DispatchThread::entry()+0x11) [0x7fc69fcf60f1]",
        "/lib64/libpthread.so.0(+0x814a) [0x7fc69e5ec14a]",
        "clone()" 
    ],
    "stack_sig": "3ec3e614ab29042467cd5a3774212f599db755986c38bc99d273a7070238f8dc",
    "timestamp": "2021-04-28T10:56:41.541431Z",
    "assert_msg": "/home/jenkins-build/build/workspace/ceph-build/ARCH/x86_64/AVAILABLE_ARCH/x86_64/AVAILABLE_DIST/centos8/DIST/centos8/MACHINE_SIZE/gigantic/release/16.2.1/rpm/el8/BUILD/ceph-16.2.1/src/mgr/DaemonServer.cc: In function 'DaemonServer::got_service_map()::<lambda(const ServiceMap&)>' thread 7fc6961e4700 time 2021-04-28T10:56:41.536299+0000\n/home/jenkins-build/build/workspace/ceph-build/ARCH/x86_64/AVAILABLE_ARCH/x86_64/AVAILABLE_DIST/centos8/DIST/centos8/MACHINE_SIZE/gigantic/release/16.2.1/rpm/el8/BUILD/ceph-16.2.1/src/mgr/DaemonServer.cc: 2925: FAILED ceph_assert(pending_service_map.epoch > service_map.epoch)\n",
    "os_version": "8",
    "assert_file": "/home/jenkins-build/build/workspace/ceph-build/ARCH/x86_64/AVAILABLE_ARCH/x86_64/AVAILABLE_DIST/centos8/DIST/centos8/MACHINE_SIZE/gigantic/release/16.2.1/rpm/el8/BUILD/ceph-16.2.1/src/mgr/DaemonServer.cc",
    "assert_func": "DaemonServer::got_service_map()::<lambda(const ServiceMap&)>",
    "assert_line": 2925,
    "entity_name": "mgr.05334b5141c222302e04d9cc04d44e194e47a598",
    "ceph_version": "16.2.1",
    "process_name": "ceph-mgr",
    "os_version_id": "8",
    "utsname_machine": "x86_64",
    "utsname_release": "4.19.0-14-amd64",
    "utsname_sysname": "Linux",
    "utsname_version": "#1 SMP Debian 4.19.171-2 (2021-01-30)",
    "assert_condition": "pending_service_map.epoch > service_map.epoch",
    "assert_thread_name": "ms_dispatch" 
}

Related issues

Related to mgr - Bug #48022: mgr/DaemonServer.cc: FAILED ceph_assert(pending_service_map.epoch > service_map.epoch) Resolved
Copied to mgr - Backport #56053: pacific: mgr/DaemonServer.cc: FAILED ceph_assert(pending_service_map.epoch > service_map.epoch) Resolved
Copied to mgr - Backport #56096: quincy: mgr/DaemonServer.cc: FAILED ceph_assert(pending_service_map.epoch > service_map.epoch) Resolved

History

#1 Updated by Neha Ojha over 2 years ago

  • Related to Bug #48022: mgr/DaemonServer.cc: FAILED ceph_assert(pending_service_map.epoch > service_map.epoch) added

#2 Updated by Neha Ojha over 2 years ago

This happened in a version which has the fix for https://tracker.ceph.com/issues/48022.

#3 Updated by Sage Weil over 2 years ago

  • Related to Bug #51929: crash: DaemonServer::got_service_map()::<lambda(const ServiceMap&)>: assert(pending_service_map.epoch > service_map.epoch) added

#4 Updated by Sage Weil over 2 years ago

  • Duplicated by Bug #51916: crash: DaemonServer::got_service_map()::<lambda(const ServiceMap&)>: assert(pending_service_map.epoch > service_map.epoch) added

#5 Updated by Sage Weil over 2 years ago

  • Related to Bug #51926: crash: DaemonServer::got_service_map()::<lambda(const ServiceMap&)>: assert(pending_service_map.epoch > service_map.epoch) added

#6 Updated by Sage Weil over 2 years ago

  • Duplicated by Bug #51922: crash: DaemonServer::got_service_map()::<lambda(const ServiceMap&)>: assert(pending_service_map.epoch > service_map.epoch) added

#7 Updated by Sage Weil over 2 years ago

  • Related to deleted (Bug #51926: crash: DaemonServer::got_service_map()::<lambda(const ServiceMap&)>: assert(pending_service_map.epoch > service_map.epoch))

#8 Updated by Sage Weil over 2 years ago

  • Duplicated by Bug #51926: crash: DaemonServer::got_service_map()::<lambda(const ServiceMap&)>: assert(pending_service_map.epoch > service_map.epoch) added

#9 Updated by Sage Weil over 2 years ago

  • Duplicated by Bug #51913: crash: DaemonServer::got_service_map()::<lambda(const ServiceMap&)>: assert(pending_service_map.epoch > service_map.epoch) added

#10 Updated by Sage Weil over 2 years ago

  • Related to deleted (Bug #51929: crash: DaemonServer::got_service_map()::<lambda(const ServiceMap&)>: assert(pending_service_map.epoch > service_map.epoch))

#11 Updated by Sage Weil over 2 years ago

  • Duplicated by Bug #51929: crash: DaemonServer::got_service_map()::<lambda(const ServiceMap&)>: assert(pending_service_map.epoch > service_map.epoch) added

#12 Updated by Sage Weil over 2 years ago

  • Duplicated by deleted (Bug #51916: crash: DaemonServer::got_service_map()::<lambda(const ServiceMap&)>: assert(pending_service_map.epoch > service_map.epoch))

#13 Updated by Sage Weil over 2 years ago

  • Duplicated by deleted (Bug #51922: crash: DaemonServer::got_service_map()::<lambda(const ServiceMap&)>: assert(pending_service_map.epoch > service_map.epoch))

#14 Updated by Sage Weil over 2 years ago

  • Duplicated by deleted (Bug #51926: crash: DaemonServer::got_service_map()::<lambda(const ServiceMap&)>: assert(pending_service_map.epoch > service_map.epoch))

#15 Updated by Sage Weil over 2 years ago

  • Duplicated by deleted (Bug #51929: crash: DaemonServer::got_service_map()::<lambda(const ServiceMap&)>: assert(pending_service_map.epoch > service_map.epoch))

#16 Updated by Sage Weil over 2 years ago

  • Duplicated by deleted (Bug #51913: crash: DaemonServer::got_service_map()::<lambda(const ServiceMap&)>: assert(pending_service_map.epoch > service_map.epoch))

#17 Updated by Sage Weil over 2 years ago

  • Status changed from New to Can't reproduce

My theory is that this affected the mgr daemon during the upgrade process, while the mon was still running octopus. (The fix for the original bug included both a mgr fix and a mon patch.) Let's see if this pops up again (and, if so, we can dig deeper into the raw telemetry to see if it coincides with an upgrade.)

#19 Updated by Neha Ojha over 2 years ago

  • Status changed from Can't reproduce to New
[ceph: root@magna031 /]# ceph crash info 2021-11-09T16:58:47.494357Z_13443875-1308-4c2a-8be8-9d0bfad08681
{
    "assert_condition": "pending_service_map.epoch > service_map.epoch",
    "assert_file": "/builddir/build/BUILD/ceph-16.2.6/src/mgr/DaemonServer.cc",
    "assert_func": "DaemonServer::got_service_map()::<lambda(const ServiceMap&)>",
    "assert_line": 2934,
    "assert_msg": "/builddir/build/BUILD/ceph-16.2.6/src/mgr/DaemonServer.cc: In function 'DaemonServer::got_service_map()::<lambda(const ServiceMap&)>' thread 7f438e385700 time 2021-11-09T16:58:47.491291+0000\n/builddir/build/BUILD/ceph-16.2.6/src/mgr/DaemonServer.cc: 2934: FAILED ceph_assert(pending_service_map.epoch > service_map.epoch)\n",
    "assert_thread_name": "ms_dispatch",
    "backtrace": [
        "/lib64/libpthread.so.0(+0x12b20) [0x7f43967a1b20]",
        "gsignal()",
        "abort()",
        "(ceph::__ceph_assert_fail(char const*, char const*, int, char const*)+0x1a9) [0x7f4397970def]",
        "/usr/lib64/ceph/libceph-common.so.2(+0x276fb8) [0x7f4397970fb8]",
        "(DaemonServer::got_service_map()+0xb2d) [0x55ceda2060ed]",
        "(Mgr::handle_service_map(boost::intrusive_ptr<MServiceMap>)+0x1b6) [0x55ceda234e36]",
        "(Mgr::ms_dispatch2(boost::intrusive_ptr<Message> const&)+0x894) [0x55ceda237b04]",
        "(MgrStandby::ms_dispatch2(boost::intrusive_ptr<Message> const&)+0xb0) [0x55ceda2416f0]",
        "(DispatchQueue::entry()+0x126a) [0x7f4397bae66a]",
        "(DispatchQueue::DispatchThread::entry()+0x11) [0x7f4397c5ef81]",
        "/lib64/libpthread.so.0(+0x814a) [0x7f439679714a]",
        "clone()" 
    ],
    "ceph_version": "16.2.6-20.el8cp",
    "crash_id": "2021-11-09T16:58:47.494357Z_13443875-1308-4c2a-8be8-9d0bfad08681",
    "entity_name": "mgr.magna006.vxieja",
    "os_id": "rhel",
    "os_name": "Red Hat Enterprise Linux",
    "os_version": "8.4 (Ootpa)",
    "os_version_id": "8.4",
    "process_name": "ceph-mgr",
    "stack_sig": "cd47423882a28f890d30b420551e2a67e76e9ce5432da2dcbce2e69e2213a2bb",
    "timestamp": "2021-11-09T16:58:47.494357Z",
    "utsname_hostname": "magna006",
    "utsname_machine": "x86_64",
    "utsname_release": "4.18.0-305.el8.x86_64",
    "utsname_sysname": "Linux",
    "utsname_version": "#1 SMP Thu Apr 29 08:54:30 EDT 2021" 
}

This happened in 16.2.6 based on https://bugzilla.redhat.com/show_bug.cgi?id=1984881#c13

#20 Updated by Telemetry Bot about 2 years ago

  • Crash signature (v1) updated (diff)
  • Crash signature (v2) updated (diff)
  • Affected Versions v16.1.0, v16.2.0, v16.2.1, v16.2.2, v16.2.4, v16.2.5, v16.2.6, v16.2.7 added

http://telemetry.front.sepia.ceph.com:4000/d/jByk5HaMz/crash-spec-x-ray?orgId=1&var-sig_v2=172fd5df40a73f4369e7240613305d57096a0a8bfae90c9e74170eff9b7065d8

Assert condition: pending_service_map.epoch > service_map.epoch
Assert function: DaemonServer::got_service_map()::<lambda(const ServiceMap&)>

Sanitized backtrace:

    /lib64/libpthread.so.0(
    /usr/lib64/ceph/libceph-common.so.2(
    DaemonServer::got_service_map()
    Mgr::handle_service_map(boost::intrusive_ptr<MServiceMap>)
    Mgr::ms_dispatch2(boost::intrusive_ptr<Message> const&)
    MgrStandby::ms_dispatch2(boost::intrusive_ptr<Message> const&)
    DispatchQueue::entry()
    DispatchQueue::DispatchThread::entry()
    /lib64/libpthread.so.0(
    clone()

Crash dump sample:
{
    "archived": "2022-03-05 08:06:21.796494",
    "assert_condition": "pending_service_map.epoch > service_map.epoch",
    "assert_file": "mgr/DaemonServer.cc",
    "assert_func": "DaemonServer::got_service_map()::<lambda(const ServiceMap&)>",
    "assert_line": 2934,
    "assert_msg": "mgr/DaemonServer.cc: In function 'DaemonServer::got_service_map()::<lambda(const ServiceMap&)>' thread 7ff3f1a21700 time 2022-03-05T14:41:03.715505+0700\nmgr/DaemonServer.cc: 2934: FAILED ceph_assert(pending_service_map.epoch > service_map.epoch)",
    "assert_thread_name": "ms_dispatch",
    "backtrace": [
        "/lib64/libpthread.so.0(+0x12c20) [0x7ff3f9429c20]",
        "gsignal()",
        "abort()",
        "(ceph::__ceph_assert_fail(char const*, char const*, int, char const*)+0x1a9) [0x7ff3fa5fcba3]",
        "/usr/lib64/ceph/libceph-common.so.2(+0x276d6c) [0x7ff3fa5fcd6c]",
        "(DaemonServer::got_service_map()+0xb2d) [0x557973e23fdd]",
        "(Mgr::handle_service_map(boost::intrusive_ptr<MServiceMap>)+0x1b6) [0x557973e52cb6]",
        "(Mgr::ms_dispatch2(boost::intrusive_ptr<Message> const&)+0x894) [0x557973e55984]",
        "(MgrStandby::ms_dispatch2(boost::intrusive_ptr<Message> const&)+0xa5) [0x557973e5f925]",
        "(DispatchQueue::entry()+0x126a) [0x7ff3fa840aba]",
        "(DispatchQueue::DispatchThread::entry()+0x11) [0x7ff3fa8f25d1]",
        "/lib64/libpthread.so.0(+0x817a) [0x7ff3f941f17a]",
        "clone()" 
    ],
    "ceph_version": "16.2.7",
    "crash_id": "2022-03-05T07:41:03.717812Z_78da9a2c-57c6-464d-a13e-a93423b91c58",
    "entity_name": "mgr.af8f19c9cd2b637d2255a58d4c9f3c0965d78c0d",
    "os_id": "centos",
    "os_name": "CentOS Stream",
    "os_version": "8",
    "os_version_id": "8",
    "process_name": "ceph-mgr",
    "stack_sig": "0998b43e2acf7885c0d520cb04bcac94422785326e2c3e613066c40ddbb222d1",
    "timestamp": "2022-03-05T07:41:03.717812Z",
    "utsname_machine": "x86_64",
    "utsname_release": "4.18.0-348.2.1.el8_5.x86_64",
    "utsname_sysname": "Linux",
    "utsname_version": "#1 SMP Tue Nov 16 14:42:35 UTC 2021" 
}

#21 Updated by Telemetry Bot about 2 years ago

  • Crash signature (v1) updated (diff)
  • Crash signature (v2) updated (diff)

#22 Updated by Telemetry Bot about 2 years ago

  • Crash signature (v1) updated (diff)
  • Affected Versions v15.2.10, v15.2.11, v15.2.12, v15.2.13, v15.2.14, v15.2.15, v15.2.7, v15.2.8, v15.2.9 added

#23 Updated by Aishwarya Mathuria almost 2 years ago

  • Crash signature (v1) updated (diff)

/a/yuriw-2022-04-06_16:35:43-rados-wip-yuri5-testing-2022-04-05-1720-distro-default-smithi/6780002

2022-04-06T21:15:42.882 DEBUG:teuthology.orchestra.run.smithi057:> sudo adjust-ulimits ceph-coverage /home/ubuntu/cephtest/archive/coverage timeout 120 ceph --cluster ceph mgr dump --format=json-pretty
2022-04-06T21:15:42.889 INFO:tasks.ceph.mgr.y.smithi107.stderr:/home/jenkins-build/build/workspace/ceph-dev-new-build/ARCH/x86_64/AVAILABLE_ARCH/x86_64/AVAILABLE_DIST/centos8/DIST/centos8/MACHINE_SIZE/gigantic/release/17.0.0-11491-g37ef971d/rpm/el8/BUILD/ceph-17.0.0-11491-g37ef971d/src/mgr/DaemonServer.cc: In function 'DaemonServer::got_service_map()::<lambda(const ServiceMap&)>' thread 7fbc3e634700 time 2022-04-06T21:15:39.735575+0000
2022-04-06T21:15:42.890 INFO:tasks.ceph.mgr.y.smithi107.stderr:/home/jenkins-build/build/workspace/ceph-dev-new-build/ARCH/x86_64/AVAILABLE_ARCH/x86_64/AVAILABLE_DIST/centos8/DIST/centos8/MACHINE_SIZE/gigantic/release/17.0.0-11491-g37ef971d/rpm/el8/BUILD/ceph-17.0.0-11491-g37ef971d/src/mgr/DaemonServer.cc: 2992: FAILED ceph_assert(pending_service_map.epoch > service_map.epoch)
2022-04-06T21:15:42.890 INFO:tasks.ceph.mgr.y.smithi107.stderr: ceph version 17.0.0-11491-g37ef971d (37ef971db5d69256a78734330cbd85e2b14fd088) quincy (dev)
2022-04-06T21:15:42.890 INFO:tasks.ceph.mgr.y.smithi107.stderr: 1: (ceph::__ceph_assert_fail(char const*, char const*, int, char const*)+0x152) [0x7fbc47e14604]
2022-04-06T21:15:42.890 INFO:tasks.ceph.mgr.y.smithi107.stderr: 2: /usr/lib64/ceph/libceph-common.so.2(+0x284825) [0x7fbc47e14825]
2022-04-06T21:15:42.891 INFO:tasks.ceph.mgr.y.smithi107.stderr: 3: (DaemonServer::got_service_map()+0xb2d) [0x55e9fd789b0d]
2022-04-06T21:15:42.891 INFO:tasks.ceph.mgr.y.smithi107.stderr: 4: (Mgr::handle_service_map(boost::intrusive_ptr<MServiceMap>)+0xee) [0x55e9fd7bb83e]
2022-04-06T21:15:42.891 INFO:tasks.ceph.mgr.y.smithi107.stderr: 5: (Mgr::ms_dispatch2(boost::intrusive_ptr<Message> const&)+0x8c4) [0x55e9fd7be744]
2022-04-06T21:15:42.891 INFO:tasks.ceph.mgr.y.smithi107.stderr: 6: (MgrStandby::ms_dispatch2(boost::intrusive_ptr<Message> const&)+0xae) [0x55e9fd7c99de]
2022-04-06T21:15:42.891 INFO:tasks.ceph.mgr.y.smithi107.stderr: 7: (DispatchQueue::entry()+0x14fa) [0x7fbc4809c1fa]
2022-04-06T21:15:42.892 INFO:tasks.ceph.mgr.y.smithi107.stderr: 8: (DispatchQueue::DispatchThread::entry()+0x11) [0x7fbc48153eb1]
2022-04-06T21:15:42.892 INFO:tasks.ceph.mgr.y.smithi107.stderr: 9: /lib64/libpthread.so.0(+0x81cf) [0x7fbc46c291cf]
2022-04-06T21:15:42.892 INFO:tasks.ceph.mgr.y.smithi107.stderr: 10: clone()
2022-04-06T21:15:42.892 INFO:tasks.ceph.mgr.y.smithi107.stderr:*** Caught signal (Aborted) **
2022-04-06T21:15:42.893 INFO:tasks.ceph.mgr.y.smithi107.stderr: in thread 7fbc3e634700 thread_name:ms_dispatch
2022-04-06T21:15:42.893 INFO:tasks.ceph.mgr.y.smithi107.stderr:2022-04-06T21:15:39.734+0000 7fbc3e634700 -1 /home/jenkins-build/build/workspace/ceph-dev-new-build/ARCH/x86_64/AVAILABLE_ARCH/x86_64/AVAILABLE_DIST/centos8/DIST/centos8/MACHINE_SIZE/gigantic/release/17.0.0-11491-g37ef971d/rpm/el8/BUILD/ceph-17.0.0-11491-g37ef971d/src/mgr/DaemonServer.cc: In function 'DaemonServer::got_service_map()::<lambda(const ServiceMap&)>' thread 7fbc3e634700 time 2022-04-06T21:15:39.735575+0000
2022-04-06T21:15:42.893 INFO:tasks.ceph.mgr.y.smithi107.stderr:/home/jenkins-build/build/workspace/ceph-dev-new-build/ARCH/x86_64/AVAILABLE_ARCH/x86_64/AVAILABLE_DIST/centos8/DIST/centos8/MACHINE_SIZE/gigantic/release/17.0.0-11491-g37ef971d/rpm/el8/BUILD/ceph-17.0.0-11491-g37ef971d/src/mgr/DaemonServer.cc: 2992: FAILED ceph_assert(pending_service_map.epoch > service_map.epoch)

#24 Updated by Neha Ojha almost 2 years ago

  • Assignee set to Mykola Golub
  • Priority changed from Normal to High

Hi Mykola, looks like this issue has not been resolved yet. We saw it during the latest LRC upgrade to quincy;

root@reesi001:~# ceph crash info 2022-04-18T17:29:40.498001Z_2b39c813-cae1-43d2-9185-5b7149f48e8d
{
    "assert_condition": "pending_service_map.epoch > service_map.epoch",
    "assert_file": "/home/jenkins-build/build/workspace/ceph-dev-new-build/ARCH/x86_64/AVAILABLE_ARCH/x86_64/AVAILABLE_DIST/centos8/DIST/centos8/MACHINE_SIZE/gigantic/release/17.1.0-206-g4fb951d2/rpm/el8/BUILD/ceph-17.1.0-206-g4fb951d2/src/mgr/DaemonServer.cc",
    "assert_func": "DaemonServer::got_service_map()::<lambda(const ServiceMap&)>",
    "assert_line": 2946,
    "assert_msg": "/home/jenkins-build/build/workspace/ceph-dev-new-build/ARCH/x86_64/AVAILABLE_ARCH/x86_64/AVAILABLE_DIST/centos8/DIST/centos8/MACHINE_SIZE/gigantic/release/17.1.0-206-g4fb951d2/rpm/el8/BUILD/ceph-17.1.0-206-g4fb951d2/src/mgr/DaemonServer.cc: In function 'DaemonServer::got_service_map()::<lambda(const ServiceMap&)>' thread 7f1193216700 time 2022-04-18T17:29:40.493572+0000\n/home/jenkins-build/build/workspace/ceph-dev-new-build/ARCH/x86_64/AVAILABLE_ARCH/x86_64/AVAILABLE_DIST/centos8/DIST/centos8/MACHINE_SIZE/gigantic/release/17.1.0-206-g4fb951d2/rpm/el8/BUILD/ceph-17.1.0-206-g4fb951d2/src/mgr/DaemonServer.cc: 2946: FAILED ceph_assert(pending_service_map.epoch > service_map.epoch)\n",
    "assert_thread_name": "ms_dispatch",
    "backtrace": [
        "/lib64/libpthread.so.0(+0x12ce0) [0x7f119ba28ce0]",
        "gsignal()",
        "abort()",
        "(ceph::__ceph_assert_fail(char const*, char const*, int, char const*)+0x1b0) [0x7f119cc08082]",
        "/usr/lib64/ceph/libceph-common.so.2(+0x283245) [0x7f119cc08245]",
        "(DaemonServer::got_service_map()+0xb2d) [0x561293f6f94d]",
        "(Mgr::handle_service_map(boost::intrusive_ptr<MServiceMap>)+0xee) [0x561293fa18de]",
        "(Mgr::ms_dispatch2(boost::intrusive_ptr<Message> const&)+0x8c4) [0x561293fa47f4]",
        "(MgrStandby::ms_dispatch2(boost::intrusive_ptr<Message> const&)+0xae) [0x561293fafc1e]",
        "(DispatchQueue::entry()+0x14fa) [0x7f119ce8e3aa]",
        "(DispatchQueue::DispatchThread::entry()+0x11) [0x7f119cf44bd1]",
        "/lib64/libpthread.so.0(+0x81cf) [0x7f119ba1e1cf]",
        "clone()" 
    ],
    "ceph_version": "17.1.0-206-g4fb951d2",
    "crash_id": "2022-04-18T17:29:40.498001Z_2b39c813-cae1-43d2-9185-5b7149f48e8d",
    "entity_name": "mgr.reesi004.tplfrt",
    "os_id": "centos",
    "os_name": "CentOS Stream",
    "os_version": "8",
    "os_version_id": "8",
    "process_name": "ceph-mgr",
    "stack_sig": "55ad203b14ba86896fb522b12a64c16ae6acefd0736d21d85ee042e9ff121474",
    "timestamp": "2022-04-18T17:29:40.498001Z",
    "utsname_hostname": "reesi004",
    "utsname_machine": "x86_64",
    "utsname_release": "5.4.0-66-generic",
    "utsname_sysname": "Linux",
    "utsname_version": "#74~18.04.2-Ubuntu SMP Fri Feb 5 11:17:31 UTC 2021" 
}

and https://tracker.ceph.com/issues/51835#note-23, which has some useful logs. Could you please take a look? In this case, it appears that active mgr is experiencing the assert, just after becoming active.

2022-04-06T21:15:38.829+0000 7fbc3e634700  1 -- 172.21.15.107:0/38689 <== mon.1 v2:172.21.15.107:3300/0 11 ==== service_map(e18 4 svc) v1 ==== 1052+0+0 (secure 0 0 0) 0x55ea06cb5b00 con 0x55ea06b7a800
2022-04-06T21:15:38.829+0000 7fbc3e634700 10 mgr ms_dispatch2 active (starting) service_map(e18 4 svc) v1
2022-04-06T21:15:38.829+0000 7fbc3e634700 10 mgr ms_dispatch2 service_map(e18 4 svc) v1
2022-04-06T21:15:38.829+0000 7fbbf6cf0700  5 AuthRegistry(0x55ea069b7340) adding auth protocol
...
2022-04-06T21:15:39.733+0000 7fbc3e634700  1 -- 172.21.15.107:0/38689 <== mon.1 v2:172.21.15.107:3300/0 30 ==== service_map(e19 4 svc) v1 ==== 2812+0+0 (secure 0 0 0) 0x55ea06a2be00 con 0x55ea06b7a800
2022-04-06T21:15:39.733+0000 7fbc3e634700 10 mgr ms_dispatch2 active service_map(e19 4 svc) v1
2022-04-06T21:15:39.733+0000 7fbc3e634700 10 mgr ms_dispatch2 service_map(e19 4 svc) v1
2022-04-06T21:15:39.733+0000 7fbc3e634700 10 mgr handle_service_map e19
2022-04-06T21:15:39.733+0000 7fbc3e634700 10 mgr.server operator() got updated map e19
2022-04-06T21:15:39.734+0000 7fbc3e634700 -1 /home/jenkins-build/build/workspace/ceph-dev-new-build/ARCH/x86_64/AVAILABLE_ARCH/x86_64/AVAILABLE_DIST/centos8/DIST/centos8/MACHINE_SIZE/gigantic/release/17.0.0-11491-g37ef971d/rpm/el8/BUILD/ceph-17.0.0-11491-g37ef971d/src/mgr/DaemonServer.cc: In function 'DaemonServer::got_service_map()::<lambda(const ServiceMap&)>' thread 7fbc3e634700 time 2022-04-06T21:15:39.735575+0000
/home/jenkins-build/build/workspace/ceph-dev-new-build/ARCH/x86_64/AVAILABLE_ARCH/x86_64/AVAILABLE_DIST/centos8/DIST/centos8/MACHINE_SIZE/gigantic/release/17.0.0-11491-g37ef971d/rpm/el8/BUILD/ceph-17.0.0-11491-g37ef971d/src/mgr/DaemonServer.cc: 2992: FAILED ceph_assert(pending_service_map.epoch > service_map.epoch)

 ceph version 17.0.0-11491-g37ef971d (37ef971db5d69256a78734330cbd85e2b14fd088) quincy (dev)
 1: (ceph::__ceph_assert_fail(char const*, char const*, int, char const*)+0x152) [0x7fbc47e14604]
 2: /usr/lib64/ceph/libceph-common.so.2(+0x284825) [0x7fbc47e14825]
 3: (DaemonServer::got_service_map()+0xb2d) [0x55e9fd789b0d]
 4: (Mgr::handle_service_map(boost::intrusive_ptr<MServiceMap>)+0xee) [0x55e9fd7bb83e]
 5: (Mgr::ms_dispatch2(boost::intrusive_ptr<Message> const&)+0x8c4) [0x55e9fd7be744]
 6: (MgrStandby::ms_dispatch2(boost::intrusive_ptr<Message> const&)+0xae) [0x55e9fd7c99de]
 7: (DispatchQueue::entry()+0x14fa) [0x7fbc4809c1fa]
 8: (DispatchQueue::DispatchThread::entry()+0x11) [0x7fbc48153eb1]
 9: /lib64/libpthread.so.0(+0x81cf) [0x7fbc46c291cf]
 10: clone()

#25 Updated by Laura Flores almost 2 years ago

/a/yuriw-2022-04-18_21:23:05-rados-wip-yuri2-testing-2022-04-18-1150-distro-default-smithi/6795328
Description: rados/dashboard/{0-single-container-host debug/mgr mon_election/classic random-objectstore$/{bluestore-comp-lz4} tasks/dashboard}
Test failure: test_access_permissions (tasks.mgr.dashboard.test_cephfs.CephfsTest)

/a/yuriw-2022-04-18_21:23:05-rados-wip-yuri2-testing-2022-04-18-1150-distro-default-smithi/6795328/remote/smithi185/crash/posted/2022-04-18T22:05:55.427559Z_9db23c98-7234-4044-9e8c-6a51e12c5164/log

/BUILD/ceph-17.0.0-11670-g14f114d5/src/mgr/DaemonServer.cc: In function 'DaemonServer::got_service_map()::<lambda(const ServiceMap&)>' thread 7f0117ccd700 time 2022-04-18T22:05:55.425868+0000
/home/jenkins-build/build/workspace/ceph-dev-new-build/ARCH/x86_64/AVAILABLE_ARCH/x86_64/AVAILABLE_DIST/centos8/DIST/centos8/MACHINE_SIZE/gigantic/release/17.0.0-11670-g14f114d5/rpm/el8/BUILD/ceph-17.0.0-11670-g14f114d5/src/mgr/DaemonServer.cc: 2992: FAILED ceph_assert(pending_service_map.epoch > service_map.epoch)

 ceph version 17.0.0-11670-g14f114d5 (14f114d5aa5304e8bd79f8addd90f60680cfce27) quincy (dev)
 1: (ceph::__ceph_assert_fail(char const*, char const*, int, char const*)+0x152) [0x7f01214ad604]
 2: /usr/lib64/ceph/libceph-common.so.2(+0x284825) [0x7f01214ad825]
 3: (DaemonServer::got_service_map()+0xb2d) [0x563cec74db3d]
 4: (Mgr::handle_service_map(boost::intrusive_ptr<MServiceMap>)+0xee) [0x563cec77f86e]
 5: (Mgr::ms_dispatch2(boost::intrusive_ptr<Message> const&)+0x8c4) [0x563cec782774]
 6: (MgrStandby::ms_dispatch2(boost::intrusive_ptr<Message> const&)+0xae) [0x563cec78da0e]
 7: (DispatchQueue::entry()+0x14fa) [0x7f01217351ea]
 8: (DispatchQueue::DispatchThread::entry()+0x11) [0x7f01217ecea1]
 9: /lib64/libpthread.so.0(+0x81cf) [0x7f01202c21cf]
 10: clone()

This teuthology job has some useful logs as well. Pasted below are relevant Tracebacks:

/a/yuriw-2022-04-18_21:23:05-rados-wip-yuri2-testing-2022-04-18-1150-distro-default-smithi/6795328/teuthology.log

2022-04-18T22:06:05.178 ERROR:teuthology.orchestra.daemon.state:Error while waiting for process to exit
Traceback (most recent call last):
  File "/home/teuthworker/src/git.ceph.com_git_teuthology_f0db28781c751636a2f4758c956e38df311ffefc/teuthology/orchestra/daemon/state.py", line 139, in stop
    run.wait([self.proc], timeout=timeout)
  File "/home/teuthworker/src/git.ceph.com_git_teuthology_f0db28781c751636a2f4758c956e38df311ffefc/teuthology/orchestra/run.py", line 479, in wait
    proc.wait()
  File "/home/teuthworker/src/git.ceph.com_git_teuthology_f0db28781c751636a2f4758c956e38df311ffefc/teuthology/orchestra/run.py", line 161, in wait
    self._raise_for_status()
  File "/home/teuthworker/src/git.ceph.com_git_teuthology_f0db28781c751636a2f4758c956e38df311ffefc/teuthology/orchestra/run.py", line 183, in _raise_for_status
    node=self.hostname, label=self.label
teuthology.exceptions.CommandFailedError: Command failed on smithi185 with status 1: 'sudo adjust-ulimits ceph-coverage /home/ubuntu/cephtest/archive/coverage daemon-helper kill ceph-mgr -f --cluster ceph -i y'

...

2022-04-18T22:11:56.425 INFO:tasks.cephfs_test_runner:ERROR: test_access_permissions (tasks.mgr.dashboard.test_cephfs.CephfsTest)
2022-04-18T22:11:56.425 INFO:tasks.cephfs_test_runner:----------------------------------------------------------------------
2022-04-18T22:11:56.425 INFO:tasks.cephfs_test_runner:Traceback (most recent call last):
2022-04-18T22:11:56.425 INFO:tasks.cephfs_test_runner:  File "/home/teuthworker/src/github.com_ceph_ceph-c_14f114d5aa5304e8bd79f8addd90f60680cfce27/qa/tasks/mgr/dashboard/helper.py", line 271, in setUp
2022-04-18T22:11:56.425 INFO:tasks.cephfs_test_runner:    self.wait_for_health_clear(self.TIMEOUT_HEALTH_CLEAR)
2022-04-18T22:11:56.425 INFO:tasks.cephfs_test_runner:  File "/home/teuthworker/src/github.com_ceph_ceph-c_14f114d5aa5304e8bd79f8addd90f60680cfce27/qa/tasks/ceph_test_case.py", line 175, in wait_for_health_clear
2022-04-18T22:11:56.426 INFO:tasks.cephfs_test_runner:    self.wait_until_true(is_clear, timeout)
2022-04-18T22:11:56.426 INFO:tasks.cephfs_test_runner:  File "/home/teuthworker/src/github.com_ceph_ceph-c_14f114d5aa5304e8bd79f8addd90f60680cfce27/qa/tasks/ceph_test_case.py", line 212, in wait_until_true
2022-04-18T22:11:56.426 INFO:tasks.cephfs_test_runner:    raise TestTimeoutError("Timed out after {0}s and {1} retries".format(elapsed, retry_count))
2022-04-18T22:11:56.426 INFO:tasks.cephfs_test_runner:tasks.ceph_test_case.TestTimeoutError: Timed out after 60s and 0 retries

/a/yuriw-2022-04-18_21:23:05-rados-wip-yuri2-testing-2022-04-18-1150-distro-default-smithi/6795328/remote/smithi073/log/ceph-mgr.x.log.gz

2022-04-18T22:07:27.997+0000 7f474c572700  0 [dashboard DEBUG auth] checking authorization...
2022-04-18T22:07:27.997+0000 7f474c572700  0 [dashboard ERROR exception] Internal Server Error
Traceback (most recent call last):
  File "/usr/share/ceph/mgr/dashboard/controllers/_base_controller.py", line 238, in get_client_version
    cherrypy.request.headers['Accept'])
  File "/usr/share/ceph/mgr/dashboard/controllers/_version.py", line 41, in from_mime_type
    return cls.from_string(cls.__MIME_TYPE_REGEX.match(mime_type).group(1))
AttributeError: 'NoneType' object has no attribute 'group'

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/usr/share/ceph/mgr/dashboard/services/exception.py", line 47, in dashboard_exception_handler
    return handler(*args, **kwargs)
  File "/usr/lib/python3.6/site-packages/cherrypy/_cpdispatch.py", line 54, in __call__
    return self.callable(*self.args, **self.kwargs)
  File "/usr/share/ceph/mgr/dashboard/controllers/_base_controller.py", line 260, in inner
    client_version = BaseController.get_client_version()
  File "/usr/share/ceph/mgr/dashboard/controllers/_base_controller.py", line 241, in get_client_version
    415, "Unable to find version in request header")
cherrypy._cperror.HTTPError: (415, 'Unable to find version in request header')

#26 Updated by Mykola Golub almost 2 years ago

  • Status changed from New to In Progress

Looking at one example [1].

The current mgr implementation, when processing a service map it received from a mon in DaemonServer::got_service_map [1], assumes two cases:

1) it is an initial service_map the mgr receives from a mon on activation;
2) it is the mgr own pending map it sent to a mon to commit.

The problem is that as in the example [1] when activating the mgr may receive several service_map versions sent by the previous active mgr, and the second map causes the assertion failure.

In the mgr crash log we see:

The mgr activation started:

 -4109> 2022-04-06T21:15:38.823+0000 7fbc3e634700  1 mgr handle_mgr_map Activating!
 -4108> 2022-04-06T21:15:38.824+0000 7fbc3e634700  1 mgr handle_mgr_map I am now activating

Then it receives service_map e18:

 -3789> 2022-04-06T21:15:38.829+0000 7fbc3e634700  1 -- 172.21.15.107:0/38689 --> [v2:172.21.15.107:3300/0,v1:172.21.15.107:6789/0] -- mon_subscribe({osdmap=26}) v3 -- 0x55ea06cf2b60 con 0x55ea06b7a800
 -3788> 2022-04-06T21:15:38.829+0000 7fbc3e634700  1 -- 172.21.15.107:0/38689 <== mon.1 v2:172.21.15.107:3300/0 11 ==== service_map(e18 4 svc) v1 ==== 1052+0+0 (secure 0 0 0) 0x55ea06cb5b00 con 0x55ea06b7a800

And use it as an initial map

 -3736> 2022-04-06T21:15:38.830+0000 7fbc3e634700 10 mgr handle_service_map e18
 -3735> 2022-04-06T21:15:38.830+0000 7fbc3e634700 10 mgr.server operator() got initial map e18

And later it receives service_map e19, which causes the assertion failure:

    -6> 2022-04-06T21:15:39.733+0000 7fbc3e634700  1 -- 172.21.15.107:0/38689 <== mon.1 v2:172.21.15.107:3300/0 30 ==== service_map(e19 4 svc) v1 ==== 2812+0+0 (secure 0 0 0) 0x55ea06a2be00 
con 0x55ea06b7a800
    -5> 2022-04-06T21:15:39.733+0000 7fbc3e634700 10 mgr ms_dispatch2 active service_map(e19 4 svc) v1
    -4> 2022-04-06T21:15:39.733+0000 7fbc3e634700 10 mgr ms_dispatch2 service_map(e19 4 svc) v1
    -3> 2022-04-06T21:15:39.733+0000 7fbc3e634700 10 mgr handle_service_map e19
    -2> 2022-04-06T21:15:39.733+0000 7fbc3e634700 10 mgr.server operator() got updated map e19
    -1> 2022-04-06T21:15:39.734+0000 7fbc3e634700 -1 /home/jenkins-build/build/workspace/ceph-dev-new-build/ARCH/x86_64/AVAILABLE_ARCH/x86_64/AVAILABLE_DIST/centos8/DIST/centos8/MACHINE_SIZE
/gigantic/release/17.0.0-11491-g37ef971d/rpm/el8/BUILD/ceph-17.0.0-11491-g37ef971d/src/mgr/DaemonServer.cc: In function 'DaemonServer::got_service_map()::<lambda(const ServiceMap&)>' thread 7fbc3e634700 time 2022-04-06T21:15:39.735575+0000
/home/jenkins-build/build/workspace/ceph-dev-new-build/ARCH/x86_64/AVAILABLE_ARCH/x86_64/AVAILABLE_DIST/centos8/DIST/centos8/MACHINE_SIZE/gigantic/release/17.0.0-11491-g37ef971d/rpm/el8/BUILD/ceph-17.0.0-11491-g37ef971d/src/mgr/DaemonServer.cc: 2992: FAILED ceph_assert(pending_service_map.epoch > service_map.epoch)

And from the another mgr (mgr.z) log we may see that the maps were generated and sent by it just before it deactivated:

2022-04-06T21:15:36.738+0000 7f74324a8700 10 mgr.server operator() sending service_map e18
...
2022-04-06T21:15:38.738+0000 7f74324a8700 10 mgr.server operator() sending service_map e19
...
2022-04-06T21:15:38.822+0000 7f74605e9700  4 mgr handle_mgr_map received map epoch 20
2022-04-06T21:15:38.822+0000 7f74605e9700  4 mgr handle_mgr_map active in map: 0 active is 4681
2022-04-06T21:15:38.823+0000 7f74605e9700 -1 mgr handle_mgr_map I was active but no longer am

[1] /a/yuriw-2022-04-06_16:35:43-rados-wip-yuri5-testing-2022-04-05-1720-distro-default-smithi/6780002
[2] https://github.com/ceph/ceph/blob/8b7ee35c3a0a758b24cfdee68c1fa666d0a1d408/src/mgr/DaemonServer.cc#L2978

#27 Updated by Mykola Golub almost 2 years ago

  • Status changed from In Progress to Fix Under Review
  • Pull request ID set to 45984

#28 Updated by Laura Flores almost 2 years ago

/a/yuriw-2022-05-27_21:59:17-rados-wip-yuri-testing-2022-05-27-0934-distro-default-smithi/6851266

#29 Updated by Laura Flores almost 2 years ago

  • Backport set to pacific

/a/yuriw-2022-05-31_21:35:41-rados-wip-yuri2-testing-2022-05-31-1300-pacific-distro-default-smithi/6856512

#30 Updated by Mykola Golub almost 2 years ago

  • Status changed from Fix Under Review to Pending Backport

#31 Updated by Backport Bot almost 2 years ago

  • Copied to Backport #56053: pacific: mgr/DaemonServer.cc: FAILED ceph_assert(pending_service_map.epoch > service_map.epoch) added

#32 Updated by Neha Ojha almost 2 years ago

  • Backport changed from pacific to pacific,quincy

Also, need a quincy backport

#33 Updated by Backport Bot almost 2 years ago

  • Copied to Backport #56096: quincy: mgr/DaemonServer.cc: FAILED ceph_assert(pending_service_map.epoch > service_map.epoch) added

#34 Updated by Vikhyat Umrao almost 2 years ago

We have seen this issue in the Gibba cluster upgrading from 17.2.0 to 17.2.1 RC because quincy backport https://github.com/ceph/ceph/pull/46738 is still not merged in quincy branch and is not part of the RC.

[root@gibba001 f9d4cf6a-edcf-11ec-a96a-3cecef3d8fb8]# ceph crash info 2022-06-17T01:46:09.396216Z_598785fc-c69d-4d7b-b315-3cec6e289181
{
    "assert_condition": "pending_service_map.epoch > service_map.epoch",
    "assert_file": "/home/jenkins-build/build/workspace/ceph-dev-build/ARCH/x86_64/AVAILABLE_ARCH/x86_64/AVAILABLE_DIST/centos8/DIST/centos8/MACHINE_SIZE/gigantic/release/17.2.0-436-gda36d2c9/rpm/el8/BUILD/ceph-17.2.0-436-gda36d2c9/src/mgr/DaemonServer.cc",
    "assert_func": "DaemonServer::got_service_map()::<lambda(const ServiceMap&)>",
    "assert_line": 2946,
    "assert_msg": "/home/jenkins-build/build/workspace/ceph-dev-build/ARCH/x86_64/AVAILABLE_ARCH/x86_64/AVAILABLE_DIST/centos8/DIST/centos8/MACHINE_SIZE/gigantic/release/17.2.0-436-gda36d2c9/rpm/el8/BUILD/ceph-17.2.0-436-gda36d2c9/src/mgr/DaemonServer.cc: In function 'DaemonServer::got_service_map()::<lambda(const ServiceMap&)>' thread 7fbc0ebef700 time 2022-06-17T01:46:09.394730+0000\n/home/jenkins-build/build/workspace/ceph-dev-build/ARCH/x86_64/AVAILABLE_ARCH/x86_64/AVAILABLE_DIST/centos8/DIST/centos8/MACHINE_SIZE/gigantic/release/17.2.0-436-gda36d2c9/rpm/el8/BUILD/ceph-17.2.0-436-gda36d2c9/src/mgr/DaemonServer.cc: 2946: FAILED ceph_assert(pending_service_map.epoch > service_map.epoch)\n",
    "assert_thread_name": "ms_dispatch",
    "backtrace": [
        "/lib64/libpthread.so.0(+0x12ce0) [0x7fbc17404ce0]",
        "gsignal()",
        "abort()",
        "(ceph::__ceph_assert_fail(char const*, char const*, int, char const*)+0x1b0) [0x7fbc185e4c32]",
        "/usr/lib64/ceph/libceph-common.so.2(+0x283df5) [0x7fbc185e4df5]",
        "(DaemonServer::got_service_map()+0xb2d) [0x55eae853ceed]",
        "(Mgr::handle_service_map(boost::intrusive_ptr<MServiceMap>)+0xee) [0x55eae856ee7e]",
        "(Mgr::ms_dispatch2(boost::intrusive_ptr<Message> const&)+0x8c4) [0x55eae8571d94]",
        "(MgrStandby::ms_dispatch2(boost::intrusive_ptr<Message> const&)+0xae) [0x55eae857d1be]",
        "(DispatchQueue::entry()+0x14fa) [0x7fbc1886b43a]",
        "(DispatchQueue::DispatchThread::entry()+0x11) [0x7fbc18922581]",
        "/lib64/libpthread.so.0(+0x81ca) [0x7fbc173fa1ca]",
        "clone()" 
    ],
    "ceph_version": "17.2.0-436-gda36d2c9",
    "crash_id": "2022-06-17T01:46:09.396216Z_598785fc-c69d-4d7b-b315-3cec6e289181",
    "entity_name": "mgr.gibba008.tfggyq",
    "os_id": "centos",
    "os_name": "CentOS Stream",
    "os_version": "8",
    "os_version_id": "8",
    "process_name": "ceph-mgr",
    "stack_sig": "5d711175d9ef3767a3e8f9de1a229853f45300f71d5d966e94bc9ffa6360673b",
    "timestamp": "2022-06-17T01:46:09.396216Z",
    "utsname_hostname": "gibba008",
    "utsname_machine": "x86_64",
    "utsname_release": "4.18.0-301.1.el8.x86_64",
    "utsname_sysname": "Linux",
    "utsname_version": "#1 SMP Tue Apr 13 16:24:22 UTC 2021" 
}
[root@gibba001 f9d4cf6a-edcf-11ec-a96a-3cecef3d8fb8]# ceph crash info 2022-06-17T02:11:50.434785Z_e7b604fb-aaea-4cf2-87a9-231ee226a9ed
{
    "assert_condition": "pending_service_map.epoch > service_map.epoch",
    "assert_file": "/home/jenkins-build/build/workspace/ceph-dev-build/ARCH/x86_64/AVAILABLE_ARCH/x86_64/AVAILABLE_DIST/centos8/DIST/centos8/MACHINE_SIZE/gigantic/release/17.2.0-436-gda36d2c9/rpm/el8/BUILD/ceph-17.2.0-436-gda36d2c9/src/mgr/DaemonServer.cc",
    "assert_func": "DaemonServer::got_service_map()::<lambda(const ServiceMap&)>",
    "assert_line": 2946,
    "assert_msg": "/home/jenkins-build/build/workspace/ceph-dev-build/ARCH/x86_64/AVAILABLE_ARCH/x86_64/AVAILABLE_DIST/centos8/DIST/centos8/MACHINE_SIZE/gigantic/release/17.2.0-436-gda36d2c9/rpm/el8/BUILD/ceph-17.2.0-436-gda36d2c9/src/mgr/DaemonServer.cc: In function 'DaemonServer::got_service_map()::<lambda(const ServiceMap&)>' thread 7f6a6787c700 time 2022-06-17T02:11:50.433270+0000\n/home/jenkins-build/build/workspace/ceph-dev-build/ARCH/x86_64/AVAILABLE_ARCH/x86_64/AVAILABLE_DIST/centos8/DIST/centos8/MACHINE_SIZE/gigantic/release/17.2.0-436-gda36d2c9/rpm/el8/BUILD/ceph-17.2.0-436-gda36d2c9/src/mgr/DaemonServer.cc: 2946: FAILED ceph_assert(pending_service_map.epoch > service_map.epoch)\n",
    "assert_thread_name": "ms_dispatch",
    "backtrace": [
        "/lib64/libpthread.so.0(+0x12ce0) [0x7f6a6fe7ece0]",
        "gsignal()",
        "abort()",
        "(ceph::__ceph_assert_fail(char const*, char const*, int, char const*)+0x1b0) [0x7f6a7105ec32]",
        "/usr/lib64/ceph/libceph-common.so.2(+0x283df5) [0x7f6a7105edf5]",
        "(DaemonServer::got_service_map()+0xb2d) [0x56046a99deed]",
        "(Mgr::handle_service_map(boost::intrusive_ptr<MServiceMap>)+0xee) [0x56046a9cfe7e]",
        "(Mgr::ms_dispatch2(boost::intrusive_ptr<Message> const&)+0x8c4) [0x56046a9d2d94]",
        "(MgrStandby::ms_dispatch2(boost::intrusive_ptr<Message> const&)+0xae) [0x56046a9de1be]",
        "(DispatchQueue::entry()+0x14fa) [0x7f6a712e543a]",
        "(DispatchQueue::DispatchThread::entry()+0x11) [0x7f6a7139c581]",
        "/lib64/libpthread.so.0(+0x81ca) [0x7f6a6fe741ca]",
        "clone()" 
    ],
    "ceph_version": "17.2.0-436-gda36d2c9",
    "crash_id": "2022-06-17T02:11:50.434785Z_e7b604fb-aaea-4cf2-87a9-231ee226a9ed",
    "entity_name": "mgr.gibba008.tfggyq",
    "os_id": "centos",
    "os_name": "CentOS Stream",
    "os_version": "8",
    "os_version_id": "8",
    "process_name": "ceph-mgr",
    "stack_sig": "5d711175d9ef3767a3e8f9de1a229853f45300f71d5d966e94bc9ffa6360673b",
    "timestamp": "2022-06-17T02:11:50.434785Z",
    "utsname_hostname": "gibba008",
    "utsname_machine": "x86_64",
    "utsname_release": "4.18.0-301.1.el8.x86_64",
    "utsname_sysname": "Linux",
    "utsname_version": "#1 SMP Tue Apr 13 16:24:22 UTC 2021" 
}
[root@gibba001 f9d4cf6a-edcf-11ec-a96a-3cecef3d8fb8]# ceph crash info 2022-06-17T07:34:07.419170Z_1529efe0-3620-4b36-b813-329f1225b04d
{
    "assert_condition": "pending_service_map.epoch > service_map.epoch",
    "assert_file": "/home/jenkins-build/build/workspace/ceph-dev-build/ARCH/x86_64/AVAILABLE_ARCH/x86_64/AVAILABLE_DIST/centos8/DIST/centos8/MACHINE_SIZE/gigantic/release/17.2.0-436-gda36d2c9/rpm/el8/BUILD/ceph-17.2.0-436-gda36d2c9/src/mgr/DaemonServer.cc",
    "assert_func": "DaemonServer::got_service_map()::<lambda(const ServiceMap&)>",
    "assert_line": 2946,
    "assert_msg": "/home/jenkins-build/build/workspace/ceph-dev-build/ARCH/x86_64/AVAILABLE_ARCH/x86_64/AVAILABLE_DIST/centos8/DIST/centos8/MACHINE_SIZE/gigantic/release/17.2.0-436-gda36d2c9/rpm/el8/BUILD/ceph-17.2.0-436-gda36d2c9/src/mgr/DaemonServer.cc: In function 'DaemonServer::got_service_map()::<lambda(const ServiceMap&)>' thread 7f846ae19700 time 2022-06-17T07:34:07.417730+0000\n/home/jenkins-build/build/workspace/ceph-dev-build/ARCH/x86_64/AVAILABLE_ARCH/x86_64/AVAILABLE_DIST/centos8/DIST/centos8/MACHINE_SIZE/gigantic/release/17.2.0-436-gda36d2c9/rpm/el8/BUILD/ceph-17.2.0-436-gda36d2c9/src/mgr/DaemonServer.cc: 2946: FAILED ceph_assert(pending_service_map.epoch > service_map.epoch)\n",
    "assert_thread_name": "ms_dispatch",
    "backtrace": [
        "/lib64/libpthread.so.0(+0x12ce0) [0x7f847341bce0]",
        "gsignal()",
        "abort()",
        "(ceph::__ceph_assert_fail(char const*, char const*, int, char const*)+0x1b0) [0x7f84745fbc32]",
        "/usr/lib64/ceph/libceph-common.so.2(+0x283df5) [0x7f84745fbdf5]",
        "(DaemonServer::got_service_map()+0xb2d) [0x560a2d31aeed]",
        "(Mgr::handle_service_map(boost::intrusive_ptr<MServiceMap>)+0xee) [0x560a2d34ce7e]",
        "(Mgr::ms_dispatch2(boost::intrusive_ptr<Message> const&)+0x8c4) [0x560a2d34fd94]",
        "(MgrStandby::ms_dispatch2(boost::intrusive_ptr<Message> const&)+0xae) [0x560a2d35b1be]",
        "(DispatchQueue::entry()+0x14fa) [0x7f847488243a]",
        "(DispatchQueue::DispatchThread::entry()+0x11) [0x7f8474939581]",
        "/lib64/libpthread.so.0(+0x81ca) [0x7f84734111ca]",
        "clone()" 
    ],
    "ceph_version": "17.2.0-436-gda36d2c9",
    "crash_id": "2022-06-17T07:34:07.419170Z_1529efe0-3620-4b36-b813-329f1225b04d",
    "entity_name": "mgr.gibba006.enemnj",
    "os_id": "centos",
    "os_name": "CentOS Stream",
    "os_version": "8",
    "os_version_id": "8",
    "process_name": "ceph-mgr",
    "stack_sig": "5d711175d9ef3767a3e8f9de1a229853f45300f71d5d966e94bc9ffa6360673b",
    "timestamp": "2022-06-17T07:34:07.419170Z",
    "utsname_hostname": "gibba006",
    "utsname_machine": "x86_64",
    "utsname_release": "4.18.0-301.1.el8.x86_64",
    "utsname_sysname": "Linux",
    "utsname_version": "#1 SMP Tue Apr 13 16:24:22 UTC 2021" 
}

#35 Updated by Telemetry Bot almost 2 years ago

  • Crash signature (v1) updated (diff)
  • Affected Versions v15.2.16, v16.2.9, v17.0.0, v17.1.0, v17.2.0 added

#36 Updated by Telemetry Bot over 1 year ago

  • Crash signature (v1) updated (diff)
  • Affected Versions v17.2.1 added

#37 Updated by Backport Bot over 1 year ago

  • Tags set to backport_processed

#38 Updated by Konstantin Shalygin over 1 year ago

  • Status changed from Pending Backport to Resolved
  • Crash signature (v1) updated (diff)

#39 Updated by Konstantin Shalygin over 1 year ago

  • % Done changed from 0 to 100
  • Tags deleted (backport_processed)

Also available in: Atom PDF