Bug #55858
openPacific 16.2.7 MDS constantly crashing
0%
Description
Jun 3 23:21:23 r07s05 bash[1415068]: debug -1> 2022-06-03T23:21:23.148+0000 7f6b0f1e1700 -1 /home/jenkins-build/build/workspace/ceph-build/ARCH/x86_64/AVAILABLE_ARCH/x86_64/AVAILABLE_DIST/centos8/DIST/centos8/MACHINE_SIZE/gigantic/release/16.2.7/rpm/el8/BUILD/ceph-16.2.7 /src/mds/Locker.cc: In function 'bool Locker::check_inode_max_size(CInode*, bool, uint64_t, uint64_t, utime_t)' thread 7f6b0f1e1700 time 2 022-06-03T23:21:23.147265+0000 Jun 3 23:21:23 r07s05 bash[1415068]: /home/jenkins-build/build/workspace/ceph-build/ARCH/x86_64/AVAILABLE_ARCH/x86_64/AVAILABLE_DIST/centos8/DIST/centos8/MACHINE_SIZE/gigantic/release/16.2.7/rpm/el8/BUILD/ceph-16.2.7/src/mds/Locker.cc: 2787: FAILED ceph_assert(in->is_auth()) Jun 3 23:21:23 r07s05 bash[1415068]: ceph version 16.2.7 (dd0603118f56ab514f133c8d2e3adfc983942503) pacific (stable) Jun 3 23:21:23 r07s05 bash[1415068]: 1: (ceph::__ceph_assert_fail(char const*, char const*, int, char const*)+0x158) [0x7f6b17c0ab52] Jun 3 23:21:23 r07s05 bash[1415068]: 2: /usr/lib64/ceph/libceph-common.so.2(+0x276d6c) [0x7f6b17c0ad6c] Jun 3 23:21:23 r07s05 bash[1415068]: 3: (Locker::check_inode_max_size(CInode*, bool, unsigned long, unsigned long, utime_t)+0x1aab) [0x55bd76f9298b] Jun 3 23:21:23 r07s05 bash[1415068]: 4: (Server::handle_client_open(boost::intrusive_ptr<MDRequestImpl>&)+0x111c) [0x55bd76e2130c] Jun 3 23:21:23 r07s05 bash[1415068]: 5: (Server::handle_client_openc(boost::intrusive_ptr<MDRequestImpl>&)+0x4ab) [0x55bd76e220cb] Jun 3 23:21:23 r07s05 bash[1415068]: 6: (Server::dispatch_client_request(boost::intrusive_ptr<MDRequestImpl>&)+0xf3c) [0x55bd76e50e8c] Jun 3 23:21:23 r07s05 bash[1415068]: 7: (MDCache::dispatch_request(boost::intrusive_ptr<MDRequestImpl>&)+0x33) [0x55bd76f07193] Jun 3 23:21:23 r07s05 bash[1415068]: 8: (MDSContext::complete(int)+0x56) [0x55bd770c3c06] Jun 3 23:21:23 r07s05 bash[1415068]: 9: (MDSCacheObject::finish_waiting(unsigned long, int)+0xce) [0x55bd770e5cae] Jun 3 23:21:23 r07s05 bash[1415068]: 10: (Locker::eval_gather(SimpleLock*, bool, bool*, std::vector<MDSContext*, std::allocator<MDSContext*> >*)+0x13d6) [0x55bd76f97d66] Jun 3 23:21:23 r07s05 bash[1415068]: 11: (Locker::handle_file_lock(ScatterLock*, boost::intrusive_ptr<MLock const> const&)+0xed1) [0x55bd76fa5dd1] Jun 3 23:21:23 r07s05 bash[1415068]: 12: (Locker::handle_lock(boost::intrusive_ptr<MLock const> const&)+0x1b3) [0x55bd76fa6943] Jun 3 23:21:23 r07s05 bash[1415068]: 13: (Locker::dispatch(boost::intrusive_ptr<Message const> const&)+0xb4) [0x55bd76faab74] Jun 3 23:21:23 r07s05 bash[1415068]: 14: (MDSRank::handle_message(boost::intrusive_ptr<Message const> const&)+0xbcc) [0x55bd76dc0a2c] Jun 3 23:21:23 r07s05 bash[1415068]: 15: (MDSRank::_dispatch(boost::intrusive_ptr<Message const> const&, bool)+0x7bb) [0x55bd76dc33cb] Jun 3 23:21:23 r07s05 bash[1415068]: 16: (MDSRankDispatcher::ms_dispatch(boost::intrusive_ptr<Message const> const&)+0x55) [0x55bd76dc39c5] Jun 3 23:21:23 r07s05 bash[1415068]: 17: (MDSDaemon::ms_dispatch2(boost::intrusive_ptr<Message> const&)+0x108) [0x55bd76db35d8] Jun 3 23:21:23 r07s05 bash[1415068]: 18: (DispatchQueue::entry()+0x126a) [0x7f6b17e4eaba] Jun 3 23:21:23 r07s05 bash[1415068]: 19: (DispatchQueue::DispatchThread::entry()+0x11) [0x7f6b17f005d1] Jun 3 23:21:23 r07s05 bash[1415068]: 20: /lib64/libpthread.so.0(+0x81cf) [0x7f6b16bee1cf] Jun 3 23:21:23 r07s05 bash[1415068]: 21: clone() Jun 3 23:21:23 r07s05 bash[1415068]: debug 0> 2022-06-03T23:21:23.148+0000 7f6b0f1e1700 -1 *** Caught signal (Aborted) ** Jun 3 23:21:23 r07s05 bash[1415068]: in thread 7f6b0f1e1700 thread_name:ms_dispatch Jun 3 23:21:23 r07s05 bash[1415068]: ceph version 16.2.7 (dd0603118f56ab514f133c8d2e3adfc983942503) pacific (stable) Jun 3 23:21:23 r07s05 bash[1415068]: 1: /lib64/libpthread.so.0(+0x12ce0) [0x7f6b16bf8ce0] Jun 3 23:21:23 r07s05 bash[1415068]: 2: gsignal() Jun 3 23:21:23 r07s05 bash[1415068]: 3: abort() Jun 3 23:21:23 r07s05 bash[1415068]: 4: (ceph::__ceph_assert_fail(char const*, char const*, int, char const*)+0x1a9) [0x7f6b17c0aba3] Jun 3 23:21:23 r07s05 bash[1415068]: 5: /usr/lib64/ceph/libceph-common.so.2(+0x276d6c) [0x7f6b17c0ad6c] Jun 3 23:21:23 r07s05 bash[1415068]: 6: (Locker::check_inode_max_size(CInode*, bool, unsigned long, unsigned long, utime_t)+0x1aab) [0x55bd76f9298b] Jun 3 23:21:23 r07s05 bash[1415068]: 7: (Server::handle_client_open(boost::intrusive_ptr<MDRequestImpl>&)+0x111c) [0x55bd76e2130c] Jun 3 23:21:23 r07s05 bash[1415068]: 8: (Server::handle_client_openc(boost::intrusive_ptr<MDRequestImpl>&)+0x4ab) [0x55bd76e220cb] Jun 3 23:21:23 r07s05 bash[1415068]: 9: (Server::dispatch_client_request(boost::intrusive_ptr<MDRequestImpl>&)+0xf3c) [0x55bd76e50e8c] Jun 3 23:21:23 r07s05 bash[1415068]: 10: (MDCache::dispatch_request(boost::intrusive_ptr<MDRequestImpl>&)+0x33) [0x55bd76f07193] Jun 3 23:21:23 r07s05 bash[1415068]: 11: (MDSContext::complete(int)+0x56) [0x55bd770c3c06] Jun 3 23:21:23 r07s05 bash[1415068]: 12: (MDSCacheObject::finish_waiting(unsigned long, int)+0xce) [0x55bd770e5cae] Jun 3 23:21:23 r07s05 bash[1415068]: 13: (Locker::eval_gather(SimpleLock*, bool, bool*, std::vector<MDSContext*, std::allocator<MDSContext*> >*)+0x13d6) [0x55bd76f97d66] Jun 3 23:21:23 r07s05 bash[1415068]: 14: (Locker::handle_file_lock(ScatterLock*, boost::intrusive_ptr<MLock const> const&)+0xed1) [0x55bd76fa5dd1] Jun 3 23:21:23 r07s05 bash[1415068]: 15: (Locker::handle_lock(boost::intrusive_ptr<MLock const> const&)+0x1b3) [0x55bd76fa6943] Jun 3 23:21:23 r07s05 bash[1415068]: 16: (Locker::dispatch(boost::intrusive_ptr<Message const> const&)+0xb4) [0x55bd76faab74] Jun 3 23:21:23 r07s05 bash[1415068]: 17: (MDSRank::handle_message(boost::intrusive_ptr<Message const> const&)+0xbcc) [0x55bd76dc0a2c] Jun 3 23:21:23 r07s05 bash[1415068]: 18: (MDSRank::_dispatch(boost::intrusive_ptr<Message const> const&, bool)+0x7bb) [0x55bd76dc33cb] Jun 3 23:21:23 r07s05 bash[1415068]: 19: (MDSRankDispatcher::ms_dispatch(boost::intrusive_ptr<Message const> const&)+0x55) [0x55bd76dc39c5] Jun 3 23:21:23 r07s05 bash[1415068]: 20: (MDSDaemon::ms_dispatch2(boost::intrusive_ptr<Message> const&)+0x108) [0x55bd76db35d8] Jun 3 23:21:23 r07s05 bash[1415068]: 21: (DispatchQueue::entry()+0x126a) [0x7f6b17e4eaba] Jun 3 23:21:23 r07s05 bash[1415068]: 22: (DispatchQueue::DispatchThread::entry()+0x11) [0x7f6b17f005d1] Jun 3 23:21:23 r07s05 bash[1415068]: 23: /lib64/libpthread.so.0(+0x81cf) [0x7f6b16bee1cf] Jun 3 23:21:23 r07s05 bash[1415068]: 24: clone()
Updated by Venky Shankar almost 2 years ago
- Status changed from New to Triaged
- Assignee set to Kotresh Hiremath Ravishankar
- Target version set to v18.0.0
- Backport set to quincy, pacific
- Labels (FS) crash added
Updated by Mike Lowe almost 2 years ago
I've identified the problematic clients as kernel client 5.18.0. Once the auth was removed for these clients the mds's were able to stay running long enough to recover.
Updated by Kotresh Hiremath Ravishankar over 1 year ago
Hi Mike,
We would need more information on this to proceed further.
1. Output of 'ceph fs dump' ?
2. Was multi-mds configured when this crash is seen ?
3. mds logs when this crash is seen. If the issue is reproducible, could you please enable mds debug logs and share it ?
4. What is the workload on the cluster ?
Thanks,
Kotresh HR
Updated by Kotresh Hiremath Ravishankar over 1 year ago
- Status changed from Triaged to Need More Info
Updated by Mike Lowe over 1 year ago
I've noticed a commonality when this is being triggered, Singularity is being used https://en.wikipedia.org/wiki/Singularity_(software)
1.
enable_multiple, ever_enabled_multiple: 1,1
default compat: compat={},rocompat={},incompat={1=base v0.20,2=client writeable ranges,3=default file layouts on dirs,4=dir inode in separate object,5=mds uses versioned encoding,6=dirfrag is stored in omap,8=no anchor table,9=file layout
v2,10=snaprealm v2}
legacy client fscid: 3
Filesystem 'cephfs' (3)
fs_name cephfs
epoch 385251
flags 12
created 2021-09-21T19:34:47.717174+0000
modified 2022-08-02T13:58:29.118779+0000
tableserver 0
root 0
session_timeout 60
session_autoclose 300
max_file_size 1099511627776
required_client_features {}
last_failure 0
last_failure_osd_epoch 421512
compat compat={},rocompat={},incompat={1=base v0.20,2=client writeable ranges,3=default file layouts on dirs,4=dir inode in separate object,5=mds uses versioned encoding,6=dirfrag is stored in omap,7=mds uses inline data,8=no anchor tabl
e,9=file layout v2,10=snaprealm v2}
max_mds 8
in 0,1,2,3,4,5,6,7
up {0=3115345405,1=2537486907,2=2537466589,3=2295815428,4=2537462109,5=2537490587,6=3081320000,7=2524704559}
failed
damaged
stopped
data_pools [14,15]
metadata_pool 13
inline_data disabled
balancer
standby_count_wanted 1
[mds.fs_name.r07s07.pggvts{0:3115345405} state up:active seq 18700 addr [v2:xxx.xxx.xxx.12:6928/2179125734,v1:xxx.xxx.xxx.12:6929/2179125734] compat {c=[1],r=[1],i=[7ff]}]
[mds.fs_name.r07s08.rgbvub{1:2537486907} state up:active seq 18877 addr [v2:xxx.xxx.xxx.13:6928/3964695445,v1:xxx.xxx.xxx.13:6929/3964695445] compat {c=[1],r=[1],i=[7ff]}]
[mds.fs_name.r07s06.bwendz{2:2537466589} state up:active seq 1854 addr [v2:xxx.xxx.xxx.11:6928/3205070210,v1:xxx.xxx.xxx.11:6929/3205070210] compat {c=[1],r=[1],i=[7ff]}]
[mds.fs_name.r07s03.xkwnse{3:2295815428} state up:active seq 1267828 addr [v2:xxx.xxx.xxx.8:6800/2513498868,v1:xxx.xxx.xxx.8:6801/2513498868] compat {c=[1],r=[1],i=[7ff]}]
[mds.fs_name.r07s04.jebrjh{4:2537462109} state up:active seq 12596 addr [v2:xxx.xxx.xxx.9:6800/909935435,v1:xxx.xxx.xxx.9:6801/909935435] compat {c=[1],r=[1],i=[7ff]}]
[mds.fs_name.r07s09.vxhfas{5:2537490587} state up:active seq 4937 addr [v2:xxx.xxx.xxx.14:6928/904839211,v1:xxx.xxx.xxx.14:6929/904839211] compat {c=[1],r=[1],i=[7ff]}]
[mds.fs_name.r07s05.rkzfgs{6:3081320000} state up:active seq 81771 addr [v2:xxx.xxx.xxx.10:6800/3613797273,v1:xxx.xxx.xxx.10:6801/3613797273] compat {c=[1],r=[1],i=[7ff]}]
[mds.fs_name.r07s01.cbombv{7:2524704559} state up:active seq 67493 addr [v2:xxx.xxx.xxx.6:6800/1780779886,v1:xxx.xxx.xxx.6:6801/1780779886] compat {c=[1],r=[1],i=[7ff]}]
Standby daemons:
[mds.fs_name.r07s02.zrtfpl{-1:3116010666} state up:standby seq 1 addr [v2:xxx.xxx.xxx.7:6800/3247043630,v1:xxx.xxx.xxx.7:6801/3247043630] compat {c=[1],r=[1],i=[7ff]}]
dumped fsmap epoch 385251
2. Yes
3. It is not easily reproducible
4. Various scientific applications
Updated by Skylar Kelty 7 months ago
We've been experiencing this off and on for over a year. Cannot reproduce though Singularity has often been pushed within the team as a potential cause.
https://github.com/kcl-eresearch/ceph/commit/f981a46ffcf3e75910cfaeb493b9efe2dcb3de4e
This technically solves the issue, but is obviously a cheap hack to work around the issue. Perhaps someone can glean something useful from it?
{ "archived": "2023-09-13 15:05:52.170840", "assert_condition": "in->is_auth()", "assert_file": "/home/jenkins-build/build/workspace/ceph-build/ARCH/x86_64/AVAILABLE_ARCH/x86_64/AVAILABLE_DIST/centos8/DIST/centos8/MACHINE_SIZE/gigantic/release/17.2.6/rpm/el8/BUILD/ceph-17.2.6/src/mds/Locker.cc", "assert_func": "bool Locker::check_inode_max_size(CInode*, bool, uint64_t, uint64_t, utime_t)", "assert_line": 2812, "assert_msg": "/home/jenkins-build/build/workspace/ceph-build/ARCH/x86_64/AVAILABLE_ARCH/x86_64/AVAILABLE_DIST/centos8/DIST/centos8/MACHINE_SIZE/gigantic/release/17.2.6/rpm/el8/BUILD/ceph-17.2.6/src/mds/Locker.cc: In function 'bool Locker::check_inode_max_size(CInode*, bool, uint64_t, uint64_t, utime_t)' thread 7f19eb386700 time 2023-09-13T14:59:30.250323+0000\n/home/jenkins-build/build/workspace/ceph-build/ARCH/x86_64/AVAILABLE_ARCH/x86_64/AVAILABLE_DIST/centos8/DIST/centos8/MACHINE_SIZE/gigantic/release/17.2.6/rpm/el8/BUILD/ceph-17.2.6/src/mds/Locker.cc: 2812: FAILED ceph_assert(in->is_auth())\n", "assert_thread_name": "ms_dispatch", "backtrace": [ "/lib64/libpthread.so.0(+0x12cf0) [0x7f19f2fb7cf0]", "gsignal()", "abort()", "(ceph::__ceph_assert_fail(char const*, char const*, int, char const*)+0x18f) [0x7f19f3fbf4e3]", "/usr/lib64/ceph/libceph-common.so.2(+0x26a64f) [0x7f19f3fbf64f]", "(Locker::check_inode_max_size(CInode*, bool, unsigned long, unsigned long, utime_t)+0x13d2) [0x563f917bca62]", "(Server::handle_client_open(boost::intrusive_ptr<MDRequestImpl>&)+0xe51) [0x563f91649171]", "(Server::handle_client_openc(boost::intrusive_ptr<MDRequestImpl>&)+0x3df) [0x563f91649d6f]", "(MDSContext::complete(int)+0x5f) [0x563f9190afbf]", "(C_MDC_OpenRemoteDentry::finish(int)+0x3e) [0x563f91786fde]", "(MDSContext::complete(int)+0x5f) [0x563f9190afbf]", "(void finish_contexts<std::vector<MDSContext*, std::allocator<MDSContext*> > >(ceph::common::CephContext*, std::vector<MDSContext*, std::allocator<MDSContext*> >&, int)+0x8d) [0x563f915b583d]", "(MDCache::open_ino_finish(inodeno_t, MDCache::open_ino_info_t&, int)+0x138) [0x563f91733448]", "(MDCache::_open_ino_traverse_dir(inodeno_t, MDCache::open_ino_info_t&, int)+0xbb) [0x563f9173379b]", "(MDSContext::complete(int)+0x5f) [0x563f9190afbf]", "(MDSRank::_advance_queues()+0xaa) [0x563f915c39fa]", "(MDSRank::_dispatch(boost::intrusive_ptr<Message const> const&, bool)+0x1d8) [0x563f915c4528]", "(MDSRankDispatcher::ms_dispatch(boost::intrusive_ptr<Message const> const&)+0x5c) [0x563f915c4f6c]", "(MDSDaemon::ms_dispatch2(boost::intrusive_ptr<Message> const&)+0x1bf) [0x563f915ae5cf]", "(Messenger::ms_deliver_dispatch(boost::intrusive_ptr<Message> const&)+0x478) [0x7f19f423b0e8]", "(DispatchQueue::entry()+0x50f) [0x7f19f423852f]", "(DispatchQueue::DispatchThread::entry()+0x11) [0x7f19f43013b1]", "/lib64/libpthread.so.0(+0x81ca) [0x7f19f2fad1ca]", "clone()" ], "ceph_version": "17.2.6", "crash_id": "2023-09-13T14:59:30.256754Z_5cefab1e-b32a-40dc-8ddd-f2c6ede2ffbd", "entity_name": "------MDS------", "os_id": "centos", "os_name": "CentOS Stream", "os_version": "8", "os_version_id": "8", "process_name": "ceph-mds", "stack_sig": "1d13f802118940700ce067135efc86cdc1e814f1e4e15965da62827d37a9f449", "timestamp": "2023-09-13T14:59:30.256754Z", "utsname_hostname": "------", "utsname_machine": "x86_64", "utsname_release": "5.15.0-71-generic", "utsname_sysname": "Linux", "utsname_version": "#78~20.04.1-Ubuntu SMP Wed Apr 19 11:26:48 UTC 2023" }