Bug #48009
MDS crashes on locking assert in /build/ceph-14.2.11/src/mds/ScatterLock.h: In function 'void ScatterLock::set_xlock_snap_sync(MDSContext*)'
0%
Description
Twice this week, an MDS has crashed in this function. Failover occurred as normal so disruption was brief. The issue seems to be correlated with load spikes. Currently 1/3 hosts is 14.2.12, with the other 2 being 14.2.11. The issue has only occurred twice so far, both in the week since updating 1 host to 14.2.12. The crashes have been in mds.3 and mds.4 in our biggest / highest load filesystem. It contains home directories for ~800 students so it has a lot of small files and small metadata ops.
root@mc-3015-422:~# ceph versions { "mon": { "ceph version 14.2.11 (f7fdb2f52131f54b891a2ec99d8205561242cdaf) nautilus (stable)": 2, "ceph version 14.2.12 (2f3caa3b8b3d5c5f2719a1e9d8e7deea5ae1a5c6) nautilus (stable)": 1 }, "mgr": { "ceph version 14.2.11 (f7fdb2f52131f54b891a2ec99d8205561242cdaf) nautilus (stable)": 2, "ceph version 14.2.12 (2f3caa3b8b3d5c5f2719a1e9d8e7deea5ae1a5c6) nautilus (stable)": 1 }, "osd": { "ceph version 14.2.11 (f7fdb2f52131f54b891a2ec99d8205561242cdaf) nautilus (stable)": 54, "ceph version 14.2.12 (2f3caa3b8b3d5c5f2719a1e9d8e7deea5ae1a5c6) nautilus (stable)": 27 }, "mds": { "ceph version 14.2.11 (f7fdb2f52131f54b891a2ec99d8205561242cdaf) nautilus (stable)": 26, "ceph version 14.2.12 (2f3caa3b8b3d5c5f2719a1e9d8e7deea5ae1a5c6) nautilus (stable)": 13 }, "rgw": { "ceph version 14.2.12 (2f3caa3b8b3d5c5f2719a1e9d8e7deea5ae1a5c6) nautilus (stable)": 1, "ceph version 14.2.8 (2d095e947a02261ce61424021bb43bd3022d35cb) nautilus (stable)": 1 }, "overall": { "ceph version 14.2.11 (f7fdb2f52131f54b891a2ec99d8205561242cdaf) nautilus (stable)": 84, "ceph version 14.2.12 (2f3caa3b8b3d5c5f2719a1e9d8e7deea5ae1a5c6) nautilus (stable)": 43, "ceph version 14.2.8 (2d095e947a02261ce61424021bb43bd3022d35cb) nautilus (stable)": 1 } } root@mc-3015-422:~# ceph crash info 2020-10-26_19:54:25.542486Z_bbd92410-c2fd-4edb-bcfc-ae8327918664 { "os_version_id": "18.04", "utsname_machine": "x86_64", "entity_name": "mds.dc-3558-422-G", "backtrace": [ "(()+0x128a0) [0x7f346dbe88a0]", "(gsignal()+0xc7) [0x7f346cce0f47]", "(abort()+0x141) [0x7f346cce28b1]", "(ceph::__ceph_assert_fail(char const*, char const*, int, char const*)+0x1a3) [0x7f346e2c86d3]", "(ceph::__ceph_assertf_fail(char const*, char const*, int, char const*, char const*, ...)+0) [0x7f346e2c885d]", "(MDCache::truncate_inode(CInode*, LogSegment*)+0x325) [0x5633dee47f35]", "(C_MDS_inode_update_finish::finish(int)+0xfa) [0x5633dee1b7fa]", "(MDSContext::complete(int)+0x73) [0x5633df006d33]", "(MDSIOContextBase::complete(int)+0x173) [0x5633df006fd3]", "(MDSLogContextBase::complete(int)+0x40) [0x5633df007230]", "(Finisher::finisher_thread_entry()+0x15e) [0x7f346e31313e]", "(()+0x76db) [0x7f346dbdd6db]", "(clone()+0x3f) [0x7f346cdc3a3f]" ], "assert_line": 85, "utsname_release": "5.4.0-48-generic", "assert_file": "/build/ceph-14.2.11/src/mds/ScatterLock.h", "utsname_sysname": "Linux", "os_version": "18.04.5 LTS (Bionic Beaver)", "os_id": "ubuntu", "assert_thread_name": "fn_anonymous", "assert_msg": "/build/ceph-14.2.11/src/mds/ScatterLock.h: In function 'void ScatterLock::set_xlock_snap_sync(MDSContext*)' thread 7f345ea01700 time 2020-10-26 15:54:25.537491\n/build/ceph-14.2.11/src/mds/ScatterLock.h: 85: FAILED ceph_assert(state == LOCK_XLOCK || state == LOCK_XLOCKDONE)\n", "assert_func": "void ScatterLock::set_xlock_snap_sync(MDSContext*)", "ceph_version": "14.2.11", "os_name": "Ubuntu", "timestamp": "2020-10-26 19:54:25.542486Z", "process_name": "ceph-mds", "archived": "2020-10-26 20:07:34.236612", "utsname_hostname": "dc-3558-422.cloud.cs.uwaterloo.ca", "crash_id": "2020-10-26_19:54:25.542486Z_bbd92410-c2fd-4edb-bcfc-ae8327918664", "assert_condition": "state == LOCK_XLOCK || state == LOCK_XLOCKDONE", "utsname_version": "#52~18.04.1-Ubuntu SMP Thu Sep 10 12:50:22 UTC 2020" }
History
#1 Updated by Patrick Donnelly over 2 years ago
- Status changed from New to Need More Info
Do you have any logs from the event? Has it been repeatable?
#2 Updated by Zheng Yan over 2 years ago
looks like dup of https://tracker.ceph.com/issues/46906
#3 Updated by Nathan Fish over 2 years ago
- File ceph-mds.dc-3558-422-G.log.2.gz added
Both issues involve Scatterlock, but the crash seems to be at a different line. Here is a log of one event. This happened a 3rd time after I reported this.