Project

General

Profile

Actions

Bug #48009

open

MDS crashes on locking assert in /build/ceph-14.2.11/src/mds/ScatterLock.h: In function 'void ScatterLock::set_xlock_snap_sync(MDSContext*)'

Added by Nathan Fish over 3 years ago. Updated over 3 years ago.

Status:
Need More Info
Priority:
Normal
Assignee:
-
Category:
-
Target version:
-
% Done:

0%

Source:
Community (user)
Tags:
Backport:
Regression:
No
Severity:
3 - minor
Reviewed:
Affected Versions:
ceph-qa-suite:
Component(FS):
Labels (FS):
Pull request ID:
Crash signature (v1):
Crash signature (v2):

Description

Twice this week, an MDS has crashed in this function. Failover occurred as normal so disruption was brief. The issue seems to be correlated with load spikes. Currently 1/3 hosts is 14.2.12, with the other 2 being 14.2.11. The issue has only occurred twice so far, both in the week since updating 1 host to 14.2.12. The crashes have been in mds.3 and mds.4 in our biggest / highest load filesystem. It contains home directories for ~800 students so it has a lot of small files and small metadata ops.

root@mc-3015-422:~# ceph versions
{
    "mon": {
        "ceph version 14.2.11 (f7fdb2f52131f54b891a2ec99d8205561242cdaf) nautilus (stable)": 2,
        "ceph version 14.2.12 (2f3caa3b8b3d5c5f2719a1e9d8e7deea5ae1a5c6) nautilus (stable)": 1
    },
    "mgr": {
        "ceph version 14.2.11 (f7fdb2f52131f54b891a2ec99d8205561242cdaf) nautilus (stable)": 2,
        "ceph version 14.2.12 (2f3caa3b8b3d5c5f2719a1e9d8e7deea5ae1a5c6) nautilus (stable)": 1
    },
    "osd": {
        "ceph version 14.2.11 (f7fdb2f52131f54b891a2ec99d8205561242cdaf) nautilus (stable)": 54,
        "ceph version 14.2.12 (2f3caa3b8b3d5c5f2719a1e9d8e7deea5ae1a5c6) nautilus (stable)": 27
    },
    "mds": {
        "ceph version 14.2.11 (f7fdb2f52131f54b891a2ec99d8205561242cdaf) nautilus (stable)": 26,
        "ceph version 14.2.12 (2f3caa3b8b3d5c5f2719a1e9d8e7deea5ae1a5c6) nautilus (stable)": 13
    },
    "rgw": {
        "ceph version 14.2.12 (2f3caa3b8b3d5c5f2719a1e9d8e7deea5ae1a5c6) nautilus (stable)": 1,
        "ceph version 14.2.8 (2d095e947a02261ce61424021bb43bd3022d35cb) nautilus (stable)": 1
    },
    "overall": {
        "ceph version 14.2.11 (f7fdb2f52131f54b891a2ec99d8205561242cdaf) nautilus (stable)": 84,
        "ceph version 14.2.12 (2f3caa3b8b3d5c5f2719a1e9d8e7deea5ae1a5c6) nautilus (stable)": 43,
        "ceph version 14.2.8 (2d095e947a02261ce61424021bb43bd3022d35cb) nautilus (stable)": 1
    }
}

root@mc-3015-422:~# ceph crash info 2020-10-26_19:54:25.542486Z_bbd92410-c2fd-4edb-bcfc-ae8327918664
{
    "os_version_id": "18.04", 
    "utsname_machine": "x86_64", 
    "entity_name": "mds.dc-3558-422-G", 
    "backtrace": [
        "(()+0x128a0) [0x7f346dbe88a0]", 
        "(gsignal()+0xc7) [0x7f346cce0f47]", 
        "(abort()+0x141) [0x7f346cce28b1]", 
        "(ceph::__ceph_assert_fail(char const*, char const*, int, char const*)+0x1a3) [0x7f346e2c86d3]", 
        "(ceph::__ceph_assertf_fail(char const*, char const*, int, char const*, char const*, ...)+0) [0x7f346e2c885d]", 
        "(MDCache::truncate_inode(CInode*, LogSegment*)+0x325) [0x5633dee47f35]", 
        "(C_MDS_inode_update_finish::finish(int)+0xfa) [0x5633dee1b7fa]", 
        "(MDSContext::complete(int)+0x73) [0x5633df006d33]", 
        "(MDSIOContextBase::complete(int)+0x173) [0x5633df006fd3]", 
        "(MDSLogContextBase::complete(int)+0x40) [0x5633df007230]", 
        "(Finisher::finisher_thread_entry()+0x15e) [0x7f346e31313e]", 
        "(()+0x76db) [0x7f346dbdd6db]", 
        "(clone()+0x3f) [0x7f346cdc3a3f]" 
    ], 
    "assert_line": 85, 
    "utsname_release": "5.4.0-48-generic", 
    "assert_file": "/build/ceph-14.2.11/src/mds/ScatterLock.h", 
    "utsname_sysname": "Linux", 
    "os_version": "18.04.5 LTS (Bionic Beaver)", 
    "os_id": "ubuntu", 
    "assert_thread_name": "fn_anonymous", 
    "assert_msg": "/build/ceph-14.2.11/src/mds/ScatterLock.h: In function 'void ScatterLock::set_xlock_snap_sync(MDSContext*)' thread 7f345ea01700 time 2020-10-26 15:54:25.537491\n/build/ceph-14.2.11/src/mds/ScatterLock.h: 85: FAILED ceph_assert(state == LOCK_XLOCK || state == LOCK_XLOCKDONE)\n", 
    "assert_func": "void ScatterLock::set_xlock_snap_sync(MDSContext*)", 
    "ceph_version": "14.2.11", 
    "os_name": "Ubuntu", 
    "timestamp": "2020-10-26 19:54:25.542486Z", 
    "process_name": "ceph-mds", 
    "archived": "2020-10-26 20:07:34.236612", 
    "utsname_hostname": "dc-3558-422.cloud.cs.uwaterloo.ca", 
    "crash_id": "2020-10-26_19:54:25.542486Z_bbd92410-c2fd-4edb-bcfc-ae8327918664", 
    "assert_condition": "state == LOCK_XLOCK || state == LOCK_XLOCKDONE", 
    "utsname_version": "#52~18.04.1-Ubuntu SMP Thu Sep 10 12:50:22 UTC 2020" 
}


Files

ceph-mds.dc-3558-422-G.log.2.gz (105 KB) ceph-mds.dc-3558-422-G.log.2.gz Nathan Fish, 10/29/2020 07:33 PM
Actions #1

Updated by Patrick Donnelly over 3 years ago

  • Status changed from New to Need More Info

Do you have any logs from the event? Has it been repeatable?

Actions #2

Updated by Zheng Yan over 3 years ago

Actions #3

Updated by Nathan Fish over 3 years ago

Both issues involve Scatterlock, but the crash seems to be at a different line. Here is a log of one event. This happened a 3rd time after I reported this.

Actions

Also available in: Atom PDF