Documentation #53607
open
No coredump getting created during ceph daemons crashes
Added by Vikhyat Umrao over 2 years ago.
Updated over 2 years ago.
Description
No coredump getting created during ceph daemons crashes
# ceph versions
{
"mon": {
"ceph version 17.0.0-9475-g8ea352e9 (8ea352e994feffca1bfd357a20c491df01db91a9) quincy (dev)": 5
},
"mgr": {
"ceph version 17.0.0-9475-g8ea352e9 (8ea352e994feffca1bfd357a20c491df01db91a9) quincy (dev)": 2
},
"osd": {
"ceph version 17.0.0-9475-g8ea352e9 (8ea352e994feffca1bfd357a20c491df01db91a9) quincy (dev)": 1061
},
"mds": {
"ceph version 17.0.0-9475-g8ea352e9 (8ea352e994feffca1bfd357a20c491df01db91a9) quincy (dev)": 2
},
"overall": {
"ceph version 17.0.0-9475-g8ea352e9 (8ea352e994feffca1bfd357a20c491df01db91a9) quincy (dev)": 1070
}
}
- Crash list
[root@gibba001 ~]# ceph crash ls | grep osd
2021-12-13T12:07:49.888443Z_0b56adca-66b7-47d9-b351-f60d507a514b osd.15 *
2021-12-13T12:10:41.871131Z_3e0a41b5-f86d-4636-a758-3204f4b8877e osd.549 *
2021-12-13T15:27:30.525590Z_b9772214-8f71-4033-9482-2439bdd15ec7 osd.176 *
- Login to gibba045
[root@gibba045 ~]# cd /var/lib/ceph/182eef00-53b5-11ec-84d3-3cecef3d8fb8/crash/posted
[root@gibba045 posted]# pwd
/var/lib/ceph/182eef00-53b5-11ec-84d3-3cecef3d8fb8/crash/posted
[root@gibba045 posted]# ls -ltr
total 12
drwx------. 2 167 167 4096 Dec 14 05:52 2021-12-13T12:10:41.871131Z_3e0a41b5-f86d-4636-a758-3204f4b8877e
drwx------. 2 167 167 4096 Dec 14 05:54 2021-12-13T15:27:30.525590Z_b9772214-8f71-4033-9482-2439bdd15ec7
drwx------. 2 167 167 4096 Dec 14 19:38 2021-12-13T12:07:49.888443Z_0b56adca-66b7-47d9-b351-f60d507a514b
[root@gibba045 posted]# ls -ltr *
'2021-12-13T12:10:41.871131Z_3e0a41b5-f86d-4636-a758-3204f4b8877e':
total 4
-rw-------. 1 167 167 1699 Dec 13 12:10 meta
-r--r--r--. 1 167 167 0 Dec 13 12:10 done
'2021-12-13T15:27:30.525590Z_b9772214-8f71-4033-9482-2439bdd15ec7':
total 2284
-rw-------. 1 167 167 3114 Dec 13 15:27 meta
-r--r--r--. 1 167 167 0 Dec 13 15:27 done
-rw-r--r--. 1 167 167 2334115 Dec 13 15:27 log
'2021-12-13T12:07:49.888443Z_0b56adca-66b7-47d9-b351-f60d507a514b':
total 2568
-rw-------. 1 167 167 3794 Dec 13 12:07 meta
-r--r--r--. 1 167 167 0 Dec 13 12:07 done
-rw-r--r--. 1 167 167 2621784 Dec 13 12:07 log
[root@gibba045 posted]#
last time the only thing I needed was ulimit -S -c unlimited
The coredumps are managed through systemd-coredump.socket on RHEL-8/CENTOS-8.
I do see coredumps are generated on gibba045 but these latest ceph-osd crashes (for osd.441 and osd.1065) are not reported to crash module (we need another tracker for this) :
[root@gibba045 ~]# coredumpctl info
PID: 2342351 (ceph-osd)
UID: 167 (167)
GID: 167 (167)
Signal: 6 (ABRT)
Timestamp: Wed 2021-12-15 00:46:32 UTC (2 days ago)
Command Line: /usr/bin/ceph-osd -n osd.441 -f --setuser ceph --setgroup ceph --default-log-to-file=false --default-log-to-journald=true --default-log-to-stderr=false
Executable: /usr/bin/ceph-osd
Control Group: /
Slice: -.slice
Boot ID: 87c3ae3237914e41b10e1c6f3df9bd90
Machine ID: 84f9d0aa3f2847afb53281430069ac75
Hostname: gibba045
Storage: /var/lib/systemd/coredump/core.ceph-osd.167.87c3ae3237914e41b10e1c6f3df9bd90.2342351.1639529192000000.lz4
Message: Process 2342351 (ceph-osd) of user 167 dumped core.
Stack trace of thread 465:
#0 0x00007f1d4c65a37f n/a (/usr/lib64/libc-2.28.so)
#1 0x0000000000000000 n/a (n/a)
PID: 2626476 (ceph-osd)
UID: 167 (167)
GID: 167 (167)
Signal: 11 (SEGV)
Timestamp: Thu 2021-12-16 22:17:45 UTC (5h 30min ago)
Command Line: /usr/bin/ceph-osd -n osd.1065 -f --setuser ceph --setgroup ceph --default-log-to-file=false --default-log-to-journald=true --default-log-to-stderr=false
Executable: /usr/bin/ceph-osd
Control Group: /system.slice/system-ceph\x2d182eef00\x2d53b5\x2d11ec\x2d84d3\x2d3cecef3d8fb8.slice/ceph-182eef00-53b5-11ec-84d3-3cecef3d8fb8@osd.1065.service/container
Unit: ceph-182eef00-53b5-11ec-84d3-3cecef3d8fb8@osd.1065.service
Slice: system-ceph\x2d182eef00\x2d53b5\x2d11ec\x2d84d3\x2d3cecef3d8fb8.slice
Boot ID: 87c3ae3237914e41b10e1c6f3df9bd90
Machine ID: 84f9d0aa3f2847afb53281430069ac75
Hostname: gibba045
Storage: /var/lib/systemd/coredump/core.ceph-osd.167.87c3ae3237914e41b10e1c6f3df9bd90.2626476.1639693065000000.lz4
Message: Process 2626476 (ceph-osd) of user 167 dumped core.
Stack trace of thread 7:
#0 0x00007f4a5ebdeabf n/a (/usr/lib64/libpthread-2.28.so)
#1 0x72632f687065632f n/a (n/a)
[root@gibba045 ~]# ls -ltr /var/lib/systemd/coredump/
total 336972
-rw-r-----. 1 root root 96619586 Dec 15 00:46 core.ceph-osd.167.87c3ae3237914e41b10e1c6f3df9bd90.2342351.1639529192000000.lz4
-rw-r-----. 1 root root 248424786 Dec 16 22:17 core.ceph-osd.167.87c3ae3237914e41b10e1c6f3df9bd90.2626476.1639693065000000.lz4
[root@gibba045 ~]# coredumpctl dump --output coredump.1065
PID: 2626476 (ceph-osd)
UID: 167 (167)
GID: 167 (167)
Signal: 11 (SEGV)
Timestamp: Thu 2021-12-16 22:17:45 UTC (5h 31min ago)
Command Line: /usr/bin/ceph-osd -n osd.1065 -f --setuser ceph --setgroup ceph --default-log-to-file=false --default-log-to-journald=true --default-log-to-stderr=false
Executable: /usr/bin/ceph-osd
Control Group: /system.slice/system-ceph\x2d182eef00\x2d53b5\x2d11ec\x2d84d3\x2d3cecef3d8fb8.slice/ceph-182eef00-53b5-11ec-84d3-3cecef3d8fb8@osd.1065.service/container
Unit: ceph-182eef00-53b5-11ec-84d3-3cecef3d8fb8@osd.1065.service
Slice: system-ceph\x2d182eef00\x2d53b5\x2d11ec\x2d84d3\x2d3cecef3d8fb8.slice
Boot ID: 87c3ae3237914e41b10e1c6f3df9bd90
Machine ID: 84f9d0aa3f2847afb53281430069ac75
Hostname: gibba045
Storage: /var/lib/systemd/coredump/core.ceph-osd.167.87c3ae3237914e41b10e1c6f3df9bd90.2626476.1639693065000000.lz4
Message: Process 2626476 (ceph-osd) of user 167 dumped core.
Stack trace of thread 7:
#0 0x00007f4a5ebdeabf n/a (/usr/lib64/libpthread-2.28.so)
#1 0x72632f687065632f n/a (n/a)
More than one entry matches, ignoring rest.
[root@gibba045 ~]# ls -ltr coredump.1065
-rw-r--r--. 1 root root 1810132992 Dec 17 03:49 coredump.1065
[root@gibba045 ~]# du -sh coredump.1065
1.7G coredump.1065
Additionally coredumps will be truncated by default if coredump file size is more than 2 GB. We need to edit /etc/systemd/coredump.conf file and set ProcessSizeMax and ExternalSizeMax to higher size that the ceph daemon will generate but we will need to wait for another ceph daemon crash on node.
- Tracker changed from Bug to Documentation
- Tags set to low-hanging-fruit
Also available in: Atom
PDF