Bug #48908
openEC Pool OSD crashes
0%
45667d498bde59f350e2c1c48877e0090ab3311fa743344d4dec6f6a95ae7e66
6e13754025ea53a95e23b62cfe8d477a79c357876904371b0556b0d10a482f7a
Description
Recently, I have started seeing a lot of these OSD crashes:
{ "backtrace": [ "(()+0x12980) [0x7fb860bcf980]", "(gsignal()+0xc7) [0x7fb85f881fb7]", "(abort()+0x141) [0x7fb85f883921]", "(()+0x8c957) [0x7fb860276957]", "(()+0x92ae6) [0x7fb86027cae6]", "(()+0x92b21) [0x7fb86027cb21]", "(()+0x92d54) [0x7fb86027cd54]", "(ceph::buffer::v15_2_0::create_aligned_in_mempool(unsigned int, unsigned int, int)+0x250) [0x55c28cd519c0]", "(ceph::buffer::v15_2_0::create_aligned(unsigned int, unsigned int)+0x22) [0x55c28cd51aa2]", "(ceph::buffer::v15_2_0::create(unsigned int)+0x22) [0x55c28cd51ae2]", "(ceph::buffer::v15_2_0::list::obtain_contiguous_space(unsigned int)+0xab) [0x55c28cd51ceb]", "(ECSubReadReply::encode(ceph::buffer::v15_2_0::list&) const+0x10d) [0x55c28cdefb8d]", "(MOSDECSubOpReadReply::encode_payload(unsigned long)+0x61) [0x55c28c8c5301]", "(Message::encode(unsigned long, int, bool)+0x2a) [0x55c28ccccf8a]", "(ProtocolV2::prepare_send_message(unsigned long, Message*)+0x40) [0x55c28cf0b800]", "(ProtocolV2::send_message(Message*)+0xd1) [0x55c28cf0bb01]", "(AsyncConnection::send_message(Message*)+0x186) [0x55c28ceea5c6]", "(ECBackend::_handle_message(boost::intrusive_ptr<OpRequest>)+0x430) [0x55c28c8c3b80]", "(PGBackend::handle_message(boost::intrusive_ptr<OpRequest>)+0x97) [0x55c28c71aff7]", "(PrimaryLogPG::do_request(boost::intrusive_ptr<OpRequest>&, ThreadPool::TPHandle&)+0x6fd) [0x55c28c6bdb6d]", "(OSD::dequeue_op(boost::intrusive_ptr<PG>, boost::intrusive_ptr<OpRequest>, ThreadPool::TPHandle&)+0x17b) [0x55c28c541dfb]", "(ceph::osd::scheduler::PGOpItem::run(OSD*, OSDShard*, boost::intrusive_ptr<PG>&, ThreadPool::TPHandle&)+0x67) [0x55c28c79f5a7]", "(OSD::ShardedOpWQ::_process(unsigned int, ceph::heartbeat_handle_d*)+0x90c) [0x55c28c55f4fc]", "(ShardedThreadPool::shardedthreadpool_worker(unsigned int)+0x4ac) [0x55c28cbae0ec]", "(ShardedThreadPool::WorkThreadSharded::entry()+0x10) [0x55c28cbb1340]", "(()+0x76db) [0x7fb860bc46db]", "(clone()+0x3f) [0x7fb85f96471f]" ], "ceph_version": "15.2.8", "crash_id": "2021-01-17T06:27:40.322963Z_8ae0d83a-0de0-4f15-92fb-fd9a315a7a72", "entity_name": "osd.217", "os_id": "ubuntu", "os_name": "Ubuntu", "os_version": "18.04.5 LTS (Bionic Beaver)", "os_version_id": "18.04", "process_name": "ceph-osd", "stack_sig": "6e13754025ea53a95e23b62cfe8d477a79c357876904371b0556b0d10a482f7a", "timestamp": "2021-01-17T06:27:40.322963Z", "utsname_hostname": "XXX", "utsname_machine": "x86_64", "utsname_release": "4.15.0-126-generic", "utsname_sysname": "Linux", "utsname_version": "#129-Ubuntu SMP Mon Nov 23 18:53:38 UTC 2020" }
I don't have much more information, unfortunately, but the crash journals are filling up with these dumps. Please tell me if there is more information that I could provide. The only information around the time of the crash I could find in the OSD's log file is this:
2021-01-17T06:09:11.842+0100 7fcd0409d700 4 rocksdb: [db/db_impl.cc:777] ------- DUMPING STATS ------- 2021-01-17T06:09:11.842+0100 7fcd0409d700 4 rocksdb: [db/db_impl.cc:778] 2021-01-17T06:19:11.842+0100 7fcd0409d700 4 rocksdb: [db/db_impl.cc:777] ------- DUMPING STATS ------- 2021-01-17T06:19:11.842+0100 7fcd0409d700 4 rocksdb: [db/db_impl.cc:778] 2021-01-17T06:29:11.842+0100 7fcd0409d700 4 rocksdb: [db/db_impl.cc:777] ------- DUMPING STATS ------- 2021-01-17T06:29:11.842+0100 7fcd0409d700 4 rocksdb: [db/db_impl.cc:778] 2021-01-17T06:39:11.846+0100 7fcd0409d700 4 rocksdb: [db/db_impl.cc:777] ------- DUMPING STATS ------- 2021-01-17T06:39:11.846+0100 7fcd0409d700 4 rocksdb: [db/db_impl.cc:778] 2021-01-17T06:49:11.847+0100 7fcd0409d700 4 rocksdb: [db/db_impl.cc:777] ------- DUMPING STATS ------- 2021-01-17T06:49:11.847+0100 7fcd0409d700 4 rocksdb: [db/db_impl.cc:778] 2021-01-17T06:59:11.847+0100 7fcd0409d700 4 rocksdb: [db/db_impl.cc:777] ------- DUMPING STATS ------- 2021-01-17T06:59:11.847+0100 7fcd0409d700 4 rocksdb: [db/db_impl.cc:778] 2021-01-17T07:09:11.847+0100 7fcd0409d700 4 rocksdb: [db/db_impl.cc:777] ------- DUMPING STATS ------- 2021-01-17T07:09:11.847+0100 7fcd0409d700 4 rocksdb: [db/db_impl.cc:778] 2021-01-17T07:19:11.847+0100 7fcd0409d700 4 rocksdb: [db/db_impl.cc:777] ------- DUMPING STATS ------- 2021-01-17T07:19:11.847+0100 7fcd0409d700 4 rocksdb: [db/db_impl.cc:778] 2021-01-17T07:29:11.848+0100 7fcd0409d700 4 rocksdb: [db/db_impl.cc:777] ------- DUMPING STATS ------- 2021-01-17T07:29:11.848+0100 7fcd0409d700 4 rocksdb: [db/db_impl.cc:778] 2021-01-17T07:39:11.848+0100 7fcd0409d700 4 rocksdb: [db/db_impl.cc:777] ------- DUMPING STATS ------- 2021-01-17T07:39:11.848+0100 7fcd0409d700 4 rocksdb: [db/db_impl.cc:778] 2021-01-17T07:48:14.083+0100 7fcd0669c700 -1 *** Caught signal (Aborted) ** -9999> 2021-01-17T07:08:29.006+0100 7fcd072a6700 10 monclient: _check_auth_rotating have uptodate secrets (they expire after 2021-01-17T07:07:59.011252+0100) -9998> 2021-01-17T07:08:29.842+0100 7fcd0aea5700 5 prioritycache tune_memory target: 4294967296 mapped: 3989864448 unmapped: 41828352 heap: 4031692800 old mem: 2845415832 new mem: 2845415832 -9997> 2021-01-17T07:08:30.006+0100 7fcd072a6700 10 monclient: tick -9996> 2021-01-17T07:08:30.006+0100 7fcd072a6700 10 monclient: _check_auth_rotating have uptodate secrets (they expire after 2021-01-17T07:08:00.011419+0100) -9995> 2021-01-17T07:08:30.846+0100 7fcd0aea5700 5 prioritycache tune_memory target: 4294967296 mapped: 3990355968 unmapped: 41336832 heap: 4031692800 old mem: 2845415832 new mem: 2845415832 -9994> 2021-01-17T07:08:31.006+0100 7fcd072a6700 10 monclient: tick -9993> 2021-01-17T07:08:31.006+0100 7fcd072a6700 10 monclient: _check_auth_rotating have uptodate secrets (they expire after 2021-01-17T07:08:01.011548+0100) -9992> 2021-01-17T07:08:31.458+0100 7fcd1611e700 10 monclient: handle_auth_request added challenge on 0x555b4f126000 -9991> 2021-01-17T07:08:31.458+0100 7fcd1591d700 10 monclient: handle_auth_request added challenge on 0x555b4f126400 -9990> 2021-01-17T07:08:31.850+0100 7fcd0aea5700 5 prioritycache tune_memory target: 4294967296 mapped: 3990388736 unmapped: 41304064 heap: 4031692800 old mem: 2845415832 new mem: 2845415832 -9989> 2021-01-17T07:08:32.010+0100 7fcd072a6700 10 monclient: tick -9988> 2021-01-17T07:08:32.010+0100 7fcd072a6700 10 monclient: _check_auth_rotating have uptodate secrets (they expire after 2021-01-17T07:08:02.011700+0100) -9987> 2021-01-17T07:08:32.851+0100 7fcd0aea5700 5 prioritycache tune_memory target: 4294967296 mapped: 3990585344 unmapped: 41107456 heap: 4031692800 old mem: 2845415832 new mem: 2845415832 -9986> 2021-01-17T07:08:32.851+0100 7fcd0aea5700 5 bluestore.MempoolThread(0x555a78dc8a98) _resize_shards cache_size: 2845415832 kv_alloc: 1476395008 kv_used: 1475919488 meta_alloc: 1073741824 meta_used: 792737866 data_alloc: 268435456 data_used: 0 -9985> 2021-01-17T07:08:33.011+0100 7fcd072a6700 10 monclient: tick -9984> 2021-01-17T07:08:33.011+0100 7fcd072a6700 10 monclient: _check_auth_rotating have uptodate secrets (they expire after 2021-01-17T07:08:03.011874+0100) -9983> 2021-01-17T07:08:33.631+0100 7fccf7884700 5 osd.1141 739484 heartbeat osd_stat(store_statfs(0x55f2a400000/0x2fdf00000/0x8e8ffc00000, data 0x366d360d569/0x386d78f0000, compress 0x0/0x0/0x0, omap 0x231143b, meta 0x2fbbeebc5), peers [4,31,36,46,47,71,89,96,104,105,123,137,141,148,150,151,156,201,218,243,249,254,290,298,301,332,341,347,366,387,429,430,431,445,454,468,505,508,516,532,553,594,597,611,642,647,657,659,665,667,671,677,686,703,704,713,715,734,741,744,748,761,768,783,790,794,797,806,821,839,878,881,888,895,899,910,917,919,927,932,934,940,947,953,969,974,976,985,993,994,1003,1004,1005,1017,1030,1035,1044,1065,1076,1091,1096,1100,1109,1125,1135,1140,1142,1144,1155,1162,1171,1183,1220,1226,1227,1235,1237,1243,1247] op hist []) -9982> 2021-01-17T07:08:33.855+0100 7fcd0aea5700 5 prioritycache tune_memory target: 4294967296 mapped: 3990585344 unmapped: 41107456 heap: 4031692800 old mem: 2845415832 new mem: 2845415832 -9981> 2021-01-17T07:08:34.011+0100 7fcd072a6700 10 monclient: tick -9980> 2021-01-17T07:08:34.011+0100 7fcd072a6700 10 monclient: _check_auth_rotating have uptodate secrets (they expire after 2021-01-17T07:08:04.011999+0100) -9979> 2021-01-17T07:08:34.855+0100 7fcd0aea5700 5 prioritycache tune_memory target: 4294967296 mapped: 3990585344 unmapped: 41107456 heap: 4031692800 old mem: 2845415832 new mem: 2845415832 -9978> 2021-01-17T07:08:35.011+0100 7fcd072a6700 10 monclient: tick -9977> 2021-01-17T07:08:35.011+0100 7fcd072a6700 10 monclient: _check_auth_rotating have uptodate secrets (they expire after 2021-01-17T07:08:05.012181+0100) -9976> 2021-01-17T07:08:35.859+0100 7fcd0aea5700 5 prioritycache tune_memory target: 4294967296 mapped: 3990585344 unmapped: 41107456 heap: 4031692800 old mem: 2845415832 new mem: 2845415832 -9975> 2021-01-17T07:08:36.011+0100 7fcd072a6700 10 monclient: tick -9974> 2021-01-17T07:08:36.011+0100 7fcd072a6700 10 monclient: _check_auth_rotating have uptodate secrets (they expire after 2021-01-17T07:08:06.012392+0100) -9973> 2021-01-17T07:08:36.859+0100 7fcd0aea5700 5 prioritycache tune_memory target: 4294967296 mapped: 3990650880 unmapped: 41041920 heap: 4031692800 old mem: 2845415832 new mem: 2845415832 -9972> 2021-01-17T07:08:37.011+0100 7fcd072a6700 10 monclient: tick -9971> 2021-01-17T07:08:37.011+0100 7fcd072a6700 10 monclient: _check_auth_rotating have uptodate secrets (they expire after 2021-01-17T07:08:07.012544+0100) -9970> 2021-01-17T07:08:37.863+0100 7fcd0aea5700 5 prioritycache tune_memory target: 4294967296 mapped: 3991216128 unmapped: 40476672 heap: 4031692800 old mem: 2845415832 new mem: 2845415832 -9969> 2021-01-17T07:08:37.863+0100 7fcd0aea5700 5 bluestore.MempoolThread(0x555a78dc8a98) _resize_shards cache_size: 2845415832 kv_alloc: 1476395008 kv_used: 1475923072 meta_alloc: 1073741824 meta_used: 792750888 data_alloc: 268435456 data_used: 0
There wasn't anything at exactly at 06:27 and the monclient messages are from about an hour later.
I would increase the log level temporarily if I could somehow reproduce the crashes, but they just happen randomly over night it seems. Perhaps it is to do with scheduled deep scrubs.
Updated by Telemetry Bot almost 3 years ago
- Crash signature (v1) updated (diff)
- Crash signature (v2) updated (diff)
- Affected Versions v15.2.9 added
Sanitized backtrace:
ceph::buffer::v15_2_0::create_aligned_in_mempool(unsigned int, unsigned int, int) ceph::buffer::v15_2_0::create_aligned(unsigned int, unsigned int) ceph::buffer::v15_2_0::create(unsigned int) ceph::buffer::v15_2_0::list::obtain_contiguous_space(unsigned int) ECSubReadReply::encode(ceph::buffer::v15_2_0::list&) const MOSDECSubOpReadReply::encode_payload(unsigned long) Message::encode(unsigned long, int, bool) ProtocolV2::prepare_send_message(unsigned long, Message*) ProtocolV2::send_message(Message*) AsyncConnection::send_message(Message*) ECBackend::_handle_message(boost::intrusive_ptr<OpRequest>) PGBackend::handle_message(boost::intrusive_ptr<OpRequest>) PrimaryLogPG::do_request(boost::intrusive_ptr<OpRequest>&, ThreadPool::TPHandle&) OSD::dequeue_op(boost::intrusive_ptr<PG>, boost::intrusive_ptr<OpRequest>, ThreadPool::TPHandle&) ceph::osd::scheduler::PGOpItem::run(OSD*, OSDShard*, boost::intrusive_ptr<PG>&, ThreadPool::TPHandle&) OSD::ShardedOpWQ::_process(unsigned int, ceph::heartbeat_handle_d*) ShardedThreadPool::shardedthreadpool_worker(unsigned int) ShardedThreadPool::WorkThreadSharded::entry() clone()
Crash dump sample:
{ "backtrace": [ "(()+0x12b20) [0x7f35ff0aeb20]", "(gsignal()+0x10f) [0x7f35fdd167ff]", "(abort()+0x127) [0x7f35fdd00c35]", "(()+0x9009b) [0x7f35fe6cc09b]", "(()+0x9653c) [0x7f35fe6d253c]", "(()+0x96597) [0x7f35fe6d2597]", "(()+0x967f8) [0x7f35fe6d27f8]", "(ceph::buffer::v15_2_0::create_aligned_in_mempool(unsigned int, unsigned int, int)+0x229) [0x56199119de49]", "(ceph::buffer::v15_2_0::create_aligned(unsigned int, unsigned int)+0x26) [0x56199119df46]", "(ceph::buffer::v15_2_0::create(unsigned int)+0x26) [0x56199119df96]", "(ceph::buffer::v15_2_0::list::obtain_contiguous_space(unsigned int)+0xab) [0x56199119e1ab]", "(ECSubReadReply::encode(ceph::buffer::v15_2_0::list&) const+0x10d) [0x561991238a2d]", "(MOSDECSubOpReadReply::encode_payload(unsigned long)+0x65) [0x561990d22f65]", "(Message::encode(unsigned long, int, bool)+0x2e) [0x56199111480e]", "(ProtocolV2::prepare_send_message(unsigned long, Message*)+0x44) [0x561991353d94]", "(ProtocolV2::send_message(Message*)+0x3ae) [0x5619913543fe]", "(AsyncConnection::send_message(Message*)+0x5ee) [0x56199133205e]", "(ECBackend::_handle_message(boost::intrusive_ptr<OpRequest>)+0x4d9) [0x561990d21689]", "(PGBackend::handle_message(boost::intrusive_ptr<OpRequest>)+0x52) [0x561990b7aa22]", "(PrimaryLogPG::do_request(boost::intrusive_ptr<OpRequest>&, ThreadPool::TPHandle&)+0x5de) [0x561990b238ce]", "(OSD::dequeue_op(boost::intrusive_ptr<PG>, boost::intrusive_ptr<OpRequest>, ThreadPool::TPHandle&)+0x309) [0x5619909aa8e9]", "(ceph::osd::scheduler::PGOpItem::run(OSD*, OSDShard*, boost::intrusive_ptr<PG>&, ThreadPool::TPHandle&)+0x68) [0x561990c06c88]", "(OSD::ShardedOpWQ::_process(unsigned int, ceph::heartbeat_handle_d*)+0x12ef) [0x5619909c7dcf]", "(ShardedThreadPool::shardedthreadpool_worker(unsigned int)+0x5c4) [0x561991004ad4]", "(ShardedThreadPool::WorkThreadSharded::entry()+0x14) [0x561991007734]", "(()+0x814a) [0x7f35ff0a414a]", "(clone()+0x43) [0x7f35fdddbf23]" ], "ceph_version": "15.2.9", "crash_id": "2021-04-26T22:22:58.272125Z_9889fe81-a5a4-48ee-8a69-f9050e7b3fd6", "entity_name": "osd.d3dac2c5053d8975cfaa606a78218506db4f9d9c", "os_id": "centos", "os_name": "CentOS Linux", "os_version": "8", "os_version_id": "8", "process_name": "ceph-osd", "stack_sig": "6e13754025ea53a95e23b62cfe8d477a79c357876904371b0556b0d10a482f7a", "timestamp": "2021-04-26T22:22:58.272125Z", "utsname_machine": "x86_64", "utsname_release": "4.15.0-142-generic", "utsname_sysname": "Linux", "utsname_version": "#146-Ubuntu SMP Tue Apr 13 01:11:19 UTC 2021" }
Updated by Neha Ojha almost 3 years ago
- Status changed from New to Need More Info
Can you capture a coredump for the crash? This appears to be memory leak of some sort, what is the memory/RSS usage like when this happens?