Actions
Bug #47740
closedOSD crash when increase pg_num
Status:
Duplicate
Priority:
Normal
Assignee:
-
Target version:
-
% Done:
0%
Source:
Tags:
Backport:
Regression:
No
Severity:
3 - minor
Reviewed:
Affected Versions:
ceph-qa-suite:
Pull request ID:
Crash signature (v1):
Crash signature (v2):
Description
Following #47738, after the pg_num increased from 32 to 256, one of our OSD crashed.
Here is the output of 'ceph crash info'
{
"archived": "2020-10-02 14:48:31.821600",
"backtrace": [
"(()+0x12dd0) [0x7f31df550dd0]",
"(()+0x118ed) [0x7f31df54f8ed]",
"(__pthread_mutex_lock()+0x124) [0x7f31df548bd4]",
"(BlueStore::SharedBlob::put()+0x70) [0x55841bd31b70]",
"(BlueStore::Blob::put()+0x65) [0x55841bdddc75]",
"(BlueStore::Extent::~Extent()+0x39) [0x55841bdddcc9]",
"(BlueStore::Onode::put()+0x2bb) [0x55841bd3291b]",
"(std::_Hashtable<ghobject_t, std::pair<ghobject_t const, boost::intrusive_ptr<BlueStore::Onode> >, mempool::pool_allocator<(mempool::pool_index_t)4, std::pair<ghobject_t const, boost::intrusive_ptr<BlueStore::Onode> > >, std::__detail::_Select1st, std::equal_to<ghobject_t>, std::hash<ghobject_t>, std::__detail::_Mod_range_hashing, std::__detail::_Default_ranged_hash, std::__detail::_Prime_rehash_policy, std::__detail::_Hashtable_traits<true, false, true> >::_M_erase(unsigned long, std::__detail::_Hash_node_base*, std::__detail::_Hash_node<std::pair<ghobject_t const, boost::intrusive_ptr<BlueStore::Onode> >, true>*)+0x66) [0x55841bddf596]",
"(BlueStore::OnodeSpace::_remove(ghobject_t const&)+0x29b) [0x55841bd3419b]",
"(LruOnodeCacheShard::_trim_to(unsigned long)+0xf1) [0x55841bde46e1]",
"(BlueStore::OnodeSpace::add(ghobject_t const&, boost::intrusive_ptr<BlueStore::Onode>&)+0x49d) [0x55841bd348cd]",
"(BlueStore::Collection::get_onode(ghobject_t const&, bool, bool)+0x459) [0x55841bd41b79]",
"(BlueStore::_txc_add_transaction(BlueStore::TransContext*, ceph::os::Transaction*)+0x1124) [0x55841bda39a4]",
"(BlueStore::queue_transactions(boost::intrusive_ptr<ObjectStore::CollectionImpl>&, std::vector<ceph::os::Transaction, std::allocator<ceph::os::Transaction> >&, boost::intrusive_ptr<TrackedOp>, ThreadPool::TPHandle*)+0x407) [0x55841bda5f47]",
"(ObjectStore::queue_transaction(boost::intrusive_ptr<ObjectStore::CollectionImpl>&, ceph::os::Transaction&&, boost::intrusive_ptr<TrackedOp>, ThreadPool::TPHandle*)+0x85) [0x55841b917745]",
"(OSD::dispatch_context(PeeringCtx&, PG*, std::shared_ptr<OSDMap const>, ThreadPool::TPHandle*)+0xf3) [0x55841b8b2ef3]",
"(OSD::dequeue_peering_evt(OSDShard*, PG*, std::shared_ptr<PGPeeringEvent>, ThreadPool::TPHandle&)+0x2d8) [0x55841b8e1968]",
"(ceph::osd::scheduler::PGPeeringItem::run(OSD*, OSDShard*, boost::intrusive_ptr<PG>&, ThreadPool::TPHandle&)+0x56) [0x55841bb131c6]",
"(OSD::ShardedOpWQ::_process(unsigned int, ceph::heartbeat_handle_d*)+0x12ef) [0x55841b8d45df]",
"(ShardedThreadPool::shardedthreadpool_worker(unsigned int)+0x5c4) [0x55841bf0d324]",
"(ShardedThreadPool::WorkThreadSharded::entry()+0x14) [0x55841bf0ff84]",
"(()+0x82de) [0x7f31df5462de]",
"(clone()+0x43) [0x7f31de27de83]"
],
"ceph_version": "15.2.5",
"crash_id": "2020-10-02T11:43:42.992341Z_248ea0f7-af3c-46b9-91e6-166aaa69f3bd",
"entity_name": "osd.2",
"os_id": "centos",
"os_name": "CentOS Linux",
"os_version": "8 (Core)",
"os_version_id": "8",
"process_name": "ceph-osd",
"stack_sig": "a94ab97e65f36fcb91a737cdd3e5733fe454df9d658c5d5183f0d434291dbd90",
"timestamp": "2020-10-02T11:43:42.992341Z",
"utsname_hostname": "169svr",
"utsname_machine": "x86_64",
"utsname_release": "4.15.0-112-generic",
"utsname_sysname": "Linux",
"utsname_version": "#113-Ubuntu SMP Thu Jul 9 23:41:39 UTC 2020"
}
This seems to be a deadlock. A log in journalctl:
debug 2020-10-02T11:43:42.962+0000 7f31d855a700 1 heartbeat_map is_healthy 'OSD::osd_op_tp thread 0x7f31bdcb5700' had suicide timed out after 150
and a lot of
debug 2020-10-02T11:43:41.034+0000 7f31d855a700 1 osd.2 2999 is_healthy false -- internal heartbeat failed debug 2020-10-02T11:43:41.034+0000 7f31d855a700 1 osd.2 2999 not healthy; waiting to boot debug 2020-10-02T11:43:41.994+0000 7f31d855a700 1 heartbeat_map is_healthy 'OSD::osd_op_tp thread 0x7f31bdcb5700' had timed out after 15
Let me know if you need more information.
Files
Actions