Bug #54701
crash: void Server::set_trace_dist(ceph::ref_t<MClientReply>&, CInode*, CDentry*, MDRequestRef&): assert(dnl->get_inode() == in)
0%
315be2739d1099ec5d061c7f2381a804c16cf2d8c7c4913344d86ed939cd8704
7278758a86083041e8b61edeaffcf9cdde1003a6f303b704eabf2b009c3814ce
Description
Assert condition: dnl->get_inode() == in
Assert function: void Server::set_trace_dist(ceph::ref_t<MClientReply>&, CInode*, CDentry*, MDRequestRef&)
Sanitized backtrace:
Server::set_trace_dist(boost::intrusive_ptr<MClientReply> const&, CInode*, CDentry*, boost::intrusive_ptr<MDRequestImpl>&)
Server::reply_client_request(boost::intrusive_ptr<MDRequestImpl>&, boost::intrusive_ptr<MClientReply> const&)
Server::respond_to_request(boost::intrusive_ptr<MDRequestImpl>&, int)
Batch_Getattr_Lookup::_respond(int)
BatchOp::respond(int)
Server::respond_to_request(boost::intrusive_ptr<MDRequestImpl>&, int)
Server::rdlock_path_pin_ref(boost::intrusive_ptr<MDRequestImpl>&, bool, bool)
Server::handle_client_getattr(boost::intrusive_ptr<MDRequestImpl>&, bool)
Server::dispatch_client_request(boost::intrusive_ptr<MDRequestImpl>&)
MDCache::dispatch_request(boost::intrusive_ptr<MDRequestImpl>&)
MDSContext::complete(int)
MDSCacheObject::finish_waiting(unsigned long, int)
Locker::eval_gather(SimpleLock*, bool, bool*, std::vector<MDSContext*, std::allocator<MDSContext*> >*)
CDentry::remove_client_lease(ClientLease*, Locker*)
Locker::handle_client_lease(boost::intrusive_ptr<MClientLease const> const&)
Locker::dispatch(boost::intrusive_ptr<Message const> const&)
MDSRank::handle_message(boost::intrusive_ptr<Message const> const&)
MDSRank::_dispatch(boost::intrusive_ptr<Message const> const&, bool)
MDSRankDispatcher::ms_dispatch(boost::intrusive_ptr<Message const> const&)
MDSDaemon::ms_dispatch2(boost::intrusive_ptr<Message> const&)
DispatchQueue::entry()
DispatchQueue::DispatchThread::entry()
Crash dump sample:
{
"assert_condition": "dnl->get_inode() == in",
"assert_file": "mds/Server.cc",
"assert_func": "void Server::set_trace_dist(ceph::ref_t<MClientReply>&, CInode*, CDentry*, MDRequestRef&)",
"assert_line": 2300,
"assert_msg": "mds/Server.cc: In function 'void Server::set_trace_dist(ceph::ref_t<MClientReply>&, CInode*, CDentry*, MDRequestRef&)' thread 7f8abaa71700 time 2022-01-19T16:07:07.299361-0600\nmds/Server.cc: 2300: FAILED ceph_assert(dnl->get_inode() == in)",
"assert_thread_name": "ms_dispatch",
"backtrace": [
"/lib64/libpthread.so.0(+0x12c20) [0x7f8ac33f3c20]",
"gsignal()",
"abort()",
"(ceph::__ceph_assert_fail(char const*, char const*, int, char const*)+0x1a9) [0x7f8ac4405ba3]",
"/usr/lib64/ceph/libceph-common.so.2(+0x276d6c) [0x7f8ac4405d6c]",
"(Server::set_trace_dist(boost::intrusive_ptr<MClientReply> const&, CInode*, CDentry*, boost::intrusive_ptr<MDRequestImpl>&)+0x1089) [0x55bdb627d949]",
"(Server::reply_client_request(boost::intrusive_ptr<MDRequestImpl>&, boost::intrusive_ptr<MClientReply> const&)+0xcda) [0x55bdb6283d8a]",
"(Server::respond_to_request(boost::intrusive_ptr<MDRequestImpl>&, int)+0x238) [0x55bdb62840c8]",
"(Batch_Getattr_Lookup::_respond(int)+0xaf) [0x55bdb630a8df]",
"(BatchOp::respond(int)+0x48) [0x55bdb6594918]",
"(Server::respond_to_request(boost::intrusive_ptr<MDRequestImpl>&, int)+0x188) [0x55bdb6284018]",
"(Server::rdlock_path_pin_ref(boost::intrusive_ptr<MDRequestImpl>&, bool, bool)+0x6ff) [0x55bdb628512f]",
"(Server::handle_client_getattr(boost::intrusive_ptr<MDRequestImpl>&, bool)+0xad) [0x55bdb628762d]",
"(Server::dispatch_client_request(boost::intrusive_ptr<MDRequestImpl>&)+0x65f) [0x55bdb62bf5af]",
"(MDCache::dispatch_request(boost::intrusive_ptr<MDRequestImpl>&)+0x33) [0x55bdb6376193]",
"(MDSContext::complete(int)+0x56) [0x55bdb6532c06]",
"(MDSCacheObject::finish_waiting(unsigned long, int)+0xce) [0x55bdb6554cae]",
"(Locker::eval_gather(SimpleLock*, bool, bool*, std::vector<MDSContext*, std::allocator<MDSContext*> >*)+0x13d6) [0x55bdb6406d66]",
"(CDentry::remove_client_lease(ClientLease*, Locker*)+0x466) [0x55bdb646bd66]",
"(Locker::handle_client_lease(boost::intrusive_ptr<MClientLease const> const&)+0xc6a) [0x55bdb63f77aa]",
"(Locker::dispatch(boost::intrusive_ptr<Message const> const&)+0x134) [0x55bdb6419bf4]",
"(MDSRank::handle_message(boost::intrusive_ptr<Message const> const&)+0xbcc) [0x55bdb622fa2c]",
"(MDSRank::_dispatch(boost::intrusive_ptr<Message const> const&, bool)+0x7bb) [0x55bdb62323cb]",
"(MDSRankDispatcher::ms_dispatch(boost::intrusive_ptr<Message const> const&)+0x55) [0x55bdb62329c5]",
"(MDSDaemon::ms_dispatch2(boost::intrusive_ptr<Message> const&)+0x108) [0x55bdb62225d8]",
"(DispatchQueue::entry()+0x126a) [0x7f8ac4649aba]",
"(DispatchQueue::DispatchThread::entry()+0x11) [0x7f8ac46fb5d1]",
"/lib64/libpthread.so.0(+0x817a) [0x7f8ac33e917a]",
"clone()"
],
"ceph_version": "16.2.7",
"crash_id": "2022-01-19T22:07:07.314026Z_bfa16d64-b959-4221-9375-32dfcc93b981",
"entity_name": "mds.980deab16bd16b05ef933948f71c9333a65f72e7",
"os_id": "centos",
"os_name": "CentOS Linux",
"os_version": "8",
"os_version_id": "8",
"process_name": "ceph-mds",
"stack_sig": "7278758a86083041e8b61edeaffcf9cdde1003a6f303b704eabf2b009c3814ce",
"timestamp": "2022-01-19T22:07:07.314026Z",
"utsname_machine": "x86_64",
"utsname_release": "4.18.0-193.19.1.el8_2.centos.plus.x86_64",
"utsname_sysname": "Linux",
"utsname_version": "#1 SMP Mon Sep 14 20:42:18 UTC 2020"
}
Related issues
History
#1 Updated by Telemetry Bot over 1 year ago
#2 Updated by Venky Shankar over 1 year ago
- Project changed from RADOS to CephFS
- Category set to Correctness/Safety
- Target version set to v18.0.0
- Backport set to quincy, pacific
- Severity changed from 3 - minor to 2 - major
- Crash signature (v1) updated (diff)
Seen in a pacific cluster. Also, yet another similar backtrace from the same cluster:
/builddir/build/BUILD/ceph-16.2.0/src/mds/Server.cc: In function 'void Server::set_trace_dist(ceph::ref_t<MClientReply>&, CInode*, CDentry*, MDRequestRef&)' thread 7f7805a7f700 time 2022-04-04T17:37:46.071741+0000 /builddir/build/BUILD/ceph-16.2.0/src/mds/Server.cc: 2304: FAILED ceph_assert(!in) ceph version 16.2.0-152.el8cp (e456e8b705cb2f4a779689a0d80b122bcb0d67c9) pacific (stable) 1: (ceph::__ceph_assert_fail(char const*, char const*, int, char const*)+0x158) [0x7f780e4dab60] 2: /usr/lib64/ceph/libceph-common.so.2(+0x274d7a) [0x7f780e4dad7a] 3: (Server::set_trace_dist(boost::intrusive_ptr<MClientReply> const&, CInode*, CDentry*, boost::intrusive_ptr<MDRequestImpl>&)+0xca2) [0x56394cb9d5c2] 4: (Server::reply_client_request(boost::intrusive_ptr<MDRequestImpl>&, boost::intrusive_ptr<MClientReply> const&)+0xcda) [0x56394cba3d9a] 5: (Server::respond_to_request(boost::intrusive_ptr<MDRequestImpl>&, int)+0x238) [0x56394cba40d8] 6: (Batch_Getattr_Lookup::_respond(int)+0xaf) [0x56394cc2a0bf] 7: (BatchOp::respond(int)+0x48) [0x56394ceb2818] 8: (Server::respond_to_request(boost::intrusive_ptr<MDRequestImpl>&, int)+0x188) [0x56394cba4028] 9: (Server::handle_client_getattr(boost::intrusive_ptr<MDRequestImpl>&, bool)+0x408) [0x56394cba7938] 10: (Server::dispatch_client_request(boost::intrusive_ptr<MDRequestImpl>&)+0x65f) [0x56394cbdf16f] 11: (MDCache::dispatch_request(boost::intrusive_ptr<MDRequestImpl>&)+0x33) [0x56394cc97dc3] 12: (MDSContext::complete(int)+0x56) [0x56394ce51706] 13: (MDSCacheObject::finish_waiting(unsigned long, int)+0xce) [0x56394ce7352e] 14: (Locker::eval_gather(SimpleLock*, bool, bool*, std::vector<MDSContext*, std::allocator<MDSContext*> >*)+0x13d6) [0x56394cd266d6] 15: (CDentry::remove_client_lease(ClientLease*, Locker*)+0x466) [0x56394cd8b046] 16: (Locker::handle_client_lease(boost::intrusive_ptr<MClientLease const> const&)+0xc6a) [0x56394cd1715a] 17: (Locker::dispatch(boost::intrusive_ptr<Message const> const&)+0x134) [0x56394cd39534] 18: (MDSRank::handle_message(boost::intrusive_ptr<Message const> const&)+0xbcc) [0x56394cb4fabc] 19: (MDSRank::_dispatch(boost::intrusive_ptr<Message const> const&, bool)+0x7bb) [0x56394cb5211b] 20: (MDSRankDispatcher::ms_dispatch(boost::intrusive_ptr<Message const> const&)+0x55) [0x56394cb52715] 21: (MDSDaemon::ms_dispatch2(boost::intrusive_ptr<Message> const&)+0x128) [0x56394cb42688] 22: (DispatchQueue::entry()+0x126a) [0x7f780e71762a] 23: (DispatchQueue::DispatchThread::entry()+0x11) [0x7f780e7c78d1] 24: /lib64/libpthread.so.0(+0x817a) [0x7f780d27b17a] 25: clone()
#3 Updated by Venky Shankar over 1 year ago
- Priority changed from Normal to High
#4 Updated by Venky Shankar over 1 year ago
- Status changed from New to Triaged
- Assignee set to Venky Shankar
- Labels (FS) task(medium) added
#5 Updated by Venky Shankar over 1 year ago
I've managed to reproduce this crash today. Will send out a fix.
#6 Updated by Venky Shankar over 1 year ago
Venky Shankar wrote:
I've managed to reproduce this crash today. Will send out a fix.
OK. This was not the exact backtrace. Its similar, but not exactly same. Probably a different issue - will create a tracker for this.
The issue seems to be triggered with concurrent lookup and unlink. MDCache::path_traverse() uses CDentry::get_projected_linkage() and fills up mdr->dn0 will the dentry. However, Server::set_trace_dist() might not use the projected linkage depending on the client and the lock state. Normally, if a lookup happens after the unlink operation finishes, the CDentry linkage is not in projected state (Server::_unlink_local_finish()), however, a lookup can creep in with early_reply (before Server::_unlink_local_finish()) gets invoked. This can lead to MDCache::path_traverse() using projected linkage and Server::set_trace_dist() use the durable linkage.
#7 Updated by Venky Shankar over 1 year ago
- Status changed from Triaged to Fix Under Review
- Pull request ID set to 46210
#8 Updated by Venky Shankar about 1 year ago
- Status changed from Fix Under Review to Pending Backport
- Component(FS) MDS added
- Labels (FS) crash added
#9 Updated by Backport Bot about 1 year ago
- Copied to Backport #55932: pacific: crash: void Server::set_trace_dist(ceph::ref_t<MClientReply>&, CInode*, CDentry*, MDRequestRef&): assert(dnl->get_inode() == in) added
#10 Updated by Backport Bot about 1 year ago
- Copied to Backport #55933: quincy: crash: void Server::set_trace_dist(ceph::ref_t<MClientReply>&, CInode*, CDentry*, MDRequestRef&): assert(dnl->get_inode() == in) added
#11 Updated by Backport Bot about 1 year ago
- Tags set to backport_processed
#12 Updated by Venky Shankar 12 months ago
- Status changed from Pending Backport to Resolved