Bug #55355
Updated by Radoslaw Zarzynski about 2 years ago
my ceph version is 14.2.22 After the network is abnormal, the osd cannot join the cluster. Then I find the osd thread have deadlock <pre> 45 Thread 0x7fdf0ec7f700 (LWP 22855) "cfin" 0x00007fdf2620c54d in __lll_lock_wait () from /lib64/libpthread.so.0 71 Thread 0x7fdf20ca3700 (LWP 20084) "msgr-worker-2" 0x00007fdf2620c54d in __lll_lock_wait () from /lib64/libpthread.so.0 (gdb) t 45 [Switching to thread 45 (Thread 0x7fdf0ec7f700 (LWP 22855))] #0 0x00007fdf2620c54d in __lll_lock_wait () from /lib64/libpthread.so.0 (gdb) bt #0 0x00007fdf2620c54d in __lll_lock_wait () from /lib64/libpthread.so.0 #1 0x00007fdf26207e9b in _L_lock_883 () from /lib64/libpthread.so.0 #2 0x00007fdf26207d68 in pthread_mutex_lock () from /lib64/libpthread.so.0 #3 0x000055f9d176317f in __gthread_mutex_lock (__mutex=0x55fa432cd268) at /opt/rh/devtoolset-8/root/usr/include/c++/8/x86_64-redhat-linux/bits/gthr-default.h:748 #4 lock (this=0x55fa432cd268) at /opt/rh/devtoolset-8/root/usr/include/c++/8/bits/std_mutex.h:103 #5 AsyncConnection::stop(bool) () at /usr/src/debug/ceph-14.2.22-2/src/msg/async/AsyncConnection.cc:711 #6 0x000055f9d15a61c5 in AsyncMessenger::shutdown_connections(bool) () at /usr/src/debug/ceph-14.2.22-2/src/msg/async/AsyncMessenger.cc:806 #7 0x000055f9d15a7e36 in mark_down_all (this=0x55f9dca4b600) at /usr/src/debug/ceph-14.2.22-2/src/msg/async/AsyncMessenger.h:158 #8 AsyncMessenger::rebind(std::set<int, std::less<int>, std::allocator<int> > const&) () at /usr/src/debug/ceph-14.2.22-2/src/msg/async/AsyncMessenger.cc:428 #9 0x000055f9d0e782c2 in OSD::_committed_osd_maps(unsigned int, unsigned int, MOSDMap*) () at /usr/src/debug/ceph-14.2.22-2/src/osd/OSD.cc:8750 #10 0x000055f9d0ecf7b7 in C_OnMapCommit::finish (this=0x55fa463b3ee0, r=<optimized out>) at /usr/src/debug/ceph-14.2.22-2/src/osd/OSD.cc:8204 #11 0x000055f9d0e80399 in Context::complete (this=0x55fa463b3ee0, r=<optimized out>) at /usr/src/debug/ceph-14.2.22-2/src/include/Context.h:77 #12 0x000055f9d13f454f in Finisher::finisher_thread_entry() () at /usr/src/debug/ceph-14.2.22-2/src/common/Finisher.cc:67 #13 0x00007fdf26205ea5 in start_thread () from /lib64/libpthread.so.0 #14 0x00007fdf250ca8cd in clone () from /lib64/libc.so.6 (gdb) (gdb) f 5 #5 AsyncConnection::stop(bool) () at /usr/src/debug/ceph-14.2.22-2/src/msg/async/AsyncConnection.cc:711 711 lock.lock() (gdb) p lock $5 = {<std::__mutex_base> = {_M_mutex = {__data = {__lock = 2, __count = 0, __owner = 20084, __nusers = 1, __kind = 0, __spins = 0, __elision = 0, __list = {__prev = 0x0, __next = 0x0}}, __size = "\002\000\000\000\000\000\000\000tN\000\000\001", '\000' <repeats 26 times>, __align = 2}}, <No data fields>} (gdb) p &lock $6 = (std::mutex *) 0x55fa432cd268 (gdb) (gdb) f 6 #6 0x000055f9d15a61c5 in AsyncMessenger::shutdown_connections(bool) () at /usr/src/debug/ceph-14.2.22-2/src/msg/async/AsyncMessenger.cc:806 806 p->stop(queue_reset); (gdb) p lock $17 = {name = "AsyncMessenger::lock", id = -1, recursive = false, lockdep = true, backtrace = false, _m = {__data = {__lock = 2, __count = 0, __owner = 22855, __nusers = 1, __kind = 2, __spins = 0, __elision = 0, __list = {__prev = 0x0, __next = 0x0}}, __size = "\002\000\000\000\000\000\000\000GY\000\000\001\000\000\000\002", '\000' <repeats 22 times>, __align = 2}, nlock = 1, locked_by = 140596002420480} (gdb) p &lock $18 = (Mutex *) 0x55f9dca4bc80 (gdb) thread 45 holding lock A(0x55f9dca4bc80) and wait for lock B(0x55fa432cd268)============= (gdb) t 71 [Switching to thread 71 (Thread 0x7fdf20ca3700 (LWP 20084))] #0 0x00007fdf2620c54d in __lll_lock_wait () from /lib64/libpthread.so.0 (gdb) bt #0 0x00007fdf2620c54d in __lll_lock_wait () from /lib64/libpthread.so.0 #1 0x00007fdf26207e9b in _L_lock_883 () from /lib64/libpthread.so.0 #2 0x00007fdf26207d68 in pthread_mutex_lock () from /lib64/libpthread.so.0 #3 0x000055f9d1415429 in Mutex::lock (this=this@entry=0x55f9dca4bc80, no_lockdep=no_lockdep@entry=false) at /usr/src/debug/ceph-14.2.22-2/src/common/Mutex.cc:78 #4 0x000055f9d15a6dcc in lock_guard (__m=..., this=<synthetic pointer>) at /usr/src/debug/ceph-14.2.22-2/src/msg/async/AsyncMessenger.cc:855 #5 AsyncMessenger::accept_conn(boost::intrusive_ptr<AsyncConnection>) () at /usr/src/debug/ceph-14.2.22-2/src/msg/async/AsyncMessenger.cc:855 #6 0x000055f9d179777f in ProtocolV2::send_server_ident() () at /usr/src/debug/ceph-14.2.22-2/src/common/RefCountedObj.h:171 #7 0x000055f9d179c923 in ProtocolV2::handle_existing_connection(boost::intrusive_ptr<AsyncConnection>) () at /usr/src/debug/ceph-14.2.22-2/src/msg/async/ProtocolV2.cc:2543 #8 0x000055f9d179e183 in ProtocolV2::handle_client_ident(ceph::buffer::v14_2_0::list&) () at /usr/src/debug/ceph-14.2.22-2/src/common/RefCountedObj.h:171 #9 0x000055f9d179e64b in ProtocolV2::handle_frame_payload() () at /usr/src/debug/ceph-14.2.22-2/src/msg/async/ProtocolV2.cc:1217 #10 0x000055f9d179e8a0 in ProtocolV2::handle_read_frame_dispatch() () at /usr/src/debug/ceph-14.2.22-2/src/msg/async/ProtocolV2.cc:1126 #11 0x000055f9d179ea45 in ProtocolV2::_handle_read_frame_epilogue_main() () at /usr/src/debug/ceph-14.2.22-2/src/msg/async/ProtocolV2.cc:1316 #12 0x000055f9d179eaea in ProtocolV2::_handle_read_frame_segment() () at /usr/src/debug/ceph-14.2.22-2/src/msg/async/ProtocolV2.cc:1185 #13 0x000055f9d179f90d in ProtocolV2::handle_read_frame_segment(std::unique_ptr<ceph::buffer::v14_2_0::ptr_node, ceph::buffer::v14_2_0::ptr_node::disposer>&&, int) () at /usr/src/debug/ceph-14.2.22-2/src/msg/async/ProtocolV2.cc:1177 #14 0x000055f9d178a3c4 in ProtocolV2::run_continuation(Ct<ProtocolV2>&) () at /usr/src/debug/ceph-14.2.22-2/src/msg/async/ProtocolV2.cc:47 #15 0x000055f9d1765bd6 in operator() (__args#1=<optimized out>, __args#0=<optimized out>, this=0x55fa2eed6fc8) at /opt/rh/devtoolset-8/root/usr/include/c++/8/bits/std_function.h:260 #16 AsyncConnection::process() () at /usr/src/debug/ceph-14.2.22-2/src/msg/async/AsyncConnection.cc:450 #17 0x000055f9d15af0b5 in EventCenter::process_events(unsigned int, std::chrono::duration<unsigned long, std::ratio<1l, 1000000000l> >*) () at /usr/src/debug/ceph-14.2.22-2/src/common/StackStringStream.h:143 #18 0x000055f9d15b3817 in operator() (__closure=0x55f9dc93b2a8, __closure=0x55f9dc93b2a8) at /usr/src/debug/ceph-14.2.22-2/src/msg/async/Stack.cc:53 #19 std::_Function_handler<void (), NetworkStack::add_thread(unsigned int)::{lambda()#1}>::_M_invoke(std::_Any_data const&) () at /opt/rh/devtoolset-8/root/usr/include/c++/8/bits/std_function.h:297 #20 0x000055f9d1a98fbf in execute_native_thread_routine () #21 0x00007fdf26205ea5 in start_thread () from /lib64/libpthread.so.0 #22 0x00007fdf250ca8cd in clone () from /lib64/libc.so.6 (gdb) f 5 #5 AsyncMessenger::accept_conn(boost::intrusive_ptr<AsyncConnection>) () at /usr/src/debug/ceph-14.2.22-2/src/msg/async/AsyncMessenger.cc:855 855 Mutex::Locker l(lock); (gdb) p lock $9 = {name = "AsyncMessenger::lock", id = -1, recursive = false, lockdep = true, backtrace = false, _m = {__data = {__lock = 2, __count = 0, __owner = 22855, __nusers = 1, __kind = 2, __spins = 0, __elision = 0, __list = {__prev = 0x0, __next = 0x0}}, __size = "\002\000\000\000\000\000\000\000GY\000\000\001\000\000\000\002", '\000' <repeats 22 times>, __align = 2}, nlock = 1, locked_by = 140596002420480} (gdb) p &lock $12 = (Mutex *) 0x55f9dca4bc80 (gdb) f 7 #7 0x000055f9d179c923 in ProtocolV2::handle_existing_connection(boost::intrusive_ptr<AsyncConnection>) () at /usr/src/debug/ceph-14.2.22-2/src/msg/async/ProtocolV2.cc:2543 2543 return send_server_ident(); (gdb) p existing $5 = {px = 0x55fa432cd000} (gdb) (gdb) p l $65 = {_M_device = @0x55fa432cd268} (gdb) p l._M_device $66 = (std::lock_guard<std::mutex>::mutex_type &) @0x55fa432cd268: {<std::__mutex_base> = {_M_mutex = {__data = {__lock = 2, __count = 0, __owner = 20084, __nusers = 1, __kind = 0, __spins = 0, __elision = 0, __list = {__prev = 0x0, __next = 0x0}}, __size = "\002\000\000\000\000\000\000\000tN\000\000\001", '\000' <repeats 26 times>, __align = 2}}, <No data fields>} thread 71 holding lock B(0x55fa432cd268) and wait for lock A(0x55f9dca4bc80)============= </pre>