Project

General

Profile

Bug #55355

Updated by Radoslaw Zarzynski about 2 years ago

my ceph version is 14.2.22 
 After the network is abnormal, the osd cannot join the cluster. 

 Then I find the osd thread have deadlock 

 <pre> 
 45     Thread 0x7fdf0ec7f700 (LWP 22855) "cfin" 0x00007fdf2620c54d in __lll_lock_wait () from /lib64/libpthread.so.0 
 71     Thread 0x7fdf20ca3700 (LWP 20084) "msgr-worker-2" 0x00007fdf2620c54d in __lll_lock_wait () from /lib64/libpthread.so.0 

 (gdb) t 45 
 [Switching to thread 45 (Thread 0x7fdf0ec7f700 (LWP 22855))] 
 #0    0x00007fdf2620c54d in __lll_lock_wait () from /lib64/libpthread.so.0 
 (gdb) bt 
 #0    0x00007fdf2620c54d in __lll_lock_wait () from /lib64/libpthread.so.0 
 #1    0x00007fdf26207e9b in _L_lock_883 () from /lib64/libpthread.so.0 
 #2    0x00007fdf26207d68 in pthread_mutex_lock () from /lib64/libpthread.so.0 
 #3    0x000055f9d176317f in __gthread_mutex_lock (__mutex=0x55fa432cd268) at /opt/rh/devtoolset-8/root/usr/include/c++/8/x86_64-redhat-linux/bits/gthr-default.h:748 
 #4    lock (this=0x55fa432cd268) at /opt/rh/devtoolset-8/root/usr/include/c++/8/bits/std_mutex.h:103 
 #5    AsyncConnection::stop(bool) () at /usr/src/debug/ceph-14.2.22-2/src/msg/async/AsyncConnection.cc:711 
 #6    0x000055f9d15a61c5 in AsyncMessenger::shutdown_connections(bool) () at /usr/src/debug/ceph-14.2.22-2/src/msg/async/AsyncMessenger.cc:806 
 #7    0x000055f9d15a7e36 in mark_down_all (this=0x55f9dca4b600) at /usr/src/debug/ceph-14.2.22-2/src/msg/async/AsyncMessenger.h:158 
 #8    AsyncMessenger::rebind(std::set<int, std::less<int>, std::allocator<int> > const&) () at /usr/src/debug/ceph-14.2.22-2/src/msg/async/AsyncMessenger.cc:428 
 #9    0x000055f9d0e782c2 in OSD::_committed_osd_maps(unsigned int, unsigned int, MOSDMap*) () at /usr/src/debug/ceph-14.2.22-2/src/osd/OSD.cc:8750 
 #10 0x000055f9d0ecf7b7 in C_OnMapCommit::finish (this=0x55fa463b3ee0, r=<optimized out>) at /usr/src/debug/ceph-14.2.22-2/src/osd/OSD.cc:8204 
 #11 0x000055f9d0e80399 in Context::complete (this=0x55fa463b3ee0, r=<optimized out>) at /usr/src/debug/ceph-14.2.22-2/src/include/Context.h:77 
 #12 0x000055f9d13f454f in Finisher::finisher_thread_entry() () at /usr/src/debug/ceph-14.2.22-2/src/common/Finisher.cc:67 
 #13 0x00007fdf26205ea5 in start_thread () from /lib64/libpthread.so.0 
 #14 0x00007fdf250ca8cd in clone () from /lib64/libc.so.6 
 (gdb)  
 (gdb) f 5 
 #5    AsyncConnection::stop(bool) () at /usr/src/debug/ceph-14.2.22-2/src/msg/async/AsyncConnection.cc:711 
 711 	   lock.lock() 
 (gdb) p lock 
 $5 = {<std::__mutex_base> = {_M_mutex = {__data = {__lock = 2, __count = 0, __owner = 20084, __nusers = 1, __kind = 0, __spins = 0, __elision = 0, __list = {__prev = 0x0,  
           __next = 0x0}}, __size = "\002\000\000\000\000\000\000\000tN\000\000\001", '\000' <repeats 26 times>, __align = 2}}, <No data fields>} 
 (gdb) p &lock 
 $6 = (std::mutex *) 0x55fa432cd268 
 (gdb)  
 (gdb) f 6 
 #6    0x000055f9d15a61c5 in AsyncMessenger::shutdown_connections(bool) () at /usr/src/debug/ceph-14.2.22-2/src/msg/async/AsyncMessenger.cc:806 
 806 	     p->stop(queue_reset); 
 (gdb) p lock 
 $17 = {name = "AsyncMessenger::lock", id = -1, recursive = false, lockdep = true, backtrace = false, _m = {__data = {__lock = 2, __count = 0, __owner = 22855, __nusers = 1,  
       __kind = 2, __spins = 0, __elision = 0, __list = {__prev = 0x0, __next = 0x0}},  
     __size = "\002\000\000\000\000\000\000\000GY\000\000\001\000\000\000\002", '\000' <repeats 22 times>, __align = 2}, nlock = 1, locked_by = 140596002420480} 
 (gdb) p &lock 
 $18 = (Mutex *) 0x55f9dca4bc80 
 (gdb)  

 thread 45    holding lock A(0x55f9dca4bc80) and wait for lock B(0x55fa432cd268)============= 

 (gdb) t 71 
 [Switching to thread 71 (Thread 0x7fdf20ca3700 (LWP 20084))] 
 #0    0x00007fdf2620c54d in __lll_lock_wait () from /lib64/libpthread.so.0 
 (gdb) bt 
 #0    0x00007fdf2620c54d in __lll_lock_wait () from /lib64/libpthread.so.0 
 #1    0x00007fdf26207e9b in _L_lock_883 () from /lib64/libpthread.so.0 
 #2    0x00007fdf26207d68 in pthread_mutex_lock () from /lib64/libpthread.so.0 
 #3    0x000055f9d1415429 in Mutex::lock (this=this@entry=0x55f9dca4bc80, no_lockdep=no_lockdep@entry=false) at /usr/src/debug/ceph-14.2.22-2/src/common/Mutex.cc:78 
 #4    0x000055f9d15a6dcc in lock_guard (__m=..., this=<synthetic pointer>) at /usr/src/debug/ceph-14.2.22-2/src/msg/async/AsyncMessenger.cc:855 
 #5    AsyncMessenger::accept_conn(boost::intrusive_ptr<AsyncConnection>) () at /usr/src/debug/ceph-14.2.22-2/src/msg/async/AsyncMessenger.cc:855 
 #6    0x000055f9d179777f in ProtocolV2::send_server_ident() () at /usr/src/debug/ceph-14.2.22-2/src/common/RefCountedObj.h:171 
 #7    0x000055f9d179c923 in ProtocolV2::handle_existing_connection(boost::intrusive_ptr<AsyncConnection>) () at /usr/src/debug/ceph-14.2.22-2/src/msg/async/ProtocolV2.cc:2543 
 #8    0x000055f9d179e183 in ProtocolV2::handle_client_ident(ceph::buffer::v14_2_0::list&) () at /usr/src/debug/ceph-14.2.22-2/src/common/RefCountedObj.h:171 
 #9    0x000055f9d179e64b in ProtocolV2::handle_frame_payload() () at /usr/src/debug/ceph-14.2.22-2/src/msg/async/ProtocolV2.cc:1217 
 #10 0x000055f9d179e8a0 in ProtocolV2::handle_read_frame_dispatch() () at /usr/src/debug/ceph-14.2.22-2/src/msg/async/ProtocolV2.cc:1126 
 #11 0x000055f9d179ea45 in ProtocolV2::_handle_read_frame_epilogue_main() () at /usr/src/debug/ceph-14.2.22-2/src/msg/async/ProtocolV2.cc:1316 
 #12 0x000055f9d179eaea in ProtocolV2::_handle_read_frame_segment() () at /usr/src/debug/ceph-14.2.22-2/src/msg/async/ProtocolV2.cc:1185 
 #13 0x000055f9d179f90d in ProtocolV2::handle_read_frame_segment(std::unique_ptr<ceph::buffer::v14_2_0::ptr_node, ceph::buffer::v14_2_0::ptr_node::disposer>&&, int) () 
     at /usr/src/debug/ceph-14.2.22-2/src/msg/async/ProtocolV2.cc:1177 
 #14 0x000055f9d178a3c4 in ProtocolV2::run_continuation(Ct<ProtocolV2>&) () at /usr/src/debug/ceph-14.2.22-2/src/msg/async/ProtocolV2.cc:47 
 #15 0x000055f9d1765bd6 in operator() (__args#1=<optimized out>, __args#0=<optimized out>, this=0x55fa2eed6fc8) 
     at /opt/rh/devtoolset-8/root/usr/include/c++/8/bits/std_function.h:260 
 #16 AsyncConnection::process() () at /usr/src/debug/ceph-14.2.22-2/src/msg/async/AsyncConnection.cc:450 
 #17 0x000055f9d15af0b5 in EventCenter::process_events(unsigned int, std::chrono::duration<unsigned long, std::ratio<1l, 1000000000l> >*) () 
     at /usr/src/debug/ceph-14.2.22-2/src/common/StackStringStream.h:143 
 #18 0x000055f9d15b3817 in operator() (__closure=0x55f9dc93b2a8, __closure=0x55f9dc93b2a8) at /usr/src/debug/ceph-14.2.22-2/src/msg/async/Stack.cc:53 
 #19 std::_Function_handler<void (), NetworkStack::add_thread(unsigned int)::{lambda()#1}>::_M_invoke(std::_Any_data const&) () 
     at /opt/rh/devtoolset-8/root/usr/include/c++/8/bits/std_function.h:297 
 #20 0x000055f9d1a98fbf in execute_native_thread_routine () 
 #21 0x00007fdf26205ea5 in start_thread () from /lib64/libpthread.so.0 
 #22 0x00007fdf250ca8cd in clone () from /lib64/libc.so.6 

 (gdb) f 5 
 #5    AsyncMessenger::accept_conn(boost::intrusive_ptr<AsyncConnection>) () at /usr/src/debug/ceph-14.2.22-2/src/msg/async/AsyncMessenger.cc:855 
 855 	   Mutex::Locker l(lock); 
 (gdb) p lock 
 $9 = {name = "AsyncMessenger::lock", id = -1, recursive = false, lockdep = true, backtrace = false, _m = {__data = {__lock = 2, __count = 0, __owner = 22855, __nusers = 1,  
       __kind = 2, __spins = 0, __elision = 0, __list = {__prev = 0x0, __next = 0x0}},  
     __size = "\002\000\000\000\000\000\000\000GY\000\000\001\000\000\000\002", '\000' <repeats 22 times>, __align = 2}, nlock = 1, locked_by = 140596002420480} 
 (gdb) p &lock 
 $12 = (Mutex *) 0x55f9dca4bc80 
 (gdb) f 7 
 #7    0x000055f9d179c923 in ProtocolV2::handle_existing_connection(boost::intrusive_ptr<AsyncConnection>) () at /usr/src/debug/ceph-14.2.22-2/src/msg/async/ProtocolV2.cc:2543 
 2543 	     return send_server_ident(); 
 (gdb) p existing 
 $5 = {px = 0x55fa432cd000} 
 (gdb)  
 (gdb) p l 
 $65 = {_M_device = @0x55fa432cd268} 
 (gdb) p l._M_device 
 $66 = (std::lock_guard<std::mutex>::mutex_type &) @0x55fa432cd268: {<std::__mutex_base> = {_M_mutex = {__data = {__lock = 2, __count = 0, __owner = 20084, __nusers = 1,  
         __kind = 0, __spins = 0, __elision = 0, __list = {__prev = 0x0, __next = 0x0}}, __size = "\002\000\000\000\000\000\000\000tN\000\000\001", '\000' <repeats 26 times>,  
       __align = 2}}, <No data fields>} 

 thread 71    holding lock B(0x55fa432cd268) and wait for lock A(0x55f9dca4bc80)============= 

 </pre>

Back