Project

General

Profile

Bug #7406

Updated by David Zafman about 10 years ago

On commit 7fe10f1271dd94d4c5014a35c6362a7772dc4508 

 /a/teuthology-2014-02-09_23:00:13-rados-master-testing-basic-plana/74133 

 osd.0 seg faulted 

 #0    0x00007f6c0ad37b7b in raise (sig=<optimized out>) at ../nptl/sysdeps/unix/sysv/linux/pt-raise.c:42 
 #1    0x000000000091bf6e in reraise_fatal (signum=11) at global/signal_handler.cc:59 
 #2    handle_fatal_signal (signum=11) at global/signal_handler.cc:105 
 #3    <signal handler called> 
 #4    ReplicatedPG::find_object_context (this=0x2b5d000, oid=..., pobc=0x7f6bf5e8b200, can_create=<optimized out>, pmissing=0x7f6bf5e8aea0) at osd/ReplicatedPG.cc:6775 
 #5    0x000000000088c724 in ReplicatedPG::do_op (this=0x2b5d000, op=...) at osd/ReplicatedPG.cc:1170 
 #6    0x0000000000833d22 in ReplicatedPG::do_request (this=0x2b5d000, op=..., handle=...) at osd/ReplicatedPG.cc:1027 
 #7    0x00000000006081ba in OSD::dequeue_op (this=0x2657000, pg=..., op=..., handle=...) at osd/OSD.cc:7239 
 #8    0x0000000000621608 in OSD::OpWQ::_process (this=0x2657e28, pg=..., handle=...) at osd/OSD.cc:7209 
 #9    0x00000000006634ec in ThreadPool::WorkQueueVal<std::pair<boost::intrusive_ptr<PG>, std::tr1::shared_ptr<OpRequest> >, boost::intrusive_ptr<PG> >::_void_process ( 
     this=0x2657e28, handle=...) at ./common/WorkQueue.h:190 
 #10 0x00000000009e91b6 in ThreadPool::worker (this=0x2657470, wt=0x2692e40) at common/WorkQueue.cc:125 
 #11 0x00000000009eafc0 in ThreadPool::WorkThread::entry (this=<optimized out>) at common/WorkQueue.h:317 
 #12 0x00007f6c0ad2fe9a in start_thread (arg=0x7f6bf5e8c700) at pthread_create.c:308 
 #13 0x00007f6c094f43fd in clone () at ../sysdeps/unix/sysv/linux/x86_64/clone.S:112 
 #14 0x0000000000000000 in ?? () 
 (gdb) f 4 
 #4    ReplicatedPG::find_object_context (this=0x2b5d000, oid=..., pobc=0x7f6bf5e8b200, can_create=<optimized out>, pmissing=0x7f6bf5e8aea0) at osd/ReplicatedPG.cc:6775 
 6775      in osd/ReplicatedPG.cc 
 (gdb) p obc 
 $13 = {<std::tr1::__shared_ptr<ObjectContext, (__gnu_cxx::_Lock_policy)2>> = {_M_ptr = 0x2c872c0, _M_refcount = {_M_pi = 0x2c86000}}, <No data fields>} 
 (gdb) p ((ObjectContext *)0x2c872c0)->obs.oi.snaps 
 $15 = {<std::_Vector_base<snapid_t, std::allocator<snapid_t> >> = { 
     _M_impl = {<std::allocator<snapid_t>> = {<__gnu_cxx::new_allocator<snapid_t>> = {<No data fields>}, <No data fields>}, _M_start = 0x0, _M_finish = 0x0, 
       _M_end_of_storage = 0x0}}, <No data fields>} 

 Lines 6774 and 6775.    Not sure why it didn't crash on 6774 with snaps.size() == 0 it didn't crash on snaps[-1] reference but did on snaps[0] unless this was optimized in a different order: 

   snapid_t first = obc->obs.oi.snaps[obc->obs.oi.snaps.size()-1]; 
   snapid_t last = obc->obs.oi.snaps[0]; 

 Nothing in the last section of code in find_object_context() has changed recently: 

 <pre><code class="cpp"> 
 c6b73eb4 src/osd/ReplicatedPG.cc          (Sage Weil           2013-12-30 12:52:20 -0800    6763)     if (!obc->ssc) { 
 c6b73eb4 src/osd/ReplicatedPG.cc          (Sage Weil           2013-12-30 12:52:20 -0800    6764)       obc->ssc = ssc; 
 c6b73eb4 src/osd/ReplicatedPG.cc          (Sage Weil           2013-12-30 12:52:20 -0800    6765)     } else { 
 c6b73eb4 src/osd/ReplicatedPG.cc          (Sage Weil           2013-12-30 12:52:20 -0800    6766)       assert(obc->ssc == ssc); 
 c6b73eb4 src/osd/ReplicatedPG.cc          (Sage Weil           2013-12-30 12:52:20 -0800    6767)       put_snapset_context(ssc); 
 c6b73eb4 src/osd/ReplicatedPG.cc          (Sage Weil           2013-12-30 12:52:20 -0800    6768)     } 
 c3c1541c src/osd/ReplicatedPG.cc          (Sage Weil           2013-12-23 17:25:07 -0800    6769)     ssc = 0; 
 c2012f3c src/osd/ReplicatedPG.cc          (Sage Weil           2009-05-22 11:43:38 -0700    6770) 
 c2012f3c src/osd/ReplicatedPG.cc          (Sage Weil           2009-05-22 11:43:38 -0700    6771)     // clone 
 c3c1541c src/osd/ReplicatedPG.cc          (Sage Weil           2013-12-23 17:25:07 -0800    6772)     dout(20) << "find_object_context    " << soid << " snaps " << obc->obs.oi.snaps 
 c3c1541c src/osd/ReplicatedPG.cc          (Sage Weil           2013-12-23 17:25:07 -0800    6773)          << dendl; 
 0ebda4a9 src/osd/ReplicatedPG.cc          (Sage Weil           2009-05-27 21:43:41 -0700    6774)     snapid_t first = obc->obs.oi.snaps[obc->obs.oi.snaps.size()-1]; 
 0ebda4a9 src/osd/ReplicatedPG.cc          (Sage Weil           2009-05-27 21:43:41 -0700    6775)     snapid_t last = obc->obs.oi.snaps[0]; 
 5492cbdf src/osd/ReplicatedPG.cc          (Samuel Just         2011-07-13 11:02:56 -0700    6776)     if (first <= oid.snap) { 
 bc1782a1 src/osd/ReplicatedPG.cc          (Sage Weil           2011-06-16 19:42:48 -0700    6777)       dout(20) << "find_object_context    " << soid << " [" << first << "," << last 
 5492cbdf src/osd/ReplicatedPG.cc          (Samuel Just         2011-07-13 11:02:56 -0700    6778)            << "] contains " << oid.snap << " -- HIT " << obc->obs << dendl; 
 c2012f3c src/osd/ReplicatedPG.cc          (Sage Weil           2009-05-22 11:43:38 -0700    6779)       *pobc = obc; 
 c2012f3c src/osd/ReplicatedPG.cc          (Sage Weil           2009-05-22 11:43:38 -0700    6780)       return 0; 
 c2012f3c src/osd/ReplicatedPG.cc          (Sage Weil           2009-05-22 11:43:38 -0700    6781)     } else { 
 bc1782a1 src/osd/ReplicatedPG.cc          (Sage Weil           2011-06-16 19:42:48 -0700    6782)       dout(20) << "find_object_context    " << soid << " [" << first << "," << last 
 5492cbdf src/osd/ReplicatedPG.cc          (Samuel Just         2011-07-13 11:02:56 -0700    6783)            << "] does not contain " << oid.snap << " -- DNE" << dendl; 
 c2012f3c src/osd/ReplicatedPG.cc          (Sage Weil           2009-05-22 11:43:38 -0700    6784)       return -ENOENT; 
 c2012f3c src/osd/ReplicatedPG.cc          (Sage Weil           2009-05-22 11:43:38 -0700    6785)     } 
 7a1eaa98 src/osd/ReplicatedPG.cc          (Sage Weil           2008-12-03 19:46:08 -0800    6786) } 
 </code></pre>

Back