Bug #7406
Updated by David Zafman about 10 years ago
On commit 7fe10f1271dd94d4c5014a35c6362a7772dc4508 /a/teuthology-2014-02-09_23:00:13-rados-master-testing-basic-plana/74133 osd.0 seg faulted #0 0x00007f6c0ad37b7b in raise (sig=<optimized out>) at ../nptl/sysdeps/unix/sysv/linux/pt-raise.c:42 #1 0x000000000091bf6e in reraise_fatal (signum=11) at global/signal_handler.cc:59 #2 handle_fatal_signal (signum=11) at global/signal_handler.cc:105 #3 <signal handler called> #4 ReplicatedPG::find_object_context (this=0x2b5d000, oid=..., pobc=0x7f6bf5e8b200, can_create=<optimized out>, pmissing=0x7f6bf5e8aea0) at osd/ReplicatedPG.cc:6775 #5 0x000000000088c724 in ReplicatedPG::do_op (this=0x2b5d000, op=...) at osd/ReplicatedPG.cc:1170 #6 0x0000000000833d22 in ReplicatedPG::do_request (this=0x2b5d000, op=..., handle=...) at osd/ReplicatedPG.cc:1027 #7 0x00000000006081ba in OSD::dequeue_op (this=0x2657000, pg=..., op=..., handle=...) at osd/OSD.cc:7239 #8 0x0000000000621608 in OSD::OpWQ::_process (this=0x2657e28, pg=..., handle=...) at osd/OSD.cc:7209 #9 0x00000000006634ec in ThreadPool::WorkQueueVal<std::pair<boost::intrusive_ptr<PG>, std::tr1::shared_ptr<OpRequest> >, boost::intrusive_ptr<PG> >::_void_process ( this=0x2657e28, handle=...) at ./common/WorkQueue.h:190 #10 0x00000000009e91b6 in ThreadPool::worker (this=0x2657470, wt=0x2692e40) at common/WorkQueue.cc:125 #11 0x00000000009eafc0 in ThreadPool::WorkThread::entry (this=<optimized out>) at common/WorkQueue.h:317 #12 0x00007f6c0ad2fe9a in start_thread (arg=0x7f6bf5e8c700) at pthread_create.c:308 #13 0x00007f6c094f43fd in clone () at ../sysdeps/unix/sysv/linux/x86_64/clone.S:112 #14 0x0000000000000000 in ?? () (gdb) f 4 #4 ReplicatedPG::find_object_context (this=0x2b5d000, oid=..., pobc=0x7f6bf5e8b200, can_create=<optimized out>, pmissing=0x7f6bf5e8aea0) at osd/ReplicatedPG.cc:6775 6775 in osd/ReplicatedPG.cc (gdb) p obc $13 = {<std::tr1::__shared_ptr<ObjectContext, (__gnu_cxx::_Lock_policy)2>> = {_M_ptr = 0x2c872c0, _M_refcount = {_M_pi = 0x2c86000}}, <No data fields>} (gdb) p ((ObjectContext *)0x2c872c0)->obs.oi.snaps $15 = {<std::_Vector_base<snapid_t, std::allocator<snapid_t> >> = { _M_impl = {<std::allocator<snapid_t>> = {<__gnu_cxx::new_allocator<snapid_t>> = {<No data fields>}, <No data fields>}, _M_start = 0x0, _M_finish = 0x0, _M_end_of_storage = 0x0}}, <No data fields>} Lines 6774 and 6775. Not sure why it didn't crash on 6774 with snaps.size() == 0 it didn't crash on snaps[-1] reference but did on snaps[0] unless this was optimized in a different order: snapid_t first = obc->obs.oi.snaps[obc->obs.oi.snaps.size()-1]; snapid_t last = obc->obs.oi.snaps[0]; Nothing in the last section of code in find_object_context() has changed recently: <pre><code class="cpp"> c6b73eb4 src/osd/ReplicatedPG.cc (Sage Weil 2013-12-30 12:52:20 -0800 6763) if (!obc->ssc) { c6b73eb4 src/osd/ReplicatedPG.cc (Sage Weil 2013-12-30 12:52:20 -0800 6764) obc->ssc = ssc; c6b73eb4 src/osd/ReplicatedPG.cc (Sage Weil 2013-12-30 12:52:20 -0800 6765) } else { c6b73eb4 src/osd/ReplicatedPG.cc (Sage Weil 2013-12-30 12:52:20 -0800 6766) assert(obc->ssc == ssc); c6b73eb4 src/osd/ReplicatedPG.cc (Sage Weil 2013-12-30 12:52:20 -0800 6767) put_snapset_context(ssc); c6b73eb4 src/osd/ReplicatedPG.cc (Sage Weil 2013-12-30 12:52:20 -0800 6768) } c3c1541c src/osd/ReplicatedPG.cc (Sage Weil 2013-12-23 17:25:07 -0800 6769) ssc = 0; c2012f3c src/osd/ReplicatedPG.cc (Sage Weil 2009-05-22 11:43:38 -0700 6770) c2012f3c src/osd/ReplicatedPG.cc (Sage Weil 2009-05-22 11:43:38 -0700 6771) // clone c3c1541c src/osd/ReplicatedPG.cc (Sage Weil 2013-12-23 17:25:07 -0800 6772) dout(20) << "find_object_context " << soid << " snaps " << obc->obs.oi.snaps c3c1541c src/osd/ReplicatedPG.cc (Sage Weil 2013-12-23 17:25:07 -0800 6773) << dendl; 0ebda4a9 src/osd/ReplicatedPG.cc (Sage Weil 2009-05-27 21:43:41 -0700 6774) snapid_t first = obc->obs.oi.snaps[obc->obs.oi.snaps.size()-1]; 0ebda4a9 src/osd/ReplicatedPG.cc (Sage Weil 2009-05-27 21:43:41 -0700 6775) snapid_t last = obc->obs.oi.snaps[0]; 5492cbdf src/osd/ReplicatedPG.cc (Samuel Just 2011-07-13 11:02:56 -0700 6776) if (first <= oid.snap) { bc1782a1 src/osd/ReplicatedPG.cc (Sage Weil 2011-06-16 19:42:48 -0700 6777) dout(20) << "find_object_context " << soid << " [" << first << "," << last 5492cbdf src/osd/ReplicatedPG.cc (Samuel Just 2011-07-13 11:02:56 -0700 6778) << "] contains " << oid.snap << " -- HIT " << obc->obs << dendl; c2012f3c src/osd/ReplicatedPG.cc (Sage Weil 2009-05-22 11:43:38 -0700 6779) *pobc = obc; c2012f3c src/osd/ReplicatedPG.cc (Sage Weil 2009-05-22 11:43:38 -0700 6780) return 0; c2012f3c src/osd/ReplicatedPG.cc (Sage Weil 2009-05-22 11:43:38 -0700 6781) } else { bc1782a1 src/osd/ReplicatedPG.cc (Sage Weil 2011-06-16 19:42:48 -0700 6782) dout(20) << "find_object_context " << soid << " [" << first << "," << last 5492cbdf src/osd/ReplicatedPG.cc (Samuel Just 2011-07-13 11:02:56 -0700 6783) << "] does not contain " << oid.snap << " -- DNE" << dendl; c2012f3c src/osd/ReplicatedPG.cc (Sage Weil 2009-05-22 11:43:38 -0700 6784) return -ENOENT; c2012f3c src/osd/ReplicatedPG.cc (Sage Weil 2009-05-22 11:43:38 -0700 6785) } 7a1eaa98 src/osd/ReplicatedPG.cc (Sage Weil 2008-12-03 19:46:08 -0800 6786) } </code></pre>