Bug #7406
closedSeg fault in find_object_context()in recent master rados run
0%
Description
On commit 7fe10f1271dd94d4c5014a35c6362a7772dc4508
/a/teuthology-2014-02-09_23:00:13-rados-master-testing-basic-plana/74133
osd.0 seg faulted
#0 0x00007f6c0ad37b7b in raise (sig=<optimized out>) at ../nptl/sysdeps/unix/sysv/linux/pt-raise.c:42
#1 0x000000000091bf6e in reraise_fatal (signum=11) at global/signal_handler.cc:59
#2 handle_fatal_signal (signum=11) at global/signal_handler.cc:105
#3 <signal handler called>
#4 ReplicatedPG::find_object_context (this=0x2b5d000, oid=..., pobc=0x7f6bf5e8b200, can_create=<optimized out>, pmissing=0x7f6bf5e8aea0) at osd/ReplicatedPG.cc:6775
#5 0x000000000088c724 in ReplicatedPG::do_op (this=0x2b5d000, op=...) at osd/ReplicatedPG.cc:1170
#6 0x0000000000833d22 in ReplicatedPG::do_request (this=0x2b5d000, op=..., handle=...) at osd/ReplicatedPG.cc:1027
#7 0x00000000006081ba in OSD::dequeue_op (this=0x2657000, pg=..., op=..., handle=...) at osd/OSD.cc:7239
#8 0x0000000000621608 in OSD::OpWQ::_process (this=0x2657e28, pg=..., handle=...) at osd/OSD.cc:7209
#9 0x00000000006634ec in ThreadPool::WorkQueueVal<std::pair<boost::intrusive_ptr<PG>, std::tr1::shared_ptr<OpRequest> >, boost::intrusive_ptr<PG> >::_void_process (
this=0x2657e28, handle=...) at ./common/WorkQueue.h:190
#10 0x00000000009e91b6 in ThreadPool::worker (this=0x2657470, wt=0x2692e40) at common/WorkQueue.cc:125
#11 0x00000000009eafc0 in ThreadPool::WorkThread::entry (this=<optimized out>) at common/WorkQueue.h:317
#12 0x00007f6c0ad2fe9a in start_thread (arg=0x7f6bf5e8c700) at pthread_create.c:308
#13 0x00007f6c094f43fd in clone () at ../sysdeps/unix/sysv/linux/x86_64/clone.S:112
#14 0x0000000000000000 in ?? ()
(gdb) f 4
#4 ReplicatedPG::find_object_context (this=0x2b5d000, oid=..., pobc=0x7f6bf5e8b200, can_create=<optimized out>, pmissing=0x7f6bf5e8aea0) at osd/ReplicatedPG.cc:6775
6775 in osd/ReplicatedPG.cc
(gdb) p obc
$13 = {<std::tr1::__shared_ptr<ObjectContext, (_gnu_cxx::_Lock_policy)2>> = {_M_ptr = 0x2c872c0, _M_refcount = {_M_pi = 0x2c86000}}, <No data fields>}
(gdb) p ((ObjectContext *)0x2c872c0)->obs.oi.snaps
$15 = {<std::_Vector_base<snapid_t, std::allocator<snapid_t> >> = {
_M_impl = {<std::allocator<snapid_t>> = {<_gnu_cxx::new_allocator<snapid_t>> = {<No data fields>}, <No data fields>}, _M_start = 0x0, _M_finish = 0x0,
_M_end_of_storage = 0x0}}, <No data fields>}
Lines 6774 and 6775. Not sure why it didn't crash on 6774 with snaps.size() == 0 it didn't crash on snaps[-1] reference but did on snaps0 unless this was optimized in a different order:
snapid_t first = obc->obs.oi.snaps[obc->obs.oi.snaps.size()-1];
snapid_t last = obc->obs.oi.snaps[0];
Nothing in the last section of code in find_object_context() has changed recently:
c6b73eb4 src/osd/ReplicatedPG.cc (Sage Weil 2013-12-30 12:52:20 -0800 6763) if (!obc->ssc) {
c6b73eb4 src/osd/ReplicatedPG.cc (Sage Weil 2013-12-30 12:52:20 -0800 6764) obc->ssc = ssc;
c6b73eb4 src/osd/ReplicatedPG.cc (Sage Weil 2013-12-30 12:52:20 -0800 6765) } else {
c6b73eb4 src/osd/ReplicatedPG.cc (Sage Weil 2013-12-30 12:52:20 -0800 6766) assert(obc->ssc == ssc);
c6b73eb4 src/osd/ReplicatedPG.cc (Sage Weil 2013-12-30 12:52:20 -0800 6767) put_snapset_context(ssc);
c6b73eb4 src/osd/ReplicatedPG.cc (Sage Weil 2013-12-30 12:52:20 -0800 6768) }
c3c1541c src/osd/ReplicatedPG.cc (Sage Weil 2013-12-23 17:25:07 -0800 6769) ssc = 0;
c2012f3c src/osd/ReplicatedPG.cc (Sage Weil 2009-05-22 11:43:38 -0700 6770)
c2012f3c src/osd/ReplicatedPG.cc (Sage Weil 2009-05-22 11:43:38 -0700 6771) // clone
c3c1541c src/osd/ReplicatedPG.cc (Sage Weil 2013-12-23 17:25:07 -0800 6772) dout(20) << "find_object_context " << soid << " snaps " << obc->obs.oi.snaps
c3c1541c src/osd/ReplicatedPG.cc (Sage Weil 2013-12-23 17:25:07 -0800 6773) << dendl;
0ebda4a9 src/osd/ReplicatedPG.cc (Sage Weil 2009-05-27 21:43:41 -0700 6774) snapid_t first = obc->obs.oi.snaps[obc->obs.oi.snaps.size()-1];
0ebda4a9 src/osd/ReplicatedPG.cc (Sage Weil 2009-05-27 21:43:41 -0700 6775) snapid_t last = obc->obs.oi.snaps[0];
5492cbdf src/osd/ReplicatedPG.cc (Samuel Just 2011-07-13 11:02:56 -0700 6776) if (first <= oid.snap) {
bc1782a1 src/osd/ReplicatedPG.cc (Sage Weil 2011-06-16 19:42:48 -0700 6777) dout(20) << "find_object_context " << soid << " [" << first << "," << last
5492cbdf src/osd/ReplicatedPG.cc (Samuel Just 2011-07-13 11:02:56 -0700 6778) << "] contains " << oid.snap << " -- HIT " << obc->obs << dendl;
c2012f3c src/osd/ReplicatedPG.cc (Sage Weil 2009-05-22 11:43:38 -0700 6779) *pobc = obc;
c2012f3c src/osd/ReplicatedPG.cc (Sage Weil 2009-05-22 11:43:38 -0700 6780) return 0;
c2012f3c src/osd/ReplicatedPG.cc (Sage Weil 2009-05-22 11:43:38 -0700 6781) } else {
bc1782a1 src/osd/ReplicatedPG.cc (Sage Weil 2011-06-16 19:42:48 -0700 6782) dout(20) << "find_object_context " << soid << " [" << first << "," << last
5492cbdf src/osd/ReplicatedPG.cc (Samuel Just 2011-07-13 11:02:56 -0700 6783) << "] does not contain " << oid.snap << " -- DNE" << dendl;
c2012f3c src/osd/ReplicatedPG.cc (Sage Weil 2009-05-22 11:43:38 -0700 6784) return -ENOENT;
c2012f3c src/osd/ReplicatedPG.cc (Sage Weil 2009-05-22 11:43:38 -0700 6785) }
7a1eaa98 src/osd/ReplicatedPG.cc (Sage Weil 2008-12-03 19:46:08 -0800 6786) }