Actions
Bug #5104
closedMDS crashed in Objecter::handle_osd_op_reply
% Done:
0%
Source:
other
Tags:
Backport:
Regression:
Severity:
3 - minor
Reviewed:
Affected Versions:
ceph-qa-suite:
Component(FS):
Labels (FS):
Pull request ID:
Crash signature (v1):
Crash signature (v2):
Description
While trying to reproduce #4999, I collected this in an MDS.
I was running next branch (commit c80c6a032c) merged into wip-4999.
-80> 2013-05-17 14:59:29.707128 7ffff26e5700 10 monclient: tick -79> 2013-05-17 14:59:29.707158 7ffff26e5700 10 monclient: renew subs? (now: 2013-05-17 14:59:29.707158; renew after: 2013-05-17 15:00:29.705072) -- no -78> 2013-05-17 14:59:31.916597 7ffff15e2700 5 mds.-1.0 is_laggy 40.991524 > 15 since last acked beacon -77> 2013-05-17 14:59:31.916639 7ffff15e2700 5 mds.-1.0 initiating monitor reconnect; maybe we're not the slow one -76> 2013-05-17 14:59:31.916642 7ffff15e2700 10 monclient: _reopen_session -75> 2013-05-17 14:59:31.916745 7ffff15e2700 10 monclient: _pick_new_mon picked mon.cs28 con 0x1508580 addr 172.17.131.28:6789/0 -74> 2013-05-17 14:59:31.916776 7ffff15e2700 10 monclient: _send_mon_message to mon.cs28 at 172.17.131.28:6789/0 -73> 2013-05-17 14:59:31.916781 7ffff15e2700 10 monclient: renew_subs -72> 2013-05-17 14:59:31.916783 7ffff15e2700 5 mds.-1.0 tick bailing out since we seem laggy -71> 2013-05-17 14:59:31.917838 7ffff36e7700 5 mds.-1.0 ms_handle_connect on 172.17.131.28:6789/0 -70> 2013-05-17 14:59:36.916734 7ffff15e2700 5 mds.-1.0 is_laggy 45.991661 > 15 since last acked beacon -69> 2013-05-17 14:59:36.916772 7ffff15e2700 5 mds.-1.0 initiating monitor reconnect; maybe we're not the slow one -68> 2013-05-17 14:59:36.916775 7ffff15e2700 10 monclient: _reopen_session -67> 2013-05-17 14:59:36.916933 7ffff15e2700 10 monclient: _pick_new_mon picked mon.cs29 con 0x1508f20 addr 172.17.131.29:6789/0 -66> 2013-05-17 14:59:36.917007 7ffff15e2700 10 monclient: _send_mon_message to mon.cs29 at 172.17.131.29:6789/0 -65> 2013-05-17 14:59:36.917015 7ffff15e2700 10 monclient: renew_subs -64> 2013-05-17 14:59:36.917017 7ffff15e2700 5 mds.-1.0 tick bailing out since we seem laggy -63> 2013-05-17 14:59:36.918039 7ffff36e7700 5 mds.-1.0 ms_handle_connect on 172.17.131.29:6789/0 -62> 2013-05-17 14:59:39.707340 7ffff26e5700 10 monclient: tick -61> 2013-05-17 14:59:39.707360 7ffff26e5700 10 monclient: renew subs? (now: 2013-05-17 14:59:39.707360; renew after: 2013-05-17 15:00:29.705072) -- no -60> 2013-05-17 14:59:41.916897 7ffff15e2700 5 mds.-1.0 is_laggy 50.991824 > 15 since last acked beacon -59> 2013-05-17 14:59:41.916931 7ffff15e2700 5 mds.-1.0 initiating monitor reconnect; maybe we're not the slow one -58> 2013-05-17 14:59:41.916934 7ffff15e2700 10 monclient: _reopen_session -57> 2013-05-17 14:59:41.917025 7ffff15e2700 10 monclient: _pick_new_mon picked mon.cs30 con 0x1508c60 addr 172.17.131.30:6789/0 -56> 2013-05-17 14:59:41.917055 7ffff15e2700 10 monclient: _send_mon_message to mon.cs30 at 172.17.131.30:6789/0 -55> 2013-05-17 14:59:41.917082 7ffff15e2700 10 monclient: renew_subs -54> 2013-05-17 14:59:41.917090 7ffff15e2700 5 mds.-1.0 tick bailing out since we seem laggy -53> 2013-05-17 14:59:41.918188 7ffff36e7700 5 mds.-1.0 ms_handle_connect on 172.17.131.30:6789/0 -52> 2013-05-17 14:59:46.917037 7ffff15e2700 5 mds.-1.0 is_laggy 55.991963 > 15 since last acked beacon -51> 2013-05-17 14:59:46.917071 7ffff15e2700 5 mds.-1.0 initiating monitor reconnect; maybe we're not the slow one -50> 2013-05-17 14:59:46.917075 7ffff15e2700 10 monclient: _reopen_session -49> 2013-05-17 14:59:46.917176 7ffff15e2700 10 monclient: _pick_new_mon picked mon.cs28 con 0x1508dc0 addr 172.17.131.28:6789/0 -48> 2013-05-17 14:59:46.917210 7ffff15e2700 10 monclient: _send_mon_message to mon.cs28 at 172.17.131.28:6789/0 -47> 2013-05-17 14:59:46.917278 7ffff15e2700 10 monclient: renew_subs -46> 2013-05-17 14:59:46.917292 7ffff15e2700 5 mds.-1.0 tick bailing out since we seem laggy -45> 2013-05-17 14:59:46.918086 7ffff36e7700 5 mds.-1.0 ms_handle_connect on 172.17.131.28:6789/0 -44> 2013-05-17 14:59:46.918709 7ffff36e7700 10 monclient: _send_mon_message to mon.cs28 at 172.17.131.28:6789/0 -43> 2013-05-17 14:59:46.919550 7ffff36e7700 10 monclient: _send_mon_message to mon.cs28 at 172.17.131.28:6789/0 -42> 2013-05-17 14:59:46.919578 7ffff36e7700 10 monclient: _check_auth_rotating have uptodate secrets (they expire after 2013-05-17 14:59:16.919577) -41> 2013-05-17 14:59:46.920056 7ffff36e7700 10 monclient: handle_subscribe_ack sent 2013-05-17 14:59:21.916552 renew after 2013-05-17 15:01:51.916552 -40> 2013-05-17 14:59:46.927575 7ffff15e2700 10 monclient: _send_mon_message to mon.cs28 at 172.17.131.28:6789/0 -39> 2013-05-17 14:59:46.928365 7ffff36e7700 0 mds.-1.0 handle_mds_beacon no longer laggy -38> 2013-05-17 14:59:49.707486 7ffff26e5700 10 monclient: tick -37> 2013-05-17 14:59:49.707570 7ffff26e5700 10 monclient: _check_auth_rotating have uptodate secrets (they expire after 2013-05-17 14:59:19.707568) -36> 2013-05-17 14:59:49.707583 7ffff26e5700 10 monclient: renew subs? (now: 2013-05-17 14:59:49.707583; renew after: 2013-05-17 15:01:51.916552) -- no -35> 2013-05-17 14:59:50.927682 7ffff15e2700 10 monclient: _send_mon_message to mon.cs28 at 172.17.131.28:6789/0 -34> 2013-05-17 14:59:54.927866 7ffff15e2700 10 monclient: _send_mon_message to mon.cs28 at 172.17.131.28:6789/0 -33> 2013-05-17 14:59:58.928083 7ffff15e2700 10 monclient: _send_mon_message to mon.cs28 at 172.17.131.28:6789/0 -32> 2013-05-17 14:59:59.707741 7ffff26e5700 10 monclient: tick -31> 2013-05-17 14:59:59.707759 7ffff26e5700 10 monclient: _check_auth_rotating have uptodate secrets (they expire after 2013-05-17 14:59:29.707758) -30> 2013-05-17 14:59:59.707770 7ffff26e5700 10 monclient: renew subs? (now: 2013-05-17 14:59:59.707770; renew after: 2013-05-17 15:01:51.916552) -- no -29> 2013-05-17 15:00:02.928247 7ffff15e2700 10 monclient: _send_mon_message to mon.cs28 at 172.17.131.28:6789/0 -28> 2013-05-17 15:00:06.928367 7ffff15e2700 10 monclient: _send_mon_message to mon.cs28 at 172.17.131.28:6789/0 -27> 2013-05-17 15:00:09.707938 7ffff26e5700 10 monclient: tick -26> 2013-05-17 15:00:09.707965 7ffff26e5700 10 monclient: _check_auth_rotating have uptodate secrets (they expire after 2013-05-17 14:59:39.707963) -25> 2013-05-17 15:00:09.707978 7ffff26e5700 10 monclient: renew subs? (now: 2013-05-17 15:00:09.707978; renew after: 2013-05-17 15:01:51.916552) -- no -24> 2013-05-17 15:00:10.544530 7ffff36e7700 5 mds.-1.0 handle_mds_map epoch 28 from mon.0 -23> 2013-05-17 15:00:10.544595 7ffff36e7700 1 mds.0.6 handle_mds_map i am now mds.0.6 -22> 2013-05-17 15:00:10.544602 7ffff36e7700 1 mds.0.6 handle_mds_map state change up:standby --> up:replay -21> 2013-05-17 15:00:10.552922 7ffff36e7700 1 mds.0.6 replay_start -20> 2013-05-17 15:00:10.555719 7ffff36e7700 1 mds.0.6 recovery set is -19> 2013-05-17 15:00:10.555734 7ffff36e7700 1 mds.0.6 need osdmap epoch 19, have 11 -18> 2013-05-17 15:00:10.555738 7ffff36e7700 1 mds.0.6 waiting for osdmap 19 (which blacklists prior instance) -17> 2013-05-17 15:00:10.555750 7ffff36e7700 10 monclient: renew_subs -16> 2013-05-17 15:00:10.555755 7ffff36e7700 10 monclient: _send_mon_message to mon.cs28 at 172.17.131.28:6789/0 -15> 2013-05-17 15:00:10.555781 7ffff36e7700 5 mds.0.6 handle_mds_failure for myself; not doing anything -14> 2013-05-17 15:00:10.561172 7ffff36e7700 2 mds.0.6 boot_start 1: opening inotable -13> 2013-05-17 15:00:10.561302 7ffff36e7700 2 mds.0.6 boot_start 1: opening sessionmap -12> 2013-05-17 15:00:10.561392 7ffff36e7700 2 mds.0.6 boot_start 1: opening anchor table -11> 2013-05-17 15:00:10.561464 7ffff36e7700 2 mds.0.6 boot_start 1: opening snap table -10> 2013-05-17 15:00:10.561553 7ffff36e7700 2 mds.0.6 boot_start 1: opening mds log -9> 2013-05-17 15:00:10.561567 7ffff36e7700 5 mds.0.log open discovering log bounds -8> 2013-05-17 15:00:10.561577 7ffff36e7700 1 mds.0.journaler(ro) recover start -7> 2013-05-17 15:00:10.561590 7ffff36e7700 1 mds.0.journaler(ro) read_head -6> 2013-05-17 15:00:10.561687 7ffff36e7700 10 monclient: handle_subscribe_ack sent 2013-05-17 15:00:10.555752 renew after 2013-05-17 15:02:40.555752 -5> 2013-05-17 15:00:10.562880 7ffff36e7700 5 mds.0.6 ms_handle_connect on 172.17.131.41:6801/48491 -4> 2013-05-17 15:00:10.563008 7ffff36e7700 5 mds.0.6 ms_handle_connect on 172.17.131.36:6800/46335 -3> 2013-05-17 15:00:10.563137 7ffff36e7700 5 mds.0.6 ms_handle_connect on 172.17.131.37:6820/50518 -2> 2013-05-17 15:00:10.563157 7ffff36e7700 5 mds.0.6 ms_handle_connect on 172.17.131.34:6822/58576 -1> 2013-05-17 15:00:10.563444 7ffff36e7700 5 mds.0.6 ms_handle_connect on 172.17.131.39:6806/48529 0> 2013-05-17 15:00:10.665398 7ffff36e7700 -1 *** Caught signal (Aborted) ** in thread 7ffff36e7700 ceph version 0.62-190-g2265d88 (2265d88dbc5bbafa097b8fc8fb8895493d572fef) 1: /usr/bin/ceph-mds() [0x85ede9] 2: (()+0xf4a0) [0x7ffff76584a0] 3: (gsignal()+0x35) [0x7ffff6bf2885] 4: (abort()+0x175) [0x7ffff6bf4065] 5: (__gnu_cxx::__verbose_terminate_handler()+0x12d) [0x3be84bea7d] NOTE: a copy of the executable, or `objdump -rdS <executable>` is needed to interpret this. --- logging levels --- 0/ 5 none 0/ 1 lockdep 0/ 1 context 1/ 1 crush 1/ 5 mds 1/ 5 mds_balancer 1/ 5 mds_locker 1/ 5 mds_log 1/ 5 mds_log_expire 1/ 5 mds_migrator 0/ 1 buffer 0/ 1 timer 0/ 1 filer 0/ 1 striper 0/ 1 objecter 0/ 5 rados 0/ 5 rbd 0/ 5 journaler 0/ 5 objectcacher 0/ 5 client 0/ 5 osd 0/ 5 optracker 0/ 5 objclass 1/ 3 filestore 1/ 3 journal 0/ 0 ms 1/ 5 mon 0/10 monc 0/ 5 paxos 0/ 5 tp 1/ 5 auth 1/ 5 crypto 1/ 1 finisher 1/ 5 heartbeatmap 1/ 5 perfcounter 1/ 5 rgw 1/ 5 hadoop 1/ 5 javaclient 1/ 5 asok 1/ 1 throttle -2/-2 (syslog threshold) -1/-1 (stderr threshold) max_recent 10000 max_new 1000 log_file /var/ceph/log/mds.cs28.log --- end dump of recent events ---
gdb had this to say:
Core was generated by `/usr/bin/ceph-mds -i cs28 --pid-file /var/run/ceph/mds.cs28.pid -c /etc/ceph/ce'. Program terminated with signal 6, Aborted. #0 0x00007ffff765836b in raise (sig=6) at ../nptl/sysdeps/unix/sysv/linux/pt-raise.c:42 warning: Source file is more recent than executable. 42 sig); Missing separate debuginfos, use: debuginfo-install gperftools-libs-2.0-3.el6.2.x86_64 libgcc-4.4.6-3.el6.x86_64 libstdc++-4.4.6-3.el6.x86_64 libuuid-2.17.2-12.4.el6.x86_64 nspr-4.9-1.el6.x86_64 nss-3.13.3-8.el6.x86_64 nss-softokn-3.12.9-11.el6.x86_64 nss-softokn-freebl-3.12.9-11.el6.x86_64 nss-util-3.13.3-2.el6.x86_64 sqlite-3.6.20-1.el6.x86_64 (gdb) bt #0 0x00007ffff765836b in raise (sig=6) at ../nptl/sysdeps/unix/sysv/linux/pt-raise.c:42 #1 0x000000000085f217 in reraise_fatal (signum=6) at global/signal_handler.cc:58 #2 handle_fatal_signal (signum=6) at global/signal_handler.cc:104 #3 <signal handler called> #4 0x00007ffff6bf2885 in raise (sig=6) at ../nptl/sysdeps/unix/sysv/linux/raise.c:64 #5 0x00007ffff6bf4065 in abort () at abort.c:92 #6 0x0000003be84bea7d in __gnu_cxx::__verbose_terminate_handler() () from /usr/lib64/libstdc++.so.6 #7 0x0000003be84bcc06 in ?? () from /usr/lib64/libstdc++.so.6 #8 0x0000003be84bcc33 in std::terminate() () from /usr/lib64/libstdc++.so.6 #9 0x0000003be84bcd2e in __cxa_throw () from /usr/lib64/libstdc++.so.6 #10 0x00000000007f408e in copy_out (this=0x7ffff36e5ff0, len=8, dest=0x7ffff36e5e80 "\300\201W\001") at ./include/buffer.h:193 #11 ceph::buffer::list::iterator::copy (this=0x7ffff36e5ff0, len=8, dest=0x7ffff36e5e80 "\300\201W\001") at common/buffer.cc:526 #12 0x00000000006eda5d in decode_raw<__le64> (this=0x1500f58, p=...) at ./include/encoding.h:57 #13 decode (this=0x1500f58, p=...) at ./include/encoding.h:100 #14 SessionMap::decode (this=0x1500f58, p=...) at mds/SessionMap.cc:169 #15 0x00000000006ef751 in SessionMap::_load_finish (this=0x1500f58, r=<value optimized out>, bl=<value optimized out>) at mds/SessionMap.cc:81 #16 0x000000000071e4d8 in Objecter::handle_osd_op_reply (this=0x1568000, m=0x1451e00) at osdc/Objecter.cc:1482 #17 0x00000000004caf07 in MDS::handle_core_message (this=0x1500b00, m=0x1451e00) at mds/MDS.cc:1733 #18 0x00000000004cafbf in MDS::_dispatch (this=0x1500b00, m=0x1451e00) at mds/MDS.cc:1857 #19 0x00000000004ccaeb in MDS::ms_dispatch (this=0x1500b00, m=0x1451e00) at mds/MDS.cc:1668 #20 0x000000000083da11 in ms_deliver_dispatch (this=0x15000e8) at msg/Messenger.h:566 #21 DispatchQueue::entry (this=0x15000e8) at msg/DispatchQueue.cc:119 #22 0x000000000079a07d in DispatchQueue::DispatchThread::entry (this=<value optimized out>) at msg/DispatchQueue.h:104 #23 0x00007ffff76507f1 in start_thread (arg=0x7ffff36e7700) at pthread_create.c:301 #24 0x00007ffff6ca5ccd in clone () at ../sysdeps/unix/sysv/linux/x86_64/clone.S:115 (gdb) f 10 #10 0x00000000007f408e in copy_out (this=0x7ffff36e5ff0, len=8, dest=0x7ffff36e5e80 "\300\201W\001") at ./include/buffer.h:193 193 throw end_of_buffer(); (gdb) l 188 int raw_nref() const; 189 190 void copy_out(unsigned o, unsigned l, char *dest) const { 191 assert(_raw); 192 if (!((o <= _len) && (o+l <= _len))) 193 throw end_of_buffer(); 194 memcpy(dest, c_str()+o, l); 195 } 196 197 unsigned wasted(); (gdb) f 11 #11 ceph::buffer::list::iterator::copy (this=0x7ffff36e5ff0, len=8, dest=0x7ffff36e5e80 "\300\201W\001") at common/buffer.cc:526 526 p->copy_out(p_off, howmuch, dest); (gdb) l 521 throw end_of_buffer(); 522 assert(p->length() > 0); 523 524 unsigned howmuch = p->length() - p_off; 525 if (len < howmuch) howmuch = len; 526 p->copy_out(p_off, howmuch, dest); 527 dest += howmuch; 528 529 len -= howmuch; 530 advance(howmuch); (gdb) p p_off $1 = 0 (gdb) p dest $2 = 0x7ffff36e5e80 "\300\201W\001" (gdb) p *dest $3 = -64 '\300' (gdb) f 14 #14 SessionMap::decode (this=0x1500f58, p=...) at mds/SessionMap.cc:169 169 ::decode(pre, p); (gdb) l 164 165 void SessionMap::decode(bufferlist::iterator& p) 166 { 167 utime_t now = ceph_clock_now(g_ceph_context); 168 uint64_t pre; 169 ::decode(pre, p); 170 if (pre == (uint64_t)-1) { 171 DECODE_START_LEGACY_COMPAT_LEN(3, 3, 3, p); 172 assert(struct_v >= 2); 173 (gdb) p p $4 = (ceph::buffer::list::iterator &) @0x7ffff36e5ff0: {bl = 0x14c7810, ls = 0x14c7810, off = 0, p = {_raw = , _off = 0, _len = 0}, p_off = 0} (gdb) f 15 #15 0x00000000006ef751 in SessionMap::_load_finish (this=0x1500f58, r=<value optimized out>, bl=<value optimized out>) at mds/SessionMap.cc:81 81 decode(blp); // note: this sets last_cap_renew = now() (gdb) l 76 77 void SessionMap::_load_finish(int r, bufferlist &bl) 78 { 79 bufferlist::iterator blp = bl.begin(); 80 dump(); 81 decode(blp); // note: this sets last_cap_renew = now() 82 dout(10) << "_load_finish v " << version 83 << ", " << session_map.size() << " sessions, " 84 << bl.length() << " bytes" 85 << dendl; (gdb) f 16 #16 0x000000000071e4d8 in Objecter::handle_osd_op_reply (this=0x1568000, m=0x1451e00) at osdc/Objecter.cc:1482 1482 onack->finish(rc); (gdb) l 1477 1478 ldout(cct, 5) << num_unacked << " unacked, " << num_uncommitted << " uncommitted" << dendl; 1479 1480 // do callbacks 1481 if (onack) { 1482 onack->finish(rc); 1483 delete onack; 1484 } 1485 if (oncommit) { 1486 oncommit->finish(rc); (gdb) f 17 #17 0x00000000004caf07 in MDS::handle_core_message (this=0x1500b00, m=0x1451e00) at mds/MDS.cc:1733 1733 objecter->handle_osd_op_reply((class MOSDOpReply*)m); (gdb) l 1728 break; 1729 1730 // OSD 1731 case CEPH_MSG_OSD_OPREPLY: 1732 ALLOW_MESSAGES_FROM(CEPH_ENTITY_TYPE_OSD); 1733 objecter->handle_osd_op_reply((class MOSDOpReply*)m); 1734 break; 1735 case CEPH_MSG_OSD_MAP: 1736 ALLOW_MESSAGES_FROM(CEPH_ENTITY_TYPE_MON | CEPH_ENTITY_TYPE_OSD); 1737 objecter->handle_osd_map((MOSDMap*)m); (gdb)
Actions