Project

General

Profile

Bug #48386

Updated by Brad Hubbard over 3 years ago

<pre> 
    -40> 2020-11-01 10:02:17.190 7f1b60500700    1 mon.a@0(leader) e3 shutdown <========================**NOTE** 
    -39> 2020-11-01 10:02:17.215 7f1b584f0700    4 set_mon_vals no callback set 
    -38> 2020-11-01 10:02:17.216 7f1b584f0700    4 mgrc handle_mgr_map Got map version 522 
    -37> 2020-11-01 10:02:17.216 7f1b584f0700    4 mgrc handle_mgr_map Active mgr is now [v2:XXX.XXX.XXX.XXX:6800/1,v1:XXX.XXX.XXX.XXX:6801/1] 
    -36> 2020-11-01 10:02:17.219 7f1b584f0700    2 mon.a@0(leader) e3 send_reply 0x559bd36a60f0 0x559bd66d4000 log(last 21629202) v1 
    -35> 2020-11-01 10:02:17.219 7f1b584f0700    2 mon.a@0(leader) e3 send_reply 0x559bd5924960 0x559bd049af00 log(last 22) v1 
    -34> 2020-11-01 10:02:17.219 7f1b584f0700    2 mon.a@0(leader) e3 send_reply 0x559bd3782960 0x559bd66d41e0 log(last 23) v1 
    -33> 2020-11-01 10:02:17.219 7f1b584f0700    2 mon.a@0(leader) e3 send_reply 0x559bd3782780 0x559bd66d5e00 log(last 2464) v1 
    -32> 2020-11-01 10:02:17.219 7f1b584f0700    2 mon.a@0(leader) e3 send_reply 0x559bd37822d0 0x559bd66d5c20 log(last 24) v1 
    -31> 2020-11-01 10:02:17.219 7f1b584f0700    2 mon.a@0(leader) e3 send_reply 0x559bd373e870 0x559bd66d45a0 log(last 4568) v1 
    -30> 2020-11-01 10:02:17.219 7f1b584f0700    2 mon.a@0(leader) e3 send_reply 0x559bd373e4b0 0x559bd66d4960 log(last 25) v1 
    -29> 2020-11-01 10:02:17.221 7f1b584f0700    2 mon.a@0(leader) e3 send_reply 0x559bd373e1e0 0x559bd66d43c0 log(last 4569) v1 
    -28> 2020-11-01 10:02:17.221 7f1b584f0700    2 mon.a@0(leader) e3 send_reply 0x559bd372fe00 0x559bd66d4780 log(last 111639138) v1 
    -27> 2020-11-01 10:02:17.221 7f1b584f0700    2 mon.a@0(leader) e3 send_reply 0x559bd42ebb30 0x559bd66d52c0 log(last 111640138) v1 
    -26> 2020-11-01 10:02:17.221 7f1b584f0700    2 mon.a@0(leader) e3 send_reply 0x559bd428db30 0x559bd66d50e0 log(last 21630202) v1 
    -25> 2020-11-01 10:02:17.221 7f1b584f0700    2 mon.a@0(leader) e3 send_reply 0x559bd4773590 0x559bd66d4f00 log(last 21631202) v1 
    -24> 2020-11-01 10:02:17.222 7f1b584f0700    5 mon.a@0(leader).paxos(paxos active c 1332309..1332941) is_readable = 1 - now=2020-11-01 10:02:17.223960 lease_expire=2020-11-01 10:02:22.218641 has v0 lc 1332941 
    -23> 2020-11-01 10:02:17.222 7f1b584f0700    5 mon.a@0(leader).paxos(paxos active c 1332309..1332941) is_readable = 1 - now=2020-11-01 10:02:17.224018 lease_expire=2020-11-01 10:02:22.218641 has v0 lc 1332941 
    -22> 2020-11-01 10:02:17.222 7f1b584f0700    5 mon.a@0(leader).paxos(paxos active c 1332309..1332941) is_readable = 1 - now=2020-11-01 10:02:17.224084 lease_expire=2020-11-01 10:02:22.218641 has v0 lc 1332941 
    -21> 2020-11-01 10:02:17.222 7f1b584f0700    5 mon.a@0(leader).paxos(paxos active c 1332309..1332941) is_readable = 1 - now=2020-11-01 10:02:17.224113 lease_expire=2020-11-01 10:02:22.218641 has v0 lc 1332941 
    -20> 2020-11-01 10:02:17.223 7f1b60500700    5 asok(0x559bd04fa000) unregister_commands add_bootstrap_peer_hint 
    -19> 2020-11-01 10:02:17.223 7f1b60500700    5 asok(0x559bd04fa000) unregister_commands add_bootstrap_peer_hintv 
    -18> 2020-11-01 10:02:17.223 7f1b60500700    5 asok(0x559bd04fa000) unregister_commands dump_historic_ops 
    -17> 2020-11-01 10:02:17.223 7f1b60500700    5 asok(0x559bd04fa000) unregister_commands dump_historic_ops_by_duration 
    -16> 2020-11-01 10:02:17.223 7f1b60500700    5 asok(0x559bd04fa000) unregister_commands dump_historic_slow_ops 
    -15> 2020-11-01 10:02:17.223 7f1b60500700    5 asok(0x559bd04fa000) unregister_commands mon_status 
    -14> 2020-11-01 10:02:17.223 7f1b60500700    5 asok(0x559bd04fa000) unregister_commands ops 
    -13> 2020-11-01 10:02:17.223 7f1b60500700    5 asok(0x559bd04fa000) unregister_commands quorum enter 
    -12> 2020-11-01 10:02:17.223 7f1b60500700    5 asok(0x559bd04fa000) unregister_commands quorum exit 
    -11> 2020-11-01 10:02:17.223 7f1b60500700    5 asok(0x559bd04fa000) unregister_commands quorum_status 
    -10> 2020-11-01 10:02:17.223 7f1b60500700    5 asok(0x559bd04fa000) unregister_commands sessions 
     -9> 2020-11-01 10:02:17.223 7f1b60500700    5 asok(0x559bd04fa000) unregister_commands sync_force 
     -8> 2020-11-01 10:02:17.224 7f1b60500700    2 mon.a@0(shutdown) e3 send_reply 0x559bd57013b0 0x559bd544c000 log(last 26) v1 
     -7> 2020-11-01 10:02:17.224 7f1b60500700    2 mon.a@0(shutdown) e3 send_reply 0x559bd3658a50 0x559bd544c1e0 log(last 27) v1 
     -6> 2020-11-01 10:02:17.224 7f1b60500700    2 mon.a@0(shutdown) e3 send_reply 0x559bd4318f00 0x559bd544c3c0 log(last 28) v1 
     -5> 2020-11-01 10:02:17.224 7f1b60500700    2 mon.a@0(shutdown) e3 send_reply 0x559bd4318690 0x559bd544c5a0 log(last 29) v1 
     -4> 2020-11-01 10:02:17.224 7f1b5b4f6700    1 Monitor::cpu_tp worker finish 
     -3> 2020-11-01 10:02:17.236 7f1b5bcf7700    1 Monitor::cpu_tp worker finish 
     -2> 2020-11-01 10:02:17.236 7f1b5acf5700    1 Monitor::cpu_tp worker finish 
     -1> 2020-11-01 10:02:17.236 7f1b5a4f4700    1 Monitor::cpu_tp worker finish 
      0> 2020-11-01 10:02:17.238 7f1b59cf3700 -1 *** Caught signal (Segmentation fault) ** 
  in thread 7f1b59cf3700 thread_name:ms_dispatch 

  ceph version 14.2.8-91.el8cp (75b4845da7d469665bd48d1a49badcc3677bf5cd) nautilus (stable) 
  1: (()+0x12dd0) [0x7f1b664dfdd0] 
  2: (PerfCounters::inc(int, unsigned long)+0x7) [0x7f1b699eb807] 
  3: (Paxos::restart()+0x2b9) [0x559bcdec7a89] 
  4: (Monitor::_reset()+0x268) [0x559bcddcacd8] 
  5: (Monitor::join_election()+0xdb) [0x559bcddcae6b] 
  6: (Elector::bump_epoch(unsigned int)+0x158) [0x559bcde63e68] 
  7: (Elector::handle_propose(boost::intrusive_ptr<MonOpRequest>)+0x988) [0x559bcde64fc8] 
  8: (Elector::dispatch(boost::intrusive_ptr<MonOpRequest>)+0x96e) [0x559bcde66fbe] 
  9: (Monitor::dispatch_op(boost::intrusive_ptr<MonOpRequest>)+0x11c6) [0x559bcdde60d6] 
  10: (Monitor::_ms_dispatch(Message*)+0xa23) [0x559bcdde6e53] 
 </pre> 

 <pre> 
    1383 void Paxos::restart() 
    1384 { 
    1385     dout(10) << "restart -- canceling timeouts" << dendl; 
    1386     cancel_events(); 
    1387     new_value.clear(); 
    1388  
    1389     if (is_writing() || is_writing_previous()) { 
    1390       dout(10) << __func__ << " flushing" << dendl; 
    1391       mon->lock.Unlock(); 
    1392       mon->store->flush(); 
    1393       mon->lock.Lock(); 
    1394       dout(10) << __func__ << " flushed" << dendl; 
    1395     } 
    1396     state = STATE_RECOVERING; 
    1397  
    1398     // discard pending transaction 
    1399     pending_proposal.reset(); 
    1400  
    1401     reset_pending_committing_finishers(); 
    1402     finish_contexts(g_ceph_context, waiting_for_active, -EAGAIN); 
    1403  
    1404     logger->inc(l_paxos_restart); 
    1405 } 
 </pre> 

 <pre> 
     972 void Monitor::shutdown() 
     973 { 
 ... 
     1058     // clean up 
     1059     paxos->shutdown(); 
 </pre> 

 <pre> 
    1313 void Paxos::shutdown() 
    1314 { 
 ... 
    1333 1031     if (logger) { 
    1334 1032       g_ceph_context->get_perfcounters_collection()->remove(logger); cct->get_perfcounters_collection()->remove(logger); 
    1335     1033       delete logger; 
    1336 1034       logger = NULL; 
    1035     } 
 </pre> 

 So we've already deleted 'logger' but we have raced because the epoch has changed 
 and we are resetting the monitor (and restarting Paxos) at the same time. 

Back