Project

General

Profile

Backport #21079

Updated by Abhishek Lekshmanan over 6 years ago

https://github.com/ceph/ceph/pull/17198 There is one bug in function    OSDMonitor::reweight_by_utilization  

 <pre><code class="cpp"> 
	     for (ceph::unordered_map<pg_t,pg_stat_t>::const_iterator p = 
	    pgm.pg_stat.begin(); 
	  p != pgm.pg_stat.end(); 
	  ++p) { 
       if (pools && pools->count(p->first.pool()) == 0) 
	 continue; 
       for (vector<int>::const_iterator q = p->second.acting.begin(); 
	    q != p->second.acting.end(); 
	    ++q) { 
	 if (*q >= (int)pgs_by_osd.size()) 
	   pgs_by_osd.resize(*q); 
	 if (pgs_by_osd[*q] == 0) { 
           if (osdmap.crush->get_item_weightf(*q) <= 0) { 
             //skip if we currently can not identify item 
             continue; 
           } 
	   weight_sum += osdmap.crush->get_item_weightf(*q); 
	   ++num_osds; 
	 } 
 </code></pre> 

 if the acting set contain item CRUSH_ITEM_NONE, *q will be 0x7fffffff. 

 pgs_by_osd.resize(*q) will take a large number of memory 

 tcmalloc: *large alloc 8589934592 bytes == (nil)* @    0x7f5cc3a0b36c 0x7f5cc3a2abd8 0x7f5cc4c87eb7 0x7f5cc4ca8165 0x7f5cc4ca8974 0x7f5cc4ca9024 0x7f5cc4c5b428 0x7f5cc4c16c3d 0x7f5cc4c1a2cb 0x7f5cc4c1b479 0x7f5cc4c1cd35 0x7f5cc4c1ab22 0x7f5cc4c1b479 0x7f5cc4c41013 0x7f5cc4fbe502 0x7f5cc4e72dcd 0x7f5cc4ef8a25 0x7f5cc2189dc5 0x7f5cc169821d (nil) 
 terminate called after throwing an instance of 'std::bad_alloc' 
   what():    std::bad_alloc 
 *** Caught signal (Aborted) ** 
  in thread 7f5cb5c25700 thread_name:ms_dispatch 
  ceph version 11.2.0 (f223e27eeb35991352ebc1f67423d4ebc252adb7) 
  1: (()+0x6fe81a) [0x7f5cc500181a] 
  2: (()+0xf100) [0x7f5cc2191100] 
  3: (gsignal()+0x37) [0x7f5cc15d75f7] 
  4: (abort()+0x148) [0x7f5cc15d8ce8] 
  5: (__gnu_cxx::__verbose_terminate_handler()+0x165) [0x7f5cc1edb9b5] 
  6: (()+0x5e926) [0x7f5cc1ed9926] 
  7: (()+0x5e953) [0x7f5cc1ed9953] 
  8: (()+0x5eb73) [0x7f5cc1ed9b73] 
  9: (()+0x18437) [0x7f5cc3a0b437] 
  10: (tc_new()+0x18) [0x7f5cc3a2abd8] 
  11: (OSDMonitor::reweight_by_utilization(int, double, int, bool, std::set<long, std::less<long>, std::allocator<long> > const*, bool, bool, std::basic_stringstream<char, std::char_traits<char>, std::allocator<char> >*, std::string*, ceph::Formatter*)+0x4f7) [0x7f5cc4c87eb7] 
  12: (OSDMonitor::prepare_command_impl(std::shared_ptr<MonOpRequest>, std::map<std::string, boost::variant<std::string<bool, long, double, std::vector<std::string, std::allocator<std::string> >, std::vector<long, std::allocator<long> > > >, std::less<std::string>, std::allocator<std::pair<std::string const, std::string<bool, long, double, std::vector<std::string, std::allocator<std::string> >, std::vector<long, std::allocator<long> > > > > >&)+0x12d95) [0x7f5cc4ca8165] 
  13: (OSDMonitor::prepare_command(std::shared_ptr<MonOpRequest>)+0x414) [0x7f5cc4ca8974] 
  14: (OSDMonitor::prepare_update(std::shared_ptr<MonOpRequest>)+0x394) [0x7f5cc4ca9024] 
  15: (PaxosService::dispatch(std::shared_ptr<MonOpRequest>)+0xe38) [0x7f5cc4c5b428] 
  16: (Monitor::handle_command(std::shared_ptr<MonOpRequest>)+0x1d2d) [0x7f5cc4c16c3d] 
  17: (Monitor::dispatch_op(std::shared_ptr<MonOpRequest>)+0x31b) [0x7f5cc4c1a2cb] 
  18: (Monitor::_ms_dispatch(Message*)+0x519) [0x7f5cc4c1b479] 
  19: (Monitor::handle_forward(std::shared_ptr<MonOpRequest>)+0xb65) [0x7f5cc4c1cd35] 
  20: (Monitor::dispatch_op(std::shared_ptr<MonOpRequest>)+0xb72) [0x7f5cc4c1ab22] 
  21: (Monitor::_ms_dispatch(Message*)+0x519) [0x7f5cc4c1b479] 
  22: (Monitor::ms_dispatch(Message*)+0x23) [0x7f5cc4c41013] 
  23: (DispatchQueue::entry()+0x6b2) [0x7f5cc4fbe502] 
  24: (DispatchQueue::DispatchThread::entry()+0xd) [0x7f5cc4e72dcd] 
  25: (Thread::entry_wrapper()+0x75) [0x7f5cc4ef8a25] 
  26: (()+0x7dc5) [0x7f5cc2189dc5] 
  27: (clone()+0x6d) [0x7f5cc169821d] 
 2017-08-10 14:53:39.286961 7f5cb5c25700 -1 *** Caught signal (Aborted) ** 
  in thread 7f5cb5c25700 thread_name:ms_dispatch 

  ceph version 11.2.0 (f223e27eeb35991352ebc1f67423d4ebc252adb7) 
  1: (()+0x6fe81a) [0x7f5cc500181a] 
  2: (()+0xf100) [0x7f5cc2191100] 
  3: (gsignal()+0x37) [0x7f5cc15d75f7] 
  4: (abort()+0x148) [0x7f5cc15d8ce8] 
  5: (__gnu_cxx::__verbose_terminate_handler()+0x165) [0x7f5cc1edb9b5] 
  6: (()+0x5e926) [0x7f5cc1ed9926] 
  7: (()+0x5e953) [0x7f5cc1ed9953] 
  8: (()+0x5eb73) [0x7f5cc1ed9b73] 
  9: (()+0x18437) [0x7f5cc3a0b437] 
  10: (tc_new()+0x18) [0x7f5cc3a2abd8] 
  11: (OSDMonitor::reweight_by_utilization(int, double, int, bool, std::set<long, std::less<long>, std::allocator<long> > const*, bool, bool, std::basic_stringstream<char, std::char_traits<char>, std::allocator<char> >*, std::string*, ceph::Formatter*)+0x4f7) [0x7f5cc4c87eb7] 
  12: (OSDMonitor::prepare_command_impl(std::shared_ptr<MonOpRequest>, std::map<std::string, boost::variant<std::string<bool, long, double, std::vector<std::string, std::allocator<std::string> >, std::vector<long, std::allocator<long> > > >, std::less<std::string>, std::allocator<std::pair<std::string const, std::string<bool, long, double, std::vector<std::string, std::allocator<std::string> >, std::vector<long, std::allocator<long> > > > > >&)+0x12d95) [0x7f5cc4ca8165] 
  13: (OSDMonitor::prepare_command(std::shared_ptr<MonOpRequest>)+0x414) [0x7f5cc4ca8974] 
  14: (OSDMonitor::prepare_update(std::shared_ptr<MonOpRequest>)+0x394) [0x7f5cc4ca9024] 
  15: (PaxosService::dispatch(std::shared_ptr<MonOpRequest>)+0xe38) [0x7f5cc4c5b428] 
  16: (Monitor::handle_command(std::shared_ptr<MonOpRequest>)+0x1d2d) [0x7f5cc4c16c3d] 
  17: (Monitor::dispatch_op(std::shared_ptr<MonOpRequest>)+0x31b) [0x7f5cc4c1a2cb] 
  18: (Monitor::_ms_dispatch(Message*)+0x519) [0x7f5cc4c1b479] 
  19: (Monitor::handle_forward(std::shared_ptr<MonOpRequest>)+0xb65) [0x7f5cc4c1cd35] 
  20: (Monitor::dispatch_op(std::shared_ptr<MonOpRequest>)+0xb72) [0x7f5cc4c1ab22] 
  21: (Monitor::_ms_dispatch(Message*)+0x519) [0x7f5cc4c1b479] 
  22: (Monitor::ms_dispatch(Message*)+0x23) [0x7f5cc4c41013] 
  23: (DispatchQueue::entry()+0x6b2) [0x7f5cc4fbe502] 
  24: (DispatchQueue::DispatchThread::entry()+0xd) [0x7f5cc4e72dcd] 
  25: (Thread::entry_wrapper()+0x75) [0x7f5cc4ef8a25] 
  26: (()+0x7dc5) [0x7f5cc2189dc5] 
  27: (clone()+0x6d) [0x7f5cc169821d] 

Back