Backport #21079
Updated by Abhishek Lekshmanan over 6 years ago
https://github.com/ceph/ceph/pull/17198 There is one bug in function OSDMonitor::reweight_by_utilization <pre><code class="cpp"> for (ceph::unordered_map<pg_t,pg_stat_t>::const_iterator p = pgm.pg_stat.begin(); p != pgm.pg_stat.end(); ++p) { if (pools && pools->count(p->first.pool()) == 0) continue; for (vector<int>::const_iterator q = p->second.acting.begin(); q != p->second.acting.end(); ++q) { if (*q >= (int)pgs_by_osd.size()) pgs_by_osd.resize(*q); if (pgs_by_osd[*q] == 0) { if (osdmap.crush->get_item_weightf(*q) <= 0) { //skip if we currently can not identify item continue; } weight_sum += osdmap.crush->get_item_weightf(*q); ++num_osds; } </code></pre> if the acting set contain item CRUSH_ITEM_NONE, *q will be 0x7fffffff. pgs_by_osd.resize(*q) will take a large number of memory tcmalloc: *large alloc 8589934592 bytes == (nil)* @ 0x7f5cc3a0b36c 0x7f5cc3a2abd8 0x7f5cc4c87eb7 0x7f5cc4ca8165 0x7f5cc4ca8974 0x7f5cc4ca9024 0x7f5cc4c5b428 0x7f5cc4c16c3d 0x7f5cc4c1a2cb 0x7f5cc4c1b479 0x7f5cc4c1cd35 0x7f5cc4c1ab22 0x7f5cc4c1b479 0x7f5cc4c41013 0x7f5cc4fbe502 0x7f5cc4e72dcd 0x7f5cc4ef8a25 0x7f5cc2189dc5 0x7f5cc169821d (nil) terminate called after throwing an instance of 'std::bad_alloc' what(): std::bad_alloc *** Caught signal (Aborted) ** in thread 7f5cb5c25700 thread_name:ms_dispatch ceph version 11.2.0 (f223e27eeb35991352ebc1f67423d4ebc252adb7) 1: (()+0x6fe81a) [0x7f5cc500181a] 2: (()+0xf100) [0x7f5cc2191100] 3: (gsignal()+0x37) [0x7f5cc15d75f7] 4: (abort()+0x148) [0x7f5cc15d8ce8] 5: (__gnu_cxx::__verbose_terminate_handler()+0x165) [0x7f5cc1edb9b5] 6: (()+0x5e926) [0x7f5cc1ed9926] 7: (()+0x5e953) [0x7f5cc1ed9953] 8: (()+0x5eb73) [0x7f5cc1ed9b73] 9: (()+0x18437) [0x7f5cc3a0b437] 10: (tc_new()+0x18) [0x7f5cc3a2abd8] 11: (OSDMonitor::reweight_by_utilization(int, double, int, bool, std::set<long, std::less<long>, std::allocator<long> > const*, bool, bool, std::basic_stringstream<char, std::char_traits<char>, std::allocator<char> >*, std::string*, ceph::Formatter*)+0x4f7) [0x7f5cc4c87eb7] 12: (OSDMonitor::prepare_command_impl(std::shared_ptr<MonOpRequest>, std::map<std::string, boost::variant<std::string<bool, long, double, std::vector<std::string, std::allocator<std::string> >, std::vector<long, std::allocator<long> > > >, std::less<std::string>, std::allocator<std::pair<std::string const, std::string<bool, long, double, std::vector<std::string, std::allocator<std::string> >, std::vector<long, std::allocator<long> > > > > >&)+0x12d95) [0x7f5cc4ca8165] 13: (OSDMonitor::prepare_command(std::shared_ptr<MonOpRequest>)+0x414) [0x7f5cc4ca8974] 14: (OSDMonitor::prepare_update(std::shared_ptr<MonOpRequest>)+0x394) [0x7f5cc4ca9024] 15: (PaxosService::dispatch(std::shared_ptr<MonOpRequest>)+0xe38) [0x7f5cc4c5b428] 16: (Monitor::handle_command(std::shared_ptr<MonOpRequest>)+0x1d2d) [0x7f5cc4c16c3d] 17: (Monitor::dispatch_op(std::shared_ptr<MonOpRequest>)+0x31b) [0x7f5cc4c1a2cb] 18: (Monitor::_ms_dispatch(Message*)+0x519) [0x7f5cc4c1b479] 19: (Monitor::handle_forward(std::shared_ptr<MonOpRequest>)+0xb65) [0x7f5cc4c1cd35] 20: (Monitor::dispatch_op(std::shared_ptr<MonOpRequest>)+0xb72) [0x7f5cc4c1ab22] 21: (Monitor::_ms_dispatch(Message*)+0x519) [0x7f5cc4c1b479] 22: (Monitor::ms_dispatch(Message*)+0x23) [0x7f5cc4c41013] 23: (DispatchQueue::entry()+0x6b2) [0x7f5cc4fbe502] 24: (DispatchQueue::DispatchThread::entry()+0xd) [0x7f5cc4e72dcd] 25: (Thread::entry_wrapper()+0x75) [0x7f5cc4ef8a25] 26: (()+0x7dc5) [0x7f5cc2189dc5] 27: (clone()+0x6d) [0x7f5cc169821d] 2017-08-10 14:53:39.286961 7f5cb5c25700 -1 *** Caught signal (Aborted) ** in thread 7f5cb5c25700 thread_name:ms_dispatch ceph version 11.2.0 (f223e27eeb35991352ebc1f67423d4ebc252adb7) 1: (()+0x6fe81a) [0x7f5cc500181a] 2: (()+0xf100) [0x7f5cc2191100] 3: (gsignal()+0x37) [0x7f5cc15d75f7] 4: (abort()+0x148) [0x7f5cc15d8ce8] 5: (__gnu_cxx::__verbose_terminate_handler()+0x165) [0x7f5cc1edb9b5] 6: (()+0x5e926) [0x7f5cc1ed9926] 7: (()+0x5e953) [0x7f5cc1ed9953] 8: (()+0x5eb73) [0x7f5cc1ed9b73] 9: (()+0x18437) [0x7f5cc3a0b437] 10: (tc_new()+0x18) [0x7f5cc3a2abd8] 11: (OSDMonitor::reweight_by_utilization(int, double, int, bool, std::set<long, std::less<long>, std::allocator<long> > const*, bool, bool, std::basic_stringstream<char, std::char_traits<char>, std::allocator<char> >*, std::string*, ceph::Formatter*)+0x4f7) [0x7f5cc4c87eb7] 12: (OSDMonitor::prepare_command_impl(std::shared_ptr<MonOpRequest>, std::map<std::string, boost::variant<std::string<bool, long, double, std::vector<std::string, std::allocator<std::string> >, std::vector<long, std::allocator<long> > > >, std::less<std::string>, std::allocator<std::pair<std::string const, std::string<bool, long, double, std::vector<std::string, std::allocator<std::string> >, std::vector<long, std::allocator<long> > > > > >&)+0x12d95) [0x7f5cc4ca8165] 13: (OSDMonitor::prepare_command(std::shared_ptr<MonOpRequest>)+0x414) [0x7f5cc4ca8974] 14: (OSDMonitor::prepare_update(std::shared_ptr<MonOpRequest>)+0x394) [0x7f5cc4ca9024] 15: (PaxosService::dispatch(std::shared_ptr<MonOpRequest>)+0xe38) [0x7f5cc4c5b428] 16: (Monitor::handle_command(std::shared_ptr<MonOpRequest>)+0x1d2d) [0x7f5cc4c16c3d] 17: (Monitor::dispatch_op(std::shared_ptr<MonOpRequest>)+0x31b) [0x7f5cc4c1a2cb] 18: (Monitor::_ms_dispatch(Message*)+0x519) [0x7f5cc4c1b479] 19: (Monitor::handle_forward(std::shared_ptr<MonOpRequest>)+0xb65) [0x7f5cc4c1cd35] 20: (Monitor::dispatch_op(std::shared_ptr<MonOpRequest>)+0xb72) [0x7f5cc4c1ab22] 21: (Monitor::_ms_dispatch(Message*)+0x519) [0x7f5cc4c1b479] 22: (Monitor::ms_dispatch(Message*)+0x23) [0x7f5cc4c41013] 23: (DispatchQueue::entry()+0x6b2) [0x7f5cc4fbe502] 24: (DispatchQueue::DispatchThread::entry()+0xd) [0x7f5cc4e72dcd] 25: (Thread::entry_wrapper()+0x75) [0x7f5cc4ef8a25] 26: (()+0x7dc5) [0x7f5cc2189dc5] 27: (clone()+0x6d) [0x7f5cc169821d]