Bug #63257
openSnapmapper filestore failure in upgrade/quincy-p2p
0%
Description
Description: upgrade:quincy-p2p/quincy-p2p-stress-split/{0-cluster/{openstack start} 1-ceph-install/quincy 1.1.short_pg_log 2-partial-upgrade/firsthalf 3-thrash/default 4-workload/{fsx radosbench rbd-cls rbd-import-export rbd_api readwrite snaps-few-objects} 5-finish-upgrade 6-final-workload/{rbd-python snaps-many-objects} objectstore/filestore-xfs supported-all-distro/ubuntu_latest thrashosds-health}
/a/yuriw-2023-10-15_01:48:01-upgrade:quincy-p2p-quincy-release-testing-default-smithi/7427978
{
"crash_id": "2023-10-15T04:44:19.928918Z_448f914f-aef4-42a6-9943-2aa415926732",
"timestamp": "2023-10-15T04:44:19.928918Z",
"process_name": "ceph-osd",
"entity_name": "osd.0",
"ceph_version": "17.2.6-1415-g9d6d32bb",
"utsname_hostname": "smithi033",
"utsname_sysname": "Linux",
"utsname_release": "6.6.0-rc5-g47672119149b",
"utsname_version": "#1 SMP PREEMPT_DYNAMIC Tue Oct 10 20:03:56 UTC 2023",
"utsname_machine": "x86_64",
"os_name": "Ubuntu",
"os_id": "ubuntu",
"os_version_id": "20.04",
"os_version": "20.04.4 LTS (Focal Fossa)",
"backtrace": [
"/lib/x86_64-linux-gnu/libpthread.so.0(+0x14420) [0x7f7198f81420]",
"(rocksdb::Status::Status(rocksdb::Status const&)+0x3f) [0x55cb854bf3ef]",
"(rocksdb::BlockIter<rocksdb::Slice>::status() const+0x2e) [0x55cb857bb0ac]",
"(rocksdb::BlockBasedTableIterator::status() const+0x132) [0x55cb85970fb8]",
"(rocksdb::IteratorWrapperBase<rocksdb::Slice>::status() const+0x72) [0x55cb855b1eac]",
"ceph-osd(+0x1b474a6) [0x55cb856684a6]",
"(rocksdb::IteratorWrapperBase<rocksdb::Slice>::Next()+0xb4) [0x55cb855b2064]",
"(rocksdb::MergingIterator::Next()+0xc2) [0x55cb857f5324]",
"(rocksdb::MergingIterator::NextAndGetResult(rocksdb::IterateResult*)+0x2c) [0x55cb857f547c]",
"(rocksdb::IteratorWrapperBase<rocksdb::Slice>::Next()+0x75) [0x55cb855b2025]",
"(rocksdb::DBIter::FindNextUserEntryInternal(bool, rocksdb::Slice const*)+0x12eb) [0x55cb855a8ef1]",
"(rocksdb::DBIter::FindNextUserEntry(bool, rocksdb::Slice const*)+0x83) [0x55cb855a7bbb]",
"(rocksdb::DBIter::Seek(rocksdb::Slice const&)+0x3d2) [0x55cb855ad31c]",
"(rocksdb::ArenaWrappedDBIter::Seek(rocksdb::Slice const&)+0x2b) [0x55cb858b4139]",
"(RocksDBStore::RocksDBWholeSpaceIteratorImpl::lower_bound(std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&)+0xb7) [0x55cb8545fb27]",
"(DBObjectMap::DBObjectMapIteratorImpl::lower_bound(std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&)+0x6a) [0x55cb84eeb1ba]",
"(DBObjectMap::scan(std::shared_ptr<DBObjectMap::_Header>, std::set<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >, std::less<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > >, std::allocator<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > > > const&, std::set<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >, std::less<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > >, std::allocator<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > > >*, std::map<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >, ceph::buffer::v15_2_0::list, std::less<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > >, std::allocator<std::pair<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const, ceph::buffer::v15_2_0::list> > >*)+0x228) [0x55cb84ee2708]",
"(DBObjectMap::get_values(ghobject_t const&, std::set<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >, std::less<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > >, std::allocator<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > > > const&, std::map<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >, ceph::buffer::v15_2_0::list, std::less<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > >, std::allocator<std::pair<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const, ceph::buffer::v15_2_0::list> > >*)+0xd0) [0x55cb84ee3f60]",
"(FileStore::omap_get_values(boost::intrusive_ptr<ObjectStore::CollectionImpl>&, ghobject_t const&, std::set<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >, std::less<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > >, std::allocator<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > > > const&, std::map<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >, ceph::buffer::v15_2_0::list, std::less<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > >, std::allocator<std::pair<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const, ceph::buffer::v15_2_0::list> > >*)+0x16d) [0x55cb84d2abed]",
"(MapCacher::MapCacher<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >, ceph::buffer::v15_2_0::list>::get_keys(std::set<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >, std::less<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > >, std::allocator<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > > > const&, std::map<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >, ceph::buffer::v15_2_0::list, std::less<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > >, std::allocator<std::pair<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const, ceph::buffer::v15_2_0::list> > >*)+0x4a8) [0x55cb84ae4248]",
"(SnapMapper::get_snaps_common(hobject_t const&) const+0x145) [0x55cb84ad7d45]",
"(SnapMapper::get_snaps(hobject_t const&, SnapMapper::object_snaps*) const+0x38) [0x55cb84ad87b8]",
"(SnapMapper::_remove_oid(hobject_t const&, MapCacher::Transaction<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >, ceph::buffer::v15_2_0::list>*)+0x169) [0x55cb84adac79]",
"(SnapMapper::remove_oid(hobject_t const&, MapCacher::Transaction<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >, ceph::buffer::v15_2_0::list>*)+0x59) [0x55cb84adb1b9]",
"(PG::do_delete_work(ceph::os::Transaction&, ghobject_t)+0x5cc) [0x55cb8490090c]",
"(PeeringState::Deleting::react(PeeringState::DeleteSome const&)+0x18b) [0x55cb84b5f2fb]",
"(boost::statechart::simple_state<PeeringState::Deleting, PeeringState::ToDelete, boost::mpl::list<mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na>, (boost::statechart::history_mode)0>::react_impl(boost::statechart::event_base const&, void const*)+0x119) [0x55cb84bcbee9]",
"(boost::statechart::state_machine<PeeringState::PeeringMachine, PeeringState::Initial, std::allocator<boost::statechart::none>, boost::statechart::null_exception_translator>::process_event(boost::statechart::event_base const&)+0x74) [0x55cb84907304]",
"(PG::do_peering_event(std::shared_ptr<PGPeeringEvent>, PeeringCtx&)+0x1aa) [0x55cb848f30ba]",
"(OSD::dequeue_peering_evt(OSDShard*, PG*, std::shared_ptr<PGPeeringEvent>, ThreadPool::TPHandle&)+0x34d) [0x55cb848671ed]",
"(OSD::dequeue_delete(OSDShard*, PG*, unsigned int, ThreadPool::TPHandle&)+0xc9) [0x55cb84867519]"
]
Updated by Laura Flores 7 months ago
$ git log --pretty=oneline --no-merges tags/v17.2.6..ci/quincy-release src/osd/SnapMapper.cc
53f1440a09098b67897c2be59d2482f77f7fceff osd/scrub: modify SnapMapper.cc to use ceph::buffer::list
f46fec6e79c7fb237ec632cf6301e121c893f76f osd/scrub: improve SnapMapper's API used by the scrubber
Possibly related to https://github.com/ceph/ceph/pull/52256?
Updated by Laura Flores 7 months ago
Seeing if it's easily reproducible: http://pulpito.front.sepia.ceph.com/lflores-2023-10-19_19:56:28-upgrade:quincy-p2p-quincy-release-distro-default-smithi/
Updated by Laura Flores 7 months ago
More tests for additional data:
http://pulpito.front.sepia.ceph.com/lflores-2023-10-19_23:47:28-upgrade:quincy-p2p-quincy-release-distro-default-smithi/
Build w/o PR https://github.com/ceph/ceph/pull/52256:
https://shaman.ceph.com/builds/ceph/wip-quincy-release-without-pr-52256/
This bug reproduced 7/20 times.
Updated by Laura Flores 7 months ago
Scheduled the same filestore tests on a build w/o PR https://github.com/ceph/ceph/pull/52256: http://pulpito.front.sepia.ceph.com/lflores-2023-10-20_16:09:09-upgrade:quincy-p2p-wip-quincy-release-without-pr-52256-distro-default-smithi/
And I scheduled tests on quincy-release with bluestore instead of filestore to see if the problem exists there too:
http://pulpito.front.sepia.ceph.com/lflores-2023-10-20_16:07:04-upgrade:quincy-p2p-quincy-release-distro-default-smithi/
Updated by Laura Flores 7 months ago
Scheduling also on 17.2.6 for comparison:
./teuthology/virtualenv/bin/teuthology-suite -v -m smithi -c quincy -S d7ff0d10654d2280e08f1ab989c7cdf3064446a5 -r yuriw-2023-10-15_01:48:01-upgrade:quincy-p2p-quincy-release-testing-default-smithi -p 59 -N 20 --filter-all "filestore-xfs"
[lflores@fedora ceph]$ git show v17.2.6
tag v17.2.6
Tagger: Ceph Release Team <ceph-maintainers@ceph.io>
Date: Wed Apr 5 15:09:51 2023 +0000
v17.2.6
commit d7ff0d10654d2280e08f1ab989c7cdf3064446a5 (HEAD, tag: v17.2.6, ceph-releases/quincy-release)
Author: Ceph Release Team <ceph-maintainers@ceph.io>
Date: Wed Apr 5 15:09:51 2023 +0000
17.2.6
Signed-off-by: Ceph Release Team <ceph-maintainers@ceph.io>
Updated by Laura Flores 7 months ago
Conclusions from Prashant's and my digging is that this failure likely has existed since before 17.2.6 due to two older teuthology jobs which also exhibit the crash:
Crash from 17.2.0: /a/lflores-2023-01-19_22:48:47-upgrade:quincy-p2p-quincy-distro-default-smithi/7130422
Crash from 17.2.5: /a/teuthology-2023-04-09_01:35:03-upgrade:quincy-p2p-quincy-distro-default-smithi/7236911
We also suspect that this failure is specific to filestore, as the same tests run with objectstore=bluestore are passing:
http://pulpito.front.sepia.ceph.com/lflores-2023-10-20_16:07:04-upgrade:quincy-p2p-quincy-release-distro-default-smithi/
Moreover, Prashant noted that omap ops on filestore OSDs take more time than bluestore, which would explain why bluestore is "unaffected" (underlying for bluestore due to faster omap ops).
Updated by Laura Flores 7 months ago
Set osd_op_thread_timeout=900 and osd_op_thread_suicide_timeout=1800 to see if this increases odds of reproducing the crash.
Used this command to schedule the run:
teuthology-suite -v -m smithi -c quincy-release -r yuriw-2023-10-15_01:48:01-upgrade:quincy-p2p-quincy-release-testing-default-smithi -p 60 -N 20 --filter-all "filestore-xfs" --suite-repo https://github.com/ljflores/ceph.git --suite-branch wip-quincy-increase-timeouts
This is the suite_branch I used: https://github.com/ljflores/ceph/commits/wip-quincy-increase-timeouts
Here is the link to the scheduled run: http://pulpito.front.sepia.ceph.com/lflores-2023-10-23_17:44:43-upgrade:quincy-p2p-quincy-release-distro-default-smithi/
Updated by Matan Breizman 7 months ago
Laura Flores wrote:
Set osd_op_thread_timeout=900 and osd_op_thread_suicide_timeout=1800 to see if this increases odds of reproducing the crash.
Used this command to schedule the run:
[...]This is the suite_branch I used: https://github.com/ljflores/ceph/commits/wip-quincy-increase-timeouts
Here is the link to the scheduled run: http://pulpito.front.sepia.ceph.com/lflores-2023-10-23_17:44:43-upgrade:quincy-p2p-quincy-release-distro-default-smithi/
Increasing the timeout is causing other teuthology timeouts to appear ('reached maximum tries (501) after waiting for 3000 seconds). Avoiding the OSD crash won't result in an OSD backtrace to debug.
I scheduled a run with file store debug on and without increased timeout.
https://github.com/Matan-B/ceph/tree/wip-matanb-quincy-debug-timeouts