Managed to trigger this using the following job:
machine_type: mira
nuke-on-error: false
interactive-on-error: true
overrides:
ceph:
conf:
global:
ms inject socket failures: 5000
mon:
debug mon: 20
debug ms: 1
debug paxos: 20
fs: xfs
log-whitelist:
- slow request
branch: master
install:
ceph:
branch: master
workunit:
branch: master
roles:
- - mon.a
- mon.c
- osd.0
- osd.1
- osd.2
- - mon.b
- mds.a
- osd.3
- osd.4
- osd.5
- client.0
- - mon.d
- mon.e
- osd.6
- osd.7
- - osd.8
- osd.9
- osd.10
- mon.f
- - osd.11
- osd.12
- osd.13
- mon.g
tasks:
- chef: null
- clock.check: null
- install: null
- ceph: null
- mon_thrash:
revive_delay: 20
thrash_delay: 1
store-thrash: true
thrash-many: true
store-thrash-probability: 100
- rados:
clients:
- client.0
objects: 50
op_weights:
delete: 50
read: 100
rollback: 50
snap_create: 50
snap_remove: 50
write: 100
ops: 4000
From gdb, these appear to be the relevant backtraces (trimmed for clarity, full backtrace attached):
Thread 16 (Thread 0x7f94cab3e700 (LWP 18696)):
#0 _M_data (this=0x56f3538) at /usr/include/c++/4.6/bits/basic_string.h:288
No locals.
#1 _M_rep (this=0x56f3538) at /usr/include/c++/4.6/bits/basic_string.h:296
No locals.
#2 size (this=0x56f3538) at /usr/include/c++/4.6/bits/basic_string.h:711
No locals.
#3 compare (__str=..., this=0x56f3538) at /usr/include/c++/4.6/bits/basic_string.h:2175
#4 operator>=<char, std::char_traits<char>, std::allocator<char> > (__rhs=..., __lhs=...)
#5 LevelDBStore::compact_range_async (this=0x2fde500, start=..., end=...) at os/LevelDBStore.cc:243
l = {mutex = @0x2fde530}
p = {_M_node = 0x56f3520}
#6 0x000000000049423a in compact_range_async (end=..., start=..., prefix=..., this=<optimized out>) at ./os/LevelDBStore.h:94
#7 MonitorDBStore::apply_transaction (this=0x2fcc030, t=...) at mon/MonitorDBStore.h:235
#8 0x00000000004e8a04 in Paxos::reapply_all_versions (this=0x2fe8500) at mon/Paxos.cc:75
first = 2005
last = 2492
__func__ = "reapply_all_versions"
tx = {ops = {<std::_List_base<MonitorDBStore::Op, std::allocator<MonitorDBStore::Op> >> = {
_M_impl = {<std::allocator<std::_List_node<MonitorDBStore::Op> >> = {<__gnu_cxx::new_allocator<std::_List_node<MonitorDBStore::Op> >> = {<No data
fields>}, <No data fields>}, _M_node = {_M_next = 0x3489e00, _M_prev = 0x5584300}}}, <No data fields>}}
#9 0x00000000004c0149 in Monitor::handle_sync_finish_reply (this=0x3072900, m=0x34b9080) at mon/Monitor.cc:1718
__func__ = "handle_sync_finish_reply"
other = {name = {_type = 1 '\001', _num = 3, static TYPE_MON = 1, static TYPE_MDS = 2, static TYPE_OSD = 4,
static TYPE_CLIENT = 8, static NEW = -1}, addr = {type = 0, nonce = 0, {addr = {ss_family = 2, __ss_align = 0,
__ss_padding = '\000' <repeats 111 times>}, addr4 = {sin_family = 2, sin_port = 34074, sin_addr = {
s_addr = 2122700298}, sin_zero = "\000\000\000\000\000\000\000"}, addr6 = {sin6_family = 2, sin6_port = 34074,
sin6_flowinfo = 2122700298, sin6_addr = {__in6_u = {__u6_addr8 = '\000' <repeats 15 times>, __u6_addr16 = {0, 0, 0, 0,
0, 0, 0, 0}, __u6_addr32 = {0, 0, 0, 0}}}, sin6_scope_id = 0}}}}
__PRETTY_FUNCTION__ = "void Monitor::handle_sync_finish_reply(MMonSync*)"
and
Thread 2 (Thread 0x7f94c8c2e700 (LWP 18719)):
#0 __lll_lock_wait () at ../nptl/sysdeps/unix/sysv/linux/x86_64/lowlevellock.S:132
No locals.
#1 0x00007f94cf8a4065 in _L_lock_858 () from /lib/x86_64-linux-gnu/libpthread.so.0
#2 0x00007f94cf8a3eba in __pthread_mutex_lock (mutex=0x2fde540) at pthread_mutex_lock.c:61
#3 0x0000000000625343 in Mutex::Lock (this=0x2fde530, no_lockdep=<optimized out>) at common/Mutex.cc:89
#4 0x000000000058474d in LevelDBStore::compact_thread_entry (this=0x2fde500) at os/LevelDBStore.cc:208
#5 0x00000000004929dd in LevelDBStore::CompactThread::entry (this=<optimized out>) at ./os/LevelDBStore.h:63
#6 0x00007f94cf8a1e9a in start_thread (arg=0x7f94c8c2e700) at pthread_create.c:308