Bug #45761
Updated by Brad Hubbard almost 4 years ago
/a/yuriw-2020-05-28_02:23:45-rados-wip-yuri-master_5.27.20-distro-basic-smithi/5097794 <pre> 2020-05-28T06:13:04.288 INFO:tasks.mon_thrash.mon_thrasher:thrashing mon.g 2020-05-28T06:13:04.289 INFO:tasks.mon_thrash.mon_thrasher:killing mon.g 2020-05-28T06:13:04.289 DEBUG:tasks.ceph.mon.g:waiting for process to exit 2020-05-28T06:13:04.289 INFO:teuthology.orchestra.run:waiting for 300 2020-05-28T06:13:04.349 INFO:tasks.ceph.mon.g:Stopped 2020-05-28T06:13:04.349 INFO:tasks.mon_thrash.mon_thrasher:thrashing mon.f 2020-05-28T06:13:04.350 INFO:tasks.mon_thrash.mon_thrasher:thrashing mon.f store 2020-05-28T06:13:04.350 INFO:teuthology.orchestra.run.smithi078:> sudo adjust-ulimits ceph-coverage /home/ubuntu/cephtest/archive/coverage timeout 120 ceph --cluster ceph --log-early tell mon.f sync_force --yes-i-really-mean-it 2020-05-28T06:13:20.481 INFO:teuthology.orchestra.run.smithi078.stderr:Error ENXIO: mon unavailable 2020-05-28T06:13:20.485 DEBUG:teuthology.orchestra.run:got remote process result: 6 2020-05-28T06:13:20.486 ERROR:tasks.mon_thrash.mon_thrasher:exception: Traceback (most recent call last): File "/home/teuthworker/src/github.com_ceph_ceph-c_wip-yuri-master_5.27.20/qa/tasks/mon_thrash.py", line 232, in do_thrash self._do_thrash() File "/home/teuthworker/src/github.com_ceph_ceph-c_wip-yuri-master_5.27.20/qa/tasks/mon_thrash.py", line 285, in _do_thrash self.thrash_store(mon) File "/home/teuthworker/src/github.com_ceph_ceph-c_wip-yuri-master_5.27.20/qa/tasks/mon_thrash.py", line 175, in thrash_store '--yes-i-really-mean-it') File "/home/teuthworker/src/github.com_ceph_ceph-c_wip-yuri-master_5.27.20/qa/tasks/ceph_manager.py", line 1363, in raw_cluster_cmd stdout=BytesIO(), File "/home/teuthworker/src/git.ceph.com_git_teuthology_master/teuthology/orchestra/remote.py", line 206, in run r = self._runner(client=self.ssh, name=self.shortname, **kwargs) File "/home/teuthworker/src/git.ceph.com_git_teuthology_master/teuthology/orchestra/run.py", line 473, in run r.wait() File "/home/teuthworker/src/git.ceph.com_git_teuthology_master/teuthology/orchestra/run.py", line 162, in wait self._raise_for_status() File "/home/teuthworker/src/git.ceph.com_git_teuthology_master/teuthology/orchestra/run.py", line 184, in _raise_for_status node=self.hostname, label=self.label teuthology.exceptions.CommandFailedError: Command failed on smithi078 with status 6: 'sudo adjust-ulimits ceph-coverage /home/ubuntu/cephtest/archive/coverage timeout 120 ceph --cluster ceph --log-early tell mon.f sync_force --yes-i-really-mean-it' 2020-05-28T06:13:24.555 INFO:tasks.workunit.client.0.smithi139.stderr:noup is unset 2020-05-28T06:13:24.568 INFO:tasks.workunit.client.0.smithi139.stderr:+ (( ++i )) 2020-05-28T06:13:24.568 INFO:tasks.workunit.client.0.smithi139.stderr:+ (( i < num )) 2020-05-28T06:13:24.568 INFO:tasks.workunit.client.0.smithi139.stderr:+ ceph osd set noup 2020-05-28T06:13:25.825 INFO:tasks.daemonwatchdog.daemon_watchdog:MonitorThrasher failed 2020-05-28T06:13:25.825 INFO:tasks.daemonwatchdog.daemon_watchdog:BARK! unmounting mounts and killing all daemons 2020-05-28T06:13:25.825 ERROR:tasks.daemonwatchdog.daemon_watchdog:exception: Traceback (most recent call last): File "/home/teuthworker/src/github.com_ceph_ceph-c_wip-yuri-master_5.27.20/qa/tasks/daemonwatchdog.py", line 37, in _run self.watch() File "/home/teuthworker/src/github.com_ceph_ceph-c_wip-yuri-master_5.27.20/qa/tasks/daemonwatchdog.py", line 113, in watch self.bark() File "/home/teuthworker/src/github.com_ceph_ceph-c_wip-yuri-master_5.27.20/qa/tasks/daemonwatchdog.py", line 53, in bark for mount in self.ctx.mounts.values(): File "/home/teuthworker/src/git.ceph.com_git_teuthology_master/teuthology/config.py", line 254, in __getattr__ raise AttributeError(name) AttributeError: mounts </pre> Several of the MONs are never restarted so we end with. <pre> 2020-05-28T06:47:14.331 INFO:teuthology.run:Summary data: description: rados/monthrash/{ceph clusters/9-mons msgr-failures/mon-delay msgr/async-v1only objectstore/bluestore-low-osd-mem-target rados supported-random-distro$/{ubuntu_latest} thrashers/force-sync-many workloads/rados_mon_osdmap_prune} duration: 2537.953857898712 failure_reason: failed to reach quorum size 9 before timeout expired </pre> /a/yuriw-2020-05-22_19:55:53-rados-wip-yuri-master_5.22.20-distro-basic-smithi/5083290 This looks like the same issue but with a different error. <pre> 2020-05-22T23:38:28.429 INFO:tasks.mon_thrash.mon_thrasher:monitors to thrash: ['d', 'a', 'h'] 2020-05-22T23:38:28.429 INFO:tasks.mon_thrash.mon_thrasher:monitors to freeze: [] 2020-05-22T23:38:28.430 INFO:tasks.mon_thrash.mon_thrasher:thrashing mon.d 2020-05-22T23:38:28.430 INFO:tasks.mon_thrash.mon_thrasher:killing mon.d 2020-05-22T23:38:28.430 DEBUG:tasks.ceph.mon.d:waiting for process to exit 2020-05-22T23:38:28.430 INFO:teuthology.orchestra.run:waiting for 300 2020-05-22T23:38:28.440 INFO:tasks.ceph.mon.d:Stopped 2020-05-22T23:38:28.441 INFO:tasks.mon_thrash.mon_thrasher:thrashing mon.a 2020-05-22T23:38:28.441 INFO:tasks.mon_thrash.mon_thrasher:thrashing mon.a store 2020-05-22T23:38:28.441 INFO:teuthology.orchestra.run.smithi204:> sudo adjust-ulimits ceph-coverage /home/ubuntu/cephtest/archive/coverage timeout 120 ceph --cluster ceph --log-early tell mon.a sync_force --yes-i-really-mean-it 2020-05-22T23:38:45.598 INFO:teuthology.orchestra.run.smithi204.stderr:Error ENXIO: problem getting command descriptions from mon.a 2020-05-22T23:38:45.610 DEBUG:teuthology.orchestra.run:got remote process result: 6 2020-05-22T23:38:45.610 ERROR:tasks.mon_thrash.mon_thrasher:exception: Traceback (most recent call last): File "/home/teuthworker/src/github.com_ceph_ceph-c_wip-yuri-master_5.22.20/qa/tasks/mon_thrash.py", line 232, in do_thrash self._do_thrash() File "/home/teuthworker/src/github.com_ceph_ceph-c_wip-yuri-master_5.22.20/qa/tasks/mon_thrash.py", line 285, in _do_thrash self.thrash_store(mon) File "/home/teuthworker/src/github.com_ceph_ceph-c_wip-yuri-master_5.22.20/qa/tasks/mon_thrash.py", line 175, in thrash_store '--yes-i-really-mean-it') File "/home/teuthworker/src/github.com_ceph_ceph-c_wip-yuri-master_5.22.20/qa/tasks/ceph_manager.py", line 1363, in raw_cluster_cmd stdout=BytesIO(), File "/home/teuthworker/src/git.ceph.com_git_teuthology_py2/teuthology/orchestra/remote.py", line 206, in run r = self._runner(client=self.ssh, name=self.shortname, **kwargs) File "/home/teuthworker/src/git.ceph.com_git_teuthology_py2/teuthology/orchestra/run.py", line 473, in run r.wait() File "/home/teuthworker/src/git.ceph.com_git_teuthology_py2/teuthology/orchestra/run.py", line 162, in wait self._raise_for_status() File "/home/teuthworker/src/git.ceph.com_git_teuthology_py2/teuthology/orchestra/run.py", line 184, in _raise_for_status node=self.hostname, label=self.label CommandFailedError: Command failed on smithi204 with status 6: 'sudo adjust-ulimits ceph-coverage /home/ubuntu/cephtest/archive/coverage timeout 120 ceph --cluster ceph --log-early tell mon.a sync_force --yes-i-really-mean-it' </pre>