Actions
Bug #23294
openOSD booted with noup never got marked in; pgs stuck peering while osd up, but out
Status:
New
Priority:
Normal
Assignee:
-
Category:
Peering
Target version:
-
% Done:
0%
Source:
Tags:
Backport:
Regression:
No
Severity:
3 - minor
Reviewed:
Affected Versions:
ceph-qa-suite:
Component(RADOS):
Pull request ID:
Crash signature (v1):
Crash signature (v2):
Description
http://pulpito.ceph.com/joshd-2018-03-09_22:47:53-rados-master-distro-basic-smithi/2273020/
This test restarts osd.0 while noup is set, then unsets noup, but osd.0 only goes up, not in, and pgs are stuck peering:
sudo ceph pg dump | grep peering dumped all 2.10 438 0 0 0 0 1794048 138 138 peering 2018-03-09 22:57:10.215720 21'438 101:541 [2,1] 2 [2,1] 2 0'0 2018-03-09 22:56:22.094590 0'0 2018-03-09 22:56:22.094590 0 2.0 489 0 0 0 0 2002944 189 189 peering 2018-03-09 22:57:10.198859 21'489 101:592 [3,1] 3 [3,1] 3 0'0 2018-03-09 22:56:22.094590 0'0 2018-03-09 22:56:22.094590 0 2.2 493 0 0 0 0 2019328 193 193 peering 2018-03-09 22:57:10.204127 21'493 101:596 [3,1] 3 [3,1] 3 0'0 2018-03-09 22:56:22.094590 0'0 2018-03-09 22:56:22.094590 0 2.28 496 0 0 0 0 2031616 196 196 peering 2018-03-09 22:57:10.209308 21'496 101:599 [3,1] 3 [3,1] 3 0'0 2018-03-09 22:56:22.094590 0'0 2018-03-09 22:56:22.094590 sudo ceph pg 2.0 query { "state": "peering", "snap_trimq": "[]", "snap_trimq_len": 0, "epoch": 102, "up": [ 3, 1 ], "acting": [ 3, 1 ], "actingbackfill": [ "1", "3" ], "info": { "pgid": "2.0", "last_update": "21'489", "last_complete": "21'489", "log_tail": "15'300", "last_user_version": 489, "last_backfill": "MAX", "last_backfill_bitwise": 0, "purged_snaps": [], "history": { "epoch_created": 13, "epoch_pool_created": 13, "last_epoch_started": 21, "last_interval_started": 20, "last_epoch_clean": 14, "last_interval_clean": 13, "last_epoch_split": 0, "last_epoch_marked_full": 0, "same_up_since": 23, "same_interval_since": 23, "same_primary_since": 13, "last_scrub": "0'0", "last_scrub_stamp": "2018-03-09 22:56:22.094590", "last_deep_scrub": "0'0", "last_deep_scrub_stamp": "2018-03-09 22:56:22.094590", "last_clean_scrub_stamp": "2018-03-09 22:56:22.094590" }, "stats": { "version": "21'489", "reported_seq": "593", "reported_epoch": "102", "state": "peering", "last_fresh": "2018-03-09 22:58:28.012227", "last_change": "2018-03-09 22:57:10.198859", "last_active": "2018-03-09 22:57:10.198383", "last_peered": "2018-03-09 22:57:05.275937", "last_clean": "2018-03-09 22:56:59.531392", "last_became_active": "2018-03-09 22:57:01.750668", "last_became_peered": "2018-03-09 22:57:01.750668", "last_unstale": "2018-03-09 22:58:28.012227", "last_undegraded": "2018-03-09 22:58:28.012227", "last_fullsized": "2018-03-09 22:58:28.012227", "mapping_epoch": 23, "log_start": "15'300", "ondisk_log_start": "15'300", "created": 13, "last_epoch_clean": 14, "parent": "0.0", "parent_split_bits": 0, "last_scrub": "0'0", "last_scrub_stamp": "2018-03-09 22:56:22.094590", "last_deep_scrub": "0'0", "last_deep_scrub_stamp": "2018-03-09 22:56:22.094590", "last_clean_scrub_stamp": "2018-03-09 22:56:22.094590", "log_size": 189, "ondisk_log_size": 189, "stats_invalid": false, "dirty_stats_invalid": false, "omap_stats_invalid": false, "hitset_stats_invalid": false, "hitset_bytes_stats_invalid": false, "pin_stats_invalid": false, "snaptrimq_len": 0, "stat_sum": { "num_bytes": 2002944, "num_objects": 489, "num_object_clones": 0, "num_object_copies": 978, "num_objects_missing_on_primary": 0, "num_objects_missing": 0, "num_objects_degraded": 0, "num_objects_misplaced": 0, "num_objects_unfound": 0, "num_objects_dirty": 489, "num_whiteouts": 0, "num_read": 0, "num_read_kb": 0, "num_write": 978, "num_write_kb": 1956, "num_scrub_errors": 0, "num_shallow_scrub_errors": 0, "num_deep_scrub_errors": 0, "num_objects_recovered": 0, "num_bytes_recovered": 0, "num_keys_recovered": 0, "num_objects_omap": 0, "num_objects_hit_set_archive": 0, "num_bytes_hit_set_archive": 0, "num_flush": 0, "num_flush_kb": 0, "num_evict": 0, "num_evict_kb": 0, "num_promote": 0, "num_flush_mode_high": 0, "num_flush_mode_low": 0, "num_evict_mode_some": 0, "num_evict_mode_full": 0, "num_objects_pinned": 0, "num_legacy_snapsets": 0, "num_large_omap_objects": 0 }, "up": [ 3, 1 ], "acting": [ 3, 1 ], "blocked_by": [ 1 ], "up_primary": 3, "acting_primary": 3, "purged_snaps": [] }, "empty": 0, "dne": 0, "incomplete": 0, "last_epoch_started": 21, "hit_set_history": { "current_last_update": "0'0", "history": [] } }, "peer_info": [ { "peer": "1", "pgid": "2.0", "last_update": "15'464", "last_complete": "15'464", "log_tail": "15'300", "last_user_version": 464, "last_backfill": "MAX", "last_backfill_bitwise": 0, "purged_snaps": [], "history": { "epoch_created": 13, "epoch_pool_created": 13, "last_epoch_started": 14, "last_interval_started": 13, "last_epoch_clean": 14, "last_interval_clean": 13, "last_epoch_split": 0, "last_epoch_marked_full": 0, "same_up_since": 23, "same_interval_since": 23, "same_primary_since": 13, "last_scrub": "0'0", "last_scrub_stamp": "2018-03-09 22:56:22.094590", "last_deep_scrub": "0'0", "last_deep_scrub_stamp": "2018-03-09 22:56:22.094590", "last_clean_scrub_stamp": "2018-03-09 22:56:22.094590" }, "stats": { "version": "15'463", "reported_seq": "473", "reported_epoch": "15", "state": "active+clean", "last_fresh": "2018-03-09 22:56:53.151346", "last_change": "2018-03-09 22:56:23.195100", "last_active": "2018-03-09 22:56:53.151346", "last_peered": "2018-03-09 22:56:53.151346", "last_clean": "2018-03-09 22:56:53.151346", "last_became_active": "2018-03-09 22:56:23.194656", "last_became_peered": "2018-03-09 22:56:23.194656", "last_unstale": "2018-03-09 22:56:53.151346", "last_undegraded": "2018-03-09 22:56:53.151346", "last_fullsized": "2018-03-09 22:56:53.151346", "mapping_epoch": 23, "log_start": "15'300", "ondisk_log_start": "15'300", "created": 13, "last_epoch_clean": 14, "parent": "0.0", "parent_split_bits": 0, "last_scrub": "0'0", "last_scrub_stamp": "2018-03-09 22:56:22.094590", "last_deep_scrub": "0'0", "last_deep_scrub_stamp": "2018-03-09 22:56:22.094590", "last_clean_scrub_stamp": "2018-03-09 22:56:22.094590", "log_size": 163, "ondisk_log_size": 163, "stats_invalid": false, "dirty_stats_invalid": false, "omap_stats_invalid": false, "hitset_stats_invalid": false, "hitset_bytes_stats_invalid": false, "pin_stats_invalid": false, "snaptrimq_len": 0, "stat_sum": { "num_bytes": 1900544, "num_objects": 464, "num_object_clones": 0, "num_object_copies": 926, "num_objects_missing_on_primary": 0, "num_objects_missing": 0, "num_objects_degraded": 0, "num_objects_misplaced": 0, "num_objects_unfound": 0, "num_objects_dirty": 464, "num_whiteouts": 0, "num_read": 0, "num_read_kb": 0, "num_write": 928, "num_write_kb": 1856, "num_scrub_errors": 0, "num_shallow_scrub_errors": 0, "num_deep_scrub_errors": 0, "num_objects_recovered": 0, "num_bytes_recovered": 0, "num_keys_recovered": 0, "num_objects_omap": 0, "num_objects_hit_set_archive": 0, "num_bytes_hit_set_archive": 0, "num_flush": 0, "num_flush_kb": 0, "num_evict": 0, "num_evict_kb": 0, "num_promote": 0, "num_flush_mode_high": 0, "num_flush_mode_low": 0, "num_evict_mode_some": 0, "num_evict_mode_full": 0, "num_objects_pinned": 0, "num_legacy_snapsets": 0, "num_large_omap_objects": 0 }, "up": [ 3, 1 ], "acting": [ 3, 1 ], "blocked_by": [], "up_primary": 3, "acting_primary": 3, "purged_snaps": [] }, "empty": 0, "dne": 0, "incomplete": 0, "last_epoch_started": 14, "hit_set_history": { "current_last_update": "0'0", "history": [] } } ], "recovery_state": [ { "name": "Started/Primary/Peering/GetMissing", "enter_time": "2018-03-09 22:57:10.256069", "peer_missing_requested": [ { "osd": "1", "got_missing": { "missing": [], "may_include_deletes": true } } ] }, { "name": "Started/Primary/Peering", "enter_time": "2018-03-09 22:57:10.198781", "past_intervals": [ { "first": "13", "last": "22", "all_participants": [ { "osd": 1 }, { "osd": 3 } ], "intervals": [ { "first": "20", "last": "22", "acting": "3" } ] } ], "probing_osds": [ "1", "3" ], "down_osds_we_would_probe": [], "peering_blocked_by": [], "peering_blocked_by_detail": [ { "detail": "peering_blocked_by_history_les_bound" } ] }, { "name": "Started", "enter_time": "2018-03-09 22:57:10.198708" } ], "agent_state": {} } sudo ceph osd tree ID CLASS WEIGHT TYPE NAME STATUS REWEIGHT PRI-AFF -1 0.35156 root default -3 0.35156 host smithi011 0 ssd 0.08789 osd.0 up 0 1.00000 1 ssd 0.08789 osd.1 up 1.00000 1.00000 2 ssd 0.08789 osd.2 up 1.00000 1.00000 3 ssd 0.08789 osd.3 up 1.00000 1.00000
This seems to be a race, but when it occurs the job is marked dead, so reproducing it via teuthology needs active logging in while the test is running. In this case I manually killed a command to make it gather logs.
Updated by Greg Farnum about 6 years ago
- Subject changed from pgs stuck peering while osd up, but out to OSD booted with noup never got marked in; pgs stuck peering while osd up, but out
- Priority changed from Urgent to Normal
Actions