Backport #17666
Updated by Loïc Dachary over 7 years ago
https://github.com/ceph/ceph/pull/11606 Hi, we've encountered a weird issue where some OSDs start scrubbing the same PG over and over while the other PGs on the OSD are simply never scrubbed, for example: <pre> 2016-06-24 23:27:47.503326 osd.73 [INF] 46.1a4 scrub ok 2016-06-24 23:27:50.476240 osd.73 [INF] 46.1a4 scrub starts 2016-06-24 23:27:50.511307 osd.73 [INF] 46.1a4 scrub ok 2016-06-24 23:27:51.475678 osd.73 [INF] 46.1a4 scrub starts 2016-06-24 23:27:51.513781 osd.73 [INF] 46.1a4 scrub ok 2016-06-24 23:27:52.476481 osd.73 [INF] 46.1a4 deep-scrub starts 2016-06-24 23:27:53.109623 osd.73 [INF] 46.1a4 deep-scrub ok 2016-06-24 23:27:54.476875 osd.73 [INF] 46.1a4 scrub starts 2016-06-24 23:27:54.507103 osd.73 [INF] 46.1a4 scrub ok 2016-06-24 23:27:59.476587 osd.73 [INF] 46.1a4 scrub starts 2016-06-24 23:27:59.510383 osd.73 [INF] 46.1a4 scrub ok 2016-06-24 23:28:06.476843 osd.73 [INF] 46.1a4 deep-scrub starts 2016-06-24 23:28:07.190836 osd.73 [INF] 46.1a4 deep-scrub ok 2016-06-24 23:28:07.477200 osd.73 [INF] 46.1a4 scrub starts 2016-06-24 23:28:07.508380 osd.73 [INF] 46.1a4 scrub ok 2016-06-24 23:28:08.477347 osd.73 [INF] 46.1a4 scrub starts 2016-06-24 23:28:08.512599 osd.73 [INF] 46.1a4 scrub ok 2016-06-24 23:28:17.478138 osd.73 [INF] 46.1a4 scrub starts 2016-06-24 23:28:17.511248 osd.73 [INF] 46.1a4 scrub ok 2016-06-24 23:28:20.478949 osd.73 [INF] 46.1a4 scrub starts 2016-06-24 23:28:20.511489 osd.73 [INF] 46.1a4 scrub ok </pre> We can't see anything unusual with ceph pg query, the scrub timestamps are all current, the output of ceph pg query is attached at the end of this post Other PGs on the same OSD also have a significantly older timestamp (beyond the max time between scrubs), yet they are completely ignored. Restarting the affected OSDs fixes the problem. Unfortunately, it is not reproducible and this only happened twice in three months. It seems to be related to the creation of new pools/PGs as it appeared twice within 24 hours of the creation of a new pool. The PGs being scrubbed all belong to the new pool. We first encountered it with 9.2.2 a few month ago and today with 10.2.2. pg info <pre> { "state": "active+clean", "snap_trimq": "[]", "epoch": 151167, "up": [ 73, 23, 138 ], "acting": [ 73, 23, 138 ], "actingbackfill": [ "23", "73", "138" ], "info": { "pgid": "46.1a4", "last_update": "151143'51", "last_complete": "151143'51", "log_tail": "0'0", "last_user_version": 51, "last_backfill": "MAX", "last_backfill_bitwise": 0, "purged_snaps": "[1~3]", "history": { "epoch_created": 150853, "last_epoch_started": 150955, "last_epoch_clean": 150955, "last_epoch_split": 0, "last_epoch_marked_full": 0, "same_up_since": 150854, "same_interval_since": 150954, "same_primary_since": 150828, "last_scrub": "151143'51", "last_scrub_stamp": "2016-06-24 23:39:02.606989", "last_deep_scrub": "151143'51", "last_deep_scrub_stamp": "2016-06-24 23:38:45.246075", "last_clean_scrub_stamp": "2016-06-24 23:39:02.606989" }, "stats": { "version": "151143'51", "reported_seq": "22519", "reported_epoch": "151167", "state": "active+clean", "last_fresh": "2016-06-24 23:39:02.607169", "last_change": "2016-06-24 23:39:02.607169", "last_active": "2016-06-24 23:39:02.607169", "last_peered": "2016-06-24 23:39:02.607169", "last_clean": "2016-06-24 23:39:02.607169", "last_became_active": "2016-06-23 06:26:51.096878", "last_became_peered": "2016-06-23 06:26:51.096878", "last_unstale": "2016-06-24 23:39:02.607169", "last_undegraded": "2016-06-24 23:39:02.607169", "last_fullsized": "2016-06-24 23:39:02.607169", "mapping_epoch": 150853, "log_start": "0'0", "ondisk_log_start": "0'0", "created": 150853, "last_epoch_clean": 150955, "parent": "0.0", "parent_split_bits": 9, "last_scrub": "151143'51", "last_scrub_stamp": "2016-06-24 23:39:02.606989", "last_deep_scrub": "151143'51", "last_deep_scrub_stamp": "2016-06-24 23:38:45.246075", "last_clean_scrub_stamp": "2016-06-24 23:39:02.606989", "log_size": 51, "ondisk_log_size": 51, "stats_invalid": false, "dirty_stats_invalid": false, "omap_stats_invalid": false, "hitset_stats_invalid": false, "hitset_bytes_stats_invalid": false, "pin_stats_invalid": false, "stat_sum": { "num_bytes": 167772160, "num_objects": 40, "num_object_clones": 0, "num_object_copies": 120, "num_objects_missing_on_primary": 0, "num_objects_missing": 0, "num_objects_degraded": 0, "num_objects_misplaced": 0, "num_objects_unfound": 0, "num_objects_dirty": 40, "num_whiteouts": 0, "num_read": 203, "num_read_kb": 11872, "num_write": 0, "num_write_kb": 0, "num_scrub_errors": 0, "num_shallow_scrub_errors": 0, "num_deep_scrub_errors": 0, "num_objects_recovered": 0, "num_bytes_recovered": 0, "num_keys_recovered": 0, "num_objects_omap": 0, "num_objects_hit_set_archive": 0, "num_bytes_hit_set_archive": 0, "num_flush": 0, "num_flush_kb": 0, "num_evict": 0, "num_evict_kb": 0, "num_promote": 0, "num_flush_mode_high": 0, "num_flush_mode_low": 0, "num_evict_mode_some": 0, "num_evict_mode_full": 0, "num_objects_pinned": 0 }, "up": [ 73, 23, 138 ], "acting": [ 73, 23, 138 ], "blocked_by": [], "up_primary": 73, "acting_primary": 73 }, "empty": 0, "dne": 0, "incomplete": 0, "last_epoch_started": 150955, "hit_set_history": { "current_last_update": "0'0", "history": [] } }, "peer_info": [ { "peer": "23", "pgid": "46.1a4", "last_update": "151143'51", "last_complete": "151143'51", "log_tail": "0'0", "last_user_version": 0, "last_backfill": "MAX", "last_backfill_bitwise": 0, "purged_snaps": "[]", "history": { "epoch_created": 150853, "last_epoch_started": 150955, "last_epoch_clean": 150955, "last_epoch_split": 0, "last_epoch_marked_full": 0, "same_up_since": 150854, "same_interval_since": 150954, "same_primary_since": 150828, "last_scrub": "151143'51", "last_scrub_stamp": "2016-06-24 23:39:02.606989", "last_deep_scrub": "151143'51", "last_deep_scrub_stamp": "2016-06-24 23:38:45.246075", "last_clean_scrub_stamp": "2016-06-24 23:39:02.606989" }, "stats": { "version": "0'0", "reported_seq": "16", "reported_epoch": "150906", "state": "peering", "last_fresh": "2016-06-23 06:22:48.093937", "last_change": "2016-06-23 06:22:48.088003", "last_active": "2016-06-23 06:18:33.658154", "last_peered": "2016-06-23 06:18:33.658154", "last_clean": "2016-06-23 06:18:33.658154", "last_became_active": "2016-06-23 06:18:33.658063", "last_became_peered": "2016-06-23 06:18:33.658063", "last_unstale": "2016-06-23 06:22:48.093937", "last_undegraded": "2016-06-23 06:22:48.093937", "last_fullsized": "2016-06-23 06:22:48.093937", "mapping_epoch": 150853, "log_start": "0'0", "ondisk_log_start": "0'0", "created": 150853, "last_epoch_clean": 150855, "parent": "0.0", "parent_split_bits": 9, "last_scrub": "0'0", "last_scrub_stamp": "2016-06-23 00:28:36.323398", "last_deep_scrub": "0'0", "last_deep_scrub_stamp": "2016-06-23 00:28:36.323398", "last_clean_scrub_stamp": "0.000000", "log_size": 0, "ondisk_log_size": 0, "stats_invalid": true, "dirty_stats_invalid": false, "omap_stats_invalid": false, "hitset_stats_invalid": false, "hitset_bytes_stats_invalid": false, "pin_stats_invalid": false, "stat_sum": { "num_bytes": 0, "num_objects": 0, "num_object_clones": 0, "num_object_copies": 0, "num_objects_missing_on_primary": 0, "num_objects_missing": 0, "num_objects_degraded": 0, "num_objects_misplaced": 0, "num_objects_unfound": 0, "num_objects_dirty": 0, "num_whiteouts": 0, "num_read": 0, "num_read_kb": 0, "num_write": 0, "num_write_kb": 0, "num_scrub_errors": 0, "num_shallow_scrub_errors": 0, "num_deep_scrub_errors": 0, "num_objects_recovered": 0, "num_bytes_recovered": 0, "num_keys_recovered": 0, "num_objects_omap": 0, "num_objects_hit_set_archive": 0, "num_bytes_hit_set_archive": 0, "num_flush": 0, "num_flush_kb": 0, "num_evict": 0, "num_evict_kb": 0, "num_promote": 0, "num_flush_mode_high": 0, "num_flush_mode_low": 0, "num_evict_mode_some": 0, "num_evict_mode_full": 0, "num_objects_pinned": 0 }, "up": [ 73, 23, 138 ], "acting": [ 73, 23, 138 ], "blocked_by": [], "up_primary": 73, "acting_primary": 73 }, "empty": 0, "dne": 0, "incomplete": 0, "last_epoch_started": 150955, "hit_set_history": { "current_last_update": "0'0", "history": [] } }, { "peer": "138", "pgid": "46.1a4", "last_update": "151143'51", "last_complete": "151143'51", "log_tail": "0'0", "last_user_version": 0, "last_backfill": "MAX", "last_backfill_bitwise": 0, "purged_snaps": "[]", "history": { "epoch_created": 150853, "last_epoch_started": 150955, "last_epoch_clean": 150955, "last_epoch_split": 0, "last_epoch_marked_full": 0, "same_up_since": 150854, "same_interval_since": 150954, "same_primary_since": 150828, "last_scrub": "151143'51", "last_scrub_stamp": "2016-06-24 23:39:02.606989", "last_deep_scrub": "151143'51", "last_deep_scrub_stamp": "2016-06-24 23:38:45.246075", "last_clean_scrub_stamp": "2016-06-24 23:39:02.606989" }, "stats": { "version": "0'0", "reported_seq": "16", "reported_epoch": "150906", "state": "peering", "last_fresh": "2016-06-23 06:22:48.093937", "last_change": "2016-06-23 06:22:48.088003", "last_active": "2016-06-23 06:18:33.658154", "last_peered": "2016-06-23 06:18:33.658154", "last_clean": "2016-06-23 06:18:33.658154", "last_became_active": "2016-06-23 06:18:33.658063", "last_became_peered": "2016-06-23 06:18:33.658063", "last_unstale": "2016-06-23 06:22:48.093937", "last_undegraded": "2016-06-23 06:22:48.093937", "last_fullsized": "2016-06-23 06:22:48.093937", "mapping_epoch": 150853, "log_start": "0'0", "ondisk_log_start": "0'0", "created": 150853, "last_epoch_clean": 150855, "parent": "0.0", "parent_split_bits": 9, "last_scrub": "0'0", "last_scrub_stamp": "2016-06-23 00:28:36.323398", "last_deep_scrub": "0'0", "last_deep_scrub_stamp": "2016-06-23 00:28:36.323398", "last_clean_scrub_stamp": "0.000000", "log_size": 0, "ondisk_log_size": 0, "stats_invalid": true, "dirty_stats_invalid": false, "omap_stats_invalid": false, "hitset_stats_invalid": false, "hitset_bytes_stats_invalid": false, "pin_stats_invalid": false, "stat_sum": { "num_bytes": 0, "num_objects": 0, "num_object_clones": 0, "num_object_copies": 0, "num_objects_missing_on_primary": 0, "num_objects_missing": 0, "num_objects_degraded": 0, "num_objects_misplaced": 0, "num_objects_unfound": 0, "num_objects_dirty": 0, "num_whiteouts": 0, "num_read": 0, "num_read_kb": 0, "num_write": 0, "num_write_kb": 0, "num_scrub_errors": 0, "num_shallow_scrub_errors": 0, "num_deep_scrub_errors": 0, "num_objects_recovered": 0, "num_bytes_recovered": 0, "num_keys_recovered": 0, "num_objects_omap": 0, "num_objects_hit_set_archive": 0, "num_bytes_hit_set_archive": 0, "num_flush": 0, "num_flush_kb": 0, "num_evict": 0, "num_evict_kb": 0, "num_promote": 0, "num_flush_mode_high": 0, "num_flush_mode_low": 0, "num_evict_mode_some": 0, "num_evict_mode_full": 0, "num_objects_pinned": 0 }, "up": [ 73, 23, 138 ], "acting": [ 73, 23, 138 ], "blocked_by": [], "up_primary": 73, "acting_primary": 73 }, "empty": 0, "dne": 0, "incomplete": 0, "last_epoch_started": 150955, "hit_set_history": { "current_last_update": "0'0", "history": [] } } ], "recovery_state": [ { "name": "Started\/Primary\/Active", "enter_time": "2016-06-23 06:26:50.813267", "might_have_unfound": [], "recovery_progress": { "backfill_targets": [], "waiting_on_backfill": [], "last_backfill_started": "MIN", "backfill_info": { "begin": "MIN", "end": "MIN", "objects": [] }, "peer_backfill_info": [], "backfills_in_flight": [], "recovering": [], "pg_backend": { "pull_from_peer": [], "pushing": [] } }, "scrub": { "scrubber.epoch_start": "150954", "scrubber.active": 0, "scrubber.state": "INACTIVE", "scrubber.start": "MIN", "scrubber.end": "MIN", "scrubber.subset_last_update": "0'0", "scrubber.deep": false, "scrubber.seed": 0, "scrubber.waiting_on": 0, "scrubber.waiting_on_whom": [] } }, { "name": "Started", "enter_time": "2016-06-23 06:26:49.718199" } ], "agent_state": {} } </pre>