Project

General

Profile

Backport #17666

Updated by Loïc Dachary over 7 years ago

https://github.com/ceph/ceph/pull/11606 Hi, 

 we've encountered a weird issue where some OSDs start scrubbing the same PG over and over while the other PGs on the OSD are simply never scrubbed, for example: 

 <pre> 
 2016-06-24 23:27:47.503326 osd.73 [INF] 46.1a4 scrub ok 
 2016-06-24 23:27:50.476240 osd.73 [INF] 46.1a4 scrub starts 
 2016-06-24 23:27:50.511307 osd.73 [INF] 46.1a4 scrub ok 
 2016-06-24 23:27:51.475678 osd.73 [INF] 46.1a4 scrub starts 
 2016-06-24 23:27:51.513781 osd.73 [INF] 46.1a4 scrub ok 
 2016-06-24 23:27:52.476481 osd.73 [INF] 46.1a4 deep-scrub starts 
 2016-06-24 23:27:53.109623 osd.73 [INF] 46.1a4 deep-scrub ok 
 2016-06-24 23:27:54.476875 osd.73 [INF] 46.1a4 scrub starts 
 2016-06-24 23:27:54.507103 osd.73 [INF] 46.1a4 scrub ok 
 2016-06-24 23:27:59.476587 osd.73 [INF] 46.1a4 scrub starts 
 2016-06-24 23:27:59.510383 osd.73 [INF] 46.1a4 scrub ok 
 2016-06-24 23:28:06.476843 osd.73 [INF] 46.1a4 deep-scrub starts 
 2016-06-24 23:28:07.190836 osd.73 [INF] 46.1a4 deep-scrub ok 
 2016-06-24 23:28:07.477200 osd.73 [INF] 46.1a4 scrub starts 
 2016-06-24 23:28:07.508380 osd.73 [INF] 46.1a4 scrub ok 
 2016-06-24 23:28:08.477347 osd.73 [INF] 46.1a4 scrub starts 
 2016-06-24 23:28:08.512599 osd.73 [INF] 46.1a4 scrub ok 
 2016-06-24 23:28:17.478138 osd.73 [INF] 46.1a4 scrub starts 
 2016-06-24 23:28:17.511248 osd.73 [INF] 46.1a4 scrub ok 
 2016-06-24 23:28:20.478949 osd.73 [INF] 46.1a4 scrub starts 
 2016-06-24 23:28:20.511489 osd.73 [INF] 46.1a4 scrub ok 
 </pre> 

 We can't see anything unusual with ceph pg query, the scrub timestamps are all current, the output of ceph pg query is attached at the end of this post 
 Other PGs on the same OSD also have a significantly older timestamp (beyond the max time between scrubs), yet they are completely ignored. 


 Restarting the affected OSDs fixes the problem. 

 Unfortunately, it is not reproducible and this only happened twice in three months. 
 It seems to be related to the creation of new pools/PGs as it appeared twice within 24 hours of the creation of a new pool. 
 The PGs being scrubbed all belong to the new pool. 

 We first encountered it with 9.2.2 a few month ago and today with 10.2.2. 


 pg info 
 <pre> 
 { 
     "state": "active+clean", 
     "snap_trimq": "[]", 
     "epoch": 151167, 
     "up": [ 
         73, 
         23, 
         138 
     ], 
     "acting": [ 
         73, 
         23, 
         138 
     ], 
     "actingbackfill": [ 
         "23", 
         "73", 
         "138" 
     ], 
     "info": { 
         "pgid": "46.1a4", 
         "last_update": "151143'51", 
         "last_complete": "151143'51", 
         "log_tail": "0'0", 
         "last_user_version": 51, 
         "last_backfill": "MAX", 
         "last_backfill_bitwise": 0, 
         "purged_snaps": "[1~3]", 
         "history": { 
             "epoch_created": 150853, 
             "last_epoch_started": 150955, 
             "last_epoch_clean": 150955, 
             "last_epoch_split": 0, 
             "last_epoch_marked_full": 0, 
             "same_up_since": 150854, 
             "same_interval_since": 150954, 
             "same_primary_since": 150828, 
             "last_scrub": "151143'51", 
             "last_scrub_stamp": "2016-06-24 23:39:02.606989", 
             "last_deep_scrub": "151143'51", 
             "last_deep_scrub_stamp": "2016-06-24 23:38:45.246075", 
             "last_clean_scrub_stamp": "2016-06-24 23:39:02.606989" 
         }, 
         "stats": { 
             "version": "151143'51", 
             "reported_seq": "22519", 
             "reported_epoch": "151167", 
             "state": "active+clean", 
             "last_fresh": "2016-06-24 23:39:02.607169", 
             "last_change": "2016-06-24 23:39:02.607169", 
             "last_active": "2016-06-24 23:39:02.607169", 
             "last_peered": "2016-06-24 23:39:02.607169", 
             "last_clean": "2016-06-24 23:39:02.607169", 
             "last_became_active": "2016-06-23 06:26:51.096878", 
             "last_became_peered": "2016-06-23 06:26:51.096878", 
             "last_unstale": "2016-06-24 23:39:02.607169", 
             "last_undegraded": "2016-06-24 23:39:02.607169", 
             "last_fullsized": "2016-06-24 23:39:02.607169", 
             "mapping_epoch": 150853, 
             "log_start": "0'0", 
             "ondisk_log_start": "0'0", 
             "created": 150853, 
             "last_epoch_clean": 150955, 
             "parent": "0.0", 
             "parent_split_bits": 9, 
             "last_scrub": "151143'51", 
             "last_scrub_stamp": "2016-06-24 23:39:02.606989", 
             "last_deep_scrub": "151143'51", 
             "last_deep_scrub_stamp": "2016-06-24 23:38:45.246075", 
             "last_clean_scrub_stamp": "2016-06-24 23:39:02.606989", 
             "log_size": 51, 
             "ondisk_log_size": 51, 
             "stats_invalid": false, 
             "dirty_stats_invalid": false, 
             "omap_stats_invalid": false, 
             "hitset_stats_invalid": false, 
             "hitset_bytes_stats_invalid": false, 
             "pin_stats_invalid": false, 
             "stat_sum": { 
                 "num_bytes": 167772160, 
                 "num_objects": 40, 
                 "num_object_clones": 0, 
                 "num_object_copies": 120, 
                 "num_objects_missing_on_primary": 0, 
                 "num_objects_missing": 0, 
                 "num_objects_degraded": 0, 
                 "num_objects_misplaced": 0, 
                 "num_objects_unfound": 0, 
                 "num_objects_dirty": 40, 
                 "num_whiteouts": 0, 
                 "num_read": 203, 
                 "num_read_kb": 11872, 
                 "num_write": 0, 
                 "num_write_kb": 0, 
                 "num_scrub_errors": 0, 
                 "num_shallow_scrub_errors": 0, 
                 "num_deep_scrub_errors": 0, 
                 "num_objects_recovered": 0, 
                 "num_bytes_recovered": 0, 
                 "num_keys_recovered": 0, 
                 "num_objects_omap": 0, 
                 "num_objects_hit_set_archive": 0, 
                 "num_bytes_hit_set_archive": 0, 
                 "num_flush": 0, 
                 "num_flush_kb": 0, 
                 "num_evict": 0, 
                 "num_evict_kb": 0, 
                 "num_promote": 0, 
                 "num_flush_mode_high": 0, 
                 "num_flush_mode_low": 0, 
                 "num_evict_mode_some": 0, 
                 "num_evict_mode_full": 0, 
                 "num_objects_pinned": 0 
             }, 
             "up": [ 
                 73, 
                 23, 
                 138 
             ], 
             "acting": [ 
                 73, 
                 23, 
                 138 
             ], 
             "blocked_by": [], 
             "up_primary": 73, 
             "acting_primary": 73 
         }, 
         "empty": 0, 
         "dne": 0, 
         "incomplete": 0, 
         "last_epoch_started": 150955, 
         "hit_set_history": { 
             "current_last_update": "0'0", 
             "history": [] 
         } 
     }, 
     "peer_info": [ 
         { 
             "peer": "23", 
             "pgid": "46.1a4", 
             "last_update": "151143'51", 
             "last_complete": "151143'51", 
             "log_tail": "0'0", 
             "last_user_version": 0, 
             "last_backfill": "MAX", 
             "last_backfill_bitwise": 0, 
             "purged_snaps": "[]", 
             "history": { 
                 "epoch_created": 150853, 
                 "last_epoch_started": 150955, 
                 "last_epoch_clean": 150955, 
                 "last_epoch_split": 0, 
                 "last_epoch_marked_full": 0, 
                 "same_up_since": 150854, 
                 "same_interval_since": 150954, 
                 "same_primary_since": 150828, 
                 "last_scrub": "151143'51", 
                 "last_scrub_stamp": "2016-06-24 23:39:02.606989", 
                 "last_deep_scrub": "151143'51", 
                 "last_deep_scrub_stamp": "2016-06-24 23:38:45.246075", 
                 "last_clean_scrub_stamp": "2016-06-24 23:39:02.606989" 
             }, 
             "stats": { 
                 "version": "0'0", 
                 "reported_seq": "16", 
                 "reported_epoch": "150906", 
                 "state": "peering", 
                 "last_fresh": "2016-06-23 06:22:48.093937", 
                 "last_change": "2016-06-23 06:22:48.088003", 
                 "last_active": "2016-06-23 06:18:33.658154", 
                 "last_peered": "2016-06-23 06:18:33.658154", 
                 "last_clean": "2016-06-23 06:18:33.658154", 
                 "last_became_active": "2016-06-23 06:18:33.658063", 
                 "last_became_peered": "2016-06-23 06:18:33.658063", 
                 "last_unstale": "2016-06-23 06:22:48.093937", 
                 "last_undegraded": "2016-06-23 06:22:48.093937", 
                 "last_fullsized": "2016-06-23 06:22:48.093937", 
                 "mapping_epoch": 150853, 
                 "log_start": "0'0", 
                 "ondisk_log_start": "0'0", 
                 "created": 150853, 
                 "last_epoch_clean": 150855, 
                 "parent": "0.0", 
                 "parent_split_bits": 9, 
                 "last_scrub": "0'0", 
                 "last_scrub_stamp": "2016-06-23 00:28:36.323398", 
                 "last_deep_scrub": "0'0", 
                 "last_deep_scrub_stamp": "2016-06-23 00:28:36.323398", 
                 "last_clean_scrub_stamp": "0.000000", 
                 "log_size": 0, 
                 "ondisk_log_size": 0, 
                 "stats_invalid": true, 
                 "dirty_stats_invalid": false, 
                 "omap_stats_invalid": false, 
                 "hitset_stats_invalid": false, 
                 "hitset_bytes_stats_invalid": false, 
                 "pin_stats_invalid": false, 
                 "stat_sum": { 
                     "num_bytes": 0, 
                     "num_objects": 0, 
                     "num_object_clones": 0, 
                     "num_object_copies": 0, 
                     "num_objects_missing_on_primary": 0, 
                     "num_objects_missing": 0, 
                     "num_objects_degraded": 0, 
                     "num_objects_misplaced": 0, 
                     "num_objects_unfound": 0, 
                     "num_objects_dirty": 0, 
                     "num_whiteouts": 0, 
                     "num_read": 0, 
                     "num_read_kb": 0, 
                     "num_write": 0, 
                     "num_write_kb": 0, 
                     "num_scrub_errors": 0, 
                     "num_shallow_scrub_errors": 0, 
                     "num_deep_scrub_errors": 0, 
                     "num_objects_recovered": 0, 
                     "num_bytes_recovered": 0, 
                     "num_keys_recovered": 0, 
                     "num_objects_omap": 0, 
                     "num_objects_hit_set_archive": 0, 
                     "num_bytes_hit_set_archive": 0, 
                     "num_flush": 0, 
                     "num_flush_kb": 0, 
                     "num_evict": 0, 
                     "num_evict_kb": 0, 
                     "num_promote": 0, 
                     "num_flush_mode_high": 0, 
                     "num_flush_mode_low": 0, 
                     "num_evict_mode_some": 0, 
                     "num_evict_mode_full": 0, 
                     "num_objects_pinned": 0 
                 }, 
                 "up": [ 
                     73, 
                     23, 
                     138 
                 ], 
                 "acting": [ 
                     73, 
                     23, 
                     138 
                 ], 
                 "blocked_by": [], 
                 "up_primary": 73, 
                 "acting_primary": 73 
             }, 
             "empty": 0, 
             "dne": 0, 
             "incomplete": 0, 
             "last_epoch_started": 150955, 
             "hit_set_history": { 
                 "current_last_update": "0'0", 
                 "history": [] 
             } 
         }, 
         { 
             "peer": "138", 
             "pgid": "46.1a4", 
             "last_update": "151143'51", 
             "last_complete": "151143'51", 
             "log_tail": "0'0", 
             "last_user_version": 0, 
             "last_backfill": "MAX", 
             "last_backfill_bitwise": 0, 
             "purged_snaps": "[]", 
             "history": { 
                 "epoch_created": 150853, 
                 "last_epoch_started": 150955, 
                 "last_epoch_clean": 150955, 
                 "last_epoch_split": 0, 
                 "last_epoch_marked_full": 0, 
                 "same_up_since": 150854, 
                 "same_interval_since": 150954, 
                 "same_primary_since": 150828, 
                 "last_scrub": "151143'51", 
                 "last_scrub_stamp": "2016-06-24 23:39:02.606989", 
                 "last_deep_scrub": "151143'51", 
                 "last_deep_scrub_stamp": "2016-06-24 23:38:45.246075", 
                 "last_clean_scrub_stamp": "2016-06-24 23:39:02.606989" 
             }, 
             "stats": { 
                 "version": "0'0", 
                 "reported_seq": "16", 
                 "reported_epoch": "150906", 
                 "state": "peering", 
                 "last_fresh": "2016-06-23 06:22:48.093937", 
                 "last_change": "2016-06-23 06:22:48.088003", 
                 "last_active": "2016-06-23 06:18:33.658154", 
                 "last_peered": "2016-06-23 06:18:33.658154", 
                 "last_clean": "2016-06-23 06:18:33.658154", 
                 "last_became_active": "2016-06-23 06:18:33.658063", 
                 "last_became_peered": "2016-06-23 06:18:33.658063", 
                 "last_unstale": "2016-06-23 06:22:48.093937", 
                 "last_undegraded": "2016-06-23 06:22:48.093937", 
                 "last_fullsized": "2016-06-23 06:22:48.093937", 
                 "mapping_epoch": 150853, 
                 "log_start": "0'0", 
                 "ondisk_log_start": "0'0", 
                 "created": 150853, 
                 "last_epoch_clean": 150855, 
                 "parent": "0.0", 
                 "parent_split_bits": 9, 
                 "last_scrub": "0'0", 
                 "last_scrub_stamp": "2016-06-23 00:28:36.323398", 
                 "last_deep_scrub": "0'0", 
                 "last_deep_scrub_stamp": "2016-06-23 00:28:36.323398", 
                 "last_clean_scrub_stamp": "0.000000", 
                 "log_size": 0, 
                 "ondisk_log_size": 0, 
                 "stats_invalid": true, 
                 "dirty_stats_invalid": false, 
                 "omap_stats_invalid": false, 
                 "hitset_stats_invalid": false, 
                 "hitset_bytes_stats_invalid": false, 
                 "pin_stats_invalid": false, 
                 "stat_sum": { 
                     "num_bytes": 0, 
                     "num_objects": 0, 
                     "num_object_clones": 0, 
                     "num_object_copies": 0, 
                     "num_objects_missing_on_primary": 0, 
                     "num_objects_missing": 0, 
                     "num_objects_degraded": 0, 
                     "num_objects_misplaced": 0, 
                     "num_objects_unfound": 0, 
                     "num_objects_dirty": 0, 
                     "num_whiteouts": 0, 
                     "num_read": 0, 
                     "num_read_kb": 0, 
                     "num_write": 0, 
                     "num_write_kb": 0, 
                     "num_scrub_errors": 0, 
                     "num_shallow_scrub_errors": 0, 
                     "num_deep_scrub_errors": 0, 
                     "num_objects_recovered": 0, 
                     "num_bytes_recovered": 0, 
                     "num_keys_recovered": 0, 
                     "num_objects_omap": 0, 
                     "num_objects_hit_set_archive": 0, 
                     "num_bytes_hit_set_archive": 0, 
                     "num_flush": 0, 
                     "num_flush_kb": 0, 
                     "num_evict": 0, 
                     "num_evict_kb": 0, 
                     "num_promote": 0, 
                     "num_flush_mode_high": 0, 
                     "num_flush_mode_low": 0, 
                     "num_evict_mode_some": 0, 
                     "num_evict_mode_full": 0, 
                     "num_objects_pinned": 0 
                 }, 
                 "up": [ 
                     73, 
                     23, 
                     138 
                 ], 
                 "acting": [ 
                     73, 
                     23, 
                     138 
                 ], 
                 "blocked_by": [], 
                 "up_primary": 73, 
                 "acting_primary": 73 
             }, 
             "empty": 0, 
             "dne": 0, 
             "incomplete": 0, 
             "last_epoch_started": 150955, 
             "hit_set_history": { 
                 "current_last_update": "0'0", 
                 "history": [] 
             } 
         } 
     ], 
     "recovery_state": [ 
         { 
             "name": "Started\/Primary\/Active", 
             "enter_time": "2016-06-23 06:26:50.813267", 
             "might_have_unfound": [], 
             "recovery_progress": { 
                 "backfill_targets": [], 
                 "waiting_on_backfill": [], 
                 "last_backfill_started": "MIN", 
                 "backfill_info": { 
                     "begin": "MIN", 
                     "end": "MIN", 
                     "objects": [] 
                 }, 
                 "peer_backfill_info": [], 
                 "backfills_in_flight": [], 
                 "recovering": [], 
                 "pg_backend": { 
                     "pull_from_peer": [], 
                     "pushing": [] 
                 } 
             }, 
             "scrub": { 
                 "scrubber.epoch_start": "150954", 
                 "scrubber.active": 0, 
                 "scrubber.state": "INACTIVE", 
                 "scrubber.start": "MIN", 
                 "scrubber.end": "MIN", 
                 "scrubber.subset_last_update": "0'0", 
                 "scrubber.deep": false, 
                 "scrubber.seed": 0, 
                 "scrubber.waiting_on": 0, 
                 "scrubber.waiting_on_whom": [] 
             } 
         }, 
         { 
             "name": "Started", 
             "enter_time": "2016-06-23 06:26:49.718199" 
         } 
     ], 
     "agent_state": {} 
 } 
 </pre> 


Back