Project

General

Profile

Bug #40073

PG scrub stamps reset to 0.000000

Added by Greg Farnum almost 2 years ago. Updated over 1 year ago.

Status:
Resolved
Priority:
Normal
Assignee:
Category:
Scrub/Repair
Target version:
-
% Done:

0%

Source:
Tags:
Backport:
nautilus
Regression:
No
Severity:
3 - minor
Reviewed:
Affected Versions:
ceph-qa-suite:
Component(RADOS):
OSD
Pull request ID:
Crash signature (v1):
Crash signature (v2):

Description

From Ceph-users, https://www.spinics.net/lists/ceph-users/msg52869.html

After upgrading from 14.2.0 to 14.2.1, I've noticed PGs are frequently resetting their scrub and deep scrub time stamps to 0.000000. It's extra strange because the peers show timestamps for deep scrubs.

  1. First entry from a pg list at 7pm
    $ grep 11.2f2 ~/pgs-active.7pm
    11.2f2 691 0 0 0 2897477632 0 0 2091 active+clean 3h 7378'12291 8048:36261 [1,6,37]p1 [1,6,37]p1 2019-05-14 21:01:29.172460 2019-05-14 21:01:29.172460
  1. Next Entry 3 minutes later
    $ ceph pg ls active |grep 11.2f2
    11.2f2 695 0 0 0 2914713600 0 0 2091 active+clean 6s 7378'12291 8049:36330 [1,6,37]p1 [1,6,37]p1 0.000000 0.000000

Not sure if it's related, but this only happens to PG's who's primary OSD is one where osd_numa_node has been set.


Related issues

Duplicated by Ceph - Bug #40066: PGs not scrubbed since 0.000000 Duplicate 05/29/2019
Copied to RADOS - Backport #40667: nautilus: PG scrub stamps reset to 0.000000 Resolved

History

#1 Updated by Greg Farnum almost 2 years ago

A full pg query:

## PG Query
{
    "state": "active+clean",
    "snap_trimq": "[]",
    "snap_trimq_len": 0,
    "epoch": 8049,
    "up": [
        1,
        6,
        37
    ],
    "acting": [
        1,
        6,
        37
    ],
    "acting_recovery_backfill": [
        "1",
        "6",
        "37" 
    ],
    "info": {
        "pgid": "11.2f2",
        "last_update": "7378'12291",
        "last_complete": "7378'12291",
        "log_tail": "1087'10200",
        "last_user_version": 12291,
        "last_backfill": "MAX",
        "last_backfill_bitwise": 1,
        "purged_snaps": [],
        "history": {
            "epoch_created": 1549,
            "epoch_pool_created": 216,
            "last_epoch_started": 6148,
            "last_interval_started": 6147,
            "last_epoch_clean": 6148,
            "last_interval_clean": 6147,
            "last_epoch_split": 6147,
            "last_epoch_marked_full": 0,
            "same_up_since": 6126,
            "same_interval_since": 6147,
            "same_primary_since": 6126,
            "last_scrub": "7378'12291",
            "last_scrub_stamp": "0.000000",
            "last_deep_scrub": "6103'12186",
            "last_deep_scrub_stamp": "0.000000",
            "last_clean_scrub_stamp": "2019-05-15 23:08:17.014575" 
        },
        "stats": {
            "version": "7378'12291",
            "reported_seq": "36700",
            "reported_epoch": "8049",
            "state": "active+clean",
            "last_fresh": "2019-05-15 23:08:17.014609",
            "last_change": "2019-05-15 23:08:17.014609",
            "last_active": "2019-05-15 23:08:17.014609",
            "last_peered": "2019-05-15 23:08:17.014609",
            "last_clean": "2019-05-15 23:08:17.014609",
            "last_became_active": "2019-05-15 19:25:01.484322",
            "last_became_peered": "2019-05-15 19:25:01.484322",
            "last_unstale": "2019-05-15 23:08:17.014609",
            "last_undegraded": "2019-05-15 23:08:17.014609",
            "last_fullsized": "2019-05-15 23:08:17.014609",
            "mapping_epoch": 6126,
            "log_start": "1087'10200",
            "ondisk_log_start": "1087'10200",
            "created": 1549,
            "last_epoch_clean": 6148,
            "parent": "0.0",
            "parent_split_bits": 10,
            "last_scrub": "7378'12291",
            "last_scrub_stamp": "0.000000",
            "last_deep_scrub": "6103'12186",
            "last_deep_scrub_stamp": "0.000000",
            "last_clean_scrub_stamp": "2019-05-15 23:08:17.014575",
            "log_size": 2091,
            "ondisk_log_size": 2091,
            "stats_invalid": false,
            "dirty_stats_invalid": false,
            "omap_stats_invalid": false,
            "hitset_stats_invalid": false,
            "hitset_bytes_stats_invalid": false,
            "pin_stats_invalid": false,
            "manifest_stats_invalid": true,
            "snaptrimq_len": 0,
            "stat_sum": {
                "num_bytes": 2914713600,
                "num_objects": 695,
                "num_object_clones": 0,
                "num_object_copies": 2085,
                "num_objects_missing_on_primary": 0,
                "num_objects_missing": 0,
                "num_objects_degraded": 0,
                "num_objects_misplaced": 0,
                "num_objects_unfound": 0,
                "num_objects_dirty": 695,
                "num_whiteouts": 0,
                "num_read": 0,
                "num_read_kb": 0,
                "num_write": 0,
                "num_write_kb": 0,
                "num_scrub_errors": 0,
                "num_shallow_scrub_errors": 0,
                "num_deep_scrub_errors": 0,
                "num_objects_recovered": 0,
                "num_bytes_recovered": 0,
                "num_keys_recovered": 0,
                "num_objects_omap": 0,
                "num_objects_hit_set_archive": 0,
                "num_bytes_hit_set_archive": 0,
                "num_flush": 0,
                "num_flush_kb": 0,
                "num_evict": 0,
                "num_evict_kb": 0,
                "num_promote": 0,
                "num_flush_mode_high": 0,
                "num_flush_mode_low": 0,
                "num_evict_mode_some": 0,
                "num_evict_mode_full": 0,
                "num_objects_pinned": 0,
                "num_legacy_snapsets": 0,
                "num_large_omap_objects": 0,
                "num_objects_manifest": 0,
                "num_omap_bytes": 0,
                "num_omap_keys": 0,
                "num_objects_repaired": 0
            },
            "up": [
                1,
                6,
                37
            ],
            "acting": [
                1,
                6,
                37
            ],
            "blocked_by": [],
            "up_primary": 1,
            "acting_primary": 1,
            "purged_snaps": []
        },
        "empty": 0,
        "dne": 0,
        "incomplete": 0,
        "last_epoch_started": 6148,
        "hit_set_history": {
            "current_last_update": "0'0",
            "history": []
        }
    },
    "peer_info": [
        {
            "peer": "6",
            "pgid": "11.2f2",
            "last_update": "7378'12291",
            "last_complete": "7378'12291",
            "log_tail": "1087'10200",
            "last_user_version": 12264,
            "last_backfill": "MAX",
            "last_backfill_bitwise": 1,
            "purged_snaps": [],
            "history": {
                "epoch_created": 1549,
                "epoch_pool_created": 216,
                "last_epoch_started": 6148,
                "last_interval_started": 6147,
                "last_epoch_clean": 6148,
                "last_interval_clean": 6147,
                "last_epoch_split": 6147,
                "last_epoch_marked_full": 0,
                "same_up_since": 6126,
                "same_interval_since": 6147,
                "same_primary_since": 6126,
                "last_scrub": "7378'12291",
                "last_scrub_stamp": "2019-05-14 21:01:29.172460",
                "last_deep_scrub": "6103'12186",
                "last_deep_scrub_stamp": "2019-05-14 21:01:29.172460",
                "last_clean_scrub_stamp": "2019-05-15 23:08:17.014575" 
            },
            "stats": {
                "version": "6103'12264",
                "reported_seq": "34823",
                "reported_epoch": "6125",
                "state": "active+undersized+degraded",
                "last_fresh": "2019-05-15 19:16:05.309556",
                "last_change": "2019-05-15 19:15:59.342872",
                "last_active": "2019-05-15 19:16:05.309556",
                "last_peered": "2019-05-15 19:16:05.309556",
                "last_clean": "2019-05-15 17:23:41.956141",
                "last_became_active": "2019-05-15 19:15:59.342872",
                "last_became_peered": "2019-05-15 19:15:59.342872",
                "last_unstale": "2019-05-15 19:16:05.309556",
                "last_undegraded": "2019-05-15 19:15:59.341684",
                "last_fullsized": "2019-05-15 19:15:59.340270",
                "mapping_epoch": 6126,
                "log_start": "1087'10200",
                "ondisk_log_start": "1087'10200",
                "created": 1549,
                "last_epoch_clean": 6115,
                "parent": "0.0",
                "parent_split_bits": 10,
                "last_scrub": "6103'12186",
                "last_scrub_stamp": "2019-05-14 21:01:29.172460",
                "last_deep_scrub": "6103'12186",
                "last_deep_scrub_stamp": "2019-05-14 21:01:29.172460",
                "last_clean_scrub_stamp": "2019-05-14 21:01:29.172460",
                "log_size": 2064,
                "ondisk_log_size": 2064,
                "stats_invalid": true,
                "dirty_stats_invalid": false,
                "omap_stats_invalid": false,
                "hitset_stats_invalid": false,
                "hitset_bytes_stats_invalid": false,
                "pin_stats_invalid": false,
                "manifest_stats_invalid": true,
                "snaptrimq_len": 0,
                "stat_sum": {
                    "num_bytes": 2784231424,
                    "num_objects": 664,
                    "num_object_clones": 0,
                    "num_object_copies": 1992,
                    "num_objects_missing_on_primary": 0,
                    "num_objects_missing": 0,
                    "num_objects_degraded": 664,
                    "num_objects_misplaced": 0,
                    "num_objects_unfound": 0,
                    "num_objects_dirty": 664,
                    "num_whiteouts": 0,
                    "num_read": 93,
                    "num_read_kb": 350209,
                    "num_write": 153,
                    "num_write_kb": 604160,
                    "num_scrub_errors": 0,
                    "num_shallow_scrub_errors": 0,
                    "num_deep_scrub_errors": 0,
                    "num_objects_recovered": 129,
                    "num_bytes_recovered": 540901376,
                    "num_keys_recovered": 0,
                    "num_objects_omap": 0,
                    "num_objects_hit_set_archive": 0,
                    "num_bytes_hit_set_archive": 0,
                    "num_flush": 0,
                    "num_flush_kb": 0,
                    "num_evict": 0,
                    "num_evict_kb": 0,
                    "num_promote": 0,
                    "num_flush_mode_high": 0,
                    "num_flush_mode_low": 0,
                    "num_evict_mode_some": 0,
                    "num_evict_mode_full": 0,
                    "num_objects_pinned": 0,
                    "num_legacy_snapsets": 0,
                    "num_large_omap_objects": 0,
                    "num_objects_manifest": 0,
                    "num_omap_bytes": 0,
                    "num_omap_keys": 0,
                    "num_objects_repaired": 0
                },
                "up": [
                    1,
                    6,
                    37
                ],
                "acting": [
                    1,
                    6,
                    37
                ],
                "blocked_by": [],
                "up_primary": 1,
                "acting_primary": 1,
                "purged_snaps": []
            },
            "empty": 0,
            "dne": 0,
            "incomplete": 0,
            "last_epoch_started": 6148,
            "hit_set_history": {
                "current_last_update": "0'0",
                "history": []
            }
        },
        {
            "peer": "37",
            "pgid": "11.2f2",
            "last_update": "7378'12291",
            "last_complete": "7378'12291",
            "log_tail": "1087'10200",
            "last_user_version": 12264,
            "last_backfill": "MAX",
            "last_backfill_bitwise": 1,
            "purged_snaps": [],
            "history": {
                "epoch_created": 1549,
                "epoch_pool_created": 216,
                "last_epoch_started": 6148,
                "last_interval_started": 6147,
                "last_epoch_clean": 6148,
                "last_interval_clean": 6147,
                "last_epoch_split": 6147,
                "last_epoch_marked_full": 0,
                "same_up_since": 6126,
                "same_interval_since": 6147,
                "same_primary_since": 6126,
                "last_scrub": "7378'12291",
                "last_scrub_stamp": "2019-05-14 21:01:29.172460",
                "last_deep_scrub": "6103'12186",
                "last_deep_scrub_stamp": "2019-05-14 21:01:29.172460",
                "last_clean_scrub_stamp": "2019-05-15 23:08:17.014575" 
            },
            "stats": {
                "version": "6103'12263",
                "reported_seq": "34275",
                "reported_epoch": "6103",
                "state": "active+clean",
                "last_fresh": "2019-05-15 17:23:41.956141",
                "last_change": "2019-05-14 21:01:29.172506",
                "last_active": "2019-05-15 17:23:41.956141",
                "last_peered": "2019-05-15 17:23:41.956141",
                "last_clean": "2019-05-15 17:23:41.956141",
                "last_became_active": "2019-05-14 20:16:44.641437",
                "last_became_peered": "2019-05-14 20:16:44.641437",
                "last_unstale": "2019-05-15 17:23:41.956141",
                "last_undegraded": "2019-05-15 17:23:41.956141",
                "last_fullsized": "2019-05-15 17:23:41.956141",
                "mapping_epoch": 6126,
                "log_start": "1087'10200",
                "ondisk_log_start": "1087'10200",
                "created": 1549,
                "last_epoch_clean": 6010,
                "parent": "0.0",
                "parent_split_bits": 10,
                "last_scrub": "6103'12186",
                "last_scrub_stamp": "2019-05-14 21:01:29.172460",
                "last_deep_scrub": "6103'12186",
                "last_deep_scrub_stamp": "2019-05-14 21:01:29.172460",
                "last_clean_scrub_stamp": "2019-05-14 21:01:29.172460",
                "log_size": 2063,
                "ondisk_log_size": 2063,
                "stats_invalid": true,
                "dirty_stats_invalid": false,
                "omap_stats_invalid": false,
                "hitset_stats_invalid": false,
                "hitset_bytes_stats_invalid": false,
                "pin_stats_invalid": false,
                "manifest_stats_invalid": true,
                "snaptrimq_len": 0,
                "stat_sum": {
                    "num_bytes": 2784231424,
                    "num_objects": 664,
                    "num_object_clones": 0,
                    "num_object_copies": 1993,
                    "num_objects_missing_on_primary": 0,
                    "num_objects_missing": 0,
                    "num_objects_degraded": 0,
                    "num_objects_misplaced": 0,
                    "num_objects_unfound": 0,
                    "num_objects_dirty": 664,
                    "num_whiteouts": 0,
                    "num_read": 93,
                    "num_read_kb": 350209,
                    "num_write": 153,
                    "num_write_kb": 604160,
                    "num_scrub_errors": 0,
                    "num_shallow_scrub_errors": 0,
                    "num_deep_scrub_errors": 0,
                    "num_objects_recovered": 0,
                    "num_bytes_recovered": 0,
                    "num_keys_recovered": 0,
                    "num_objects_omap": 0,
                    "num_objects_hit_set_archive": 0,
                    "num_bytes_hit_set_archive": 0,
                    "num_flush": 0,
                    "num_flush_kb": 0,
                    "num_evict": 0,
                    "num_evict_kb": 0,
                    "num_promote": 0,
                    "num_flush_mode_high": 0,
                    "num_flush_mode_low": 0,
                    "num_evict_mode_some": 0,
                    "num_evict_mode_full": 0,
                    "num_objects_pinned": 0,
                    "num_legacy_snapsets": 0,
                    "num_large_omap_objects": 0,
                    "num_objects_manifest": 0,
                    "num_omap_bytes": 0,
                    "num_omap_keys": 0,
                    "num_objects_repaired": 0
                },
                "up": [
                    1,
                    6,
                    37
                ],
                "acting": [
                    1,
                    6,
                    37
                ],
                "blocked_by": [],
                "up_primary": 1,
                "acting_primary": 1,
                "purged_snaps": []
            },
            "empty": 0,
            "dne": 0,
            "incomplete": 0,
            "last_epoch_started": 6148,
            "hit_set_history": {
                "current_last_update": "0'0",
                "history": []
            }
        }
    ],
    "recovery_state": [
        {
            "name": "Started/Primary/Active",
            "enter_time": "2019-05-15 19:25:01.479877",
            "might_have_unfound": [],
            "recovery_progress": {
                "backfill_targets": [],
                "waiting_on_backfill": [],
                "last_backfill_started": "MIN",
                "backfill_info": {
                    "begin": "MIN",
                    "end": "MIN",
                    "objects": []
                },
                "peer_backfill_info": [],
                "backfills_in_flight": [],
                "recovering": [],
                "pg_backend": {
                    "pull_from_peer": [],
                    "pushing": []
                }
            },
            "scrub": {
                "scrubber.epoch_start": "6147",
                "scrubber.active": false,
                "scrubber.state": "INACTIVE",
                "scrubber.start": "MIN",
                "scrubber.end": "MIN",
                "scrubber.max_end": "MIN",
                "scrubber.subset_last_update": "0'0",
                "scrubber.deep": false,
                "scrubber.waiting_on_whom": []
            }
        },
        {
            "name": "Started",
            "enter_time": "2019-05-15 19:25:00.557042" 
        }
    ],
    "agent_state": {}
}

#2 Updated by Nathan Cutler almost 2 years ago

The similarity to #40066 is so striking I just had to mention it and create a "Relates to" link.

#3 Updated by Nathan Cutler almost 2 years ago

  • Related to Bug #40066: PGs not scrubbed since 0.000000 added

#4 Updated by David Zafman almost 2 years ago

  • Assignee set to David Zafman

When auto repair is enabled a bug causes a regular scrub to reset time stamps which is only intended to happen when there were errors. This internally forces a deep-scrub in order to eventually auto repair.

#5 Updated by David Zafman almost 2 years ago

  • Duplicated by Bug #40066: PGs not scrubbed since 0.000000 added

#6 Updated by David Zafman almost 2 years ago

  • Related to deleted (Bug #40066: PGs not scrubbed since 0.000000)

#7 Updated by David Zafman almost 2 years ago

  • Backport set to nautilus

#8 Updated by David Zafman almost 2 years ago

  • Pull request ID set to 28334

#9 Updated by David Zafman almost 2 years ago

  • Status changed from New to In Progress

#10 Updated by David Zafman almost 2 years ago

  • Status changed from In Progress to Pending Backport

#11 Updated by David Zafman almost 2 years ago

  • Copied to Backport #40667: nautilus: PG scrub stamps reset to 0.000000 added

#12 Updated by David Zafman over 1 year ago

  • Status changed from Pending Backport to Resolved

Also available in: Atom PDF