Project

General

Profile

Bug #40073

PG scrub stamps reset to 0.000000

Added by Greg Farnum about 1 year ago. Updated 9 months ago.

Status:
Resolved
Priority:
Normal
Assignee:
Category:
Scrub/Repair
Target version:
-
% Done:

0%

Source:
Tags:
Backport:
nautilus
Regression:
No
Severity:
3 - minor
Reviewed:
Affected Versions:
ceph-qa-suite:
Component(RADOS):
OSD
Pull request ID:
Crash signature:

Description

From Ceph-users, https://www.spinics.net/lists/ceph-users/msg52869.html

After upgrading from 14.2.0 to 14.2.1, I've noticed PGs are frequently resetting their scrub and deep scrub time stamps to 0.000000. It's extra strange because the peers show timestamps for deep scrubs.

  1. First entry from a pg list at 7pm
    $ grep 11.2f2 ~/pgs-active.7pm
    11.2f2 691 0 0 0 2897477632 0 0 2091 active+clean 3h 7378'12291 8048:36261 [1,6,37]p1 [1,6,37]p1 2019-05-14 21:01:29.172460 2019-05-14 21:01:29.172460
  1. Next Entry 3 minutes later
    $ ceph pg ls active |grep 11.2f2
    11.2f2 695 0 0 0 2914713600 0 0 2091 active+clean 6s 7378'12291 8049:36330 [1,6,37]p1 [1,6,37]p1 0.000000 0.000000

Not sure if it's related, but this only happens to PG's who's primary OSD is one where osd_numa_node has been set.


Related issues

Duplicated by Ceph - Bug #40066: PGs not scrubbed since 0.000000 Duplicate 05/29/2019
Copied to RADOS - Backport #40667: nautilus: PG scrub stamps reset to 0.000000 Resolved

History

#1 Updated by Greg Farnum about 1 year ago

A full pg query:

## PG Query
{
    "state": "active+clean",
    "snap_trimq": "[]",
    "snap_trimq_len": 0,
    "epoch": 8049,
    "up": [
        1,
        6,
        37
    ],
    "acting": [
        1,
        6,
        37
    ],
    "acting_recovery_backfill": [
        "1",
        "6",
        "37" 
    ],
    "info": {
        "pgid": "11.2f2",
        "last_update": "7378'12291",
        "last_complete": "7378'12291",
        "log_tail": "1087'10200",
        "last_user_version": 12291,
        "last_backfill": "MAX",
        "last_backfill_bitwise": 1,
        "purged_snaps": [],
        "history": {
            "epoch_created": 1549,
            "epoch_pool_created": 216,
            "last_epoch_started": 6148,
            "last_interval_started": 6147,
            "last_epoch_clean": 6148,
            "last_interval_clean": 6147,
            "last_epoch_split": 6147,
            "last_epoch_marked_full": 0,
            "same_up_since": 6126,
            "same_interval_since": 6147,
            "same_primary_since": 6126,
            "last_scrub": "7378'12291",
            "last_scrub_stamp": "0.000000",
            "last_deep_scrub": "6103'12186",
            "last_deep_scrub_stamp": "0.000000",
            "last_clean_scrub_stamp": "2019-05-15 23:08:17.014575" 
        },
        "stats": {
            "version": "7378'12291",
            "reported_seq": "36700",
            "reported_epoch": "8049",
            "state": "active+clean",
            "last_fresh": "2019-05-15 23:08:17.014609",
            "last_change": "2019-05-15 23:08:17.014609",
            "last_active": "2019-05-15 23:08:17.014609",
            "last_peered": "2019-05-15 23:08:17.014609",
            "last_clean": "2019-05-15 23:08:17.014609",
            "last_became_active": "2019-05-15 19:25:01.484322",
            "last_became_peered": "2019-05-15 19:25:01.484322",
            "last_unstale": "2019-05-15 23:08:17.014609",
            "last_undegraded": "2019-05-15 23:08:17.014609",
            "last_fullsized": "2019-05-15 23:08:17.014609",
            "mapping_epoch": 6126,
            "log_start": "1087'10200",
            "ondisk_log_start": "1087'10200",
            "created": 1549,
            "last_epoch_clean": 6148,
            "parent": "0.0",
            "parent_split_bits": 10,
            "last_scrub": "7378'12291",
            "last_scrub_stamp": "0.000000",
            "last_deep_scrub": "6103'12186",
            "last_deep_scrub_stamp": "0.000000",
            "last_clean_scrub_stamp": "2019-05-15 23:08:17.014575",
            "log_size": 2091,
            "ondisk_log_size": 2091,
            "stats_invalid": false,
            "dirty_stats_invalid": false,
            "omap_stats_invalid": false,
            "hitset_stats_invalid": false,
            "hitset_bytes_stats_invalid": false,
            "pin_stats_invalid": false,
            "manifest_stats_invalid": true,
            "snaptrimq_len": 0,
            "stat_sum": {
                "num_bytes": 2914713600,
                "num_objects": 695,
                "num_object_clones": 0,
                "num_object_copies": 2085,
                "num_objects_missing_on_primary": 0,
                "num_objects_missing": 0,
                "num_objects_degraded": 0,
                "num_objects_misplaced": 0,
                "num_objects_unfound": 0,
                "num_objects_dirty": 695,
                "num_whiteouts": 0,
                "num_read": 0,
                "num_read_kb": 0,
                "num_write": 0,
                "num_write_kb": 0,
                "num_scrub_errors": 0,
                "num_shallow_scrub_errors": 0,
                "num_deep_scrub_errors": 0,
                "num_objects_recovered": 0,
                "num_bytes_recovered": 0,
                "num_keys_recovered": 0,
                "num_objects_omap": 0,
                "num_objects_hit_set_archive": 0,
                "num_bytes_hit_set_archive": 0,
                "num_flush": 0,
                "num_flush_kb": 0,
                "num_evict": 0,
                "num_evict_kb": 0,
                "num_promote": 0,
                "num_flush_mode_high": 0,
                "num_flush_mode_low": 0,
                "num_evict_mode_some": 0,
                "num_evict_mode_full": 0,
                "num_objects_pinned": 0,
                "num_legacy_snapsets": 0,
                "num_large_omap_objects": 0,
                "num_objects_manifest": 0,
                "num_omap_bytes": 0,
                "num_omap_keys": 0,
                "num_objects_repaired": 0
            },
            "up": [
                1,
                6,
                37
            ],
            "acting": [
                1,
                6,
                37
            ],
            "blocked_by": [],
            "up_primary": 1,
            "acting_primary": 1,
            "purged_snaps": []
        },
        "empty": 0,
        "dne": 0,
        "incomplete": 0,
        "last_epoch_started": 6148,
        "hit_set_history": {
            "current_last_update": "0'0",
            "history": []
        }
    },
    "peer_info": [
        {
            "peer": "6",
            "pgid": "11.2f2",
            "last_update": "7378'12291",
            "last_complete": "7378'12291",
            "log_tail": "1087'10200",
            "last_user_version": 12264,
            "last_backfill": "MAX",
            "last_backfill_bitwise": 1,
            "purged_snaps": [],
            "history": {
                "epoch_created": 1549,
                "epoch_pool_created": 216,
                "last_epoch_started": 6148,
                "last_interval_started": 6147,
                "last_epoch_clean": 6148,
                "last_interval_clean": 6147,
                "last_epoch_split": 6147,
                "last_epoch_marked_full": 0,
                "same_up_since": 6126,
                "same_interval_since": 6147,
                "same_primary_since": 6126,
                "last_scrub": "7378'12291",
                "last_scrub_stamp": "2019-05-14 21:01:29.172460",
                "last_deep_scrub": "6103'12186",
                "last_deep_scrub_stamp": "2019-05-14 21:01:29.172460",
                "last_clean_scrub_stamp": "2019-05-15 23:08:17.014575" 
            },
            "stats": {
                "version": "6103'12264",
                "reported_seq": "34823",
                "reported_epoch": "6125",
                "state": "active+undersized+degraded",
                "last_fresh": "2019-05-15 19:16:05.309556",
                "last_change": "2019-05-15 19:15:59.342872",
                "last_active": "2019-05-15 19:16:05.309556",
                "last_peered": "2019-05-15 19:16:05.309556",
                "last_clean": "2019-05-15 17:23:41.956141",
                "last_became_active": "2019-05-15 19:15:59.342872",
                "last_became_peered": "2019-05-15 19:15:59.342872",
                "last_unstale": "2019-05-15 19:16:05.309556",
                "last_undegraded": "2019-05-15 19:15:59.341684",
                "last_fullsized": "2019-05-15 19:15:59.340270",
                "mapping_epoch": 6126,
                "log_start": "1087'10200",
                "ondisk_log_start": "1087'10200",
                "created": 1549,
                "last_epoch_clean": 6115,
                "parent": "0.0",
                "parent_split_bits": 10,
                "last_scrub": "6103'12186",
                "last_scrub_stamp": "2019-05-14 21:01:29.172460",
                "last_deep_scrub": "6103'12186",
                "last_deep_scrub_stamp": "2019-05-14 21:01:29.172460",
                "last_clean_scrub_stamp": "2019-05-14 21:01:29.172460",
                "log_size": 2064,
                "ondisk_log_size": 2064,
                "stats_invalid": true,
                "dirty_stats_invalid": false,
                "omap_stats_invalid": false,
                "hitset_stats_invalid": false,
                "hitset_bytes_stats_invalid": false,
                "pin_stats_invalid": false,
                "manifest_stats_invalid": true,
                "snaptrimq_len": 0,
                "stat_sum": {
                    "num_bytes": 2784231424,
                    "num_objects": 664,
                    "num_object_clones": 0,
                    "num_object_copies": 1992,
                    "num_objects_missing_on_primary": 0,
                    "num_objects_missing": 0,
                    "num_objects_degraded": 664,
                    "num_objects_misplaced": 0,
                    "num_objects_unfound": 0,
                    "num_objects_dirty": 664,
                    "num_whiteouts": 0,
                    "num_read": 93,
                    "num_read_kb": 350209,
                    "num_write": 153,
                    "num_write_kb": 604160,
                    "num_scrub_errors": 0,
                    "num_shallow_scrub_errors": 0,
                    "num_deep_scrub_errors": 0,
                    "num_objects_recovered": 129,
                    "num_bytes_recovered": 540901376,
                    "num_keys_recovered": 0,
                    "num_objects_omap": 0,
                    "num_objects_hit_set_archive": 0,
                    "num_bytes_hit_set_archive": 0,
                    "num_flush": 0,
                    "num_flush_kb": 0,
                    "num_evict": 0,
                    "num_evict_kb": 0,
                    "num_promote": 0,
                    "num_flush_mode_high": 0,
                    "num_flush_mode_low": 0,
                    "num_evict_mode_some": 0,
                    "num_evict_mode_full": 0,
                    "num_objects_pinned": 0,
                    "num_legacy_snapsets": 0,
                    "num_large_omap_objects": 0,
                    "num_objects_manifest": 0,
                    "num_omap_bytes": 0,
                    "num_omap_keys": 0,
                    "num_objects_repaired": 0
                },
                "up": [
                    1,
                    6,
                    37
                ],
                "acting": [
                    1,
                    6,
                    37
                ],
                "blocked_by": [],
                "up_primary": 1,
                "acting_primary": 1,
                "purged_snaps": []
            },
            "empty": 0,
            "dne": 0,
            "incomplete": 0,
            "last_epoch_started": 6148,
            "hit_set_history": {
                "current_last_update": "0'0",
                "history": []
            }
        },
        {
            "peer": "37",
            "pgid": "11.2f2",
            "last_update": "7378'12291",
            "last_complete": "7378'12291",
            "log_tail": "1087'10200",
            "last_user_version": 12264,
            "last_backfill": "MAX",
            "last_backfill_bitwise": 1,
            "purged_snaps": [],
            "history": {
                "epoch_created": 1549,
                "epoch_pool_created": 216,
                "last_epoch_started": 6148,
                "last_interval_started": 6147,
                "last_epoch_clean": 6148,
                "last_interval_clean": 6147,
                "last_epoch_split": 6147,
                "last_epoch_marked_full": 0,
                "same_up_since": 6126,
                "same_interval_since": 6147,
                "same_primary_since": 6126,
                "last_scrub": "7378'12291",
                "last_scrub_stamp": "2019-05-14 21:01:29.172460",
                "last_deep_scrub": "6103'12186",
                "last_deep_scrub_stamp": "2019-05-14 21:01:29.172460",
                "last_clean_scrub_stamp": "2019-05-15 23:08:17.014575" 
            },
            "stats": {
                "version": "6103'12263",
                "reported_seq": "34275",
                "reported_epoch": "6103",
                "state": "active+clean",
                "last_fresh": "2019-05-15 17:23:41.956141",
                "last_change": "2019-05-14 21:01:29.172506",
                "last_active": "2019-05-15 17:23:41.956141",
                "last_peered": "2019-05-15 17:23:41.956141",
                "last_clean": "2019-05-15 17:23:41.956141",
                "last_became_active": "2019-05-14 20:16:44.641437",
                "last_became_peered": "2019-05-14 20:16:44.641437",
                "last_unstale": "2019-05-15 17:23:41.956141",
                "last_undegraded": "2019-05-15 17:23:41.956141",
                "last_fullsized": "2019-05-15 17:23:41.956141",
                "mapping_epoch": 6126,
                "log_start": "1087'10200",
                "ondisk_log_start": "1087'10200",
                "created": 1549,
                "last_epoch_clean": 6010,
                "parent": "0.0",
                "parent_split_bits": 10,
                "last_scrub": "6103'12186",
                "last_scrub_stamp": "2019-05-14 21:01:29.172460",
                "last_deep_scrub": "6103'12186",
                "last_deep_scrub_stamp": "2019-05-14 21:01:29.172460",
                "last_clean_scrub_stamp": "2019-05-14 21:01:29.172460",
                "log_size": 2063,
                "ondisk_log_size": 2063,
                "stats_invalid": true,
                "dirty_stats_invalid": false,
                "omap_stats_invalid": false,
                "hitset_stats_invalid": false,
                "hitset_bytes_stats_invalid": false,
                "pin_stats_invalid": false,
                "manifest_stats_invalid": true,
                "snaptrimq_len": 0,
                "stat_sum": {
                    "num_bytes": 2784231424,
                    "num_objects": 664,
                    "num_object_clones": 0,
                    "num_object_copies": 1993,
                    "num_objects_missing_on_primary": 0,
                    "num_objects_missing": 0,
                    "num_objects_degraded": 0,
                    "num_objects_misplaced": 0,
                    "num_objects_unfound": 0,
                    "num_objects_dirty": 664,
                    "num_whiteouts": 0,
                    "num_read": 93,
                    "num_read_kb": 350209,
                    "num_write": 153,
                    "num_write_kb": 604160,
                    "num_scrub_errors": 0,
                    "num_shallow_scrub_errors": 0,
                    "num_deep_scrub_errors": 0,
                    "num_objects_recovered": 0,
                    "num_bytes_recovered": 0,
                    "num_keys_recovered": 0,
                    "num_objects_omap": 0,
                    "num_objects_hit_set_archive": 0,
                    "num_bytes_hit_set_archive": 0,
                    "num_flush": 0,
                    "num_flush_kb": 0,
                    "num_evict": 0,
                    "num_evict_kb": 0,
                    "num_promote": 0,
                    "num_flush_mode_high": 0,
                    "num_flush_mode_low": 0,
                    "num_evict_mode_some": 0,
                    "num_evict_mode_full": 0,
                    "num_objects_pinned": 0,
                    "num_legacy_snapsets": 0,
                    "num_large_omap_objects": 0,
                    "num_objects_manifest": 0,
                    "num_omap_bytes": 0,
                    "num_omap_keys": 0,
                    "num_objects_repaired": 0
                },
                "up": [
                    1,
                    6,
                    37
                ],
                "acting": [
                    1,
                    6,
                    37
                ],
                "blocked_by": [],
                "up_primary": 1,
                "acting_primary": 1,
                "purged_snaps": []
            },
            "empty": 0,
            "dne": 0,
            "incomplete": 0,
            "last_epoch_started": 6148,
            "hit_set_history": {
                "current_last_update": "0'0",
                "history": []
            }
        }
    ],
    "recovery_state": [
        {
            "name": "Started/Primary/Active",
            "enter_time": "2019-05-15 19:25:01.479877",
            "might_have_unfound": [],
            "recovery_progress": {
                "backfill_targets": [],
                "waiting_on_backfill": [],
                "last_backfill_started": "MIN",
                "backfill_info": {
                    "begin": "MIN",
                    "end": "MIN",
                    "objects": []
                },
                "peer_backfill_info": [],
                "backfills_in_flight": [],
                "recovering": [],
                "pg_backend": {
                    "pull_from_peer": [],
                    "pushing": []
                }
            },
            "scrub": {
                "scrubber.epoch_start": "6147",
                "scrubber.active": false,
                "scrubber.state": "INACTIVE",
                "scrubber.start": "MIN",
                "scrubber.end": "MIN",
                "scrubber.max_end": "MIN",
                "scrubber.subset_last_update": "0'0",
                "scrubber.deep": false,
                "scrubber.waiting_on_whom": []
            }
        },
        {
            "name": "Started",
            "enter_time": "2019-05-15 19:25:00.557042" 
        }
    ],
    "agent_state": {}
}

#2 Updated by Nathan Cutler about 1 year ago

The similarity to #40066 is so striking I just had to mention it and create a "Relates to" link.

#3 Updated by Nathan Cutler about 1 year ago

  • Related to Bug #40066: PGs not scrubbed since 0.000000 added

#4 Updated by David Zafman about 1 year ago

  • Assignee set to David Zafman

When auto repair is enabled a bug causes a regular scrub to reset time stamps which is only intended to happen when there were errors. This internally forces a deep-scrub in order to eventually auto repair.

#5 Updated by David Zafman about 1 year ago

  • Duplicated by Bug #40066: PGs not scrubbed since 0.000000 added

#6 Updated by David Zafman about 1 year ago

  • Related to deleted (Bug #40066: PGs not scrubbed since 0.000000)

#7 Updated by David Zafman about 1 year ago

  • Backport set to nautilus

#8 Updated by David Zafman about 1 year ago

  • Pull request ID set to 28334

#9 Updated by David Zafman about 1 year ago

  • Status changed from New to In Progress

#10 Updated by David Zafman 11 months ago

  • Status changed from In Progress to Pending Backport

#11 Updated by David Zafman 11 months ago

  • Copied to Backport #40667: nautilus: PG scrub stamps reset to 0.000000 added

#12 Updated by David Zafman 9 months ago

  • Status changed from Pending Backport to Resolved

Also available in: Atom PDF