Project

General

Profile

Bug #40073

PG scrub stamps reset to 0.000000

Added by Greg Farnum about 2 months ago. Updated 12 days ago.

Status:
Pending Backport
Priority:
Normal
Assignee:
Category:
Scrub/Repair
Target version:
-
Start date:
05/29/2019
Due date:
% Done:

0%

Source:
Tags:
Backport:
nautilus
Regression:
No
Severity:
3 - minor
Reviewed:
Affected Versions:
ceph-qa-suite:
Component(RADOS):
OSD
Pull request ID:

Description

From Ceph-users, https://www.spinics.net/lists/ceph-users/msg52869.html

After upgrading from 14.2.0 to 14.2.1, I've noticed PGs are frequently resetting their scrub and deep scrub time stamps to 0.000000. It's extra strange because the peers show timestamps for deep scrubs.

  1. First entry from a pg list at 7pm
    $ grep 11.2f2 ~/pgs-active.7pm
    11.2f2 691 0 0 0 2897477632 0 0 2091 active+clean 3h 7378'12291 8048:36261 [1,6,37]p1 [1,6,37]p1 2019-05-14 21:01:29.172460 2019-05-14 21:01:29.172460
  1. Next Entry 3 minutes later
    $ ceph pg ls active |grep 11.2f2
    11.2f2 695 0 0 0 2914713600 0 0 2091 active+clean 6s 7378'12291 8049:36330 [1,6,37]p1 [1,6,37]p1 0.000000 0.000000

Not sure if it's related, but this only happens to PG's who's primary OSD is one where osd_numa_node has been set.


Related issues

Duplicated by Ceph - Bug #40066: PGs not scrubbed since 0.000000 Duplicate 05/29/2019
Copied to RADOS - Backport #40667: nautilus: PG scrub stamps reset to 0.000000 In Progress

History

#1 Updated by Greg Farnum about 2 months ago

A full pg query:

## PG Query
{
    "state": "active+clean",
    "snap_trimq": "[]",
    "snap_trimq_len": 0,
    "epoch": 8049,
    "up": [
        1,
        6,
        37
    ],
    "acting": [
        1,
        6,
        37
    ],
    "acting_recovery_backfill": [
        "1",
        "6",
        "37" 
    ],
    "info": {
        "pgid": "11.2f2",
        "last_update": "7378'12291",
        "last_complete": "7378'12291",
        "log_tail": "1087'10200",
        "last_user_version": 12291,
        "last_backfill": "MAX",
        "last_backfill_bitwise": 1,
        "purged_snaps": [],
        "history": {
            "epoch_created": 1549,
            "epoch_pool_created": 216,
            "last_epoch_started": 6148,
            "last_interval_started": 6147,
            "last_epoch_clean": 6148,
            "last_interval_clean": 6147,
            "last_epoch_split": 6147,
            "last_epoch_marked_full": 0,
            "same_up_since": 6126,
            "same_interval_since": 6147,
            "same_primary_since": 6126,
            "last_scrub": "7378'12291",
            "last_scrub_stamp": "0.000000",
            "last_deep_scrub": "6103'12186",
            "last_deep_scrub_stamp": "0.000000",
            "last_clean_scrub_stamp": "2019-05-15 23:08:17.014575" 
        },
        "stats": {
            "version": "7378'12291",
            "reported_seq": "36700",
            "reported_epoch": "8049",
            "state": "active+clean",
            "last_fresh": "2019-05-15 23:08:17.014609",
            "last_change": "2019-05-15 23:08:17.014609",
            "last_active": "2019-05-15 23:08:17.014609",
            "last_peered": "2019-05-15 23:08:17.014609",
            "last_clean": "2019-05-15 23:08:17.014609",
            "last_became_active": "2019-05-15 19:25:01.484322",
            "last_became_peered": "2019-05-15 19:25:01.484322",
            "last_unstale": "2019-05-15 23:08:17.014609",
            "last_undegraded": "2019-05-15 23:08:17.014609",
            "last_fullsized": "2019-05-15 23:08:17.014609",
            "mapping_epoch": 6126,
            "log_start": "1087'10200",
            "ondisk_log_start": "1087'10200",
            "created": 1549,
            "last_epoch_clean": 6148,
            "parent": "0.0",
            "parent_split_bits": 10,
            "last_scrub": "7378'12291",
            "last_scrub_stamp": "0.000000",
            "last_deep_scrub": "6103'12186",
            "last_deep_scrub_stamp": "0.000000",
            "last_clean_scrub_stamp": "2019-05-15 23:08:17.014575",
            "log_size": 2091,
            "ondisk_log_size": 2091,
            "stats_invalid": false,
            "dirty_stats_invalid": false,
            "omap_stats_invalid": false,
            "hitset_stats_invalid": false,
            "hitset_bytes_stats_invalid": false,
            "pin_stats_invalid": false,
            "manifest_stats_invalid": true,
            "snaptrimq_len": 0,
            "stat_sum": {
                "num_bytes": 2914713600,
                "num_objects": 695,
                "num_object_clones": 0,
                "num_object_copies": 2085,
                "num_objects_missing_on_primary": 0,
                "num_objects_missing": 0,
                "num_objects_degraded": 0,
                "num_objects_misplaced": 0,
                "num_objects_unfound": 0,
                "num_objects_dirty": 695,
                "num_whiteouts": 0,
                "num_read": 0,
                "num_read_kb": 0,
                "num_write": 0,
                "num_write_kb": 0,
                "num_scrub_errors": 0,
                "num_shallow_scrub_errors": 0,
                "num_deep_scrub_errors": 0,
                "num_objects_recovered": 0,
                "num_bytes_recovered": 0,
                "num_keys_recovered": 0,
                "num_objects_omap": 0,
                "num_objects_hit_set_archive": 0,
                "num_bytes_hit_set_archive": 0,
                "num_flush": 0,
                "num_flush_kb": 0,
                "num_evict": 0,
                "num_evict_kb": 0,
                "num_promote": 0,
                "num_flush_mode_high": 0,
                "num_flush_mode_low": 0,
                "num_evict_mode_some": 0,
                "num_evict_mode_full": 0,
                "num_objects_pinned": 0,
                "num_legacy_snapsets": 0,
                "num_large_omap_objects": 0,
                "num_objects_manifest": 0,
                "num_omap_bytes": 0,
                "num_omap_keys": 0,
                "num_objects_repaired": 0
            },
            "up": [
                1,
                6,
                37
            ],
            "acting": [
                1,
                6,
                37
            ],
            "blocked_by": [],
            "up_primary": 1,
            "acting_primary": 1,
            "purged_snaps": []
        },
        "empty": 0,
        "dne": 0,
        "incomplete": 0,
        "last_epoch_started": 6148,
        "hit_set_history": {
            "current_last_update": "0'0",
            "history": []
        }
    },
    "peer_info": [
        {
            "peer": "6",
            "pgid": "11.2f2",
            "last_update": "7378'12291",
            "last_complete": "7378'12291",
            "log_tail": "1087'10200",
            "last_user_version": 12264,
            "last_backfill": "MAX",
            "last_backfill_bitwise": 1,
            "purged_snaps": [],
            "history": {
                "epoch_created": 1549,
                "epoch_pool_created": 216,
                "last_epoch_started": 6148,
                "last_interval_started": 6147,
                "last_epoch_clean": 6148,
                "last_interval_clean": 6147,
                "last_epoch_split": 6147,
                "last_epoch_marked_full": 0,
                "same_up_since": 6126,
                "same_interval_since": 6147,
                "same_primary_since": 6126,
                "last_scrub": "7378'12291",
                "last_scrub_stamp": "2019-05-14 21:01:29.172460",
                "last_deep_scrub": "6103'12186",
                "last_deep_scrub_stamp": "2019-05-14 21:01:29.172460",
                "last_clean_scrub_stamp": "2019-05-15 23:08:17.014575" 
            },
            "stats": {
                "version": "6103'12264",
                "reported_seq": "34823",
                "reported_epoch": "6125",
                "state": "active+undersized+degraded",
                "last_fresh": "2019-05-15 19:16:05.309556",
                "last_change": "2019-05-15 19:15:59.342872",
                "last_active": "2019-05-15 19:16:05.309556",
                "last_peered": "2019-05-15 19:16:05.309556",
                "last_clean": "2019-05-15 17:23:41.956141",
                "last_became_active": "2019-05-15 19:15:59.342872",
                "last_became_peered": "2019-05-15 19:15:59.342872",
                "last_unstale": "2019-05-15 19:16:05.309556",
                "last_undegraded": "2019-05-15 19:15:59.341684",
                "last_fullsized": "2019-05-15 19:15:59.340270",
                "mapping_epoch": 6126,
                "log_start": "1087'10200",
                "ondisk_log_start": "1087'10200",
                "created": 1549,
                "last_epoch_clean": 6115,
                "parent": "0.0",
                "parent_split_bits": 10,
                "last_scrub": "6103'12186",
                "last_scrub_stamp": "2019-05-14 21:01:29.172460",
                "last_deep_scrub": "6103'12186",
                "last_deep_scrub_stamp": "2019-05-14 21:01:29.172460",
                "last_clean_scrub_stamp": "2019-05-14 21:01:29.172460",
                "log_size": 2064,
                "ondisk_log_size": 2064,
                "stats_invalid": true,
                "dirty_stats_invalid": false,
                "omap_stats_invalid": false,
                "hitset_stats_invalid": false,
                "hitset_bytes_stats_invalid": false,
                "pin_stats_invalid": false,
                "manifest_stats_invalid": true,
                "snaptrimq_len": 0,
                "stat_sum": {
                    "num_bytes": 2784231424,
                    "num_objects": 664,
                    "num_object_clones": 0,
                    "num_object_copies": 1992,
                    "num_objects_missing_on_primary": 0,
                    "num_objects_missing": 0,
                    "num_objects_degraded": 664,
                    "num_objects_misplaced": 0,
                    "num_objects_unfound": 0,
                    "num_objects_dirty": 664,
                    "num_whiteouts": 0,
                    "num_read": 93,
                    "num_read_kb": 350209,
                    "num_write": 153,
                    "num_write_kb": 604160,
                    "num_scrub_errors": 0,
                    "num_shallow_scrub_errors": 0,
                    "num_deep_scrub_errors": 0,
                    "num_objects_recovered": 129,
                    "num_bytes_recovered": 540901376,
                    "num_keys_recovered": 0,
                    "num_objects_omap": 0,
                    "num_objects_hit_set_archive": 0,
                    "num_bytes_hit_set_archive": 0,
                    "num_flush": 0,
                    "num_flush_kb": 0,
                    "num_evict": 0,
                    "num_evict_kb": 0,
                    "num_promote": 0,
                    "num_flush_mode_high": 0,
                    "num_flush_mode_low": 0,
                    "num_evict_mode_some": 0,
                    "num_evict_mode_full": 0,
                    "num_objects_pinned": 0,
                    "num_legacy_snapsets": 0,
                    "num_large_omap_objects": 0,
                    "num_objects_manifest": 0,
                    "num_omap_bytes": 0,
                    "num_omap_keys": 0,
                    "num_objects_repaired": 0
                },
                "up": [
                    1,
                    6,
                    37
                ],
                "acting": [
                    1,
                    6,
                    37
                ],
                "blocked_by": [],
                "up_primary": 1,
                "acting_primary": 1,
                "purged_snaps": []
            },
            "empty": 0,
            "dne": 0,
            "incomplete": 0,
            "last_epoch_started": 6148,
            "hit_set_history": {
                "current_last_update": "0'0",
                "history": []
            }
        },
        {
            "peer": "37",
            "pgid": "11.2f2",
            "last_update": "7378'12291",
            "last_complete": "7378'12291",
            "log_tail": "1087'10200",
            "last_user_version": 12264,
            "last_backfill": "MAX",
            "last_backfill_bitwise": 1,
            "purged_snaps": [],
            "history": {
                "epoch_created": 1549,
                "epoch_pool_created": 216,
                "last_epoch_started": 6148,
                "last_interval_started": 6147,
                "last_epoch_clean": 6148,
                "last_interval_clean": 6147,
                "last_epoch_split": 6147,
                "last_epoch_marked_full": 0,
                "same_up_since": 6126,
                "same_interval_since": 6147,
                "same_primary_since": 6126,
                "last_scrub": "7378'12291",
                "last_scrub_stamp": "2019-05-14 21:01:29.172460",
                "last_deep_scrub": "6103'12186",
                "last_deep_scrub_stamp": "2019-05-14 21:01:29.172460",
                "last_clean_scrub_stamp": "2019-05-15 23:08:17.014575" 
            },
            "stats": {
                "version": "6103'12263",
                "reported_seq": "34275",
                "reported_epoch": "6103",
                "state": "active+clean",
                "last_fresh": "2019-05-15 17:23:41.956141",
                "last_change": "2019-05-14 21:01:29.172506",
                "last_active": "2019-05-15 17:23:41.956141",
                "last_peered": "2019-05-15 17:23:41.956141",
                "last_clean": "2019-05-15 17:23:41.956141",
                "last_became_active": "2019-05-14 20:16:44.641437",
                "last_became_peered": "2019-05-14 20:16:44.641437",
                "last_unstale": "2019-05-15 17:23:41.956141",
                "last_undegraded": "2019-05-15 17:23:41.956141",
                "last_fullsized": "2019-05-15 17:23:41.956141",
                "mapping_epoch": 6126,
                "log_start": "1087'10200",
                "ondisk_log_start": "1087'10200",
                "created": 1549,
                "last_epoch_clean": 6010,
                "parent": "0.0",
                "parent_split_bits": 10,
                "last_scrub": "6103'12186",
                "last_scrub_stamp": "2019-05-14 21:01:29.172460",
                "last_deep_scrub": "6103'12186",
                "last_deep_scrub_stamp": "2019-05-14 21:01:29.172460",
                "last_clean_scrub_stamp": "2019-05-14 21:01:29.172460",
                "log_size": 2063,
                "ondisk_log_size": 2063,
                "stats_invalid": true,
                "dirty_stats_invalid": false,
                "omap_stats_invalid": false,
                "hitset_stats_invalid": false,
                "hitset_bytes_stats_invalid": false,
                "pin_stats_invalid": false,
                "manifest_stats_invalid": true,
                "snaptrimq_len": 0,
                "stat_sum": {
                    "num_bytes": 2784231424,
                    "num_objects": 664,
                    "num_object_clones": 0,
                    "num_object_copies": 1993,
                    "num_objects_missing_on_primary": 0,
                    "num_objects_missing": 0,
                    "num_objects_degraded": 0,
                    "num_objects_misplaced": 0,
                    "num_objects_unfound": 0,
                    "num_objects_dirty": 664,
                    "num_whiteouts": 0,
                    "num_read": 93,
                    "num_read_kb": 350209,
                    "num_write": 153,
                    "num_write_kb": 604160,
                    "num_scrub_errors": 0,
                    "num_shallow_scrub_errors": 0,
                    "num_deep_scrub_errors": 0,
                    "num_objects_recovered": 0,
                    "num_bytes_recovered": 0,
                    "num_keys_recovered": 0,
                    "num_objects_omap": 0,
                    "num_objects_hit_set_archive": 0,
                    "num_bytes_hit_set_archive": 0,
                    "num_flush": 0,
                    "num_flush_kb": 0,
                    "num_evict": 0,
                    "num_evict_kb": 0,
                    "num_promote": 0,
                    "num_flush_mode_high": 0,
                    "num_flush_mode_low": 0,
                    "num_evict_mode_some": 0,
                    "num_evict_mode_full": 0,
                    "num_objects_pinned": 0,
                    "num_legacy_snapsets": 0,
                    "num_large_omap_objects": 0,
                    "num_objects_manifest": 0,
                    "num_omap_bytes": 0,
                    "num_omap_keys": 0,
                    "num_objects_repaired": 0
                },
                "up": [
                    1,
                    6,
                    37
                ],
                "acting": [
                    1,
                    6,
                    37
                ],
                "blocked_by": [],
                "up_primary": 1,
                "acting_primary": 1,
                "purged_snaps": []
            },
            "empty": 0,
            "dne": 0,
            "incomplete": 0,
            "last_epoch_started": 6148,
            "hit_set_history": {
                "current_last_update": "0'0",
                "history": []
            }
        }
    ],
    "recovery_state": [
        {
            "name": "Started/Primary/Active",
            "enter_time": "2019-05-15 19:25:01.479877",
            "might_have_unfound": [],
            "recovery_progress": {
                "backfill_targets": [],
                "waiting_on_backfill": [],
                "last_backfill_started": "MIN",
                "backfill_info": {
                    "begin": "MIN",
                    "end": "MIN",
                    "objects": []
                },
                "peer_backfill_info": [],
                "backfills_in_flight": [],
                "recovering": [],
                "pg_backend": {
                    "pull_from_peer": [],
                    "pushing": []
                }
            },
            "scrub": {
                "scrubber.epoch_start": "6147",
                "scrubber.active": false,
                "scrubber.state": "INACTIVE",
                "scrubber.start": "MIN",
                "scrubber.end": "MIN",
                "scrubber.max_end": "MIN",
                "scrubber.subset_last_update": "0'0",
                "scrubber.deep": false,
                "scrubber.waiting_on_whom": []
            }
        },
        {
            "name": "Started",
            "enter_time": "2019-05-15 19:25:00.557042" 
        }
    ],
    "agent_state": {}
}

#2 Updated by Nathan Cutler about 2 months ago

The similarity to #40066 is so striking I just had to mention it and create a "Relates to" link.

#3 Updated by Nathan Cutler about 2 months ago

  • Related to Bug #40066: PGs not scrubbed since 0.000000 added

#4 Updated by David Zafman about 2 months ago

  • Assignee set to David Zafman

When auto repair is enabled a bug causes a regular scrub to reset time stamps which is only intended to happen when there were errors. This internally forces a deep-scrub in order to eventually auto repair.

#5 Updated by David Zafman about 2 months ago

  • Duplicated by Bug #40066: PGs not scrubbed since 0.000000 added

#6 Updated by David Zafman about 2 months ago

  • Related to deleted (Bug #40066: PGs not scrubbed since 0.000000)

#7 Updated by David Zafman about 2 months ago

  • Backport set to nautilus

#8 Updated by David Zafman about 2 months ago

  • Pull request ID set to 28334

#9 Updated by David Zafman about 2 months ago

  • Status changed from New to In Progress

#10 Updated by David Zafman 12 days ago

  • Status changed from In Progress to Pending Backport

#11 Updated by David Zafman 12 days ago

  • Copied to Backport #40667: nautilus: PG scrub stamps reset to 0.000000 added

Also available in: Atom PDF