Project

General

Profile

Bug #40073

PG scrub stamps reset to 0.000000

Added by Greg Farnum 4 months ago. Updated about 1 month ago.

Status:
Resolved
Priority:
Normal
Assignee:
Category:
Scrub/Repair
Target version:
-
Start date:
05/29/2019
Due date:
% Done:

0%

Source:
Tags:
Backport:
nautilus
Regression:
No
Severity:
3 - minor
Reviewed:
Affected Versions:
ceph-qa-suite:
Component(RADOS):
OSD
Pull request ID:

Description

From Ceph-users, https://www.spinics.net/lists/ceph-users/msg52869.html

After upgrading from 14.2.0 to 14.2.1, I've noticed PGs are frequently resetting their scrub and deep scrub time stamps to 0.000000. It's extra strange because the peers show timestamps for deep scrubs.

  1. First entry from a pg list at 7pm
    $ grep 11.2f2 ~/pgs-active.7pm
    11.2f2 691 0 0 0 2897477632 0 0 2091 active+clean 3h 7378'12291 8048:36261 [1,6,37]p1 [1,6,37]p1 2019-05-14 21:01:29.172460 2019-05-14 21:01:29.172460
  1. Next Entry 3 minutes later
    $ ceph pg ls active |grep 11.2f2
    11.2f2 695 0 0 0 2914713600 0 0 2091 active+clean 6s 7378'12291 8049:36330 [1,6,37]p1 [1,6,37]p1 0.000000 0.000000

Not sure if it's related, but this only happens to PG's who's primary OSD is one where osd_numa_node has been set.


Related issues

Duplicated by Ceph - Bug #40066: PGs not scrubbed since 0.000000 Duplicate 05/29/2019
Copied to RADOS - Backport #40667: nautilus: PG scrub stamps reset to 0.000000 Resolved

History

#1 Updated by Greg Farnum 4 months ago

A full pg query:

## PG Query
{
    "state": "active+clean",
    "snap_trimq": "[]",
    "snap_trimq_len": 0,
    "epoch": 8049,
    "up": [
        1,
        6,
        37
    ],
    "acting": [
        1,
        6,
        37
    ],
    "acting_recovery_backfill": [
        "1",
        "6",
        "37" 
    ],
    "info": {
        "pgid": "11.2f2",
        "last_update": "7378'12291",
        "last_complete": "7378'12291",
        "log_tail": "1087'10200",
        "last_user_version": 12291,
        "last_backfill": "MAX",
        "last_backfill_bitwise": 1,
        "purged_snaps": [],
        "history": {
            "epoch_created": 1549,
            "epoch_pool_created": 216,
            "last_epoch_started": 6148,
            "last_interval_started": 6147,
            "last_epoch_clean": 6148,
            "last_interval_clean": 6147,
            "last_epoch_split": 6147,
            "last_epoch_marked_full": 0,
            "same_up_since": 6126,
            "same_interval_since": 6147,
            "same_primary_since": 6126,
            "last_scrub": "7378'12291",
            "last_scrub_stamp": "0.000000",
            "last_deep_scrub": "6103'12186",
            "last_deep_scrub_stamp": "0.000000",
            "last_clean_scrub_stamp": "2019-05-15 23:08:17.014575" 
        },
        "stats": {
            "version": "7378'12291",
            "reported_seq": "36700",
            "reported_epoch": "8049",
            "state": "active+clean",
            "last_fresh": "2019-05-15 23:08:17.014609",
            "last_change": "2019-05-15 23:08:17.014609",
            "last_active": "2019-05-15 23:08:17.014609",
            "last_peered": "2019-05-15 23:08:17.014609",
            "last_clean": "2019-05-15 23:08:17.014609",
            "last_became_active": "2019-05-15 19:25:01.484322",
            "last_became_peered": "2019-05-15 19:25:01.484322",
            "last_unstale": "2019-05-15 23:08:17.014609",
            "last_undegraded": "2019-05-15 23:08:17.014609",
            "last_fullsized": "2019-05-15 23:08:17.014609",
            "mapping_epoch": 6126,
            "log_start": "1087'10200",
            "ondisk_log_start": "1087'10200",
            "created": 1549,
            "last_epoch_clean": 6148,
            "parent": "0.0",
            "parent_split_bits": 10,
            "last_scrub": "7378'12291",
            "last_scrub_stamp": "0.000000",
            "last_deep_scrub": "6103'12186",
            "last_deep_scrub_stamp": "0.000000",
            "last_clean_scrub_stamp": "2019-05-15 23:08:17.014575",
            "log_size": 2091,
            "ondisk_log_size": 2091,
            "stats_invalid": false,
            "dirty_stats_invalid": false,
            "omap_stats_invalid": false,
            "hitset_stats_invalid": false,
            "hitset_bytes_stats_invalid": false,
            "pin_stats_invalid": false,
            "manifest_stats_invalid": true,
            "snaptrimq_len": 0,
            "stat_sum": {
                "num_bytes": 2914713600,
                "num_objects": 695,
                "num_object_clones": 0,
                "num_object_copies": 2085,
                "num_objects_missing_on_primary": 0,
                "num_objects_missing": 0,
                "num_objects_degraded": 0,
                "num_objects_misplaced": 0,
                "num_objects_unfound": 0,
                "num_objects_dirty": 695,
                "num_whiteouts": 0,
                "num_read": 0,
                "num_read_kb": 0,
                "num_write": 0,
                "num_write_kb": 0,
                "num_scrub_errors": 0,
                "num_shallow_scrub_errors": 0,
                "num_deep_scrub_errors": 0,
                "num_objects_recovered": 0,
                "num_bytes_recovered": 0,
                "num_keys_recovered": 0,
                "num_objects_omap": 0,
                "num_objects_hit_set_archive": 0,
                "num_bytes_hit_set_archive": 0,
                "num_flush": 0,
                "num_flush_kb": 0,
                "num_evict": 0,
                "num_evict_kb": 0,
                "num_promote": 0,
                "num_flush_mode_high": 0,
                "num_flush_mode_low": 0,
                "num_evict_mode_some": 0,
                "num_evict_mode_full": 0,
                "num_objects_pinned": 0,
                "num_legacy_snapsets": 0,
                "num_large_omap_objects": 0,
                "num_objects_manifest": 0,
                "num_omap_bytes": 0,
                "num_omap_keys": 0,
                "num_objects_repaired": 0
            },
            "up": [
                1,
                6,
                37
            ],
            "acting": [
                1,
                6,
                37
            ],
            "blocked_by": [],
            "up_primary": 1,
            "acting_primary": 1,
            "purged_snaps": []
        },
        "empty": 0,
        "dne": 0,
        "incomplete": 0,
        "last_epoch_started": 6148,
        "hit_set_history": {
            "current_last_update": "0'0",
            "history": []
        }
    },
    "peer_info": [
        {
            "peer": "6",
            "pgid": "11.2f2",
            "last_update": "7378'12291",
            "last_complete": "7378'12291",
            "log_tail": "1087'10200",
            "last_user_version": 12264,
            "last_backfill": "MAX",
            "last_backfill_bitwise": 1,
            "purged_snaps": [],
            "history": {
                "epoch_created": 1549,
                "epoch_pool_created": 216,
                "last_epoch_started": 6148,
                "last_interval_started": 6147,
                "last_epoch_clean": 6148,
                "last_interval_clean": 6147,
                "last_epoch_split": 6147,
                "last_epoch_marked_full": 0,
                "same_up_since": 6126,
                "same_interval_since": 6147,
                "same_primary_since": 6126,
                "last_scrub": "7378'12291",
                "last_scrub_stamp": "2019-05-14 21:01:29.172460",
                "last_deep_scrub": "6103'12186",
                "last_deep_scrub_stamp": "2019-05-14 21:01:29.172460",
                "last_clean_scrub_stamp": "2019-05-15 23:08:17.014575" 
            },
            "stats": {
                "version": "6103'12264",
                "reported_seq": "34823",
                "reported_epoch": "6125",
                "state": "active+undersized+degraded",
                "last_fresh": "2019-05-15 19:16:05.309556",
                "last_change": "2019-05-15 19:15:59.342872",
                "last_active": "2019-05-15 19:16:05.309556",
                "last_peered": "2019-05-15 19:16:05.309556",
                "last_clean": "2019-05-15 17:23:41.956141",
                "last_became_active": "2019-05-15 19:15:59.342872",
                "last_became_peered": "2019-05-15 19:15:59.342872",
                "last_unstale": "2019-05-15 19:16:05.309556",
                "last_undegraded": "2019-05-15 19:15:59.341684",
                "last_fullsized": "2019-05-15 19:15:59.340270",
                "mapping_epoch": 6126,
                "log_start": "1087'10200",
                "ondisk_log_start": "1087'10200",
                "created": 1549,
                "last_epoch_clean": 6115,
                "parent": "0.0",
                "parent_split_bits": 10,
                "last_scrub": "6103'12186",
                "last_scrub_stamp": "2019-05-14 21:01:29.172460",
                "last_deep_scrub": "6103'12186",
                "last_deep_scrub_stamp": "2019-05-14 21:01:29.172460",
                "last_clean_scrub_stamp": "2019-05-14 21:01:29.172460",
                "log_size": 2064,
                "ondisk_log_size": 2064,
                "stats_invalid": true,
                "dirty_stats_invalid": false,
                "omap_stats_invalid": false,
                "hitset_stats_invalid": false,
                "hitset_bytes_stats_invalid": false,
                "pin_stats_invalid": false,
                "manifest_stats_invalid": true,
                "snaptrimq_len": 0,
                "stat_sum": {
                    "num_bytes": 2784231424,
                    "num_objects": 664,
                    "num_object_clones": 0,
                    "num_object_copies": 1992,
                    "num_objects_missing_on_primary": 0,
                    "num_objects_missing": 0,
                    "num_objects_degraded": 664,
                    "num_objects_misplaced": 0,
                    "num_objects_unfound": 0,
                    "num_objects_dirty": 664,
                    "num_whiteouts": 0,
                    "num_read": 93,
                    "num_read_kb": 350209,
                    "num_write": 153,
                    "num_write_kb": 604160,
                    "num_scrub_errors": 0,
                    "num_shallow_scrub_errors": 0,
                    "num_deep_scrub_errors": 0,
                    "num_objects_recovered": 129,
                    "num_bytes_recovered": 540901376,
                    "num_keys_recovered": 0,
                    "num_objects_omap": 0,
                    "num_objects_hit_set_archive": 0,
                    "num_bytes_hit_set_archive": 0,
                    "num_flush": 0,
                    "num_flush_kb": 0,
                    "num_evict": 0,
                    "num_evict_kb": 0,
                    "num_promote": 0,
                    "num_flush_mode_high": 0,
                    "num_flush_mode_low": 0,
                    "num_evict_mode_some": 0,
                    "num_evict_mode_full": 0,
                    "num_objects_pinned": 0,
                    "num_legacy_snapsets": 0,
                    "num_large_omap_objects": 0,
                    "num_objects_manifest": 0,
                    "num_omap_bytes": 0,
                    "num_omap_keys": 0,
                    "num_objects_repaired": 0
                },
                "up": [
                    1,
                    6,
                    37
                ],
                "acting": [
                    1,
                    6,
                    37
                ],
                "blocked_by": [],
                "up_primary": 1,
                "acting_primary": 1,
                "purged_snaps": []
            },
            "empty": 0,
            "dne": 0,
            "incomplete": 0,
            "last_epoch_started": 6148,
            "hit_set_history": {
                "current_last_update": "0'0",
                "history": []
            }
        },
        {
            "peer": "37",
            "pgid": "11.2f2",
            "last_update": "7378'12291",
            "last_complete": "7378'12291",
            "log_tail": "1087'10200",
            "last_user_version": 12264,
            "last_backfill": "MAX",
            "last_backfill_bitwise": 1,
            "purged_snaps": [],
            "history": {
                "epoch_created": 1549,
                "epoch_pool_created": 216,
                "last_epoch_started": 6148,
                "last_interval_started": 6147,
                "last_epoch_clean": 6148,
                "last_interval_clean": 6147,
                "last_epoch_split": 6147,
                "last_epoch_marked_full": 0,
                "same_up_since": 6126,
                "same_interval_since": 6147,
                "same_primary_since": 6126,
                "last_scrub": "7378'12291",
                "last_scrub_stamp": "2019-05-14 21:01:29.172460",
                "last_deep_scrub": "6103'12186",
                "last_deep_scrub_stamp": "2019-05-14 21:01:29.172460",
                "last_clean_scrub_stamp": "2019-05-15 23:08:17.014575" 
            },
            "stats": {
                "version": "6103'12263",
                "reported_seq": "34275",
                "reported_epoch": "6103",
                "state": "active+clean",
                "last_fresh": "2019-05-15 17:23:41.956141",
                "last_change": "2019-05-14 21:01:29.172506",
                "last_active": "2019-05-15 17:23:41.956141",
                "last_peered": "2019-05-15 17:23:41.956141",
                "last_clean": "2019-05-15 17:23:41.956141",
                "last_became_active": "2019-05-14 20:16:44.641437",
                "last_became_peered": "2019-05-14 20:16:44.641437",
                "last_unstale": "2019-05-15 17:23:41.956141",
                "last_undegraded": "2019-05-15 17:23:41.956141",
                "last_fullsized": "2019-05-15 17:23:41.956141",
                "mapping_epoch": 6126,
                "log_start": "1087'10200",
                "ondisk_log_start": "1087'10200",
                "created": 1549,
                "last_epoch_clean": 6010,
                "parent": "0.0",
                "parent_split_bits": 10,
                "last_scrub": "6103'12186",
                "last_scrub_stamp": "2019-05-14 21:01:29.172460",
                "last_deep_scrub": "6103'12186",
                "last_deep_scrub_stamp": "2019-05-14 21:01:29.172460",
                "last_clean_scrub_stamp": "2019-05-14 21:01:29.172460",
                "log_size": 2063,
                "ondisk_log_size": 2063,
                "stats_invalid": true,
                "dirty_stats_invalid": false,
                "omap_stats_invalid": false,
                "hitset_stats_invalid": false,
                "hitset_bytes_stats_invalid": false,
                "pin_stats_invalid": false,
                "manifest_stats_invalid": true,
                "snaptrimq_len": 0,
                "stat_sum": {
                    "num_bytes": 2784231424,
                    "num_objects": 664,
                    "num_object_clones": 0,
                    "num_object_copies": 1993,
                    "num_objects_missing_on_primary": 0,
                    "num_objects_missing": 0,
                    "num_objects_degraded": 0,
                    "num_objects_misplaced": 0,
                    "num_objects_unfound": 0,
                    "num_objects_dirty": 664,
                    "num_whiteouts": 0,
                    "num_read": 93,
                    "num_read_kb": 350209,
                    "num_write": 153,
                    "num_write_kb": 604160,
                    "num_scrub_errors": 0,
                    "num_shallow_scrub_errors": 0,
                    "num_deep_scrub_errors": 0,
                    "num_objects_recovered": 0,
                    "num_bytes_recovered": 0,
                    "num_keys_recovered": 0,
                    "num_objects_omap": 0,
                    "num_objects_hit_set_archive": 0,
                    "num_bytes_hit_set_archive": 0,
                    "num_flush": 0,
                    "num_flush_kb": 0,
                    "num_evict": 0,
                    "num_evict_kb": 0,
                    "num_promote": 0,
                    "num_flush_mode_high": 0,
                    "num_flush_mode_low": 0,
                    "num_evict_mode_some": 0,
                    "num_evict_mode_full": 0,
                    "num_objects_pinned": 0,
                    "num_legacy_snapsets": 0,
                    "num_large_omap_objects": 0,
                    "num_objects_manifest": 0,
                    "num_omap_bytes": 0,
                    "num_omap_keys": 0,
                    "num_objects_repaired": 0
                },
                "up": [
                    1,
                    6,
                    37
                ],
                "acting": [
                    1,
                    6,
                    37
                ],
                "blocked_by": [],
                "up_primary": 1,
                "acting_primary": 1,
                "purged_snaps": []
            },
            "empty": 0,
            "dne": 0,
            "incomplete": 0,
            "last_epoch_started": 6148,
            "hit_set_history": {
                "current_last_update": "0'0",
                "history": []
            }
        }
    ],
    "recovery_state": [
        {
            "name": "Started/Primary/Active",
            "enter_time": "2019-05-15 19:25:01.479877",
            "might_have_unfound": [],
            "recovery_progress": {
                "backfill_targets": [],
                "waiting_on_backfill": [],
                "last_backfill_started": "MIN",
                "backfill_info": {
                    "begin": "MIN",
                    "end": "MIN",
                    "objects": []
                },
                "peer_backfill_info": [],
                "backfills_in_flight": [],
                "recovering": [],
                "pg_backend": {
                    "pull_from_peer": [],
                    "pushing": []
                }
            },
            "scrub": {
                "scrubber.epoch_start": "6147",
                "scrubber.active": false,
                "scrubber.state": "INACTIVE",
                "scrubber.start": "MIN",
                "scrubber.end": "MIN",
                "scrubber.max_end": "MIN",
                "scrubber.subset_last_update": "0'0",
                "scrubber.deep": false,
                "scrubber.waiting_on_whom": []
            }
        },
        {
            "name": "Started",
            "enter_time": "2019-05-15 19:25:00.557042" 
        }
    ],
    "agent_state": {}
}

#2 Updated by Nathan Cutler 4 months ago

The similarity to #40066 is so striking I just had to mention it and create a "Relates to" link.

#3 Updated by Nathan Cutler 4 months ago

  • Related to Bug #40066: PGs not scrubbed since 0.000000 added

#4 Updated by David Zafman 4 months ago

  • Assignee set to David Zafman

When auto repair is enabled a bug causes a regular scrub to reset time stamps which is only intended to happen when there were errors. This internally forces a deep-scrub in order to eventually auto repair.

#5 Updated by David Zafman 4 months ago

  • Duplicated by Bug #40066: PGs not scrubbed since 0.000000 added

#6 Updated by David Zafman 4 months ago

  • Related to deleted (Bug #40066: PGs not scrubbed since 0.000000)

#7 Updated by David Zafman 4 months ago

  • Backport set to nautilus

#8 Updated by David Zafman 4 months ago

  • Pull request ID set to 28334

#9 Updated by David Zafman 4 months ago

  • Status changed from New to In Progress

#10 Updated by David Zafman 3 months ago

  • Status changed from In Progress to Pending Backport

#11 Updated by David Zafman 3 months ago

  • Copied to Backport #40667: nautilus: PG scrub stamps reset to 0.000000 added

#12 Updated by David Zafman about 1 month ago

  • Status changed from Pending Backport to Resolved

Also available in: Atom PDF