Project

General

Profile

Actions

Bug #21887

closed

degraded calculation is off during backfill

Added by Sage Weil over 6 years ago. Updated over 6 years ago.

Status:
Duplicate
Priority:
High
Assignee:
-
Category:
-
Target version:
-
% Done:

0%

Source:
Tags:
Backport:
luminous
Regression:
No
Severity:
3 - minor
Reviewed:
Affected Versions:
ceph-qa-suite:
Component(RADOS):
Pull request ID:
Crash signature (v1):
Crash signature (v2):

Description

The PG is active+remapped+backfill_wait. There are 2 backfill targets, and 3 acting which are all up to date. There should not be any degraded objects here... but the PG reports 8.

The full dump:

{
    "state": "active+remapped+backfill_wait",
    "snap_trimq": "[]",
    "epoch": 854065,
    "up": [
        10,
        13,
        28
    ],
    "acting": [
        10,
        7,
        75
    ],
    "backfill_targets": [
        "13",
        "28" 
    ],
    "actingbackfill": [
        "7",
        "10",
        "13",
        "28",
        "75" 
    ],
    "info": {
        "pgid": "0.3c6",
        "last_update": "854065'4050203",
        "last_complete": "854065'4050203",
        "log_tail": "850240'4048688",
        "last_user_version": 4050203,
        "last_backfill": "MAX",
        "last_backfill_bitwise": 1,
        "purged_snaps": [],
        "history": {
            "epoch_created": 1,
            "epoch_pool_created": 1,
            "last_epoch_started": 853926,
            "last_interval_started": 853925,
            "last_epoch_clean": 853782,
            "last_interval_clean": 853780,
            "last_epoch_split": 0,
            "last_epoch_marked_full": 819533,
            "same_up_since": 853924,
            "same_interval_since": 853925,
            "same_primary_since": 853611,
            "last_scrub": "851336'4048912",
            "last_scrub_stamp": "2017-10-20 20:46:48.495908",
            "last_deep_scrub": "842641'4043774",
            "last_deep_scrub_stamp": "2017-10-17 14:17:24.849677",
            "last_clean_scrub_stamp": "2017-10-20 20:46:48.495908" 
        },
        "stats": {
            "version": "854065'4050203",
            "reported_seq": "5305940",
            "reported_epoch": "854065",
            "state": "active+remapped+backfill_wait",
            "last_fresh": "2017-10-21 15:57:36.945099",
            "last_change": "2017-10-21 15:34:22.132320",
            "last_active": "2017-10-21 15:57:36.945099",
            "last_peered": "2017-10-21 15:57:36.945099",
            "last_clean": "2017-10-21 15:32:37.582274",
            "last_became_active": "2017-10-21 15:32:40.428219",
            "last_became_peered": "2017-10-21 15:32:40.428219",
            "last_unstale": "2017-10-21 15:57:36.945099",
            "last_undegraded": "2017-10-21 15:57:36.945099",
            "last_fullsized": "2017-10-21 15:57:36.945099",
            "mapping_epoch": 853925,
            "log_start": "850240'4048688",
            "ondisk_log_start": "850240'4048688",
            "created": 1,
            "last_epoch_clean": 853782,
            "parent": "0.0",
            "parent_split_bits": 0,
            "last_scrub": "851336'4048912",
            "last_scrub_stamp": "2017-10-20 20:46:48.495908",
            "last_deep_scrub": "842641'4043774",
            "last_deep_scrub_stamp": "2017-10-17 14:17:24.849677",
            "last_clean_scrub_stamp": "2017-10-20 20:46:48.495908",
            "log_size": 1515,
            "ondisk_log_size": 1515,
            "stats_invalid": false,
            "dirty_stats_invalid": true,
            "omap_stats_invalid": true,
            "hitset_stats_invalid": true,
            "hitset_bytes_stats_invalid": true,
            "pin_stats_invalid": true,
            "stat_sum": {
                "num_bytes": 12460904141,
                "num_objects": 11087,
                "num_object_clones": 0,
                "num_object_copies": 33261,
                "num_objects_missing_on_primary": 0,
                "num_objects_missing": 0,
                "num_objects_degraded": 10,
                "num_objects_misplaced": 22168,
                "num_objects_unfound": 0,
                "num_objects_dirty": 9504,
                "num_whiteouts": 0,
                "num_read": 121131,
                "num_read_kb": 28091002,
                "num_write": 3771682,
                "num_write_kb": 379407465,
                "num_scrub_errors": 0,
                "num_shallow_scrub_errors": 0,
                "num_deep_scrub_errors": 0,
                "num_objects_recovered": 366236,
                "num_bytes_recovered": 307583669875,
                "num_keys_recovered": 0,
                "num_objects_omap": 0,
                "num_objects_hit_set_archive": 0,
                "num_bytes_hit_set_archive": 0,
                "num_flush": 0,
                "num_flush_kb": 0,
                "num_evict": 0,
                "num_evict_kb": 0,
                "num_promote": 0,
                "num_flush_mode_high": 0,
                "num_flush_mode_low": 0,
                "num_evict_mode_some": 0,
                "num_evict_mode_full": 0,
                "num_objects_pinned": 0,
                "num_legacy_snapsets": 0
            },
            "up": [
                10,
                13,
                28
            ],
            "acting": [
                10,
                7,
                75
            ],
            "blocked_by": [],
            "up_primary": 10,
            "acting_primary": 10
        },
        "empty": 0,
        "dne": 0,
        "incomplete": 0,
        "last_epoch_started": 853926,
        "hit_set_history": {
            "current_last_update": "0'0",
            "history": []
        }
    },
    "peer_info": [
        {
            "peer": "7",
            "pgid": "0.3c6",
            "last_update": "854065'4050203",
            "last_complete": "854065'4050203",
            "log_tail": "850240'4048688",
            "last_user_version": 4050191,
            "last_backfill": "MAX",
            "last_backfill_bitwise": 1,
            "purged_snaps": [],
            "history": {
                "epoch_created": 1,
                "epoch_pool_created": 1,
                "last_epoch_started": 853926,
                "last_interval_started": 853925,
                "last_epoch_clean": 853782,
                "last_interval_clean": 853780,
                "last_epoch_split": 0,
                "last_epoch_marked_full": 819533,
                "same_up_since": 853924,
                "same_interval_since": 853925,
                "same_primary_since": 853611,
                "last_scrub": "851336'4048912",
                "last_scrub_stamp": "2017-10-20 20:46:48.495908",
                "last_deep_scrub": "842641'4043774",
                "last_deep_scrub_stamp": "2017-10-17 14:17:24.849677",
                "last_clean_scrub_stamp": "2017-10-20 20:46:48.495908" 
            },
            "stats": {
                "version": "853899'4050190",
                "reported_seq": "5305749",
                "reported_epoch": "853919",
                "state": "active+clean",
                "last_fresh": "2017-10-21 15:27:43.404739",
                "last_change": "2017-10-21 15:01:38.568621",
                "last_active": "2017-10-21 15:27:43.404739",
                "last_peered": "2017-10-21 15:27:43.404739",
                "last_clean": "2017-10-21 15:27:43.404739",
                "last_became_active": "2017-10-21 15:01:38.568333",
                "last_became_peered": "2017-10-21 15:01:38.568333",
                "last_unstale": "2017-10-21 15:27:43.404739",
                "last_undegraded": "2017-10-21 15:27:43.404739",
                "last_fullsized": "2017-10-21 15:27:43.404739",
                "mapping_epoch": 853925,
                "log_start": "850240'4048688",
                "ondisk_log_start": "850240'4048688",
                "created": 1,
                "last_epoch_clean": 853782,
                "parent": "0.0",
                "parent_split_bits": 0,
                "last_scrub": "851336'4048912",
                "last_scrub_stamp": "2017-10-20 20:46:48.495908",
                "last_deep_scrub": "842641'4043774",
                "last_deep_scrub_stamp": "2017-10-17 14:17:24.849677",
                "last_clean_scrub_stamp": "2017-10-20 20:46:48.495908",
                "log_size": 1502,
                "ondisk_log_size": 1502,
                "stats_invalid": false,
                "dirty_stats_invalid": true,
                "omap_stats_invalid": true,
                "hitset_stats_invalid": true,
                "hitset_bytes_stats_invalid": true,
                "pin_stats_invalid": true,
                "stat_sum": {
                    "num_bytes": 12439926674,
                    "num_objects": 11079,
                    "num_object_clones": 0,
                    "num_object_copies": 33234,
                    "num_objects_missing_on_primary": 0,
                    "num_objects_missing": 0,
                    "num_objects_degraded": 0,
                    "num_objects_misplaced": 0,
                    "num_objects_unfound": 0,
                    "num_objects_dirty": 9496,
                    "num_whiteouts": 0,
                    "num_read": 121131,
                    "num_read_kb": 28091002,
                    "num_write": 3771666,
                    "num_write_kb": 379386979,
                    "num_scrub_errors": 0,
                    "num_shallow_scrub_errors": 0,
                    "num_deep_scrub_errors": 0,
                    "num_objects_recovered": 366232,
                    "num_bytes_recovered": 307575227822,
                    "num_keys_recovered": 0,
                    "num_objects_omap": 0,
                    "num_objects_hit_set_archive": 0,
                    "num_bytes_hit_set_archive": 0,
                    "num_flush": 0,
                    "num_flush_kb": 0,
                    "num_evict": 0,
                    "num_evict_kb": 0,
                    "num_promote": 0,
                    "num_flush_mode_high": 0,
                    "num_flush_mode_low": 0,
                    "num_evict_mode_some": 0,
                    "num_evict_mode_full": 0,
                    "num_objects_pinned": 0,
                    "num_legacy_snapsets": 0
                },
                "up": [
                    10,
                    13,
                    28
                ],
                "acting": [
                    10,
                    7,
                    75
                ],
                "blocked_by": [],
                "up_primary": 10,
                "acting_primary": 10
            },
            "empty": 0,
            "dne": 0,
            "incomplete": 0,
            "last_epoch_started": 853926,
            "hit_set_history": {
                "current_last_update": "0'0",
                "history": []
            }
        },
        {
            "peer": "13",
            "pgid": "0.3c6",
            "last_update": "854065'4050203",
            "last_complete": "854065'4050203",
            "log_tail": "850240'4048691",
            "last_user_version": 0,
            "last_backfill": "0:63c00162:::10000dcbbc8.00000000:head",
            "last_backfill_bitwise": 1,
            "purged_snaps": [],
            "history": {
                "epoch_created": 1,
                "epoch_pool_created": 1,
                "last_epoch_started": 853926,
                "last_interval_started": 853925,
                "last_epoch_clean": 853782,
                "last_interval_clean": 853780,
                "last_epoch_split": 0,
                "last_epoch_marked_full": 819533,
                "same_up_since": 853924,
                "same_interval_since": 853925,
                "same_primary_since": 853611,
                "last_scrub": "851336'4048912",
                "last_scrub_stamp": "2017-10-20 20:46:48.495908",
                "last_deep_scrub": "842641'4043774",
                "last_deep_scrub_stamp": "2017-10-17 14:17:24.849677",
                "last_clean_scrub_stamp": "2017-10-20 20:46:48.495908" 
            },
            "stats": {
                "version": "0'0",
                "reported_seq": "0",
                "reported_epoch": "0",
                "state": "unknown",
                "last_fresh": "0.000000",
                "last_change": "0.000000",
                "last_active": "0.000000",
                "last_peered": "0.000000",
                "last_clean": "0.000000",
                "last_became_active": "0.000000",
                "last_became_peered": "0.000000",
                "last_unstale": "0.000000",
                "last_undegraded": "0.000000",
                "last_fullsized": "0.000000",
                "mapping_epoch": 0,
                "log_start": "0'0",
                "ondisk_log_start": "0'0",
                "created": 0,
                "last_epoch_clean": 0,
                "parent": "0.0",
                "parent_split_bits": 0,
                "last_scrub": "0'0",
                "last_scrub_stamp": "0.000000",
                "last_deep_scrub": "0'0",
                "last_deep_scrub_stamp": "0.000000",
                "last_clean_scrub_stamp": "0.000000",
                "log_size": 0,
                "ondisk_log_size": 0,
                "stats_invalid": false,
                "dirty_stats_invalid": false,
                "omap_stats_invalid": false,
                "hitset_stats_invalid": false,
                "hitset_bytes_stats_invalid": false,
                "pin_stats_invalid": false,
                "stat_sum": {
                    "num_bytes": 4247749,
                    "num_objects": 3,
                    "num_object_clones": 0,
                    "num_object_copies": 0,
                    "num_objects_missing_on_primary": 0,
                    "num_objects_missing": 0,
                    "num_objects_degraded": 0,
                    "num_objects_misplaced": 0,
                    "num_objects_unfound": 0,
                    "num_objects_dirty": 1,
                    "num_whiteouts": 0,
                    "num_read": 0,
                    "num_read_kb": 0,
                    "num_write": 0,
                    "num_write_kb": 0,
                    "num_scrub_errors": 0,
                    "num_shallow_scrub_errors": 0,
                    "num_deep_scrub_errors": 0,
                    "num_objects_recovered": 0,
                    "num_bytes_recovered": 0,
                    "num_keys_recovered": 0,
                    "num_objects_omap": 0,
                    "num_objects_hit_set_archive": 0,
                    "num_bytes_hit_set_archive": 0,
                    "num_flush": 0,
                    "num_flush_kb": 0,
                    "num_evict": 0,
                    "num_evict_kb": 0,
                    "num_promote": 0,
                    "num_flush_mode_high": 0,
                    "num_flush_mode_low": 0,
                    "num_evict_mode_some": 0,
                    "num_evict_mode_full": 0,
                    "num_objects_pinned": 0,
                    "num_legacy_snapsets": 0
                },
                "up": [],
                "acting": [],
                "blocked_by": [],
                "up_primary": -1,
                "acting_primary": -1
            },
            "empty": 0,
            "dne": 0,
            "incomplete": 1,
            "last_epoch_started": 853926,
            "hit_set_history": {
                "current_last_update": "0'0",
                "history": []
            }
        },
        {
            "peer": "28",
            "pgid": "0.3c6",
            "last_update": "854065'4050203",
            "last_complete": "854065'4050203",
            "log_tail": "850240'4048691",
            "last_user_version": 0,
            "last_backfill": "0:63c00162:::10000dcbbc8.00000000:head",
            "last_backfill_bitwise": 1,
            "purged_snaps": [],
            "history": {
                "epoch_created": 1,
                "epoch_pool_created": 1,
                "last_epoch_started": 853926,
                "last_interval_started": 853925,
                "last_epoch_clean": 853782,
                "last_interval_clean": 853780,
                "last_epoch_split": 0,
                "last_epoch_marked_full": 819533,
                "same_up_since": 853924,
                "same_interval_since": 853925,
                "same_primary_since": 853611,
                "last_scrub": "851336'4048912",
                "last_scrub_stamp": "2017-10-20 20:46:48.495908",
                "last_deep_scrub": "842641'4043774",
                "last_deep_scrub_stamp": "2017-10-17 14:17:24.849677",
                "last_clean_scrub_stamp": "2017-10-20 20:46:48.495908" 
            },
            "stats": {
                "version": "0'0",
                "reported_seq": "0",
                "reported_epoch": "0",
                "state": "unknown",
                "last_fresh": "0.000000",
                "last_change": "0.000000",
                "last_active": "0.000000",
                "last_peered": "0.000000",
                "last_clean": "0.000000",
                "last_became_active": "0.000000",
                "last_became_peered": "0.000000",
                "last_unstale": "0.000000",
                "last_undegraded": "0.000000",
                "last_fullsized": "0.000000",
                "mapping_epoch": 0,
                "log_start": "0'0",
                "ondisk_log_start": "0'0",
                "created": 0,
                "last_epoch_clean": 0,
                "parent": "0.0",
                "parent_split_bits": 0,
                "last_scrub": "0'0",
                "last_scrub_stamp": "0.000000",
                "last_deep_scrub": "0'0",
                "last_deep_scrub_stamp": "0.000000",
                "last_clean_scrub_stamp": "0.000000",
                "log_size": 0,
                "ondisk_log_size": 0,
                "stats_invalid": false,
                "dirty_stats_invalid": false,
                "omap_stats_invalid": false,
                "hitset_stats_invalid": false,
                "hitset_bytes_stats_invalid": false,
                "pin_stats_invalid": false,
                "stat_sum": {
                    "num_bytes": 4247749,
                    "num_objects": 3,
                    "num_object_clones": 0,
                    "num_object_copies": 0,
                    "num_objects_missing_on_primary": 0,
                    "num_objects_missing": 0,
                    "num_objects_degraded": 0,
                    "num_objects_misplaced": 0,
                    "num_objects_unfound": 0,
                    "num_objects_dirty": 1,
                    "num_whiteouts": 0,
                    "num_read": 0,
                    "num_read_kb": 0,
                    "num_write": 0,
                    "num_write_kb": 0,
                    "num_scrub_errors": 0,
                    "num_shallow_scrub_errors": 0,
                    "num_deep_scrub_errors": 0,
                    "num_objects_recovered": 0,
                    "num_bytes_recovered": 0,
                    "num_keys_recovered": 0,
                    "num_objects_omap": 0,
                    "num_objects_hit_set_archive": 0,
                    "num_bytes_hit_set_archive": 0,
                    "num_flush": 0,
                    "num_flush_kb": 0,
                    "num_evict": 0,
                    "num_evict_kb": 0,
                    "num_promote": 0,
                    "num_flush_mode_high": 0,
                    "num_flush_mode_low": 0,
                    "num_evict_mode_some": 0,
                    "num_evict_mode_full": 0,
                    "num_objects_pinned": 0,
                    "num_legacy_snapsets": 0
                },
                "up": [],
                "acting": [],
                "blocked_by": [],
                "up_primary": -1,
                "acting_primary": -1
            },
            "empty": 0,
            "dne": 0,
            "incomplete": 1,
            "last_epoch_started": 853926,
            "hit_set_history": {
                "current_last_update": "0'0",
                "history": []
            }
        },
        {
            "peer": "75",
            "pgid": "0.3c6",
            "last_update": "854065'4050203",
            "last_complete": "854065'4050203",
            "log_tail": "850240'4048688",
            "last_user_version": 4050191,
            "last_backfill": "MAX",
            "last_backfill_bitwise": 1,
            "purged_snaps": [],
            "history": {
                "epoch_created": 1,
                "epoch_pool_created": 1,
                "last_epoch_started": 853926,
                "last_interval_started": 853925,
                "last_epoch_clean": 853782,
                "last_interval_clean": 853780,
                "last_epoch_split": 0,
                "last_epoch_marked_full": 819533,
                "same_up_since": 853924,
                "same_interval_since": 853925,
                "same_primary_since": 853611,
                "last_scrub": "851336'4048912",
                "last_scrub_stamp": "2017-10-20 20:46:48.495908",
                "last_deep_scrub": "842641'4043774",
                "last_deep_scrub_stamp": "2017-10-17 14:17:24.849677",
                "last_clean_scrub_stamp": "2017-10-20 20:46:48.495908" 
            },
            "stats": {
                "version": "853899'4050190",
                "reported_seq": "5305749",
                "reported_epoch": "853919",
                "state": "active+clean",
                "last_fresh": "2017-10-21 15:27:43.404739",
                "last_change": "2017-10-21 15:01:38.568621",
                "last_active": "2017-10-21 15:27:43.404739",
                "last_peered": "2017-10-21 15:27:43.404739",
                "last_clean": "2017-10-21 15:27:43.404739",
                "last_became_active": "2017-10-21 15:01:38.568333",
                "last_became_peered": "2017-10-21 15:01:38.568333",
                "last_unstale": "2017-10-21 15:27:43.404739",
                "last_undegraded": "2017-10-21 15:27:43.404739",
                "last_fullsized": "2017-10-21 15:27:43.404739",
                "mapping_epoch": 853925,
                "log_start": "850240'4048688",
                "ondisk_log_start": "850240'4048688",
                "created": 1,
                "last_epoch_clean": 853782,
                "parent": "0.0",
                "parent_split_bits": 0,
                "last_scrub": "851336'4048912",
                "last_scrub_stamp": "2017-10-20 20:46:48.495908",
                "last_deep_scrub": "842641'4043774",
                "last_deep_scrub_stamp": "2017-10-17 14:17:24.849677",
                "last_clean_scrub_stamp": "2017-10-20 20:46:48.495908",
                "log_size": 1502,
                "ondisk_log_size": 1502,
                "stats_invalid": false,
                "dirty_stats_invalid": true,
                "omap_stats_invalid": true,
                "hitset_stats_invalid": true,
                "hitset_bytes_stats_invalid": true,
                "pin_stats_invalid": true,
                "stat_sum": {
                    "num_bytes": 12439926674,
                    "num_objects": 11079,
                    "num_object_clones": 0,
                    "num_object_copies": 33234,
                    "num_objects_missing_on_primary": 0,
                    "num_objects_missing": 0,
                    "num_objects_degraded": 0,
                    "num_objects_misplaced": 0,
                    "num_objects_unfound": 0,
                    "num_objects_dirty": 9496,
                    "num_whiteouts": 0,
                    "num_read": 121131,
                    "num_read_kb": 28091002,
                    "num_write": 3771666,
                    "num_write_kb": 379386979,
                    "num_scrub_errors": 0,
                    "num_shallow_scrub_errors": 0,
                    "num_deep_scrub_errors": 0,
                    "num_objects_recovered": 366232,
                    "num_bytes_recovered": 307575227822,
                    "num_keys_recovered": 0,
                    "num_objects_omap": 0,
                    "num_objects_hit_set_archive": 0,
                    "num_bytes_hit_set_archive": 0,
                    "num_flush": 0,
                    "num_flush_kb": 0,
                    "num_evict": 0,
                    "num_evict_kb": 0,
                    "num_promote": 0,
                    "num_flush_mode_high": 0,
                    "num_flush_mode_low": 0,
                    "num_evict_mode_some": 0,
                    "num_evict_mode_full": 0,
                    "num_objects_pinned": 0,
                    "num_legacy_snapsets": 0
                },
                "up": [
                    10,
                    13,
                    28
                ],
                "acting": [
                    10,
                    7,
                    75
                ],
                "blocked_by": [],
                "up_primary": 10,
                "acting_primary": 10
            },
            "empty": 0,
            "dne": 0,
            "incomplete": 0,
            "last_epoch_started": 853926,
            "hit_set_history": {
                "current_last_update": "0'0",
                "history": []
            }
        }
    ],
...

I think the problem is here:

        // If this peer has more objects then it should, ignore them
        backfilled += MIN(num_objects, peer_info[p].stats.stats.sum.num_objects);

and here
    // Include computed backfilled objects on up nodes
    object_copies += backfilled;

because these backfill targets are beyond the pg size, so we don't actually care what progress they've made wrt degraded... only misplaced.

this isn't just cosmetic/consufing... it will prevent the balancer from doing work when it could.


Related issues 1 (0 open1 closed)

Related to Ceph - Bug #21803: objects degraded higher than 100%ResolvedDavid Zafman10/13/2017

Actions
Actions #1

Updated by Sage Weil over 6 years ago

  • Backport set to luminous

We should backport the fix to luminous. It is confusing/scary that the 'degraded' health warning comes up during a rebalance with no failures.

Actions #2

Updated by Sage Weil over 6 years ago

  • Status changed from 12 to Duplicate
Actions #3

Updated by Sage Weil over 6 years ago

  • Related to Bug #21803: objects degraded higher than 100% added
Actions

Also available in: Atom PDF