Actions
Bug #21887
closeddegraded calculation is off during backfill
Status:
Duplicate
Priority:
High
Assignee:
-
Category:
-
Target version:
-
% Done:
0%
Source:
Tags:
Backport:
luminous
Regression:
No
Severity:
3 - minor
Reviewed:
Affected Versions:
ceph-qa-suite:
Component(RADOS):
Pull request ID:
Crash signature (v1):
Crash signature (v2):
Description
The PG is active+remapped+backfill_wait. There are 2 backfill targets, and 3 acting which are all up to date. There should not be any degraded objects here... but the PG reports 8.
The full dump:
{ "state": "active+remapped+backfill_wait", "snap_trimq": "[]", "epoch": 854065, "up": [ 10, 13, 28 ], "acting": [ 10, 7, 75 ], "backfill_targets": [ "13", "28" ], "actingbackfill": [ "7", "10", "13", "28", "75" ], "info": { "pgid": "0.3c6", "last_update": "854065'4050203", "last_complete": "854065'4050203", "log_tail": "850240'4048688", "last_user_version": 4050203, "last_backfill": "MAX", "last_backfill_bitwise": 1, "purged_snaps": [], "history": { "epoch_created": 1, "epoch_pool_created": 1, "last_epoch_started": 853926, "last_interval_started": 853925, "last_epoch_clean": 853782, "last_interval_clean": 853780, "last_epoch_split": 0, "last_epoch_marked_full": 819533, "same_up_since": 853924, "same_interval_since": 853925, "same_primary_since": 853611, "last_scrub": "851336'4048912", "last_scrub_stamp": "2017-10-20 20:46:48.495908", "last_deep_scrub": "842641'4043774", "last_deep_scrub_stamp": "2017-10-17 14:17:24.849677", "last_clean_scrub_stamp": "2017-10-20 20:46:48.495908" }, "stats": { "version": "854065'4050203", "reported_seq": "5305940", "reported_epoch": "854065", "state": "active+remapped+backfill_wait", "last_fresh": "2017-10-21 15:57:36.945099", "last_change": "2017-10-21 15:34:22.132320", "last_active": "2017-10-21 15:57:36.945099", "last_peered": "2017-10-21 15:57:36.945099", "last_clean": "2017-10-21 15:32:37.582274", "last_became_active": "2017-10-21 15:32:40.428219", "last_became_peered": "2017-10-21 15:32:40.428219", "last_unstale": "2017-10-21 15:57:36.945099", "last_undegraded": "2017-10-21 15:57:36.945099", "last_fullsized": "2017-10-21 15:57:36.945099", "mapping_epoch": 853925, "log_start": "850240'4048688", "ondisk_log_start": "850240'4048688", "created": 1, "last_epoch_clean": 853782, "parent": "0.0", "parent_split_bits": 0, "last_scrub": "851336'4048912", "last_scrub_stamp": "2017-10-20 20:46:48.495908", "last_deep_scrub": "842641'4043774", "last_deep_scrub_stamp": "2017-10-17 14:17:24.849677", "last_clean_scrub_stamp": "2017-10-20 20:46:48.495908", "log_size": 1515, "ondisk_log_size": 1515, "stats_invalid": false, "dirty_stats_invalid": true, "omap_stats_invalid": true, "hitset_stats_invalid": true, "hitset_bytes_stats_invalid": true, "pin_stats_invalid": true, "stat_sum": { "num_bytes": 12460904141, "num_objects": 11087, "num_object_clones": 0, "num_object_copies": 33261, "num_objects_missing_on_primary": 0, "num_objects_missing": 0, "num_objects_degraded": 10, "num_objects_misplaced": 22168, "num_objects_unfound": 0, "num_objects_dirty": 9504, "num_whiteouts": 0, "num_read": 121131, "num_read_kb": 28091002, "num_write": 3771682, "num_write_kb": 379407465, "num_scrub_errors": 0, "num_shallow_scrub_errors": 0, "num_deep_scrub_errors": 0, "num_objects_recovered": 366236, "num_bytes_recovered": 307583669875, "num_keys_recovered": 0, "num_objects_omap": 0, "num_objects_hit_set_archive": 0, "num_bytes_hit_set_archive": 0, "num_flush": 0, "num_flush_kb": 0, "num_evict": 0, "num_evict_kb": 0, "num_promote": 0, "num_flush_mode_high": 0, "num_flush_mode_low": 0, "num_evict_mode_some": 0, "num_evict_mode_full": 0, "num_objects_pinned": 0, "num_legacy_snapsets": 0 }, "up": [ 10, 13, 28 ], "acting": [ 10, 7, 75 ], "blocked_by": [], "up_primary": 10, "acting_primary": 10 }, "empty": 0, "dne": 0, "incomplete": 0, "last_epoch_started": 853926, "hit_set_history": { "current_last_update": "0'0", "history": [] } }, "peer_info": [ { "peer": "7", "pgid": "0.3c6", "last_update": "854065'4050203", "last_complete": "854065'4050203", "log_tail": "850240'4048688", "last_user_version": 4050191, "last_backfill": "MAX", "last_backfill_bitwise": 1, "purged_snaps": [], "history": { "epoch_created": 1, "epoch_pool_created": 1, "last_epoch_started": 853926, "last_interval_started": 853925, "last_epoch_clean": 853782, "last_interval_clean": 853780, "last_epoch_split": 0, "last_epoch_marked_full": 819533, "same_up_since": 853924, "same_interval_since": 853925, "same_primary_since": 853611, "last_scrub": "851336'4048912", "last_scrub_stamp": "2017-10-20 20:46:48.495908", "last_deep_scrub": "842641'4043774", "last_deep_scrub_stamp": "2017-10-17 14:17:24.849677", "last_clean_scrub_stamp": "2017-10-20 20:46:48.495908" }, "stats": { "version": "853899'4050190", "reported_seq": "5305749", "reported_epoch": "853919", "state": "active+clean", "last_fresh": "2017-10-21 15:27:43.404739", "last_change": "2017-10-21 15:01:38.568621", "last_active": "2017-10-21 15:27:43.404739", "last_peered": "2017-10-21 15:27:43.404739", "last_clean": "2017-10-21 15:27:43.404739", "last_became_active": "2017-10-21 15:01:38.568333", "last_became_peered": "2017-10-21 15:01:38.568333", "last_unstale": "2017-10-21 15:27:43.404739", "last_undegraded": "2017-10-21 15:27:43.404739", "last_fullsized": "2017-10-21 15:27:43.404739", "mapping_epoch": 853925, "log_start": "850240'4048688", "ondisk_log_start": "850240'4048688", "created": 1, "last_epoch_clean": 853782, "parent": "0.0", "parent_split_bits": 0, "last_scrub": "851336'4048912", "last_scrub_stamp": "2017-10-20 20:46:48.495908", "last_deep_scrub": "842641'4043774", "last_deep_scrub_stamp": "2017-10-17 14:17:24.849677", "last_clean_scrub_stamp": "2017-10-20 20:46:48.495908", "log_size": 1502, "ondisk_log_size": 1502, "stats_invalid": false, "dirty_stats_invalid": true, "omap_stats_invalid": true, "hitset_stats_invalid": true, "hitset_bytes_stats_invalid": true, "pin_stats_invalid": true, "stat_sum": { "num_bytes": 12439926674, "num_objects": 11079, "num_object_clones": 0, "num_object_copies": 33234, "num_objects_missing_on_primary": 0, "num_objects_missing": 0, "num_objects_degraded": 0, "num_objects_misplaced": 0, "num_objects_unfound": 0, "num_objects_dirty": 9496, "num_whiteouts": 0, "num_read": 121131, "num_read_kb": 28091002, "num_write": 3771666, "num_write_kb": 379386979, "num_scrub_errors": 0, "num_shallow_scrub_errors": 0, "num_deep_scrub_errors": 0, "num_objects_recovered": 366232, "num_bytes_recovered": 307575227822, "num_keys_recovered": 0, "num_objects_omap": 0, "num_objects_hit_set_archive": 0, "num_bytes_hit_set_archive": 0, "num_flush": 0, "num_flush_kb": 0, "num_evict": 0, "num_evict_kb": 0, "num_promote": 0, "num_flush_mode_high": 0, "num_flush_mode_low": 0, "num_evict_mode_some": 0, "num_evict_mode_full": 0, "num_objects_pinned": 0, "num_legacy_snapsets": 0 }, "up": [ 10, 13, 28 ], "acting": [ 10, 7, 75 ], "blocked_by": [], "up_primary": 10, "acting_primary": 10 }, "empty": 0, "dne": 0, "incomplete": 0, "last_epoch_started": 853926, "hit_set_history": { "current_last_update": "0'0", "history": [] } }, { "peer": "13", "pgid": "0.3c6", "last_update": "854065'4050203", "last_complete": "854065'4050203", "log_tail": "850240'4048691", "last_user_version": 0, "last_backfill": "0:63c00162:::10000dcbbc8.00000000:head", "last_backfill_bitwise": 1, "purged_snaps": [], "history": { "epoch_created": 1, "epoch_pool_created": 1, "last_epoch_started": 853926, "last_interval_started": 853925, "last_epoch_clean": 853782, "last_interval_clean": 853780, "last_epoch_split": 0, "last_epoch_marked_full": 819533, "same_up_since": 853924, "same_interval_since": 853925, "same_primary_since": 853611, "last_scrub": "851336'4048912", "last_scrub_stamp": "2017-10-20 20:46:48.495908", "last_deep_scrub": "842641'4043774", "last_deep_scrub_stamp": "2017-10-17 14:17:24.849677", "last_clean_scrub_stamp": "2017-10-20 20:46:48.495908" }, "stats": { "version": "0'0", "reported_seq": "0", "reported_epoch": "0", "state": "unknown", "last_fresh": "0.000000", "last_change": "0.000000", "last_active": "0.000000", "last_peered": "0.000000", "last_clean": "0.000000", "last_became_active": "0.000000", "last_became_peered": "0.000000", "last_unstale": "0.000000", "last_undegraded": "0.000000", "last_fullsized": "0.000000", "mapping_epoch": 0, "log_start": "0'0", "ondisk_log_start": "0'0", "created": 0, "last_epoch_clean": 0, "parent": "0.0", "parent_split_bits": 0, "last_scrub": "0'0", "last_scrub_stamp": "0.000000", "last_deep_scrub": "0'0", "last_deep_scrub_stamp": "0.000000", "last_clean_scrub_stamp": "0.000000", "log_size": 0, "ondisk_log_size": 0, "stats_invalid": false, "dirty_stats_invalid": false, "omap_stats_invalid": false, "hitset_stats_invalid": false, "hitset_bytes_stats_invalid": false, "pin_stats_invalid": false, "stat_sum": { "num_bytes": 4247749, "num_objects": 3, "num_object_clones": 0, "num_object_copies": 0, "num_objects_missing_on_primary": 0, "num_objects_missing": 0, "num_objects_degraded": 0, "num_objects_misplaced": 0, "num_objects_unfound": 0, "num_objects_dirty": 1, "num_whiteouts": 0, "num_read": 0, "num_read_kb": 0, "num_write": 0, "num_write_kb": 0, "num_scrub_errors": 0, "num_shallow_scrub_errors": 0, "num_deep_scrub_errors": 0, "num_objects_recovered": 0, "num_bytes_recovered": 0, "num_keys_recovered": 0, "num_objects_omap": 0, "num_objects_hit_set_archive": 0, "num_bytes_hit_set_archive": 0, "num_flush": 0, "num_flush_kb": 0, "num_evict": 0, "num_evict_kb": 0, "num_promote": 0, "num_flush_mode_high": 0, "num_flush_mode_low": 0, "num_evict_mode_some": 0, "num_evict_mode_full": 0, "num_objects_pinned": 0, "num_legacy_snapsets": 0 }, "up": [], "acting": [], "blocked_by": [], "up_primary": -1, "acting_primary": -1 }, "empty": 0, "dne": 0, "incomplete": 1, "last_epoch_started": 853926, "hit_set_history": { "current_last_update": "0'0", "history": [] } }, { "peer": "28", "pgid": "0.3c6", "last_update": "854065'4050203", "last_complete": "854065'4050203", "log_tail": "850240'4048691", "last_user_version": 0, "last_backfill": "0:63c00162:::10000dcbbc8.00000000:head", "last_backfill_bitwise": 1, "purged_snaps": [], "history": { "epoch_created": 1, "epoch_pool_created": 1, "last_epoch_started": 853926, "last_interval_started": 853925, "last_epoch_clean": 853782, "last_interval_clean": 853780, "last_epoch_split": 0, "last_epoch_marked_full": 819533, "same_up_since": 853924, "same_interval_since": 853925, "same_primary_since": 853611, "last_scrub": "851336'4048912", "last_scrub_stamp": "2017-10-20 20:46:48.495908", "last_deep_scrub": "842641'4043774", "last_deep_scrub_stamp": "2017-10-17 14:17:24.849677", "last_clean_scrub_stamp": "2017-10-20 20:46:48.495908" }, "stats": { "version": "0'0", "reported_seq": "0", "reported_epoch": "0", "state": "unknown", "last_fresh": "0.000000", "last_change": "0.000000", "last_active": "0.000000", "last_peered": "0.000000", "last_clean": "0.000000", "last_became_active": "0.000000", "last_became_peered": "0.000000", "last_unstale": "0.000000", "last_undegraded": "0.000000", "last_fullsized": "0.000000", "mapping_epoch": 0, "log_start": "0'0", "ondisk_log_start": "0'0", "created": 0, "last_epoch_clean": 0, "parent": "0.0", "parent_split_bits": 0, "last_scrub": "0'0", "last_scrub_stamp": "0.000000", "last_deep_scrub": "0'0", "last_deep_scrub_stamp": "0.000000", "last_clean_scrub_stamp": "0.000000", "log_size": 0, "ondisk_log_size": 0, "stats_invalid": false, "dirty_stats_invalid": false, "omap_stats_invalid": false, "hitset_stats_invalid": false, "hitset_bytes_stats_invalid": false, "pin_stats_invalid": false, "stat_sum": { "num_bytes": 4247749, "num_objects": 3, "num_object_clones": 0, "num_object_copies": 0, "num_objects_missing_on_primary": 0, "num_objects_missing": 0, "num_objects_degraded": 0, "num_objects_misplaced": 0, "num_objects_unfound": 0, "num_objects_dirty": 1, "num_whiteouts": 0, "num_read": 0, "num_read_kb": 0, "num_write": 0, "num_write_kb": 0, "num_scrub_errors": 0, "num_shallow_scrub_errors": 0, "num_deep_scrub_errors": 0, "num_objects_recovered": 0, "num_bytes_recovered": 0, "num_keys_recovered": 0, "num_objects_omap": 0, "num_objects_hit_set_archive": 0, "num_bytes_hit_set_archive": 0, "num_flush": 0, "num_flush_kb": 0, "num_evict": 0, "num_evict_kb": 0, "num_promote": 0, "num_flush_mode_high": 0, "num_flush_mode_low": 0, "num_evict_mode_some": 0, "num_evict_mode_full": 0, "num_objects_pinned": 0, "num_legacy_snapsets": 0 }, "up": [], "acting": [], "blocked_by": [], "up_primary": -1, "acting_primary": -1 }, "empty": 0, "dne": 0, "incomplete": 1, "last_epoch_started": 853926, "hit_set_history": { "current_last_update": "0'0", "history": [] } }, { "peer": "75", "pgid": "0.3c6", "last_update": "854065'4050203", "last_complete": "854065'4050203", "log_tail": "850240'4048688", "last_user_version": 4050191, "last_backfill": "MAX", "last_backfill_bitwise": 1, "purged_snaps": [], "history": { "epoch_created": 1, "epoch_pool_created": 1, "last_epoch_started": 853926, "last_interval_started": 853925, "last_epoch_clean": 853782, "last_interval_clean": 853780, "last_epoch_split": 0, "last_epoch_marked_full": 819533, "same_up_since": 853924, "same_interval_since": 853925, "same_primary_since": 853611, "last_scrub": "851336'4048912", "last_scrub_stamp": "2017-10-20 20:46:48.495908", "last_deep_scrub": "842641'4043774", "last_deep_scrub_stamp": "2017-10-17 14:17:24.849677", "last_clean_scrub_stamp": "2017-10-20 20:46:48.495908" }, "stats": { "version": "853899'4050190", "reported_seq": "5305749", "reported_epoch": "853919", "state": "active+clean", "last_fresh": "2017-10-21 15:27:43.404739", "last_change": "2017-10-21 15:01:38.568621", "last_active": "2017-10-21 15:27:43.404739", "last_peered": "2017-10-21 15:27:43.404739", "last_clean": "2017-10-21 15:27:43.404739", "last_became_active": "2017-10-21 15:01:38.568333", "last_became_peered": "2017-10-21 15:01:38.568333", "last_unstale": "2017-10-21 15:27:43.404739", "last_undegraded": "2017-10-21 15:27:43.404739", "last_fullsized": "2017-10-21 15:27:43.404739", "mapping_epoch": 853925, "log_start": "850240'4048688", "ondisk_log_start": "850240'4048688", "created": 1, "last_epoch_clean": 853782, "parent": "0.0", "parent_split_bits": 0, "last_scrub": "851336'4048912", "last_scrub_stamp": "2017-10-20 20:46:48.495908", "last_deep_scrub": "842641'4043774", "last_deep_scrub_stamp": "2017-10-17 14:17:24.849677", "last_clean_scrub_stamp": "2017-10-20 20:46:48.495908", "log_size": 1502, "ondisk_log_size": 1502, "stats_invalid": false, "dirty_stats_invalid": true, "omap_stats_invalid": true, "hitset_stats_invalid": true, "hitset_bytes_stats_invalid": true, "pin_stats_invalid": true, "stat_sum": { "num_bytes": 12439926674, "num_objects": 11079, "num_object_clones": 0, "num_object_copies": 33234, "num_objects_missing_on_primary": 0, "num_objects_missing": 0, "num_objects_degraded": 0, "num_objects_misplaced": 0, "num_objects_unfound": 0, "num_objects_dirty": 9496, "num_whiteouts": 0, "num_read": 121131, "num_read_kb": 28091002, "num_write": 3771666, "num_write_kb": 379386979, "num_scrub_errors": 0, "num_shallow_scrub_errors": 0, "num_deep_scrub_errors": 0, "num_objects_recovered": 366232, "num_bytes_recovered": 307575227822, "num_keys_recovered": 0, "num_objects_omap": 0, "num_objects_hit_set_archive": 0, "num_bytes_hit_set_archive": 0, "num_flush": 0, "num_flush_kb": 0, "num_evict": 0, "num_evict_kb": 0, "num_promote": 0, "num_flush_mode_high": 0, "num_flush_mode_low": 0, "num_evict_mode_some": 0, "num_evict_mode_full": 0, "num_objects_pinned": 0, "num_legacy_snapsets": 0 }, "up": [ 10, 13, 28 ], "acting": [ 10, 7, 75 ], "blocked_by": [], "up_primary": 10, "acting_primary": 10 }, "empty": 0, "dne": 0, "incomplete": 0, "last_epoch_started": 853926, "hit_set_history": { "current_last_update": "0'0", "history": [] } } ], ...
I think the problem is here:
// If this peer has more objects then it should, ignore them backfilled += MIN(num_objects, peer_info[p].stats.stats.sum.num_objects);
and here
// Include computed backfilled objects on up nodes object_copies += backfilled;
because these backfill targets are beyond the pg size, so we don't actually care what progress they've made wrt degraded... only misplaced.
this isn't just cosmetic/consufing... it will prevent the balancer from doing work when it could.
Updated by Sage Weil over 6 years ago
- Backport set to luminous
We should backport the fix to luminous. It is confusing/scary that the 'degraded' health warning comes up during a rebalance with no failures.
Updated by Sage Weil over 6 years ago
- Related to Bug #21803: objects degraded higher than 100% added
Actions