Project

General

Profile

Actions

Bug #52255

open

The pgs state are degraded, but all the osds is up and there is no recovering and backfilling

Added by Ke Xiao over 2 years ago. Updated over 2 years ago.

Status:
Need More Info
Priority:
Normal
Assignee:
-
Category:
-
Target version:
-
% Done:

0%

Source:
Tags:
Backport:
Regression:
No
Severity:
3 - minor
Reviewed:
Affected Versions:
ceph-qa-suite:
rbd
Component(RADOS):
Pull request ID:
Crash signature (v1):
Crash signature (v2):

Description

I removed a server yesterday, but there are 6 pgs are in stare degraded and no longer changed.
The copy size of pool is 3.
The health detail content show that

HEALTH_WARN Degraded data redundancy: 7812/4856175 objects degraded (0.161%), 6 pgs degraded, 6 pgs undersized
PG_DEGRADED Degraded data redundancy: 7812/4856175 objects degraded (0.161%), 6 pgs degraded, 6 pgs undersized
    pg 3.1f is stuck undersized for 5227.725424, current state active+undersized+degraded, last acting [33,8]
    pg 3.3b is stuck undersized for 44976.167641, current state active+undersized+degraded, last acting [36,21]
    pg 3.3c is stuck undersized for 44976.174640, current state active+undersized+degraded, last acting [36,7]
    pg 3.4a is stuck undersized for 44976.171052, current state active+undersized+degraded, last acting [10,34]
    pg 3.66 is stuck undersized for 44976.168330, current state active+undersized+degraded, last acting [19,16]
    pg 3.7f is stuck undersized for 66349.043653, current state active+undersized+degraded, last acting [21,18]

The coomand `ceph pg dump_stuck` show that :

PG_STAT STATE                      UP      UP_PRIMARY ACTING  ACTING_PRIMARY
3.4a    active+undersized+degraded [10,34]         10 [10,34]             10
3.3c    active+undersized+degraded  [36,7]         36  [36,7]             36
3.3b    active+undersized+degraded [36,21]         36 [36,21]             36
3.1f    active+undersized+degraded  [33,8]         33  [33,8]             33
3.66    active+undersized+degraded [19,16]         19 [19,16]             19
3.7f    active+undersized+degraded [21,18]         21 [21,18]             21

One of the pg info like that:

# ceph pg 3.4a query
{
    "state": "active+undersized+degraded",
    "snap_trimq": "[]",
    "snap_trimq_len": 0,
    "epoch": 3087,
    "up": [
        10,
        34
    ],
    "acting": [
        10,
        34
    ],
    "actingbackfill": [
        "10",
        "34" 
    ],
    "info": {
        "pgid": "3.4a",
        "last_update": "3087'17416148",
        "last_complete": "3087'17416148",
        "log_tail": "2070'17406078",
        "last_user_version": 17416148,
        "last_backfill": "MAX",
        "last_backfill_bitwise": 0,
        "purged_snaps": [
            {
                "start": "1",
                "length": "d" 
            },
            {
                "start": "13",
                "length": "2" 
            }
        ],
        "history": {
            "epoch_created": 144,
            "epoch_pool_created": 144,
            "last_epoch_started": 2075,
            "last_interval_started": 2074,
            "last_epoch_clean": 1378,
            "last_interval_clean": 1359,
            "last_epoch_split": 206,
            "last_epoch_marked_full": 0,
            "same_up_since": 2074,
            "same_interval_since": 2074,
            "same_primary_since": 582,
            "last_scrub": "1346'17384156",
            "last_scrub_stamp": "2021-08-12 01:00:46.854454",
            "last_deep_scrub": "1346'17363981",
            "last_deep_scrub_stamp": "2021-08-11 00:21:39.115330",
            "last_clean_scrub_stamp": "2021-08-12 01:00:46.854454" 
        },
        "stats": {
            "version": "3087'17416148",
            "reported_seq": "27965921",
            "reported_epoch": "3087",
            "state": "active+undersized+degraded",
            "last_fresh": "2021-08-13 11:10:19.963929",
            "last_change": "2021-08-12 22:37:42.523713",
            "last_active": "2021-08-13 11:10:19.963929",
            "last_peered": "2021-08-13 11:10:19.963929",
            "last_clean": "2021-08-12 22:37:39.959234",
            "last_became_active": "2021-08-12 22:37:42.523713",
            "last_became_peered": "2021-08-12 22:37:42.523713",
            "last_unstale": "2021-08-13 11:10:19.963929",
            "last_undegraded": "2021-08-12 22:37:42.511001",
            "last_fullsized": "2021-08-12 22:37:42.509855",
            "mapping_epoch": 2074,
            "log_start": "2070'17406078",
            "ondisk_log_start": "2070'17406078",
            "created": 144,
            "last_epoch_clean": 1378,
            "parent": "0.0",
            "parent_split_bits": 0,
            "last_scrub": "1346'17384156",
            "last_scrub_stamp": "2021-08-12 01:00:46.854454",
            "last_deep_scrub": "1346'17363981",
            "last_deep_scrub_stamp": "2021-08-11 00:21:39.115330",
            "last_clean_scrub_stamp": "2021-08-12 01:00:46.854454",
            "log_size": 10070,
            "ondisk_log_size": 10070,
            "stats_invalid": false,
            "dirty_stats_invalid": false,
            "omap_stats_invalid": false,
            "hitset_stats_invalid": false,
            "hitset_bytes_stats_invalid": false,
            "pin_stats_invalid": false,
            "snaptrimq_len": 0,
            "stat_sum": {
                "num_bytes": 5377932020,
                "num_objects": 1288,
                "num_object_clones": 1,
                "num_object_copies": 3864,
                "num_objects_missing_on_primary": 0,
                "num_objects_missing": 0,
                "num_objects_degraded": 1288,
                "num_objects_misplaced": 0,
                "num_objects_unfound": 0,
                "num_objects_dirty": 1288,
                "num_whiteouts": 0,
                "num_read": 28230422,
                "num_read_kb": 111166288,
                "num_write": 17408569,
                "num_write_kb": 578323545,
                "num_scrub_errors": 0,
                "num_shallow_scrub_errors": 0,
                "num_deep_scrub_errors": 0,
                "num_objects_recovered": 1279,
                "num_bytes_recovered": 5330412276,
                "num_keys_recovered": 0,
                "num_objects_omap": 0,
                "num_objects_hit_set_archive": 0,
                "num_bytes_hit_set_archive": 0,
                "num_flush": 0,
                "num_flush_kb": 0,
                "num_evict": 0,
                "num_evict_kb": 0,
                "num_promote": 0,
                "num_flush_mode_high": 0,
                "num_flush_mode_low": 0,
                "num_evict_mode_some": 0,
                "num_evict_mode_full": 0,
                "num_objects_pinned": 0,
                "num_legacy_snapsets": 0,
                "num_large_omap_objects": 0
            },
            "up": [
                10,
                34
            ],
            "acting": [
                10,
                34
            ],
            "blocked_by": [],
            "up_primary": 10,
            "acting_primary": 10
        },
        "empty": 0,
        "dne": 0,
        "incomplete": 0,
        "last_epoch_started": 2075,
        "hit_set_history": {
            "current_last_update": "0'0",
            "history": []
        }
    },
    "peer_info": [
        {
            "peer": "34",
            "pgid": "3.4a",
            "last_update": "3087'17416148",
            "last_complete": "3087'17416148",
            "log_tail": "2070'17404578",
            "last_user_version": 17406149,
            "last_backfill": "MAX",
            "last_backfill_bitwise": 1,
            "purged_snaps": [
                {
                    "start": "1",
                    "length": "d" 
                },
                {
                    "start": "13",
                    "length": "2" 
                }
            ],
            "history": {
                "epoch_created": 144,
                "epoch_pool_created": 144,
                "last_epoch_started": 2075,
                "last_interval_started": 2074,
                "last_epoch_clean": 1378,
                "last_interval_clean": 1359,
                "last_epoch_split": 206,
                "last_epoch_marked_full": 0,
                "same_up_since": 2074,
                "same_interval_since": 2074,
                "same_primary_since": 582,
                "last_scrub": "1346'17384156",
                "last_scrub_stamp": "2021-08-12 01:00:46.854454",
                "last_deep_scrub": "1346'17363981",
                "last_deep_scrub_stamp": "2021-08-11 00:21:39.115330",
                "last_clean_scrub_stamp": "2021-08-12 01:00:46.854454" 
            },
            "stats": {
                "version": "2073'17406148",
                "reported_seq": "27948804",
                "reported_epoch": "2073",
                "state": "active+clean",
                "last_fresh": "2021-08-12 22:37:39.944287",
                "last_change": "2021-08-12 16:47:34.937642",
                "last_active": "2021-08-12 22:37:39.944287",
                "last_peered": "2021-08-12 22:37:39.944287",
                "last_clean": "2021-08-12 22:37:39.944287",
                "last_became_active": "2021-08-12 16:43:30.452372",
                "last_became_peered": "2021-08-12 16:43:30.452372",
                "last_unstale": "2021-08-12 22:37:39.944287",
                "last_undegraded": "2021-08-12 22:37:39.944287",
                "last_fullsized": "2021-08-12 22:37:39.944287",
                "mapping_epoch": 2074,
                "log_start": "2070'17404578",
                "ondisk_log_start": "2070'17404578",
                "created": 144,
                "last_epoch_clean": 1378,
                "parent": "0.0",
                "parent_split_bits": 0,
                "last_scrub": "1346'17384156",
                "last_scrub_stamp": "2021-08-12 01:00:46.854454",
                "last_deep_scrub": "1346'17363981",
                "last_deep_scrub_stamp": "2021-08-11 00:21:39.115330",
                "last_clean_scrub_stamp": "2021-08-12 01:00:46.854454",
                "log_size": 1570,
                "ondisk_log_size": 1570,
                "stats_invalid": false,
                "dirty_stats_invalid": false,
                "omap_stats_invalid": false,
                "hitset_stats_invalid": false,
                "hitset_bytes_stats_invalid": false,
                "pin_stats_invalid": false,
                "snaptrimq_len": 0,
                "stat_sum": {
                    "num_bytes": 5377932020,
                    "num_objects": 1288,
                    "num_object_clones": 1,
                    "num_object_copies": 3864,
                    "num_objects_missing_on_primary": 0,
                    "num_objects_missing": 0,
                    "num_objects_degraded": 0,
                    "num_objects_misplaced": 0,
                    "num_objects_unfound": 0,
                    "num_objects_dirty": 1288,
                    "num_whiteouts": 0,
                    "num_read": 28216747,
                    "num_read_kb": 111077141,
                    "num_write": 17398570,
                    "num_write_kb": 577991459,
                    "num_scrub_errors": 0,
                    "num_shallow_scrub_errors": 0,
                    "num_deep_scrub_errors": 0,
                    "num_objects_recovered": 1279,
                    "num_bytes_recovered": 5330412276,
                    "num_keys_recovered": 0,
                    "num_objects_omap": 0,
                    "num_objects_hit_set_archive": 0,
                    "num_bytes_hit_set_archive": 0,
                    "num_flush": 0,
                    "num_flush_kb": 0,
                    "num_evict": 0,
                    "num_evict_kb": 0,
                    "num_promote": 0,
                    "num_flush_mode_high": 0,
                    "num_flush_mode_low": 0,
                    "num_evict_mode_some": 0,
                    "num_evict_mode_full": 0,
                    "num_objects_pinned": 0,
                    "num_legacy_snapsets": 0,
                    "num_large_omap_objects": 0
                },
                "up": [
                    10,
                    34
                ],
                "acting": [
                    10,
                    34
                ],
                "blocked_by": [],
                "up_primary": 10,
                "acting_primary": 10
            },
            "empty": 0,
            "dne": 0,
            "incomplete": 0,
            "last_epoch_started": 2075,
            "hit_set_history": {
                "current_last_update": "0'0",
                "history": []
            }
        }
    ],
    "recovery_state": [
        {
            "name": "Started/Primary/Active",
            "enter_time": "2021-08-12 22:37:42.509959",
            "might_have_unfound": [],
            "recovery_progress": {
                "backfill_targets": [],
                "waiting_on_backfill": [],
                "last_backfill_started": "MIN",
                "backfill_info": {
                    "begin": "MIN",
                    "end": "MIN",
                    "objects": []
                },
                "peer_backfill_info": [],
                "backfills_in_flight": [],
                "recovering": [],
                "pg_backend": {
                    "pull_from_peer": [],
                    "pushing": []
                }
            },
            "scrub": {
                "scrubber.epoch_start": "803",
                "scrubber.active": false,
                "scrubber.state": "INACTIVE",
                "scrubber.start": "MIN",
                "scrubber.end": "MIN",
                "scrubber.max_end": "MIN",
                "scrubber.subset_last_update": "0'0",
                "scrubber.deep": false,
                "scrubber.waiting_on_whom": []
            }
        },
        {
            "name": "Started",
            "enter_time": "2021-08-12 22:37:41.514118" 
        }
    ],
    "agent_state": {}
}

What should I do? Hode to get your Help.


Files

crush.map.txt (3.52 KB) crush.map.txt Ke Xiao, 08/17/2021 04:09 AM
osd.tree.txt (7.84 KB) osd.tree.txt Ke Xiao, 08/20/2021 04:14 PM
Actions #1

Updated by Ilya Dryomov over 2 years ago

  • Project changed from rbd to RADOS
Actions #2

Updated by Ke Xiao over 2 years ago

This is my crushmap

Actions #3

Updated by Neha Ojha over 2 years ago

can you share your osdmap? are all your osds up and in? the crushmap looks fine.

Actions #4

Updated by Neha Ojha over 2 years ago

  • Status changed from New to Need More Info
Actions #5

Updated by Ke Xiao over 2 years ago

Neha Ojha wrote:

can you share your osdmap? are all your osds up and in? the crushmap looks fine.

all the osds are up and in,this is my osd map

Actions #6

Updated by Ke Xiao over 2 years ago

Neha Ojha wrote:

can you share your osdmap? are all your osds up and in? the crushmap looks fine.

wish to get your help

Actions

Also available in: Atom PDF