Project

General

Profile

Actions

Bug #23294

open

OSD booted with noup never got marked in; pgs stuck peering while osd up, but out

Added by Josh Durgin about 6 years ago. Updated about 6 years ago.

Status:
New
Priority:
Normal
Assignee:
-
Category:
Peering
Target version:
-
% Done:

0%

Source:
Tags:
Backport:
Regression:
No
Severity:
3 - minor
Reviewed:
Affected Versions:
ceph-qa-suite:
Component(RADOS):
Pull request ID:
Crash signature (v1):
Crash signature (v2):

Description

http://pulpito.ceph.com/joshd-2018-03-09_22:47:53-rados-master-distro-basic-smithi/2273020/

This test restarts osd.0 while noup is set, then unsets noup, but osd.0 only goes up, not in, and pgs are stuck peering:

sudo ceph pg dump | grep peering
dumped all
2.10        438                  0        0         0       0 1794048 138      138      peering 2018-03-09 22:57:10.215720  21'438  101:541 [2,1]          2  [2,1]              2        0'0 2018-03-09 22:56:22.094590             0'0 2018-03-09 22:56:22.094590             0 
2.0         489                  0        0         0       0 2002944 189      189      peering 2018-03-09 22:57:10.198859  21'489  101:592 [3,1]          3  [3,1]              3        0'0 2018-03-09 22:56:22.094590             0'0 2018-03-09 22:56:22.094590             0 
2.2         493                  0        0         0       0 2019328 193      193      peering 2018-03-09 22:57:10.204127  21'493  101:596 [3,1]          3  [3,1]              3        0'0 2018-03-09 22:56:22.094590             0'0 2018-03-09 22:56:22.094590             0 
2.28        496                  0        0         0       0 2031616 196      196      peering 2018-03-09 22:57:10.209308  21'496  101:599 [3,1]          3  [3,1]              3        0'0 2018-03-09 22:56:22.094590             0'0 2018-03-09 22:56:22.094590

sudo ceph pg 2.0 query
{
    "state": "peering",
    "snap_trimq": "[]",
    "snap_trimq_len": 0,
    "epoch": 102,
    "up": [
        3,
        1
    ],
    "acting": [
        3,
        1
    ],
    "actingbackfill": [
        "1",
        "3" 
    ],
    "info": {
        "pgid": "2.0",
        "last_update": "21'489",
        "last_complete": "21'489",
        "log_tail": "15'300",
        "last_user_version": 489,
        "last_backfill": "MAX",
        "last_backfill_bitwise": 0,
        "purged_snaps": [],
        "history": {
            "epoch_created": 13,
            "epoch_pool_created": 13,
            "last_epoch_started": 21,
            "last_interval_started": 20,
            "last_epoch_clean": 14,
            "last_interval_clean": 13,
            "last_epoch_split": 0,
            "last_epoch_marked_full": 0,
            "same_up_since": 23,
            "same_interval_since": 23,
            "same_primary_since": 13,
            "last_scrub": "0'0",
            "last_scrub_stamp": "2018-03-09 22:56:22.094590",
            "last_deep_scrub": "0'0",
            "last_deep_scrub_stamp": "2018-03-09 22:56:22.094590",
            "last_clean_scrub_stamp": "2018-03-09 22:56:22.094590" 
        },
        "stats": {
            "version": "21'489",
            "reported_seq": "593",
            "reported_epoch": "102",
            "state": "peering",
            "last_fresh": "2018-03-09 22:58:28.012227",
            "last_change": "2018-03-09 22:57:10.198859",
            "last_active": "2018-03-09 22:57:10.198383",
            "last_peered": "2018-03-09 22:57:05.275937",
            "last_clean": "2018-03-09 22:56:59.531392",
            "last_became_active": "2018-03-09 22:57:01.750668",
            "last_became_peered": "2018-03-09 22:57:01.750668",
            "last_unstale": "2018-03-09 22:58:28.012227",
            "last_undegraded": "2018-03-09 22:58:28.012227",
            "last_fullsized": "2018-03-09 22:58:28.012227",
            "mapping_epoch": 23,
            "log_start": "15'300",
            "ondisk_log_start": "15'300",
            "created": 13,
            "last_epoch_clean": 14,
            "parent": "0.0",
            "parent_split_bits": 0,
            "last_scrub": "0'0",
            "last_scrub_stamp": "2018-03-09 22:56:22.094590",
            "last_deep_scrub": "0'0",
            "last_deep_scrub_stamp": "2018-03-09 22:56:22.094590",
            "last_clean_scrub_stamp": "2018-03-09 22:56:22.094590",
            "log_size": 189,
            "ondisk_log_size": 189,
            "stats_invalid": false,
            "dirty_stats_invalid": false,
            "omap_stats_invalid": false,
            "hitset_stats_invalid": false,
            "hitset_bytes_stats_invalid": false,
            "pin_stats_invalid": false,
            "snaptrimq_len": 0,
            "stat_sum": {
                "num_bytes": 2002944,
                "num_objects": 489,
                "num_object_clones": 0,
                "num_object_copies": 978,
                "num_objects_missing_on_primary": 0,
                "num_objects_missing": 0,
                "num_objects_degraded": 0,
                "num_objects_misplaced": 0,
                "num_objects_unfound": 0,
                "num_objects_dirty": 489,
                "num_whiteouts": 0,
                "num_read": 0,
                "num_read_kb": 0,
                "num_write": 978,
                "num_write_kb": 1956,
                "num_scrub_errors": 0,
                "num_shallow_scrub_errors": 0,
                "num_deep_scrub_errors": 0,
                "num_objects_recovered": 0,
                "num_bytes_recovered": 0,
                "num_keys_recovered": 0,
                "num_objects_omap": 0,
                "num_objects_hit_set_archive": 0,
                "num_bytes_hit_set_archive": 0,
                "num_flush": 0,
                "num_flush_kb": 0,
                "num_evict": 0,
                "num_evict_kb": 0,
                "num_promote": 0,
                "num_flush_mode_high": 0,
                "num_flush_mode_low": 0,
                "num_evict_mode_some": 0,
                "num_evict_mode_full": 0,
                "num_objects_pinned": 0,
                "num_legacy_snapsets": 0,
                "num_large_omap_objects": 0
            },
            "up": [
                3,
                1
            ],
            "acting": [
                3,
                1
            ],
            "blocked_by": [
                1
            ],
            "up_primary": 3,
            "acting_primary": 3,
            "purged_snaps": []
        },
        "empty": 0,
        "dne": 0,
        "incomplete": 0,
        "last_epoch_started": 21,
        "hit_set_history": {
            "current_last_update": "0'0",
            "history": []
        }
    },
    "peer_info": [
        {
            "peer": "1",
            "pgid": "2.0",
            "last_update": "15'464",
            "last_complete": "15'464",
            "log_tail": "15'300",
            "last_user_version": 464,
            "last_backfill": "MAX",
            "last_backfill_bitwise": 0,
            "purged_snaps": [],
            "history": {
                "epoch_created": 13,
                "epoch_pool_created": 13,
                "last_epoch_started": 14,
                "last_interval_started": 13,
                "last_epoch_clean": 14,
                "last_interval_clean": 13,
                "last_epoch_split": 0,
                "last_epoch_marked_full": 0,
                "same_up_since": 23,
                "same_interval_since": 23,
                "same_primary_since": 13,
                "last_scrub": "0'0",
                "last_scrub_stamp": "2018-03-09 22:56:22.094590",
                "last_deep_scrub": "0'0",
                "last_deep_scrub_stamp": "2018-03-09 22:56:22.094590",
                "last_clean_scrub_stamp": "2018-03-09 22:56:22.094590" 
            },
            "stats": {
                "version": "15'463",
                "reported_seq": "473",
                "reported_epoch": "15",
                "state": "active+clean",
                "last_fresh": "2018-03-09 22:56:53.151346",
                "last_change": "2018-03-09 22:56:23.195100",
                "last_active": "2018-03-09 22:56:53.151346",
                "last_peered": "2018-03-09 22:56:53.151346",
                "last_clean": "2018-03-09 22:56:53.151346",
                "last_became_active": "2018-03-09 22:56:23.194656",
                "last_became_peered": "2018-03-09 22:56:23.194656",
                "last_unstale": "2018-03-09 22:56:53.151346",
                "last_undegraded": "2018-03-09 22:56:53.151346",
                "last_fullsized": "2018-03-09 22:56:53.151346",
                "mapping_epoch": 23,
                "log_start": "15'300",
                "ondisk_log_start": "15'300",
                "created": 13,
                "last_epoch_clean": 14,
                "parent": "0.0",
                "parent_split_bits": 0,
                "last_scrub": "0'0",
                "last_scrub_stamp": "2018-03-09 22:56:22.094590",
                "last_deep_scrub": "0'0",
                "last_deep_scrub_stamp": "2018-03-09 22:56:22.094590",
                "last_clean_scrub_stamp": "2018-03-09 22:56:22.094590",
                "log_size": 163,
                "ondisk_log_size": 163,
                "stats_invalid": false,
                "dirty_stats_invalid": false,
                "omap_stats_invalid": false,
                "hitset_stats_invalid": false,
                "hitset_bytes_stats_invalid": false,
                "pin_stats_invalid": false,
                "snaptrimq_len": 0,
                "stat_sum": {
                    "num_bytes": 1900544,
                    "num_objects": 464,
                    "num_object_clones": 0,
                    "num_object_copies": 926,
                    "num_objects_missing_on_primary": 0,
                    "num_objects_missing": 0,
                    "num_objects_degraded": 0,
                    "num_objects_misplaced": 0,
                    "num_objects_unfound": 0,
                    "num_objects_dirty": 464,
                    "num_whiteouts": 0,
                    "num_read": 0,
                    "num_read_kb": 0,
                    "num_write": 928,
                    "num_write_kb": 1856,
                    "num_scrub_errors": 0,
                    "num_shallow_scrub_errors": 0,
                    "num_deep_scrub_errors": 0,
                    "num_objects_recovered": 0,
                    "num_bytes_recovered": 0,
                    "num_keys_recovered": 0,
                    "num_objects_omap": 0,
                    "num_objects_hit_set_archive": 0,
                    "num_bytes_hit_set_archive": 0,
                    "num_flush": 0,
                    "num_flush_kb": 0,
                    "num_evict": 0,
                    "num_evict_kb": 0,
                    "num_promote": 0,
                    "num_flush_mode_high": 0,
                    "num_flush_mode_low": 0,
                    "num_evict_mode_some": 0,
                    "num_evict_mode_full": 0,
                    "num_objects_pinned": 0,
                    "num_legacy_snapsets": 0,
                    "num_large_omap_objects": 0
                },
                "up": [
                    3,
                    1
                ],
                "acting": [
                    3,
                    1
                ],
                "blocked_by": [],
                "up_primary": 3,
                "acting_primary": 3,
                "purged_snaps": []
            },
            "empty": 0,
            "dne": 0,
            "incomplete": 0,
            "last_epoch_started": 14,
            "hit_set_history": {
                "current_last_update": "0'0",
                "history": []
            }
        }
    ],
    "recovery_state": [
        {
            "name": "Started/Primary/Peering/GetMissing",
            "enter_time": "2018-03-09 22:57:10.256069",
            "peer_missing_requested": [
                {
                    "osd": "1",
                    "got_missing": {
                        "missing": [],
                        "may_include_deletes": true
                    }
                }
            ]
        },
        {
            "name": "Started/Primary/Peering",
            "enter_time": "2018-03-09 22:57:10.198781",
            "past_intervals": [
                {
                    "first": "13",
                    "last": "22",
                    "all_participants": [
                        {
                            "osd": 1
                        },
                        {
                            "osd": 3
                        }
                    ],
                    "intervals": [
                        {
                            "first": "20",
                            "last": "22",
                            "acting": "3" 
                        }
                    ]
                }
            ],
            "probing_osds": [
                "1",
                "3" 
            ],
            "down_osds_we_would_probe": [],
            "peering_blocked_by": [],
            "peering_blocked_by_detail": [
                {
                    "detail": "peering_blocked_by_history_les_bound" 
                }
            ]
        },
        {
            "name": "Started",
            "enter_time": "2018-03-09 22:57:10.198708" 
        }
    ],
    "agent_state": {}
}

sudo ceph osd tree
ID CLASS WEIGHT  TYPE NAME          STATUS REWEIGHT PRI-AFF 
-1       0.35156 root default                               
-3       0.35156     host smithi011                         
 0   ssd 0.08789         osd.0          up        0 1.00000 
 1   ssd 0.08789         osd.1          up  1.00000 1.00000 
 2   ssd 0.08789         osd.2          up  1.00000 1.00000 
 3   ssd 0.08789         osd.3          up  1.00000 1.00000 

This seems to be a race, but when it occurs the job is marked dead, so reproducing it via teuthology needs active logging in while the test is running. In this case I manually killed a command to make it gather logs.

Actions #1

Updated by Greg Farnum about 6 years ago

  • Subject changed from pgs stuck peering while osd up, but out to OSD booted with noup never got marked in; pgs stuck peering while osd up, but out
  • Priority changed from Urgent to Normal
Actions

Also available in: Atom PDF