Project

General

Profile

Actions

Bug #41154

open

osd: pg unknown state

Added by Alexander Kazansky over 4 years ago. Updated almost 4 years ago.

Status:
New
Priority:
Normal
Assignee:
-
Category:
-
Target version:
% Done:

0%

Source:
Tags:
Backport:
Regression:
No
Severity:
1 - critical
Reviewed:
Affected Versions:
ceph-qa-suite:
Component(RADOS):
OSD
Pull request ID:
Crash signature (v1):
Crash signature (v2):

Description

hello. yesterday my cluster go crazy and zeroized action sent for one pg.

osd.119 pg_epoch: 79413 pg[15.7c1( v 79401'1396074 (78949'1393067,79401'1396074] local-lis/les=79247/79248 n=38552 ec=40508/18370 lis/c 79247/79247 les/c/f 79248/79248/0 79413/79413/79413) [] r=-1 lpr=79413 pi=[79247,79413)/1 luod=0'0 lua=79244'1393968 crt=79401'1396074 lcod 79401'1396073 active mbc={}] start_peering_interval up [20,156,6,119] -> [], acting [20,156,6,119] -> [], acting_primary 20 -> -1, up_primary 20 -> -1, role 3 -> -1, features acting 4611087854031667195 upacting 4611087854031667195

pg 15.7c1 have status unknown.

after some hours troubleshuting i resolve problem by using upmap feature
ceph osd pg-upmap 15.7c1 osd.6 osd.20 osd.119 osd.156

in current time this pg have next state
"state": "active+clean",
"snap_trimq": "[]",
"snap_trimq_len": 0,
"epoch": 80472,
"up": [
6,
20,
119,
156
],
"acting": [
6,
20,
119,
156
],
"acting_recovery_backfill": [
"6",
"20",
"119",
"156"
],
"info": {
"pgid": "15.7c1",
"last_update": "80470'1401118",
"last_complete": "80470'1401118",
"log_tail": "79854'1398068",
"last_user_version": 1401118,
"last_backfill": "MAX",
"last_backfill_bitwise": 1,
"purged_snaps": [],
"history": {
"epoch_created": 40508,
"epoch_pool_created": 18370,
"last_epoch_started": 79864,
"last_interval_started": 79863,
"last_epoch_clean": 79865,
"last_interval_clean": 79863,
"last_epoch_split": 40508,
"last_epoch_marked_full": 0,
"same_up_since": 79863,
"same_interval_since": 79863,
"same_primary_since": 79858,
"last_scrub": "79867'1399717",
"last_scrub_stamp": "2019-08-07 14:00:00.062479",
"last_deep_scrub": "79867'1399717",
"last_deep_scrub_stamp": "2019-08-07 14:00:00.062479",
"last_clean_scrub_stamp": "2019-08-07 14:00:00.062479"
},
"stats": {
"version": "80470'1401118",
"reported_seq": "2501969",
"reported_epoch": "80472",
"state": "active+clean",
"last_fresh": "2019-08-07 19:07:26.901482",
"last_change": "2019-08-07 14:00:00.062557",
"last_active": "2019-08-07 19:07:26.901482",
"last_peered": "2019-08-07 19:07:26.901482",
"last_clean": "2019-08-07 19:07:26.901482",
"last_became_active": "2019-08-07 13:45:40.744977",
"last_became_peered": "2019-08-07 13:45:40.744977",
"last_unstale": "2019-08-07 19:07:26.901482",
"last_undegraded": "2019-08-07 19:07:26.901482",
"last_fullsized": "2019-08-07 19:07:26.901482",
"mapping_epoch": 79863,
"log_start": "79854'1398068",
"ondisk_log_start": "79854'1398068",
"created": 40508,
"last_epoch_clean": 79865,
"parent": "0.0",
"parent_split_bits": 11,
"last_scrub": "79867'1399717",
"last_scrub_stamp": "2019-08-07 14:00:00.062479",
"last_deep_scrub": "79867'1399717",
"last_deep_scrub_stamp": "2019-08-07 14:00:00.062479",
"last_clean_scrub_stamp": "2019-08-07 14:00:00.062479",
"log_size": 3050,
"ondisk_log_size": 3050,
"stats_invalid": false,
"dirty_stats_invalid": false,
"omap_stats_invalid": false,
"hitset_stats_invalid": false,
"hitset_bytes_stats_invalid": false,
"pin_stats_invalid": false,
"manifest_stats_invalid": false,
"snaptrimq_len": 0,
"stat_sum": {
"num_bytes": 18637781186,
"num_objects": 38676,
"num_object_clones": 0,
"num_object_copies": 154704,
"num_objects_missing_on_primary": 0,
"num_objects_missing": 0,
"num_objects_degraded": 0,
"num_objects_misplaced": 0,
"num_objects_unfound": 0,
"num_objects_dirty": 38676,
"num_whiteouts": 0,
"num_read": 325832,
"num_read_kb": 65142690,
"num_write": 440892,
"num_write_kb": 8590052,
"num_scrub_errors": 0,
"num_shallow_scrub_errors": 0,
"num_deep_scrub_errors": 0,
"num_objects_recovered": 85352,
"num_bytes_recovered": 58676498560,
"num_keys_recovered": 0,
"num_objects_omap": 0,
"num_objects_hit_set_archive": 0,
"num_bytes_hit_set_archive": 0,
"num_flush": 0,
"num_flush_kb": 0,
"num_evict": 0,
"num_evict_kb": 0,
"num_promote": 0,
"num_flush_mode_high": 0,
"num_flush_mode_low": 0,
"num_evict_mode_some": 0,
"num_evict_mode_full": 0,
"num_objects_pinned": 0,
"num_legacy_snapsets": 0,
"num_large_omap_objects": 0,
"num_objects_manifest": 0
},
"up": [
6,
20,
119,
156
],
"acting": [
6,
20,
119,
156
],
"blocked_by": [],
"up_primary": 6,
"acting_primary": 6,
"purged_snaps": []
},
"empty": 0,
"dne": 0,
"incomplete": 0,
"last_epoch_started": 79864,
"hit_set_history": {
"current_last_update": "0'0",
"history": []
}
},
"recovery_state": [ {
"name": "Started/Primary/Active",
"enter_time": "2019-08-07 13:45:40.663653",
"might_have_unfound": [ {
"osd": "20",
"status": "already probed"
}, {
"osd": "119",
"status": "already probed"
}, {
"osd": "156",
"status": "already probed"
}
],
"recovery_progress": {
"backfill_targets": [],
"waiting_on_backfill": [],
"last_backfill_started": "MIN",
"backfill_info": {
"begin": "MIN",
"end": "MIN",
"objects": []
},
"peer_backfill_info": [],
"backfills_in_flight": [],
"recovering": [],
"pg_backend": {
"pull_from_peer": [],
"pushing": []
}
},
"scrub": {
"scrubber.epoch_start": "79863",
"scrubber.active": false,
"scrubber.state": "INACTIVE",
"scrubber.start": "MIN",
"scrubber.end": "MIN",
"scrubber.max_end": "MIN",
"scrubber.subset_last_update": "0'0",
"scrubber.deep": false,
"scrubber.waiting_on_whom": []
}
}, {
"name": "Started",
"enter_time": "2019-08-07 13:45:39.813202"
}
],
and seems at work. but i dont sure what i do right and i dont know when need do for this pg recovery whitout hard upmap.

Actions #1

Updated by Patrick Donnelly over 4 years ago

  • Project changed from Ceph to RADOS
  • Subject changed from pg unknown state to osd: pg unknown state
  • Component(RADOS) OSD added
Actions #2

Updated by Alexander Kazansky almost 4 years ago

I again have this problem.

osd.26 pg_epoch: 237814 pg[14.84( v 237812'23168375 (237586'23165374,237812'23168375] local-lis/les=223376/223377 n=180181 ec=19642/18365 lis/c 223376/223376 les/c/f 223377/223377/206072 237814/237814/237814) [] r=-1 lpr=237814 pi=[223376,237814)/1 luod=0'0 crt=237812'23168375 lcod 237812'23168374 active mbc={}] start_peering_interval up [26,248,253,30] -> [], acting [26,248,253,30] -> [], acting_primary 26 -> -1, up_primary 26 -> -1, role 0 -> -1, features acting 4611087854031667199 upacting 4611087854031667199
2020-06-22 19:18:06 +03:00    kr01-ceph-93
osd.26 pg_epoch: 237814 pg[14.84( v 237812'23168375 (237586'23165374,237812'23168375] local-lis/les=223376/223377 n=180181 ec=19642/18365 lis/c 223376/223376 les/c/f 223377/223377/206072 237814/237814/237814) [] r=-1 lpr=237814 pi=[223376,237814)/1 crt=237812'23168375 lcod 237812'23168374 unknown NOTIFY mbc={}] state<Start>: transitioning to Stray
2020-06-22 19:18:06 +03:00    sdc-ceph-182
osd.30 pg_epoch: 237814 pg[14.84( v 237812'23168375 (237586'23165374,237812'23168375] local-lis/les=223376/223377 n=180181 ec=19642/18365 lis/c 223376/223376 les/c/f 223377/223377/206072 237814/237814/237814) [] r=-1 lpr=237814 pi=[223376,237814)/1 crt=237812'23168375 lcod 237812'23168374 unknown NOTIFY mbc={}] state<Start>: transitioning to Stray
2020-06-22 19:18:06 +03:00    sdc-ceph-182
osd.30 pg_epoch: 237814 pg[14.84( v 237812'23168375 (237586'23165374,237812'23168375] local-lis/les=223376/223377 n=180181 ec=19642/18365 lis/c 223376/223376 les/c/f 223377/223377/206072 237814/237814/237814) [] r=-1 lpr=237814 pi=[223376,237814)/1 luod=0'0 lua=237724'23168348 crt=237812'23168375 lcod 237812'23168374 active mbc={}] start_peering_interval up [26,248,253,30] -> [], acting [26,248,253,30] -> [], acting_primary 26 -> -1, up_primary 26 -> -1, role 3 -> -1, features acting 4611087854031667199 upacting 4611087854031667199
2020-06-22 19:18:06 +03:00    kr01-ceph-84
osd.248 pg_epoch: 237814 pg[14.84( v 237812'23168375 (237586'23165374,237812'23168375] local-lis/les=223376/223377 n=180181 ec=19642/18365 lis/c 223376/223376 les/c/f 223377/223377/206072 237814/237814/237814) [] r=-1 lpr=237814 pi=[223376,237814)/1 crt=237812'23168375 lcod 237812'23168374 unknown NOTIFY mbc={}] state<Start>: transitioning to Stray
2020-06-22 19:18:06 +03:00    kr01-ceph-84
osd.248 pg_epoch: 237814 pg[14.84( v 237812'23168375 (237586'23165374,237812'23168375] local-lis/les=223376/223377 n=180181 ec=19642/18365 lis/c 223376/223376 les/c/f 223377/223377/206072 237814/237814/237814) [] r=-1 lpr=237814 pi=[223376,237814)/1 luod=0'0 lua=237724'23168348 crt=237812'23168375 lcod 237812'23168374 active mbc={}] start_peering_interval up [26,248,253,30] -> [], acting [26,248,253,30] -> [], acting_primary 26 -> -1, up_primary 26 -> -1, role 1 -> -1, features acting 4611087854031667199 upacting 4611087854031667199
2020-06-22 19:18:06 +03:00    sdc-ceph-251
osd.253 pg_epoch: 237814 pg[14.84( v 237812'23168375 (237586'23165374,237812'23168375] local-lis/les=223376/223377 n=180181 ec=19642/18365 lis/c 223376/223376 les/c/f 223377/223377/206072 237814/237814/237814) [] r=-1 lpr=237814 pi=[223376,237814)/1 crt=237812'23168375 lcod 237812'23168374 unknown NOTIFY mbc={}] state<Start>: transitioning to Stray
2020-06-22 19:18:06 +03:00    sdc-ceph-251
osd.253 pg_epoch: 237814 pg[14.84( v 237812'23168375 (237586'23165374,237812'23168375] local-lis/les=223376/223377 n=180181 ec=19642/18365 lis/c 223376/223376 les/c/f 223377/223377/206072 237814/237814/237814) [] r=-1 lpr=237814 pi=[223376,237814)/1 luod=0'0 lua=237724'23168348 crt=237812'23168375 lcod 237812'23168374 active mbc={}] start_peering_interval up [26,248,253,30] -> [], acting [26,248,253,30] -> [], acting_primary 26 -> -1, up_primary 26 -> -1, role 2 -> -1, features acting 4611087854031667199 upacting 4611087854031667199
Actions

Also available in: Atom PDF