Bug #6681: osd recovery hung - Ceph - Ceph

Actions

Copy link

Bug #6681

closed

osd recovery hung

Added by Samuel Just over 10 years ago. Updated about 10 years ago.

Status:

Resolved

Priority:

Urgent

Assignee:

Samuel Just

Category:

Target version:

% Done:

Source:

Q/A

Tags:

Backport:

dumpling

Regression:

Severity:

3 - minor

Reviewed:

Affected Versions:

ceph-qa-suite:

Pull request ID:

Crash signature (v1):

Crash signature (v2):

Description

ubuntu@teuthology:/a/teuthology-2013-10-29_10:45:19-rados-next-testing-basic-plana

ubuntu@plana31:~$ sudo ceph -s
cluster aff625a2-8a2f-4cb5-9f70-923d40fa0dba
health HEALTH_WARN 3 pgs recovering; 3 pgs stuck unclean; 24 requests are blocked > 32 sec; recovery 861/49076 objects degraded (1.754%); 326/24538 unfound (1.329%); pool data pg_num 64 > pgp_num 54; pool metadata pg_num 34 > pgp_num 24; pool rbd pg_num 34 > pgp_num 24
monmap e1: 3 mons at {a=10.214.131.9:6789/0,b=10.214.132.11:6789/0,c=10.214.131.9:6790/0}, election epoch 6, quorum 0,1,2 a,b,c
mdsmap e5: 1/1/1 up {0=a=up:active}
osdmap e284: 6 osds: 6 up, 4 in
pgmap v1902: 132 pgs, 3 pools, 98032 MB data, 24538 objects
114 GB used, 1745 GB / 1863 GB avail
861/49076 objects degraded (1.754%); 326/24538 unfound (1.329%)
129 active+clean
3 active+recovering

ubuntu@plana31:~$ sudo ceph pg dump | grep recovering
dumped all in format plain
0.3a 500 169 395 169 2097152000 532 532 active+recovering 2013-10-29 12:02:07.668419 207'532 284:265 [4,0] [4,0] 0'0 2013-10-29 11:43:09.707243 0'0 2013-10-29 11:43:09.707243
0.37 404 156 382 156 1694498816 410 410 active+recovering 2013-10-29 12:02:25.523768 203'410 284:259 [5,1] [5,1] 0'0 2013-10-29 11:43:08.645224 0'0 2013-10-29 11:43:08.645224
0.36 383 1 84 1 1602224140 403 403 active+recovering 2013-10-29 12:02:35.302149 219'403 284:585 [5,1] [5,1] 0'0 2013-10-29 11:43:08.644738 0'0 2013-10-29 11:43:08.644738

{ "state": "active+recovering",
"epoch": 284,
"up": [
4,
0],
"acting": [
4,
0],
"info": { "pgid": "0.3a",
"last_update": "207'532",
"last_complete": "11'50",
"log_tail": "0'0",
"last_user_version": 532,
"last_backfill": "MAX",
"purged_snaps": "[]",
"history": { "epoch_created": 1,
"last_epoch_started": 163,
"last_epoch_clean": 95,
"last_epoch_split": 0,
"same_up_since": 162,
"same_interval_since": 162,
"same_primary_since": 162,
"last_scrub": "0'0",
"last_scrub_stamp": "2013-10-29 11:43:09.707243",
"last_deep_scrub": "0'0",
"last_deep_scrub_stamp": "2013-10-29 11:43:09.707243",
"last_clean_scrub_stamp": "0.000000"},
"stats": { "version": "207'532",
"reported_seq": "265",
"reported_epoch": "284",
"state": "active+recovering",
"last_fresh": "2013-10-29 12:27:49.857720",
"last_change": "2013-10-29 12:02:07.668419",
"last_active": "2013-10-29 12:27:49.857720",
"last_clean": "2013-10-29 12:00:06.712576",
"last_became_active": "0.000000",
"last_unstale": "2013-10-29 12:27:49.857720",
"mapping_epoch": 160,
"log_start": "0'0",
"ondisk_log_start": "0'0",
"created": 1,
"last_epoch_clean": 95,
"parent": "0.0",
"parent_split_bits": 0,
"last_scrub": "0'0",
"last_scrub_stamp": "2013-10-29 11:43:09.707243",
"last_deep_scrub": "0'0",
"last_deep_scrub_stamp": "2013-10-29 11:43:09.707243",
"last_clean_scrub_stamp": "0.000000",
"log_size": 532,
"ondisk_log_size": 532,
"stats_invalid": "1",
"stat_sum": { "num_bytes": 2097152000,
"num_objects": 500,
"num_object_clones": 0,
"num_object_copies": 0,
"num_objects_missing_on_primary": 0,
"num_objects_degraded": 0,
"num_objects_unfound": 0,
"num_read": 0,
"num_read_kb": 0,
"num_write": 532,
"num_write_kb": 2113536,
"num_scrub_errors": 0,
"num_shallow_scrub_errors": 0,
"num_deep_scrub_errors": 0,
"num_objects_recovered": 298,
"num_bytes_recovered": 1249902592,
"num_keys_recovered": 0},
"stat_cat_sum": {},
"up": [
4,
0],
"acting": [
4,
0]},
"empty": 0,
"dne": 0,
"incomplete": 0,
"last_epoch_started": 163},
"recovery_state": [ { "name": "Started\/Primary\/Active",
"enter_time": "2013-10-29 12:02:07.476638",
"might_have_unfound": [ { "osd": 0,
"status": "already probed"}, { "osd": 1,
"status": "already probed"}, { "osd": 2,
"status": "querying"}, { "osd": 3,
"status": "querying"}, { "osd": 5,
"status": "already probed"}],
"recovery_progress": { "backfill_target": -1,
"waiting_on_backfill": 0,
"last_backfill_started": "0\/\/0\/\/-1",
"backfill_info": { "begin": "0\/\/0\/\/-1",
"end": "0\/\/0\/\/-1",
"objects": []},
"peer_backfill_info": { "begin": "0\/\/0\/\/-1",
"end": "0\/\/0\/\/-1",
"objects": []},
"backfills_in_flight": [],
"recovering": [],
"pg_backend": { "pull_from_peer": [],
"pushing": []}},
"scrub": { "scrubber.epoch_start": "0",
"scrubber.active": 0,
"scrubber.block_writes": 0,
"scrubber.finalizing": 0,
"scrubber.waiting_on": 0,
"scrubber.waiting_on_whom": []}}, { "name": "Started",
"enter_time": "2013-10-29 12:02:06.321424"}]}

Actions

Copy link

Updated by Samuel Just over 10 years ago

Priority changed from Normal to Urgent

Not an rc blocker. This is actually a fairly old bug and can be worked around by restarting one of the effected osds. It seems to be related to split, the MISSING query from the primary in the case I examined appeared to show up at the replica between when the split was initiated and when the pg showed up in the pg map. The query did not appear after the initial messenger log output.

Actions

Copy link