Project

General

Profile

Actions

Bug #6681

closed

osd recovery hung

Added by Samuel Just over 10 years ago. Updated about 10 years ago.

Status:
Resolved
Priority:
Urgent
Assignee:
Category:
-
Target version:
-
% Done:

0%

Source:
Q/A
Tags:
Backport:
dumpling
Regression:
Severity:
3 - minor
Reviewed:
Affected Versions:
ceph-qa-suite:
Pull request ID:
Crash signature (v1):
Crash signature (v2):

Description

ubuntu@teuthology:/a/teuthology-2013-10-29_10:45:19-rados-next-testing-basic-plana

ubuntu@plana31:~$ sudo ceph -s
cluster aff625a2-8a2f-4cb5-9f70-923d40fa0dba
health HEALTH_WARN 3 pgs recovering; 3 pgs stuck unclean; 24 requests are blocked > 32 sec; recovery 861/49076 objects degraded (1.754%); 326/24538 unfound (1.329%); pool data pg_num 64 > pgp_num 54; pool metadata pg_num 34 > pgp_num 24; pool rbd pg_num 34 > pgp_num 24
monmap e1: 3 mons at {a=10.214.131.9:6789/0,b=10.214.132.11:6789/0,c=10.214.131.9:6790/0}, election epoch 6, quorum 0,1,2 a,b,c
mdsmap e5: 1/1/1 up {0=a=up:active}
osdmap e284: 6 osds: 6 up, 4 in
pgmap v1902: 132 pgs, 3 pools, 98032 MB data, 24538 objects
114 GB used, 1745 GB / 1863 GB avail
861/49076 objects degraded (1.754%); 326/24538 unfound (1.329%)
129 active+clean
3 active+recovering

ubuntu@plana31:~$ sudo ceph pg dump | grep recovering
dumped all in format plain
0.3a 500 169 395 169 2097152000 532 532 active+recovering 2013-10-29 12:02:07.668419 207'532 284:265 [4,0] [4,0] 0'0 2013-10-29 11:43:09.707243 0'0 2013-10-29 11:43:09.707243
0.37 404 156 382 156 1694498816 410 410 active+recovering 2013-10-29 12:02:25.523768 203'410 284:259 [5,1] [5,1] 0'0 2013-10-29 11:43:08.645224 0'0 2013-10-29 11:43:08.645224
0.36 383 1 84 1 1602224140 403 403 active+recovering 2013-10-29 12:02:35.302149 219'403 284:585 [5,1] [5,1] 0'0 2013-10-29 11:43:08.644738 0'0 2013-10-29 11:43:08.644738

{ "state": "active+recovering",
"epoch": 284,
"up": [
4,
0],
"acting": [
4,
0],
"info": { "pgid": "0.3a",
"last_update": "207'532",
"last_complete": "11'50",
"log_tail": "0'0",
"last_user_version": 532,
"last_backfill": "MAX",
"purged_snaps": "[]",
"history": { "epoch_created": 1,
"last_epoch_started": 163,
"last_epoch_clean": 95,
"last_epoch_split": 0,
"same_up_since": 162,
"same_interval_since": 162,
"same_primary_since": 162,
"last_scrub": "0'0",
"last_scrub_stamp": "2013-10-29 11:43:09.707243",
"last_deep_scrub": "0'0",
"last_deep_scrub_stamp": "2013-10-29 11:43:09.707243",
"last_clean_scrub_stamp": "0.000000"},
"stats": { "version": "207'532",
"reported_seq": "265",
"reported_epoch": "284",
"state": "active+recovering",
"last_fresh": "2013-10-29 12:27:49.857720",
"last_change": "2013-10-29 12:02:07.668419",
"last_active": "2013-10-29 12:27:49.857720",
"last_clean": "2013-10-29 12:00:06.712576",
"last_became_active": "0.000000",
"last_unstale": "2013-10-29 12:27:49.857720",
"mapping_epoch": 160,
"log_start": "0'0",
"ondisk_log_start": "0'0",
"created": 1,
"last_epoch_clean": 95,
"parent": "0.0",
"parent_split_bits": 0,
"last_scrub": "0'0",
"last_scrub_stamp": "2013-10-29 11:43:09.707243",
"last_deep_scrub": "0'0",
"last_deep_scrub_stamp": "2013-10-29 11:43:09.707243",
"last_clean_scrub_stamp": "0.000000",
"log_size": 532,
"ondisk_log_size": 532,
"stats_invalid": "1",
"stat_sum": { "num_bytes": 2097152000,
"num_objects": 500,
"num_object_clones": 0,
"num_object_copies": 0,
"num_objects_missing_on_primary": 0,
"num_objects_degraded": 0,
"num_objects_unfound": 0,
"num_read": 0,
"num_read_kb": 0,
"num_write": 532,
"num_write_kb": 2113536,
"num_scrub_errors": 0,
"num_shallow_scrub_errors": 0,
"num_deep_scrub_errors": 0,
"num_objects_recovered": 298,
"num_bytes_recovered": 1249902592,
"num_keys_recovered": 0},
"stat_cat_sum": {},
"up": [
4,
0],
"acting": [
4,
0]},
"empty": 0,
"dne": 0,
"incomplete": 0,
"last_epoch_started": 163},
"recovery_state": [ { "name": "Started\/Primary\/Active",
"enter_time": "2013-10-29 12:02:07.476638",
"might_have_unfound": [ { "osd": 0,
"status": "already probed"}, { "osd": 1,
"status": "already probed"}, { "osd": 2,
"status": "querying"}, { "osd": 3,
"status": "querying"}, { "osd": 5,
"status": "already probed"}],
"recovery_progress": { "backfill_target": -1,
"waiting_on_backfill": 0,
"last_backfill_started": "0\/\/0\/\/-1",
"backfill_info": { "begin": "0\/\/0\/\/-1",
"end": "0\/\/0\/\/-1",
"objects": []},
"peer_backfill_info": { "begin": "0\/\/0\/\/-1",
"end": "0\/\/0\/\/-1",
"objects": []},
"backfills_in_flight": [],
"recovering": [],
"pg_backend": { "pull_from_peer": [],
"pushing": []}},
"scrub": { "scrubber.epoch_start": "0",
"scrubber.active": 0,
"scrubber.block_writes": 0,
"scrubber.finalizing": 0,
"scrubber.waiting_on": 0,
"scrubber.waiting_on_whom": []}}, { "name": "Started",
"enter_time": "2013-10-29 12:02:06.321424"}]}

Actions #1

Updated by Samuel Just over 10 years ago

  • Priority changed from Normal to Urgent

Not an rc blocker. This is actually a fairly old bug and can be worked around by restarting one of the effected osds. It seems to be related to split, the MISSING query from the primary in the case I examined appeared to show up at the replica between when the split was initiated and when the pg showed up in the pg map. The query did not appear after the initial messenger log output.

Actions #2

Updated by Ian Colle over 10 years ago

  • Assignee set to Samuel Just
Actions #3

Updated by Samuel Just over 10 years ago

I may have a fix in wip-6685

Actions #4

Updated by Samuel Just over 10 years ago

  • Status changed from New to 7
Actions #5

Updated by Greg Farnum over 10 years ago

commit:cf4e00ff5c6d0c548cb766f94288277d1f661094 ("OSD: don't clear peering_wait_for_split in advance_map()") looks good to me.

Actions #6

Updated by Samuel Just over 10 years ago

  • Status changed from 7 to Resolved
Actions #7

Updated by Samuel Just about 10 years ago

  • Status changed from Resolved to Pending Backport

/var/lib/teuthworker/archive/teuthology-2014-04-01_19:00:30-rados-dumpling-testing-basic-plana/159217

Actions #8

Updated by Samuel Just about 10 years ago

  • Backport set to dumpling,cuttlefish
Actions #9

Updated by Samuel Just about 10 years ago

  • Backport changed from dumpling,cuttlefish to dumpling
Actions #10

Updated by Sage Weil about 10 years ago

  • Status changed from Pending Backport to Resolved
  • Source changed from other to Q/A

backported to dumpling

Actions

Also available in: Atom PDF