Project

General

Profile

Bug #16259

ceph osd processes crashes when doing a revert on unfound object

Added by Max Vernimmen almost 8 years ago. Updated almost 7 years ago.

Status:
Duplicate
Priority:
High
Assignee:
-
Category:
-
Target version:
-
% Done:

0%

Source:
other
Tags:
Backport:
Regression:
No
Severity:
3 - minor
Reviewed:
Affected Versions:
ceph-qa-suite:
Pull request ID:
Crash signature (v1):
Crash signature (v2):

Description

Situation: Some PG's report unfound objects.
Problem: Doing a revert results in ceph stating 'pg has no unfound objects'. But ceph health and pg list_missing continue to report the object as unfound.
Analysis: one of the osd processes crashes during this command and restarts itself. The object remains unfound and the osd crashes every time the revert command is executed.

This is a 3 node cluster consisting of 10.2.1 on CentOS 7.2 with 10 osd's on each node.

[root@md002 ~]# ceph pg 21.a3 query|less {
"state": "active+recovering+degraded",
"snap_trimq": "[]",
"epoch": 111319,
"up": [
4,
24,
12
],
"acting": [
4,
24,
12
],
"actingbackfill": [
"4",
"12",
"24"
],
"info": {
"pgid": "21.a3",
"last_update": "111050'276843",
"last_complete": "111050'276842",
"log_tail": "108850'273114",
"last_user_version": 276843,
"last_backfill": "MAX",
"last_backfill_bitwise": 0,
"purged_snaps": "[]",
"history": {
"epoch_created": 73150,
"last_epoch_started": 111090,
"last_epoch_clean": 111043,
"last_epoch_split": 0,
"last_epoch_marked_full": 0,
"same_up_since": 111088,
"same_interval_since": 111089,
"same_primary_since": 111089,
"last_scrub": "108850'275715",
"last_scrub_stamp": "2016-06-06 10:29:23.542753",
"last_deep_scrub": "108690'267814",
"last_deep_scrub_stamp": "2016-05-25 12:17:10.854387",
"last_clean_scrub_stamp": "2016-06-06 10:29:23.542753"
},
"stats": {
"version": "111050'276843",
"reported_seq": "388498",
"reported_epoch": "111310",
"state": "active+recovering+degraded",
"last_fresh": "2016-06-13 13:40:25.387172",
"last_change": "2016-06-10 13:45:52.574159",
"last_active": "2016-06-13 13:40:25.387172",
"last_peered": "2016-06-13 13:40:25.387172",
"last_clean": "2016-06-06 15:18:49.136586",
"last_became_active": "2016-06-10 13:45:51.981236",
"last_became_peered": "2016-06-10 13:45:51.981236",
"last_unstale": "2016-06-13 13:40:25.387172",
"last_undegraded": "2016-06-10 13:45:50.940689",
"last_fullsized": "2016-06-13 13:40:25.387172",
"mapping_epoch": 111088,
"log_start": "108850'273114",
"ondisk_log_start": "108850'273114",
"created": 73150,
"last_epoch_clean": 111043,
"parent": "0.0",
"parent_split_bits": 0,
"last_scrub": "108850'275715",
"last_scrub_stamp": "2016-06-06 10:29:23.542753",
"last_deep_scrub": "108690'267814",
"last_deep_scrub_stamp": "2016-05-25 12:17:10.854387",
"last_clean_scrub_stamp": "2016-06-06 10:29:23.542753",
"log_size": 3729,
"ondisk_log_size": 3729,
"stats_invalid": false,
"dirty_stats_invalid": false,
"omap_stats_invalid": false,
"hitset_stats_invalid": false,
"hitset_bytes_stats_invalid": false,
"pin_stats_invalid": true,
"stat_sum": {
"num_bytes": 12565090304,
"num_objects": 2996,
"num_object_clones": 0,
"num_object_copies": 8988,
"num_objects_missing_on_primary": 1,
"num_objects_missing": 0,
"num_objects_degraded": 1,
"num_objects_misplaced": 0,
"num_objects_unfound": 1,
"num_objects_dirty": 2996,
"num_whiteouts": 0,
"num_read": 2062256,
"num_read_kb": 311644380,
"num_write": 504736,
"num_write_kb": 282097976,
"num_scrub_errors": 0,
"num_shallow_scrub_errors": 0,
"num_deep_scrub_errors": 0,
"num_objects_recovered": 35411,
"num_bytes_recovered": 148490977280,
"num_keys_recovered": 0,
"num_objects_omap": 0,
"num_objects_hit_set_archive": 0,
"num_bytes_hit_set_archive": 0,
"num_flush": 0,
"num_flush_kb": 0,
"num_evict": 0,
"num_evict_kb": 0,
"num_promote": 0,
"num_flush_mode_high": 0,
"num_flush_mode_low": 0,
"num_evict_mode_some": 0,
"num_evict_mode_full": 0,
"num_objects_pinned": 0
},
"up": [
4,
24,
12
],
"acting": [
4,
24,
12
],
"blocked_by": [],
"up_primary": 4,
"acting_primary": 4
},
"empty": 0,
"dne": 0,
"incomplete": 0,
"last_epoch_started": 111090,
"hit_set_history": {
"current_last_update": "0'0",
"history": []
}
},
"peer_info": [ {
"peer": "12",
"pgid": "21.a3",
"last_update": "111050'276843",
"last_complete": "111050'276843",
"log_tail": "108850'273114",
"last_user_version": 276843,
"last_backfill": "MAX",
"last_backfill_bitwise": 0,
"purged_snaps": "[]",
"history": {
"epoch_created": 73150,
"last_epoch_started": 111090,
"last_epoch_clean": 111043,
"last_epoch_split": 0,
"last_epoch_marked_full": 0,
"same_up_since": 111088,
"same_interval_since": 111089,
"same_primary_since": 111089,
"last_scrub": "108850'275715",
"last_scrub_stamp": "2016-06-06 10:29:23.542753",
"last_deep_scrub": "108690'267814",
"last_deep_scrub_stamp": "2016-05-25 12:17:10.854387",
"last_clean_scrub_stamp": "2016-06-06 10:29:23.542753"
},
"stats": {
"version": "111050'276842",
"reported_seq": "388478",
"reported_epoch": "111089",
"state": "active+undersized+degraded",
"last_fresh": "2016-06-10 13:07:50.662549",
"last_change": "2016-06-10 13:07:31.905978",
"last_active": "2016-06-10 13:07:50.662549",
"last_peered": "2016-06-10 13:07:50.662549",
"last_clean": "2016-06-06 15:18:49.136586",
"last_became_active": "2016-06-10 13:07:31.905978",
"last_became_peered": "2016-06-10 13:07:31.905978",
"last_unstale": "2016-06-10 13:07:50.662549",
"last_undegraded": "2016-06-10 13:07:30.997668",
"last_fullsized": "2016-06-10 13:07:30.997668",
"mapping_epoch": 111088,
"log_start": "108850'273114",
"ondisk_log_start": "108850'273114",
"created": 73150,
"last_epoch_clean": 111043,
"parent": "0.0",
"parent_split_bits": 0,
"last_scrub": "108850'275715",
"last_scrub_stamp": "2016-06-06 10:29:23.542753",
"last_deep_scrub": "108690'267814",
"last_deep_scrub_stamp": "2016-05-25 12:17:10.854387",
"last_clean_scrub_stamp": "2016-06-06 10:29:23.542753",
"log_size": 3728,
"ondisk_log_size": 3728,
"stats_invalid": false,
"dirty_stats_invalid": false,
"omap_stats_invalid": false,
"hitset_stats_invalid": false,
"hitset_bytes_stats_invalid": false,
"pin_stats_invalid": true,
"stat_sum": {
"num_bytes": 12565090304,
"num_objects": 2996,
"num_object_clones": 0,
"num_object_copies": 8988,
"num_objects_missing_on_primary": 0,
"num_objects_missing": 0,
"num_objects_degraded": 2996,
"num_objects_misplaced": 0,
"num_objects_unfound": 0,
"num_objects_dirty": 2996,
"num_whiteouts": 0,
"num_read": 2062000,
"num_read_kb": 311640284,
"num_write": 504736,
"num_write_kb": 282097976,
"num_scrub_errors": 0,
"num_shallow_scrub_errors": 0,
"num_deep_scrub_errors": 0,
"num_objects_recovered": 35411,
"num_bytes_recovered": 148490977280,
"num_keys_recovered": 0,
"num_objects_omap": 0,
"num_objects_hit_set_archive": 0,
"num_bytes_hit_set_archive": 0,
"num_flush": 0,
"num_flush_kb": 0,
"num_evict": 0,
"num_evict_kb": 0,
"num_promote": 0,
"num_flush_mode_high": 0,
"num_flush_mode_low": 0,
"num_evict_mode_some": 0,
"num_evict_mode_full": 0,
"num_objects_pinned": 0
},
"up": [
4,
24,
12
],
"acting": [
4,
24,
12
],
"blocked_by": [],
"up_primary": 4,
"acting_primary": 4
},
"empty": 0,
"dne": 0,
"incomplete": 0,
"last_epoch_started": 111090,
"hit_set_history": {
"current_last_update": "0'0",
"history": []
}
}, {
"peer": "24",
"pgid": "21.a3",
"last_update": "111050'276843",
"last_complete": "111050'276843",
"log_tail": "108850'273114",
"last_user_version": 276843,
"last_backfill": "MAX",
"last_backfill_bitwise": 0,
"purged_snaps": "[]",
"history": {
"epoch_created": 73150,
"last_epoch_started": 111090,
"last_epoch_clean": 111043,
"last_epoch_split": 0,
"last_epoch_marked_full": 0,
"same_up_since": 111088,
"same_interval_since": 111089,
"same_primary_since": 111089,
"last_scrub": "108850'275715",
"last_scrub_stamp": "2016-06-06 10:29:23.542753",
"last_deep_scrub": "108690'267814",
"last_deep_scrub_stamp": "2016-05-25 12:17:10.854387",
"last_clean_scrub_stamp": "2016-06-06 10:29:23.542753"
},
"stats": {
"version": "111050'276843",
"reported_seq": "388529",
"reported_epoch": "111088",
"state": "remapped+peering",
"last_fresh": "2016-06-10 13:45:50.339193",
"last_change": "2016-06-10 13:45:50.318373",
"last_active": "2016-06-10 13:45:45.452403",
"last_peered": "2016-06-10 13:45:45.452403",
"last_clean": "2016-06-06 15:18:49.136586",
"last_became_active": "2016-06-10 13:07:31.905978",
"last_became_peered": "2016-06-10 13:07:31.905978",
"last_unstale": "2016-06-10 13:45:50.339193",
"last_undegraded": "2016-06-10 13:45:50.339193",
"last_fullsized": "2016-06-10 13:45:50.339193",
"mapping_epoch": 111088,
"log_start": "108850'273114",
"ondisk_log_start": "108850'273114",
"created": 73150,
"last_epoch_clean": 111043,
"parent": "0.0",
"parent_split_bits": 0,
"last_scrub": "108850'275715",
"last_scrub_stamp": "2016-06-06 10:29:23.542753",
"last_deep_scrub": "108690'267814",
"last_deep_scrub_stamp": "2016-05-25 12:17:10.854387",
"last_clean_scrub_stamp": "2016-06-06 10:29:23.542753",
"log_size": 3729,
"ondisk_log_size": 3729,
"stats_invalid": false,
"dirty_stats_invalid": false,
"omap_stats_invalid": false,
"hitset_stats_invalid": false,
"hitset_bytes_stats_invalid": false,
"pin_stats_invalid": true,
"stat_sum": {
"num_bytes": 12565090304,
"num_objects": 2996,
"num_object_clones": 0,
"num_object_copies": 8988,
"num_objects_missing_on_primary": 0,
"num_objects_missing": 0,
"num_objects_degraded": 0,
"num_objects_misplaced": 0,
"num_objects_unfound": 0,
"num_objects_dirty": 2996,
"num_whiteouts": 0,
"num_read": 2062000,
"num_read_kb": 311640284,
"num_write": 504736,
"num_write_kb": 282097976,
"num_scrub_errors": 0,
"num_shallow_scrub_errors": 0,
"num_deep_scrub_errors": 0,
"num_objects_recovered": 35411,
"num_bytes_recovered": 148490977280,
"num_keys_recovered": 0,
"num_objects_omap": 0,
"num_objects_hit_set_archive": 0,
"num_bytes_hit_set_archive": 0,
"num_flush": 0,
"num_flush_kb": 0,
"num_evict": 0,
"num_evict_kb": 0,
"num_promote": 0,
"num_flush_mode_high": 0,
"num_flush_mode_low": 0,
"num_evict_mode_some": 0,
"num_evict_mode_full": 0,
"num_objects_pinned": 0
},
"up": [
4,
24,
12
],
"acting": [
4,
24,
12
],
"blocked_by": [],
"up_primary": 4,
"acting_primary": 4
},
"empty": 0,
"dne": 0,
"incomplete": 0,
"last_epoch_started": 111090,
"hit_set_history": {
"current_last_update": "0'0",
"history": []
}
}
],
"recovery_state": [ {
"name": "Started\/Primary\/Active",
"enter_time": "2016-06-10 13:45:51.897296",
"num_flush_kb": 0,
"num_evict": 0,
"num_evict_kb": 0,
"num_promote": 0,
"num_flush_mode_high": 0,
"num_flush_mode_low": 0,
"num_evict_mode_some": 0,
"num_evict_mode_full": 0,
"num_objects_pinned": 0
},
"up": [
4,
24,
12
],
"acting": [
4,
24,
12
],
"blocked_by": [],
"up_primary": 4,
"acting_primary": 4
},
"empty": 0,
"dne": 0,
"incomplete": 0,
"last_epoch_started": 111090,
"hit_set_history": {
"current_last_update": "0'0",
"history": []
}
}
],
"recovery_state": [ {
"name": "Started\/Primary\/Active",
"enter_time": "2016-06-10 13:45:51.897296",
"might_have_unfound": [ {
"osd": "12",
"status": "already probed"
}, {
"osd": "24",
"status": "already probed"
}
[root@md002 ~]# ceph pg 21.a3 list_missing {
"offset": {
"oid": "",
"key": "",
"snapid": 0,
"hash": 0,
"max": 0,
"pool": -9223372036854775808,
"namespace": ""
},
"num_missing": 1,
"num_unfound": 1,
"objects": [ {
"oid": {
"oid": "rbd_data.6df19c2ae8944a.000000000010789e",
"key": "",
"snapid": 2,
"hash": 3213450915,
"max": 0,
"pool": 21,
"namespace": ""
},
"need": "111050'276843",
"have": "110440'276612",
"locations": []
}
],
"more": 0
}
[root@md002 ~]# ceph pg 21.a3 mark_unfound_lost revert
2016-06-13 14:30:53.206361 7fd936bf1700 0 -
172.19.20.2:0/602132201 >> 172.19.20.8:6836/2102 pipe(0x7fd9280133b0 sd=6 :0 s=1 pgs=0 cs=0 l=1 c=0x7fd9280146c0).fault
pg has no unfound objects
[root@md002 ~]# ceph pg 21.a3 list_missing {
"offset": {
"oid": "",
"key": "",
"snapid": 0,
"hash": 0,
"max": 0,
"pool": -9223372036854775808,
"namespace": ""
},
"num_missing": 1,
"num_unfound": 1,
"objects": [ {
"oid": {
"oid": "rbd_data.6df19c2ae8944a.000000000010789e",
"key": "",
"snapid": -2,
"hash": 3213450915,
"max": 0,
"pool": 21,
"namespace": ""
},
"need": "111050'276843",
"have": "110440'276612",
"locations": []
}
],
"more": 0
}

[root@md002 ~]# ceph status
cluster 6318a6a2-808b-45a1-9c89-31575c58de49
health HEALTH_WARN
36 pgs backfill_wait
114 pgs degraded
68 pgs recovering
34 pgs recovery_wait
140 pgs stuck unclean
10 pgs undersized
7 requests are blocked > 32 sec
recovery 26331/11216491 objects degraded (0.235%)
recovery 111267/11216491 objects misplaced (0.992%)
recovery 9885/3721644 unfound (0.266%)
monmap e7: 4 mons at {md002=172.19.20.2:6789/0,md005=172.19.20.5:6789/0,md008=172.19.20.8:6789/0,md010=172.19.20.10:6789/0}
election epoch 18892, quorum 0,1,2,3 md002,md005,md008,md010
osdmap e111331: 30 osds: 30 up, 30 in; 53 remapped pgs
flags sortbitwise
pgmap v14302472: 2880 pgs, 5 pools, 14511 GB data, 3634 kobjects
44202 GB used, 17230 GB / 61433 GB avail
26331/11216491 objects degraded (0.235%)
111267/11216491 objects misplaced (0.992%)
9885/3721644 unfound (0.266%)
2739 active+clean
67 active+recovering+degraded
26 active+remapped+wait_backfill
18 active+recovery_wait+degraded
16 active+recovery_wait+degraded+remapped
10 active+undersized+degraded+remapped+wait_backfill
2 active+degraded
1 active+recovering+degraded+remapped
1 active+clean+scrubbing+deep

[root@md002 ~]# ceph health detail | grep 21.a3
pg 21.a3 is stuck unclean for 602039.954625, current state active+degraded, last acting [4,24,12]
pg 21.a3 is active+degraded, acting [4,24,12], 1 unfound
[root@md002 ~]# ceph osd tree
ID WEIGHT TYPE NAME UP/DOWN REWEIGHT PRIMARY-AFFINITY
-1 60.77974 root default
-2 20.34990 host md002
18 1.84999 osd.18 up 1.00000 1.00000
19 1.84999 osd.19 up 1.00000 1.00000
20 1.84999 osd.20 up 1.00000 1.00000
21 1.84999 osd.21 up 1.00000 1.00000
22 1.84999 osd.22 up 1.00000 1.00000
23 1.84999 osd.23 up 1.00000 1.00000
24 1.84999 osd.24 up 1.00000 1.00000
25 1.84999 osd.25 up 1.00000 1.00000
26 1.84999 osd.26 up 1.00000 1.00000
27 1.84999 osd.27 up 1.00000 1.00000
28 1.84999 osd.28 up 1.00000 1.00000
-3 20.34990 host md008
0 1.84999 osd.0 up 1.00000 1.00000
3 1.84999 osd.3 up 1.00000 1.00000
5 1.84999 osd.5 up 1.00000 1.00000
7 1.84999 osd.7 up 1.00000 1.00000
10 1.84999 osd.10 up 1.00000 1.00000
1 1.84999 osd.1 up 1.00000 1.00000
6 1.84999 osd.6 up 1.00000 1.00000
8 1.84999 osd.8 up 1.00000 1.00000
9 1.84999 osd.9 up 1.00000 1.00000
2 1.84999 osd.2 up 1.00000 1.00000
4 1.84999 osd.4 up 1.00000 1.00000
-5 20.07994 host md005
14 2.73000 osd.14 up 0.89999 1.00000
15 2.73000 osd.15 up 0.94998 1.00000
16 2.73000 osd.16 up 1.00000 1.00000
12 2.73000 osd.12 up 0.84998 1.00000
17 2.73000 osd.17 up 1.00000 1.00000
13 2.73000 osd.13 up 0.92245 1.00000
11 1.84999 osd.11 up 1.00000 1.00000
29 1.84998 osd.29 up 0.95001 1.00000

osd log 24:
2016-06-13 14:31:56.922526 7ff3181af700 0 -- 172.19.20.2:0/7409 >> 172.19.20.8:6847/6335 pipe(0x55f863449400 sd=202 :0 s=1 pgs=0 cs=0 l=1 c=0x55f89146fa80).fault
2016-06-13 14:31:56.922955 7ff3275a3700 0 -- 172.19.20.2:0/7409 >> 172.19.20.8:6846/6335 pipe(0x55f876fb0800 sd=55 :0 s=1 pgs=0 cs=0 l=1 c=0x55f89146ed00).fault
2016-06-13 14:32:06.821018 7ff325ecd700 0 -- 172.19.20.2:0/7409 >> 172.19.20.8:6839/7728 pipe(0x55f863e5f400 sd=170 :0 s=1 pgs=0 cs=0 l=1 c=0x55f88ba44d80).fault
2016-06-13 14:32:06.821052 7ff317879700 0 -- 172.19.20.2:6833/7409 >> 172.19.20.8:6837/7728 pipe(0x55f88ae86000 sd=34 :57862 s=2 pgs=12 cs=1 l=0 c=0x55f88ceefa80).fault with nothing to send, going to standby
2016-06-13 14:32:06.821206 7ff323baa700 0 -- 172.19.20.2:0/7409 >> 172.19.20.8:6838/7728 pipe(0x55f885050000 sd=209 :0 s=1 pgs=0 cs=0 l=1 c=0x55f88f860e80).fault
2016-06-13 14:35:52.807557 7ff32fcd8700 0 log_channel(cluster) log [INF] : 21.b0 scrub starts
2016-06-13 14:36:18.422492 7ff32fcd8700 0 log_channel(cluster) log [INF] : 21.b0 scrub ok

osd log 12:
2016-06-13 14:32:05.706946 7fa93c41d700 1 log_channel(cluster) log [ERR] : 20.a has 327 objects unfound and apparently lost
2016-06-13 14:32:05.707248 7fa93c41d700 -1 log_channel(cluster) log [ERR] : 21.1be has 39 objects unfound and apparently lost
2016-06-13 14:32:05.709406 7fa93cc1e700 -1 log_channel(cluster) log [ERR] : 21.112 has 34 objects unfound and apparently lost
2016-06-13 14:32:06.820162 7fa931704700 0 -
172.19.20.5:6819/1342 >> 172.19.20.8:6837/7728 pipe(0x561466694800 sd=114 :59012 s=2 pgs=2 cs=1 l=0 c=0x56146ed17f80).fault with nothing to send, going to s
tandby
2016-06-13 14:32:06.821200 7fa927c6a700 0 -- 172.19.20.5:0/1342 >> 172.19.20.8:6839/7728 pipe(0x561465750800 sd=136 :0 s=1 pgs=0 cs=0 l=1 c=0x56146f48fb00).fault
2016-06-13 14:32:06.821203 7fa92f2e0700 0 -- 172.19.20.5:0/1342 >> 172.19.20.8:6838/7728 pipe(0x5614663e4000 sd=127 :0 s=1 pgs=0 cs=0 l=1 c=0x56146e4a3a80).fault
2016-06-13 14:32:13.975929 7fa93c41d700 -1 log_channel(cluster) log [ERR] : 21.16a has 32 objects unfound and apparently lost
2016-06-13 14:32:13.978135 7fa93c41d700 -1 log_channel(cluster) log [ERR] : 21.7c has 38 objects unfound and apparently lost
2016-06-13 14:32:13.979842 7fa93c41d700 -1 log_channel(cluster) log [ERR] : 21.179 has 29 objects unfound and apparently lost
2016-06-13 14:32:13.982525 7fa93cc1e700 -1 log_channel(cluster) log [ERR] : 21.14e has 29 objects unfound and apparently lost

osd log 4:
11> 2016-06-13 14:32:06.327220 7f77467ea700 5 - op tracker -- seq: 1138, time: 2016-06-13 14:32:06.327220, event: started, op: pg_info(1 pgs e111327:24.5de)
10> 2016-06-13 14:32:06.327256 7f77467ea700 5 - op tracker -- seq: 1138, time: 2016-06-13 14:32:06.327256, event: done, op: pg_info(1 pgs e111327:24.5de)
9> 2016-06-13 14:32:06.327629 7f77467ea700 1 - 172.19.20.8:6837/7728 <== osd.23 172.19.20.2:6805/7325 62 ==== pg_info(1 pgs e111327:24.774) v4 ==== 901+0+0 (4232033861 0 0) 0x5598f6758f00 con 0x5
59908b81680
8> 2016-06-13 14:32:06.327659 7f77467ea700 5 - op tracker -- seq: 1139, time: 2016-06-13 14:32:06.327659, event: started, op: pg_info(1 pgs e111327:24.774)
7> 2016-06-13 14:32:06.327685 7f77467ea700 5 - op tracker -- seq: 1139, time: 2016-06-13 14:32:06.327685, event: done, op: pg_info(1 pgs e111327:24.774)
6> 2016-06-13 14:32:06.327701 7f77467ea700 1 - 172.19.20.8:6837/7728 <== osd.23 172.19.20.2:6805/7325 63 ==== pg_info(1 pgs e111327:24.100) v4 ==== 901+0+0 (969455417 0 0) 0x5598f67590e0 con 0x55
9908b81680
5> 2016-06-13 14:32:06.327716 7f77467ea700 5 - op tracker -- seq: 1140, time: 2016-06-13 14:32:06.327716, event: started, op: pg_info(1 pgs e111327:24.100)
4> 2016-06-13 14:32:06.327737 7f77467ea700 5 - op tracker -- seq: 1140, time: 2016-06-13 14:32:06.327737, event: done, op: pg_info(1 pgs e111327:24.100)
3> 2016-06-13 14:32:06.685067 7f772753d700 1 - 172.19.20.8:6836/7728 >> :/0 pipe(0x559908270000 sd=30 :6836 s=0 pgs=0 cs=0 l=0 c=0x559908fb8580).accept sd=30 172.19.20.2:46180/0
2> 2016-06-13 14:32:06.686026 7f7747fed700 1 - 172.19.20.8:6836/7728 <== client.53526920 172.19.20.2:0/1447332590 1 ==== command(tid 2: {"mulcmd": "revert", "prefix": "pg", "cmd": "mark_unfound_l
ost", "pgid": "21.a3"}) v1 ==== 105+0+0 (1077273210 0 0) 0x559908dcfc20 con 0x559908fb8580
-1> 2016-06-13 14:32:06.686214 7f7737fcd700 3 osd.4 pg_epoch: 111327 pg[21.a3( v 111050'276843 lc 111050'276842 (108850'273114,111050'276843] local-les=111326 n=2996 ec=73150 les/c/f 111326/111043/
0 111325/111325/111325) [4,24,12] r=0 lpr=111325 pi=110607-111324/13 crt=110001'276247 lcod 0'0 mlcod 0'0 active+recovering+degraded m=1 u=1] mark_all_unfound_lost l_revert
0> 2016-06-13 14:32:06.690652 7f7737fcd700 -1 osd/ReplicatedPG.cc: In function 'eversion_t ReplicatedPG::pick_newest_available(const hobject_t&)' thread 7f7737fcd700 time 2016-06-13 14:32:06.686255
osd/ReplicatedPG.cc: 9629: FAILED assert(is_backfill_targets(peer))

ceph version 10.2.1 (3a66dd4f30852819c1bdaa8ec23c795d4ad77269)
1: (ceph::__ceph_assert_fail(char const*, char const*, int, char const*)+0x85) [0x5598d9cd45b5]
2: (ReplicatedPG::pick_newest_available(hobject_t const&)+0x285) [0x5598d9785f65]
3: (ReplicatedPG::mark_all_unfound_lost(int, boost::intrusive_ptr&lt;Connection&gt;, unsigned long)+0xbcb) [0x5598d978716b]
4: (ReplicatedPG::do_command(std::map&lt;std::string, boost::variant&lt;std::string, bool, long, double, std::vector&lt;std::string, std::allocator&lt;std::string&gt; >, boost::detail::variant::void_, boost::detail::
variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_,
boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::v
ariant::void_>, std::less&lt;std::string&gt;, std::allocator&lt;std::pair&lt;std::string const, boost::variant&lt;std::string, bool, long, double, std::vector&lt;std::string, std::allocator&lt;std::string&gt; >, boost::detail:
:variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_> > > >, std::ostream&, ceph::buffer::list&, ceph::buffer::list&, boost::intrusive_ptr&lt;Connection&gt;, unsigned long)+0xbd3) [0x5598d97e2e93]
5: (OSD::do_command(Connection*, unsigned long, std::vector&lt;std::string, std::allocator&lt;std::string&gt; >&, ceph::buffer::list&)+0x2a1d) [0x5598d9660afd]
6: (OSD::CommandWQ::_process(OSD::Command*, ThreadPool::TPHandle&)+0x69) [0x5598d96b3f19]
7: (ThreadPool::worker(ThreadPool::WorkThread*)+0xa7e) [0x5598d9cc55fe]
8: (ThreadPool::WorkThread::entry()+0x10) [0x5598d9cc64e0]
9: (()+0x7dc5) [0x7f775cf3fdc5]
10: (clone()+0x6d) [0x7f775b5cb28d]
NOTE: a copy of the executable, or `objdump -rdS &lt;executable&gt;` is needed to interpret this.

--- logging levels ---
0/ 5 none
0/ 1 lockdep
0/ 1 context
1/ 1 crush
1/ 5 mds
1/ 5 mds_balancer
1/ 5 mds_locker
1/ 5 mds_log
1/ 5 mds_log_expire
1/ 5 mds_migrator
0/ 1 buffer
0/ 1 timer
0/ 1 filer
0/ 1 striper
0/ 1 objecter
0/ 5 rados
0/ 5 rbd
0/ 5 rbd_mirror
0/ 5 rbd_replay
0/ 5 journaler
0/ 5 objectcacher
0/ 5 client
0/ 5 osd
0/ 5 optracker
0/ 5 objclass
1/ 3 filestore
1/ 3 journal
0/ 5 ms
1/ 5 mon
0/10 monc
1/ 5 paxos
0/ 5 tp
1/ 5 auth
1/ 5 crypto
1/ 1 finisher
1/ 5 heartbeatmap
1/ 5 perfcounter
1/ 5 rgw
1/10 civetweb
1/ 5 javaclient
1/ 5 asok
1/ 1 throttle
0/ 0 refs
1/ 5 xio
1/ 5 compressor
1/ 5 newstore
1/ 5 bluestore
1/ 5 bluefs
1/ 3 bdev
1/ 5 kstore
4/ 5 rocksdb
4/ 5 leveldb
1/ 5 kinetic
1/ 5 fuse
2/-2 (syslog threshold)
-1/-1 (stderr threshold)
max_recent 10000
max_new 1000
log_file /var/log/ceph/ceph-osd.4.log
--
end dump of recent events ---
2016-06-13 14:32:06.757251 7f7737fcd700 -1 ** Caught signal (Aborted) *
in thread 7f7737fcd700 thread_name:tp_osd_cmd

ceph version 10.2.1 (3a66dd4f30852819c1bdaa8ec23c795d4ad77269)
1: (()+0x91241a) [0x5598d9bd441a]
2: (()+0xf100) [0x7f775cf47100]
3: (gsignal()+0x37) [0x7f775b50a5f7]
4: (abort()+0x148) [0x7f775b50bce8]
5: (ceph::__ceph_assert_fail(char const*, char const*, int, char const*)+0x267) [0x5598d9cd4797]
6: (ReplicatedPG::pick_newest_available(hobject_t const&)+0x285) [0x5598d9785f65]
7: (ReplicatedPG::mark_all_unfound_lost(int, boost::intrusive_ptr&lt;Connection&gt;, unsigned long)+0xbcb) [0x5598d978716b]
8: (ReplicatedPG::do_command(std::map&lt;std::string, boost::variant&lt;std::string, bool, long, double, std::vector&lt;std::string, std::allocator&lt;std::string&gt; >, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_>, std::less&lt;std::string&gt;, std::allocator&lt;std::pair&lt;std::string const, boost::variant&lt;std::string, bool, long, double, std::vector&lt;std::string, std::allocator&lt;std::string&gt; >, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_> > > >, std::ostream&, ceph::buffer::list&, ceph::buffer::list&, boost::intrusive_ptr&lt;Connection&gt;, unsigned long)+0xbd3) [0x5598d97e2e93]
9: (OSD::do_command(Connection*, unsigned long, std::vector&lt;std::string, std::allocator&lt;std::string&gt; >&, ceph::buffer::list&)+0x2a1d) [0x5598d9660afd]
10: (OSD::CommandWQ::_process(OSD::Command*, ThreadPool::TPHandle&)+0x69) [0x5598d96b3f19]
11: (ThreadPool::worker(ThreadPool::WorkThread*)+0xa7e) [0x5598d9cc55fe]
12: (ThreadPool::WorkThread::entry()+0x10) [0x5598d9cc64e0]
13: (()+0x7dc5) [0x7f775cf3fdc5]
14: (clone()+0x6d) [0x7f775b5cb28d]
NOTE: a copy of the executable, or `objdump -rdS &lt;executable&gt;` is needed to interpret this.

--- begin dump of recent events ---
0> 2016-06-13 14:32:06.757251 7f7737fcd700 -1 ** Caught signal (Aborted) *
in thread 7f7737fcd700 thread_name:tp_osd_cmd

ceph version 10.2.1 (3a66dd4f30852819c1bdaa8ec23c795d4ad77269)
1: (()+0x91241a) [0x5598d9bd441a]
2: (()+0xf100) [0x7f775cf47100]
3: (gsignal()+0x37) [0x7f775b50a5f7]
4: (abort()+0x148) [0x7f775b50bce8]
5: (ceph::__ceph_assert_fail(char const*, char const*, int, char const*)+0x267) [0x5598d9cd4797]
6: (ReplicatedPG::pick_newest_available(hobject_t const&)+0x285) [0x5598d9785f65]
7: (ReplicatedPG::mark_all_unfound_lost(int, boost::intrusive_ptr&lt;Connection&gt;, unsigned long)+0xbcb) [0x5598d978716b]
8: (ReplicatedPG::do_command(std::map&lt;std::string, boost::variant&lt;std::string, bool, long, double, std::vector&lt;std::string, std::allocator&lt;std::string&gt; >, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_>, std::less&lt;std::string&gt;, std::allocator&lt;std::pair&lt;std::string const, boost::variant&lt;std::string, bool, long, double, std::vector&lt;std::string, std::allocator&lt;std::string&gt; >, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_> > > >, std::ostream&, ceph::buffer::list&, ceph::buffer::list&, boost::intrusive_ptr&lt;Connection&gt;, unsigned long)+0xbd3) [0x5598d97e2e93]
9: (OSD::do_command(Connection*, unsigned long, std::vector&lt;std::string, std::allocator&lt;std::string&gt; >&, ceph::buffer::list&)+0x2a1d) [0x5598d9660afd]
10: (OSD::CommandWQ::_process(OSD::Command*, ThreadPool::TPHandle&)+0x69) [0x5598d96b3f19]
11: (ThreadPool::worker(ThreadPool::WorkThread*)+0xa7e) [0x5598d9cc55fe]
12: (ThreadPool::WorkThread::entry()+0x10) [0x5598d9cc64e0]
13: (()+0x7dc5) [0x7f775cf3fdc5]
14: (clone()+0x6d) [0x7f775b5cb28d]
NOTE: a copy of the executable, or `objdump -rdS &lt;executable&gt;` is needed to interpret this.

--- logging levels ---
0/ 5 none
0/ 1 lockdep
0/ 1 context
1/ 1 crush
1/ 5 mds
1/ 5 mds_balancer
1/ 5 mds_locker
1/ 5 mds_log
1/ 5 mds_log_expire
1/ 5 mds_migrator
0/ 1 buffer
0/ 1 timer
0/ 1 filer
0/ 1 striper
0/ 1 objecter
0/ 5 rados
0/ 5 rbd
0/ 5 rbd_mirror
0/ 5 rbd_replay
0/ 5 journaler
0/ 5 objectcacher
0/ 5 client
0/ 5 osd
0/ 5 optracker
0/ 5 objclass
1/ 3 filestore
1/ 3 journal
0/ 5 ms
1/ 5 mon
0/10 monc
1/ 5 paxos
0/ 5 tp
1/ 5 auth
1/ 5 crypto
1/ 1 finisher
1/ 5 heartbeatmap
1/ 5 perfcounter
1/ 5 rgw
1/10 civetweb
1/ 5 javaclient
1/ 5 asok
1/ 1 throttle
0/ 0 refs
1/ 5 xio
1/ 5 compressor
1/ 5 newstore
1/ 5 bluestore
1/ 5 bluefs
1/ 3 bdev
1/ 5 kstore
4/ 5 rocksdb
4/ 5 leveldb
1/ 5 kinetic
1/ 5 fuse
2/-2 (syslog threshold)
-1/-1 (stderr threshold)
max_recent 10000
max_new 1000
log_file /var/log/ceph/ceph-osd.4.log
--
end dump of recent events ---
2016-06-13 14:32:07.535021 7f700dd31800 0 set uid:gid to 167:167 (ceph:ceph)
2016-06-13 14:32:07.535060 7f700dd31800 0 ceph version 10.2.1 (3a66dd4f30852819c1bdaa8ec23c795d4ad77269), process ceph-osd, pid 8216
2016-06-13 14:32:07.536112 7f700dd31800 0 pidfile_write: ignore empty --pid-file
2016-06-13 14:32:07.597413 7f700dd31800 0 filestore(/var/lib/ceph/osd/ceph-4) backend xfs (magic 0x58465342)
2016-06-13 14:32:07.597936 7f700dd31800 0 genericfilestorebackend(/var/lib/ceph/osd/ceph-4) detect_features: FIEMAP ioctl is disabled via 'filestore fiemap' config option
2016-06-13 14:32:07.597947 7f700dd31800 0 genericfilestorebackend(/var/lib/ceph/osd/ceph-4) detect_features: SEEK_DATA/SEEK_HOLE is disabled via 'filestore seek data hole' config option
2016-06-13 14:32:07.597968 7f700dd31800 0 genericfilestorebackend(/var/lib/ceph/osd/ceph-4) detect_features: splice is supported
2016-06-13 14:32:07.624301 7f700dd31800 0 genericfilestorebackend(/var/lib/ceph/osd/ceph-4) detect_features: syncfs(2) syscall fully supported (by glibc and kernel)
2016-06-13 14:32:07.624486 7f700dd31800 0 xfsfilestorebackend(/var/lib/ceph/osd/ceph-4) detect_feature: extsize is disabled by conf
2016-06-13 14:32:07.625850 7f700dd31800 1 leveldb: Recovering log #21458
2016-06-13 14:32:07.634681 7f700dd31800 1 leveldb: Level-0 table #21460: started
2016-06-13 14:32:07.646981 7f700dd31800 1 leveldb: Level-0 table #21460: 473566 bytes OK
2016-06-13 14:32:07.649036 7f700dd31800 1 leveldb: Delete type=0 #21458

2016-06-13 14:32:07.649682 7f700dd31800 1 leveldb: Delete type=3 #21456

2016-06-13 14:32:07.649881 7f7001a97700 1 leveldb: Compacting 4@0 + 6@1 files
2016-06-13 14:32:07.650710 7f700dd31800 0 filestore(/var/lib/ceph/osd/ceph-4) mount: enabling WRITEAHEAD journal mode: checkpoint is not enabled
2016-06-13 14:32:07.659924 7f700dd31800 -1 journal FileJournal::_open: disabling aio for non-block journal. Use journal_force_aio to force use of aio anyway
2016-06-13 14:32:07.659939 7f700dd31800 1 journal _open /var/lib/ceph/osd/ceph-4/journal fd 19: 13107200000 bytes, block size 4096 bytes, directio = 1, aio = 0
2016-06-13 14:32:07.748340 7f700dd31800 1 journal _open /var/lib/ceph/osd/ceph-4/journal fd 19: 13107200000 bytes, block size 4096 bytes, directio = 1, aio = 0
2016-06-13 14:32:07.751862 7f700dd31800 1 filestore(/var/lib/ceph/osd/ceph-4) upgrade
2016-06-13 14:32:07.754079 7f700dd31800 0 <cls> cls/cephfs/cls_cephfs.cc:202: loading cephfs_size_scan
2016-06-13 14:32:07.763278 7f700dd31800 0 <cls> cls/hello/cls_hello.cc:305: loading cls_hello
2016-06-13 14:32:07.763961 7f700dd31800 0 osd.4 111327 crush map has features 283743827001344, adjusting msgr requires for clients
2016-06-13 14:32:07.763976 7f700dd31800 0 osd.4 111327 crush map has features 283743827001344 was 8705, adjusting msgr requires for mons
2016-06-13 14:32:07.763982 7f700dd31800 0 osd.4 111327 crush map has features 283743827001344, adjusting msgr requires for osds
2016-06-13 14:32:07.786580 7f7001a97700 1 leveldb: Generated table #21462: 109273 keys, 2128293 bytes
2016-06-13 14:32:07.892539 7f7001a97700 1 leveldb: Generated table #21463: 57467 keys, 2135285 bytes
2016-06-13 14:32:07.975351 7f7001a97700 1 leveldb: Generated table #21464: 46829 keys, 1866162 bytes
2016-06-13 14:32:08.075505 7f7001a97700 1 leveldb: Generated table #21465: 59611 keys, 2136960 bytes
2016-06-13 14:32:08.153036 7f7001a97700 1 leveldb: Generated table #21466: 41985 keys, 1699612 bytes
2016-06-13 14:32:08.169755 7f7001a97700 1 leveldb: Generated table #21467: 6120 keys, 87650 bytes
2016-06-13 14:32:08.169860 7f7001a97700 1 leveldb: Compacted 4@0 + 6@1 files => 10053962 bytes
2016-06-13 14:32:08.170005 7f7001a97700 1 leveldb: compacted to: files[ 0 6 26 0 0 0 0 ]
2016-06-13 14:32:08.170138 7f7001a97700 1 leveldb: Delete type=2 #21452

2016-06-13 14:32:08.170741 7f7001a97700 1 leveldb: Delete type=2 #21454

2016-06-13 14:32:08.171084 7f7001a97700 1 leveldb: Delete type=2 #21443

2016-06-13 14:32:08.171916 7f7001a97700 1 leveldb: Delete type=2 #21441

2016-06-13 14:32:08.172641 7f7001a97700 1 leveldb: Delete type=2 #21439

2016-06-13 14:32:08.173621 7f7001a97700 1 leveldb: Delete type=2 #21442

2016-06-13 14:32:08.174542 7f7001a97700 1 leveldb: Delete type=2 #21457

2016-06-13 14:32:08.175118 7f7001a97700 1 leveldb: Delete type=2 #21440

2016-06-13 14:32:08.176021 7f7001a97700 1 leveldb: Delete type=2 #21445

2016-06-13 14:32:08.176314 7f7001a97700 1 leveldb: Delete type=2 #21460

2016-06-13 14:32:08.667875 7f700dd31800 0 osd.4 111327 load_pgs
2016-06-13 14:32:13.266122 7f700dd31800 0 osd.4 111327 load_pgs opened 253 pgs
2016-06-13 14:32:13.266183 7f700dd31800 0 osd.4 111327 using 0 op queue with priority op cut off at 64.
2016-06-13 14:32:13.266892 7f700dd31800 1 osd.4 111327 log_to_monitors {default=true}
2016-06-13 14:32:13.344664 7f700dd31800 0 osd.4 111327 done with init, starting boot process
2016-06-13 14:32:15.031258 7f6fda14c700 0 -
172.19.20.8:6849/8216 >> 172.19.20.2:6809/7338 pipe(0x560dd919d400 sd=229 :6849 s=0 pgs=0 cs=0 l=0 c=0x560dd8fb2600).accept connect_seq 0 vs existing 0 state connecting
2016-06-13 14:32:16.686809 7f6fe68db700 0 osd.4 111330 do_command r=0 pg has no unfound objects
2016-06-13 14:32:16.686826 7f6fe68db700 0 log_channel(cluster) log [INF] : pg has no unfound objects
2016-06-13 14:32:16.791678 7f6fed8e9700 1 log_channel(cluster) log [ERR] : 20.69 has 306 objects unfound and apparently lost
2016-06-13 14:32:16.792162 7f6fed0e8700 -1 log_channel(cluster) log [ERR] : 21.15f has 47 objects unfound and apparently lost
2016-06-13 14:32:16.792342 7f6fed0e8700 -1 log_channel(cluster) log [ERR] : 21.70 has 36 objects unfound and apparently lost
2016-06-13 14:32:16.796433 7f6fed0e8700 -1 log_channel(cluster) log [ERR] : 21.13b has 36 objects unfound and apparently lost
2016-06-13 14:32:18.588087 7f6fd610c700 0 -
172.19.20.8:6849/8216 >> 172.19.20.8:6801/2007 pipe(0x560dd9a7e800 sd=256 :6849 s=0 pgs=0 cs=0 l=0 c=0x560dd8fb2900).accept connect_seq 0 vs existing 0 state connecting
2016-06-13 14:32:19.372790 7f6fe68db700 0 osd.4 111331 do_command r=0
2016-06-13 14:32:19.387598 7f6fe68db700 0 osd.4 111331 do_command r=0
2016-06-13 14:35:50.304210 7f6fe90e0700 0 log_channel(cluster) log [INF] : 21.57 scrub starts
2016-06-13 14:36:03.837358 7f6feb8e5700 0 log_channel(cluster) log [INF] : 21.57 scrub ok


Related issues

Duplicates RADOS - Bug #18165: OSD crash with osd/ReplicatedPG.cc: 8485: FAILED assert(is_backfill_targets(peer)) Resolved 12/07/2016

History

#1 Updated by Max Vernimmen almost 8 years ago

Upgrading to 10.2.2 resulted in no more unfound objects to be reported by 'ceph status'.

#2 Updated by Samuel Just over 7 years ago

  • Priority changed from Normal to Urgent

#3 Updated by Samuel Just over 7 years ago

  • Status changed from New to Need More Info
  • Priority changed from Urgent to High

Sorry for the delay on this. If the situation is resolved, I guess you can't reproduce the crash with
debug osd = 20
debug filestore = 20
debug ms = 1 ?

#4 Updated by Samuel Just over 7 years ago

  • Status changed from Need More Info to Can't reproduce

#5 Updated by David Zafman almost 7 years ago

  • Duplicates Bug #18165: OSD crash with osd/ReplicatedPG.cc: 8485: FAILED assert(is_backfill_targets(peer)) added

#6 Updated by David Zafman almost 7 years ago

  • Status changed from Can't reproduce to Duplicate

Also available in: Atom PDF