Project

General

Profile

Actions

Bug #40641

open

OSD failure after PGInfo back to previous versions, resulting in PGLog error rollback

Added by tao ning almost 5 years ago. Updated almost 5 years ago.

Status:
Need More Info
Priority:
Normal
Assignee:
-
Category:
-
Target version:
-
% Done:

0%

Source:
Tags:
Backport:
Regression:
No
Severity:
3 - minor
Reviewed:
Affected Versions:
ceph-qa-suite:
Component(RADOS):
OSD
Pull request ID:
Crash signature (v1):
Crash signature (v2):

Description

Ceph Version 12.2.7


ceph pg 17.27 list_missing
{
    "offset": {
        "oid": "",
        "key": "",
        "snapid": 0,
        "hash": 0,
        "max": 0,
        "pool": -9223372036854775808,
        "namespace": "" 
    },
    "num_missing": 2,
    "num_unfound": 2,
    "objects": [
        {
            "oid": {
                "oid": "100000057b9.00000006",
                "key": "",
                "snapid": -2,
                "hash": 3984438439,
                "max": 0,
                "pool": 17,
                "namespace": "" 
            },
            "need": "2900'29100",
            "have": "0'0",
            "flags": "none",
            "locations": [
                "8(4)" 
            ]
        },
        {
            "oid": {
                "oid": "10000009247.00000013",
                "key": "",
                "snapid": -2,
                "hash": 3428461991,
                "max": 0,
                "pool": 17,
                "namespace": "" 
            },
            "need": "3013'34090",
            "have": "0'0",
            "flags": "none",
            "locations": [
                "8(4)" 
            ]
        }
    ],
    "more": false
}

osd.8 pg 17.27s4

2019-06-27 10:03:10.096009 7fbddac47700  1 -- 10.252.254.131:6811/30921 --> 10.252.254.131:6820/9274 -- MOSDECSubOpWriteReply(17.27s0 3013/3003 ECSubWriteReply(tid=450308, last_complete=3013'34106, committed=1, applied=1)) v2 -- 0x561f14fc7d40 con 0

2019-06-27 10:03:10.236438 7fbdcdc2d700 10 osd.8 3013 dequeue_op 0x561f17f31b00 prio 127 cost 0 latency 0.000055 MOSDECSubOpRead(17.27s4 3013/3003 ECSubRead(tid=450328, to_read={17:e52dbeb7:::100000057b9.00000006:head=524288,16384,0}, attrs_to_read=)) v3 pg pg[17.27s4( v 3013'34106 (2970'32534,3013'34106] local-lis/les=3003/3004 n=4837 ec=375/375 lis/c 3003/3003 les/c/f 3004/3004/0 2997/3003/2863) [6,2,9,13,8,1]p6(0) r=4 lpr=3003 luod=0'0 crt=3013'34105 active]

## OSD failure (execute reboot -f), PGInfo change, last update 3013' 34106-> 3013'34104
2019-06-27 10:06:27.600578 7fb92f16cd80  5 osd.8 pg_epoch: 3013 pg[17.27s4( v 3013'34104 (2970'32534,3013'34104] local-lis/les=3003/3004 n=4837 ec=375/375 lis/c 3003/3003 les/c/f 3004/3004/0 2997/3003/2863) [6,2,9,13,8,1]p6(0) r=4 lpr=0 crt=3013'34093 lcod 0'0 unknown NOTIFY] enter Reset
2019-06-27 10:06:27.600588 7fb92f16cd80 10 osd.8 pg_epoch: 3013 pg[17.27s4( v 3013'34104 (2970'32534,3013'34104] local-lis/les=3003/3004 n=4837 ec=375/375 lis/c 3003/3003 les/c/f 3004/3004/0 2997/3003/2863) [6,2,9,13,8,1]p6(0) r=4 lpr=3013 crt=3013'34093 lcod 0'0 unknown NOTIFY] Clearing blocked outgoing recovery messages
2019-06-27 10:06:27.600596 7fb92f16cd80 10 osd.8 pg_epoch: 3013 pg[17.27s4( v 3013'34104 (2970'32534,3013'34104] local-lis/les=3003/3004 n=4837 ec=375/375 lis/c 3003/3003 les/c/f 3004/3004/0 2997/3003/2863) [6,2,9,13,8,1]p6(0) r=4 lpr=3013 crt=3013'34093 lcod 0'0 unknown NOTIFY] Not blocking outgoing recovery messages
2019-06-27 10:06:27.600605 7fb92f16cd80 10 osd.8 3013 load_pgs loaded pg[17.27s4( v 3013'34104 (2970'32534,3013'34104] local-lis/les=3003/3004 n=4837 ec=375/375 lis/c 3003/3003 les/c/f 3004/3004/0 2997/3003/2863) [6,2,9,13,8,1]p6(0) r=4 lpr=3013 crt=3013'34093 lcod 0'0 unknown NOTIFY] log((2970'32534,3013'34104], crt=3013'34093)

osd.6 PG 17.27s0

## Complete the 3013'34106 version write Op 
2019-06-27 10:03:10.085504 7efe69609700 10 osd.6 pg_epoch: 3013 pg[17.27s0( v 3013'34105 (2970'32534,3013'34105] local-lis/les=3003/3004 n=4837 ec=375/375 lis/c 3003/3003 les/c/f 3004/3004/0 2997/3003/2863) [6,2,9,13,8,1]p6(0) r=0 lpr=3003 crt=3013'34104 lcod 3013'34104 mlcod 3013'34104 active+clean] start_rmw: Op(17:e5b45a33:::10000009247.00000013:head v=3013'34106 tt=2970'32534 tid=450308 reqid=client.1141606.0:718483 client_op=osd_op(client.1141606.0:718483 17.27s0 17:e5b45a33:::10000009247.00000013:head [write 3670016~524288] snapc 1=[] gid 7 raft_term  152 raft_version 718483 ondisk+write+known_if_redirected e3013) roll_forward_to=3013'34105 temp_added= temp_cleared= pending_read={} remote_read={} remote_read_result={} pending_apply= pending_commit= plan.to_read={} plan.will_write={17:e5b45a33:::10000009247.00000013:head=[3670016~524288]})
2019-06-27 10:03:10.085523 7efe69609700 10 osd.6 pg_epoch: 3013 pg[17.27s0( v 3013'34105 (2970'32534,3013'34105] local-lis/les=3003/3004 n=4837 ec=375/375 lis/c 3003/3003 les/c/f 3004/3004/0 2997/3003/2863) [6,2,9,13,8,1]p6(0) r=0 lpr=3003 crt=3013'34104 lcod 3013'34104 mlcod 3013'34104 active+clean] try_state_to_reads: Op(17:e5b45a33:::10000009247.00000013:head v=3013'34106 tt=2970'32534 tid=450308 reqid=client.1141606.0:718483 client_op=osd_op(client.1141606.0:718483 17.27s0 17:e5b45a33:::10000009247.00000013:head [write 3670016~524288] snapc 1=[] gid 7 raft_term  152 raft_version 718483 ondisk+write+known_if_redirected e3013) roll_forward_to=3013'34105 temp_added= temp_cleared= pending_read={} remote_read={} remote_read_result={} pending_apply= pending_commit= plan.to_read={} plan.will_write={17:e5b45a33:::10000009247.00000013:head=[3670016~524288]})
2019-06-27 10:03:10.085537 7efe69609700 10 osd.6 pg_epoch: 3013 pg[17.27s0( v 3013'34105 (2970'32534,3013'34105] local-lis/les=3003/3004 n=4837 ec=375/375 lis/c 3003/3003 les/c/f 3004/3004/0 2997/3003/2863) [6,2,9,13,8,1]p6(0) r=0 lpr=3003 crt=3013'34104 lcod 3013'34104 mlcod 3013'34104 active+clean] try_reads_to_commit: starting commit on Op(17:e5b45a33:::10000009247.00000013:head v=3013'34106 tt=2970'32534 tid=450308 reqid=client.1141606.0:718483 client_op=osd_op(client.1141606.0:718483 17.27s0 17:e5b45a33:::10000009247.00000013:head [write 3670016~524288] snapc 1=[] gid 7 raft_term  152 raft_version 718483 ondisk+write+known_if_redirected e3013) roll_forward_to=3013'34105 temp_added= temp_cleared= pending_read={} remote_read={} remote_read_result={} pending_apply= pending_commit= plan.to_read={} plan.will_write={17:e5b45a33:::10000009247.00000013:head=[3670016~524288]})
2019-06-27 10:03:10.086438 7efe69609700 10 osd.6 pg_epoch: 3013 pg[17.27s0( v 3013'34105 (2970'32534,3013'34105] local-lis/les=3003/3004 n=4837 ec=375/375 lis/c 3003/3003 les/c/f 3004/3004/0 2997/3003/2863) [6,2,9,13,8,1]p6(0) r=0 lpr=3003 crt=3013'34104 lcod 3013'34104 mlcod 3013'34104 active+clean] append_log log((2970'32534,3013'34105], crt=3013'34104) [3013'34106 (3013'34090) modify   17:e5b45a33:::10000009247.00000013:head by client.1141606.0:718483 2019-06-27 10:03:10.084723 0 raft_grpupid=7 raft_term=152]
2019-06-27 10:03:10.086464 7efe69609700 10 osd.6 pg_epoch: 3013 pg[17.27s0( v 3013'34106 (2970'32534,3013'34106] local-lis/les=3003/3004 n=4837 ec=375/375 lis/c 3003/3003 les/c/f 3004/3004/0 2997/3003/2863) [6,2,9,13,8,1]p6(0) r=0 lpr=3003 luod=3013'34105 lua=3013'34105 crt=3013'34104 lcod 3013'34104 mlcod 3013'34104 active+clean] add_log_entry 3013'34106 (3013'34090) modify   17:e5b45a33:::10000009247.00000013:head by client.1141606.0:718483 2019-06-27 10:03:10.084723 0 raft_grpupid=7 raft_term=152
2019-06-27 10:03:10.086836 7efe8b60a700 10 -- 10.252.254.131:6820/9274 >> 10.252.254.131:6811/30921 conn(0x561a13707000 :6820 s=STATE_OPEN_TAG_ACK pgs=22 cs=1 l=0).handle_ack got ack seq 41418 >= 41418 on 0x561a1e744300 MOSDECSubOpWrite(17.27s4 3013/3003 ECSubWrite(tid=450308, reqid=client.1141606.0:718483, at_version=3013'34106, trim_to=2970'32534, roll_forward_to=3013'34105)) v2
2019-06-27 10:03:10.123953 7efe6ae0c700 10 osd.6 pg_epoch: 3013 pg[17.27s0( v 3013'34106 (2970'32534,3013'34106] local-lis/les=3003/3004 n=4837 ec=375/375 lis/c 3003/3003 les/c/f 3004/3004/0 2997/3003/2863) [6,2,9,13,8,1]p6(0) r=0 lpr=3003 luod=3013'34105 crt=3013'34105 lcod 3013'34104 mlcod 3013'34104 active+clean] handle_sub_write_reply Calling on_all_commit on Op(17:e5b45a33:::10000009247.00000013:head v=3013'34106 tt=2970'32534 tid=450308 reqid=client.1141606.0:718483 client_op=osd_op(client.1141606.0:718483 17.27s0 17:e5b45a33:::10000009247.00000013:head [write 3670016~524288] snapc 1=[] gid 7 raft_term  152 raft_version 718483 ondisk+write+known_if_redirected e3013) roll_forward_to=3013'34105 temp_added= temp_cleared= pending_read={} remote_read={} remote_read_result={} pending_apply= pending_commit= plan.to_read={} plan.will_write={17:e5b45a33:::10000009247.00000013:head=[3670016~524288]})
2019-06-27 10:03:10.124011 7efe6ae0c700  3 osd.6 pg_epoch: 3013 pg[17.27s0( v 3013'34106 (2970'32534,3013'34106] local-lis/les=3003/3004 n=4837 ec=375/375 lis/c 3003/3003 les/c/f 3004/3004/0 2997/3003/2863) [6,2,9,13,8,1]p6(0) r=0 lpr=3003 crt=3013'34105 lcod 3013'34105 mlcod 3013'34104 active+clean]  sending reply on osd_op(client.1141606.0:718483 17.27s0 17:e5b45a33:::10000009247.00000013:head [write 3670016~524288] snapc 1=[] gid 7 raft_term  152 raft_version 718483 ondisk+write+known_if_redirected e3013) v9 0x561a134898c0

## proc master log(osd.8), last_update 3013'34104
2019-06-27 10:06:52.478860 7f06788a4700 10 osd.6 pg_epoch: 3027 pg[17.27s0( v 3013'34106 (2970'32534,3013'34106] local-lis/les=3003/3004 n=4837 ec=375/375 lis/c 3003/3003 les/c/f 3004/3004/0 3027/3027/3027) [6,2,2147483647,2147483647,8,1]p6(0) r=0 lpr=3027 pi=[3003,3027)/1 crt=3013'34105 lcod 0'0 mlcod 0'0 peering] state<Started/Primary/Peering/GetLog>:  requesting log from osd.8(4)
2019-06-27 10:06:52.490276 7f06780a3700 10 osd.6 pg_epoch: 3027 pg[17.27s0( v 3013'34106 (2970'32534,3013'34106] local-lis/les=3003/3004 n=4837 ec=375/375 lis/c 3003/3003 les/c/f 3004/3004/0 3027/3027/3027) [6,2,2147483647,2147483647,8,1]p6(0) r=0 lpr=3027 pi=[3003,3027)/1 crt=3013'34105 lcod 0'0 mlcod 0'0 peering] proc_master_log for osd.8(4): log((3013'34104,3013'34104], crt=3013'34093) missing(0 may_include_deletes = 1)
2019-06-27 10:06:52.490793 7f06780a3700 10 rewind_divergent_log future divergent 3013'34105 (2900'29100) modify   17:e52dbeb7:::100000057b9.00000006:head by client.1494123.0:89636 2019-06-27 10:03:08.788908 0 raft_grpupid=5 raft_term=46
2019-06-27 10:06:52.526560 7f06788a4700 10 _merge_object_divergent_entries: hoid 17:e52dbeb7:::100000057b9.00000006:head prior_version: 2900'29100 first_divergent_update: 3013'34105 last_divergent_update: 3013'34105
2019-06-27 10:06:52.526567 7f06788a4700 10 _merge_object_divergent_entries: hoid 17:e52dbeb7:::100000057b9.00000006:head has no more recent entries in log
2019-06-27 10:06:52.526570 7f06788a4700 10 _merge_object_divergent_entries: hoid 17:e52dbeb7:::100000057b9.00000006:head must be rolled back or recovered, attempting to rollback
2019-06-27 10:06:52.526572 7f06788a4700 10 _merge_object_divergent_entries: hoid 17:e52dbeb7:::100000057b9.00000006:head cannot rollback 3013'34105 (2900'29100) modify   17:e52dbeb7:::100000057b9.00000006:head by client.1494123.0:89636 2019-06-27 10:03:08.788908 0 raft_grpupid=5 raft_term=46
2019-06-27 10:06:52.526586 7f06788a4700 10 _merge_object_divergent_entries: hoid 17:e52dbeb7:::100000057b9.00000006:head cannot roll back, removing and adding to missing
2019-06-27 10:06:52.526593 7f06788a4700 10 _merge_object_divergent_entries: hoid 17:e52dbeb7:::100000057b9.00000006:head prior_version 2900'29100 <= info.log_tail 2970'32534

2019-06-27 10:06:52.528628 7f067a0a7700 10 _merge_object_divergent_entries: hoid 17:e5b45a33:::10000009247.00000013:head prior_version: 3013'34090 first_divergent_update: 3013'34106 last_divergent_update: 3013'34106
2019-06-27 10:06:52.528629 7f067a0a7700 10 _merge_object_divergent_entries: hoid 17:e5b45a33:::10000009247.00000013:head has no more recent entries in log
2019-06-27 10:06:52.528630 7f067a0a7700 10 _merge_object_divergent_entries: hoid 17:e5b45a33:::10000009247.00000013:head must be rolled back or recovered, attempting to rollback
2019-06-27 10:06:52.528631 7f067a0a7700 10 _merge_object_divergent_entries: hoid 17:e5b45a33:::10000009247.00000013:head cannot rollback 3013'34106 (3013'34090) modify   17:e5b45a33:::10000009247.00000013:head by client.1141606.0:718483 2019-06-27 10:03:10.084723 0 raft_grpupid=7 raft_term=152
2019-06-27 10:06:52.528634 7f067a0a7700 10 _merge_object_divergent_entries: hoid 17:e5b45a33:::10000009247.00000013:head cannot roll back, removing and adding to missing

Actions

Also available in: Atom PDF