Bug #9727
closed0.86 EC+ KV OSDs crashing
0%
Description
Hi, testing our Tiering setup with EC+KV backend a bit further on the latest dev release, our OSDS started to crash after some hours:
ceph osd tree:
[root@ceph003 ~]# ceph osd tree # id weight type name up/down reweight -12 23.44 root default-cache -9 7.812 host ceph001-cache 1 3.906 osd.1 up 1 3 3.906 osd.3 up 1 -10 7.812 host ceph002-cache 17 3.906 osd.17 up 1 19 3.906 osd.19 up 1 -11 7.812 host ceph003-cache 33 3.906 osd.33 up 1 35 3.906 osd.35 up 1 -8 140.6 root default-ec -5 46.87 host ceph001-ec 4 3.906 osd.4 down 0 5 3.906 osd.5 down 0 6 3.906 osd.6 up 1 7 3.906 osd.7 down 0 8 3.906 osd.8 down 0 9 3.906 osd.9 down 0 10 3.906 osd.10 up 1 11 3.906 osd.11 up 1 12 3.906 osd.12 down 0 13 3.906 osd.13 down 0 14 3.906 osd.14 down 0 15 3.906 osd.15 down 0 -6 46.87 host ceph002-ec 20 3.906 osd.20 down 0 21 3.906 osd.21 up 1 22 3.906 osd.22 up 1 23 3.906 osd.23 down 0 24 3.906 osd.24 down 0 25 3.906 osd.25 down 0 26 3.906 osd.26 down 0 27 3.906 osd.27 down 0 28 3.906 osd.28 up 1 29 3.906 osd.29 down 0 30 3.906 osd.30 down 0 31 3.906 osd.31 down 0 -7 46.87 host ceph003-ec 36 3.906 osd.36 up 1 37 3.906 osd.37 up 1 38 3.906 osd.38 down 0 39 3.906 osd.39 down 0 40 3.906 osd.40 down 0 41 3.906 osd.41 up 1 42 3.906 osd.42 down 0 43 3.906 osd.43 down 0 44 3.906 osd.44 down 0 45 3.906 osd.45 up 1 46 3.906 osd.46 down 0 47 3.906 osd.47 down 0 -4 23.44 root default-ssd -1 7.812 host ceph001-ssd 0 3.906 osd.0 up 1 2 3.906 osd.2 up 1 -2 7.812 host ceph002-ssd 16 3.906 osd.16 up 1 18 3.906 osd.18 up 1 -3 7.812 host ceph003-ssd 32 3.906 osd.32 up 1 34 3.906 osd.34 up 1
The stack from the log file of a crashed OSD:
-37> 2014-10-10 16:13:28.624571 7f10d5cc7700 1 -- 10.141.8.182:6846/1477 <== osd.6 10.143.8.180:0/44498 28286 ==== osd_ping(ping e169 stamp 2014-10-10 16:13:28.624015) v2 ==== 47+0+0 (401773946 0 0) 0xa3ee900 con 0x58f09a0 -36> 2014-10-10 16:13:28.624595 7f10d5cc7700 1 -- 10.141.8.182:6846/1477 --> 10.143.8.180:0/44498 -- osd_ping(ping_reply e169 stamp 2014-10-10 16:13:28.624015) v2 -- ?+0 0x6e1a940 con 0x58f09a0 -35> 2014-10-10 16:13:28.660956 7f10d44c4700 1 -- 10.143.8.182:6831/1477 <== osd.21 10.141.8.181:0/39311 1186 ==== osd_ping(ping e169 stamp 2014-10-10 16:13:28.660226) v2 ==== 47+0+0 (2490584147 0 0) 0x73a2940 con 0x67644c0 -34> 2014-10-10 16:13:28.660978 7f10d44c4700 1 -- 10.143.8.182:6831/1477 --> 10.141.8.181:0/39311 -- osd_ping(ping_reply e169 stamp 2014-10-10 16:13:28.660226) v2 -- ?+0 0xc223fc0 con 0x67644c0 -33> 2014-10-10 16:13:28.660991 7f10d5cc7700 1 -- 10.141.8.182:6846/1477 <== osd.21 10.141.8.181:0/39311 1186 ==== osd_ping(ping e169 stamp 2014-10-10 16:13:28.660226) v2 ==== 47+0+0 (2490584147 0 0) 0x1bcaf080 con 0x1666a100 -32> 2014-10-10 16:13:28.661032 7f10d5cc7700 1 -- 10.141.8.182:6846/1477 --> 10.141.8.181:0/39311 -- osd_ping(ping_reply e169 stamp 2014-10-10 16:13:28.660226) v2 -- ?+0 0xa3ee900 con 0x1666a100 -31> 2014-10-10 16:13:28.750100 7f10b7bc9700 1 -- 10.143.8.182:6830/1477 <== osd.21 10.143.8.181:6810/39311 21434 ==== MOSDECSubOpRead(2.1es6 169 ECSubRead(tid=32298, to_read={5fc9c59e/1000008bcd3.00000000/head//2=0,839072,152ac59e/1000008ce45.00000000/head//2=0,839072,2b0bc59e/100000ca195.00000000/head//2=0,839072,425bc59e/100000dc199.00000000/head//2=0,839072,288bc59e/1000001df08.00000000/head//2=0,839072}, attrs_to_read=)) v1 ==== 456+0+0 (259409574 0 0) 0x5b34ec0 con 0x5fb2ec0 -30> 2014-10-10 16:13:28.750119 7f10b7bc9700 5 -- op tracker -- seq: 1058580, time: 2014-10-10 16:13:28.750047, event: header_read, op: MOSDECSubOpRead(2.1es6 169 ECSubRead(tid=32298, to_read={5fc9c59e/1000008bcd3.00000000/head//2=0,839072,152ac59e/1000008ce45.00000000/head//2=0,839072,2b0bc59e/100000ca195.00000000/head//2=0,839072,425bc59e/100000dc199.00000000/head//2=0,839072,288bc59e/1000001df08.00000000/head//2=0,839072}, attrs_to_read=)) -29> 2014-10-10 16:13:28.750131 7f10b7bc9700 5 -- op tracker -- seq: 1058580, time: 2014-10-10 16:13:28.750050, event: throttled, op: MOSDECSubOpRead(2.1es6 169 ECSubRead(tid=32298, to_read={5fc9c59e/1000008bcd3.00000000/head//2=0,839072,152ac59e/1000008ce45.00000000/head//2=0,839072,2b0bc59e/100000ca195.00000000/head//2=0,839072,425bc59e/100000dc199.00000000/head//2=0,839072,288bc59e/1000001df08.00000000/head//2=0,839072}, attrs_to_read=)) -28> 2014-10-10 16:13:28.750141 7f10b7bc9700 5 -- op tracker -- seq: 1058580, time: 2014-10-10 16:13:28.750096, event: all_read, op: MOSDECSubOpRead(2.1es6 169 ECSubRead(tid=32298, to_read={5fc9c59e/1000008bcd3.00000000/head//2=0,839072,152ac59e/1000008ce45.00000000/head//2=0,839072,2b0bc59e/100000ca195.00000000/head//2=0,839072,425bc59e/100000dc199.00000000/head//2=0,839072,288bc59e/1000001df08.00000000/head//2=0,839072}, attrs_to_read=)) -27> 2014-10-10 16:13:28.750151 7f10b7bc9700 5 -- op tracker -- seq: 1058580, time: 0.000000, event: dispatched, op: MOSDECSubOpRead(2.1es6 169 ECSubRead(tid=32298, to_read={5fc9c59e/1000008bcd3.00000000/head//2=0,839072,152ac59e/1000008ce45.00000000/head//2=0,839072,2b0bc59e/100000ca195.00000000/head//2=0,839072,425bc59e/100000dc199.00000000/head//2=0,839072,288bc59e/1000001df08.00000000/head//2=0,839072}, attrs_to_read=)) -26> 2014-10-10 16:13:28.750186 7f10cf4ba700 5 -- op tracker -- seq: 1058580, time: 2014-10-10 16:13:28.750185, event: reached_pg, op: MOSDECSubOpRead(2.1es6 169 ECSubRead(tid=32298, to_read={5fc9c59e/1000008bcd3.00000000/head//2=0,839072,152ac59e/1000008ce45.00000000/head//2=0,839072,2b0bc59e/100000ca195.00000000/head//2=0,839072,425bc59e/100000dc199.00000000/head//2=0,839072,288bc59e/1000001df08.00000000/head//2=0,839072}, attrs_to_read=)) -25> 2014-10-10 16:13:28.756626 7f10cf4ba700 1 -- 10.143.8.182:6830/1477 --> 10.143.8.181:6810/39311 -- MOSDECSubOpReadReply(2.1es0 169 ECSubReadReply(tid=32298, attrs_read=0)) v1 -- ?+0 0x21dc2300 con 0x5fb2ec0 -24> 2014-10-10 16:13:28.756800 7f10cf4ba700 5 -- op tracker -- seq: 1058580, time: 2014-10-10 16:13:28.756799, event: done, op: MOSDECSubOpRead(2.1es6 169 ECSubRead(tid=32298, to_read={5fc9c59e/1000008bcd3.00000000/head//2=0,839072,152ac59e/1000008ce45.00000000/head//2=0,839072,2b0bc59e/100000ca195.00000000/head//2=0,839072,425bc59e/100000dc199.00000000/head//2=0,839072,288bc59e/1000001df08.00000000/head//2=0,839072}, attrs_to_read=)) -23> 2014-10-10 16:13:28.855939 7f10b7bc9700 1 -- 10.143.8.182:6830/1477 <== osd.21 10.143.8.181:6810/39311 21435 ==== MOSDECSubOpRead(2.1es6 169 ECSubRead(tid=32299, to_read={cf2cc59e/100000b5ec2.00000000/head//2=0,839072,c5acc59e/100000a6894.00000000/head//2=0,839072,c53dc59e/100000c62fd.00000000/head//2=0,839072,d11ec59e/1000005a942.00000000/head//2=0,839072,c88ec59e/100000e4348.00000000/head//2=0,839072}, attrs_to_read=)) v1 ==== 456+0+0 (3334140587 0 0) 0x5b306c0 con 0x5fb2ec0 -22> 2014-10-10 16:13:28.855968 7f10b7bc9700 5 -- op tracker -- seq: 1058581, time: 2014-10-10 16:13:28.855886, event: header_read, op: MOSDECSubOpRead(2.1es6 169 ECSubRead(tid=32299, to_read={cf2cc59e/100000b5ec2.00000000/head//2=0,839072,c5acc59e/100000a6894.00000000/head//2=0,839072,c53dc59e/100000c62fd.00000000/head//2=0,839072,d11ec59e/1000005a942.00000000/head//2=0,839072,c88ec59e/100000e4348.00000000/head//2=0,839072}, attrs_to_read=)) -21> 2014-10-10 16:13:28.855981 7f10b7bc9700 5 -- op tracker -- seq: 1058581, time: 2014-10-10 16:13:28.855889, event: throttled, op: MOSDECSubOpRead(2.1es6 169 ECSubRead(tid=32299, to_read={cf2cc59e/100000b5ec2.00000000/head//2=0,839072,c5acc59e/100000a6894.00000000/head//2=0,839072,c53dc59e/100000c62fd.00000000/head//2=0,839072,d11ec59e/1000005a942.00000000/head//2=0,839072,c88ec59e/100000e4348.00000000/head//2=0,839072}, attrs_to_read=)) -20> 2014-10-10 16:13:28.855991 7f10b7bc9700 5 -- op tracker -- seq: 1058581, time: 2014-10-10 16:13:28.855934, event: all_read, op: MOSDECSubOpRead(2.1es6 169 ECSubRead(tid=32299, to_read={cf2cc59e/100000b5ec2.00000000/head//2=0,839072,c5acc59e/100000a6894.00000000/head//2=0,839072,c53dc59e/100000c62fd.00000000/head//2=0,839072,d11ec59e/1000005a942.00000000/head//2=0,839072,c88ec59e/100000e4348.00000000/head//2=0,839072}, attrs_to_read=)) -19> 2014-10-10 16:13:28.856001 7f10b7bc9700 5 -- op tracker -- seq: 1058581, time: 0.000000, event: dispatched, op: MOSDECSubOpRead(2.1es6 169 ECSubRead(tid=32299, to_read={cf2cc59e/100000b5ec2.00000000/head//2=0,839072,c5acc59e/100000a6894.00000000/head//2=0,839072,c53dc59e/100000c62fd.00000000/head//2=0,839072,d11ec59e/1000005a942.00000000/head//2=0,839072,c88ec59e/100000e4348.00000000/head//2=0,839072}, attrs_to_read=)) -18> 2014-10-10 16:13:28.856046 7f10cccb5700 5 -- op tracker -- seq: 1058581, time: 2014-10-10 16:13:28.856046, event: reached_pg, op: MOSDECSubOpRead(2.1es6 169 ECSubRead(tid=32299, to_read={cf2cc59e/100000b5ec2.00000000/head//2=0,839072,c5acc59e/100000a6894.00000000/head//2=0,839072,c53dc59e/100000c62fd.00000000/head//2=0,839072,d11ec59e/1000005a942.00000000/head//2=0,839072,c88ec59e/100000e4348.00000000/head//2=0,839072}, attrs_to_read=)) -17> 2014-10-10 16:13:28.865364 7f10cccb5700 1 -- 10.143.8.182:6830/1477 --> 10.143.8.181:6810/39311 -- MOSDECSubOpReadReply(2.1es0 169 ECSubReadReply(tid=32299, attrs_read=0)) v1 -- ?+0 0x21dc2800 con 0x5fb2ec0 -16> 2014-10-10 16:13:28.865446 7f10cccb5700 5 -- op tracker -- seq: 1058581, time: 2014-10-10 16:13:28.865446, event: done, op: MOSDECSubOpRead(2.1es6 169 ECSubRead(tid=32299, to_read={cf2cc59e/100000b5ec2.00000000/head//2=0,839072,c5acc59e/100000a6894.00000000/head//2=0,839072,c53dc59e/100000c62fd.00000000/head//2=0,839072,d11ec59e/1000005a942.00000000/head//2=0,839072,c88ec59e/100000e4348.00000000/head//2=0,839072}, attrs_to_read=)) -15> 2014-10-10 16:13:28.883745 7f10de654700 5 osd.47 169 tick -14> 2014-10-10 16:13:29.387688 7f10d44c4700 1 -- 10.143.8.182:6831/1477 <== osd.8 10.141.8.180:0/50167 28373 ==== osd_ping(ping e169 stamp 2014-10-10 16:13:29.383149) v2 ==== 47+0+0 (972166668 0 0) 0x19ecf260 con 0x644a7e0 -13> 2014-10-10 16:13:29.387722 7f10d44c4700 1 -- 10.143.8.182:6831/1477 --> 10.141.8.180:0/50167 -- osd_ping(ping_reply e169 stamp 2014-10-10 16:13:29.383149) v2 -- ?+0 0x73a2940 con 0x644a7e0 -12> 2014-10-10 16:13:29.387727 7f10d5cc7700 1 -- 10.141.8.182:6846/1477 <== osd.8 10.141.8.180:0/50167 28373 ==== osd_ping(ping e169 stamp 2014-10-10 16:13:29.383149) v2 ==== 47+0+0 (972166668 0 0) 0x1c7005a0 con 0x58f3de0 -11> 2014-10-10 16:13:29.387782 7f10d5cc7700 1 -- 10.141.8.182:6846/1477 --> 10.141.8.180:0/50167 -- osd_ping(ping_reply e169 stamp 2014-10-10 16:13:29.383149) v2 -- ?+0 0x1bcaf080 con 0x58f3de0 -10> 2014-10-10 16:13:29.439085 7f10d5cc7700 1 -- 10.141.8.182:6846/1477 <== osd.37 10.141.8.182:0/39363 28314 ==== osd_ping(ping e169 stamp 2014-10-10 16:13:29.438577) v2 ==== 47+0+0 (3411033659 0 0) 0x9ac1680 con 0x6763860 -9> 2014-10-10 16:13:29.439117 7f10d5cc7700 1 -- 10.141.8.182:6846/1477 --> 10.141.8.182:0/39363 -- osd_ping(ping_reply e169 stamp 2014-10-10 16:13:29.438577) v2 -- ?+0 0x1c7005a0 con 0x6763860 -8> 2014-10-10 16:13:29.439134 7f10d44c4700 1 -- 10.143.8.182:6831/1477 <== osd.37 10.141.8.182:0/39363 28314 ==== osd_ping(ping e169 stamp 2014-10-10 16:13:29.438577) v2 ==== 47+0+0 (3411033659 0 0) 0x1b0eecc0 con 0x6764ba0 -7> 2014-10-10 16:13:29.439153 7f10d44c4700 1 -- 10.143.8.182:6831/1477 --> 10.141.8.182:0/39363 -- osd_ping(ping_reply e169 stamp 2014-10-10 16:13:29.438577) v2 -- ?+0 0x19ecf260 con 0x6764ba0 -6> 2014-10-10 16:13:29.522560 7f10b7bc9700 1 -- 10.143.8.182:6830/1477 <== osd.21 10.143.8.181:6810/39311 21436 ==== MOSDECSubOpRead(2.1es6 169 ECSubRead(tid=32300, to_read={8d0fc59e/100000c66b4.00000000/head//2=0,839072,1fafc59e/10000087781.00000000/head//2=0,839072,bcd1d59e/1000005e23e.00000000/head//2=0,839072,c7d3d59e/10000004853.00000000/head//2=0,839072,20e3d59e/10000000447.00000791/head//2=0,839072}, attrs_to_read=)) v1 ==== 456+0+0 (1404608695 0 0) 0x5b37980 con 0x5fb2ec0 -5> 2014-10-10 16:13:29.522593 7f10b7bc9700 5 -- op tracker -- seq: 1058582, time: 2014-10-10 16:13:29.522506, event: header_read, op: MOSDECSubOpRead(2.1es6 169 ECSubRead(tid=32300, to_read={8d0fc59e/100000c66b4.00000000/head//2=0,839072,1fafc59e/10000087781.00000000/head//2=0,839072,bcd1d59e/1000005e23e.00000000/head//2=0,839072,c7d3d59e/10000004853.00000000/head//2=0,839072,20e3d59e/10000000447.00000791/head//2=0,839072}, attrs_to_read=)) -4> 2014-10-10 16:13:29.522608 7f10b7bc9700 5 -- op tracker -- seq: 1058582, time: 2014-10-10 16:13:29.522508, event: throttled, op: MOSDECSubOpRead(2.1es6 169 ECSubRead(tid=32300, to_read={8d0fc59e/100000c66b4.00000000/head//2=0,839072,1fafc59e/10000087781.00000000/head//2=0,839072,bcd1d59e/1000005e23e.00000000/head//2=0,839072,c7d3d59e/10000004853.00000000/head//2=0,839072,20e3d59e/10000000447.00000791/head//2=0,839072}, attrs_to_read=)) -3> 2014-10-10 16:13:29.522619 7f10b7bc9700 5 -- op tracker -- seq: 1058582, time: 2014-10-10 16:13:29.522554, event: all_read, op: MOSDECSubOpRead(2.1es6 169 ECSubRead(tid=32300, to_read={8d0fc59e/100000c66b4.00000000/head//2=0,839072,1fafc59e/10000087781.00000000/head//2=0,839072,bcd1d59e/1000005e23e.00000000/head//2=0,839072,c7d3d59e/10000004853.00000000/head//2=0,839072,20e3d59e/10000000447.00000791/head//2=0,839072}, attrs_to_read=)) -2> 2014-10-10 16:13:29.522629 7f10b7bc9700 5 -- op tracker -- seq: 1058582, time: 0.000000, event: dispatched, op: MOSDECSubOpRead(2.1es6 169 ECSubRead(tid=32300, to_read={8d0fc59e/100000c66b4.00000000/head//2=0,839072,1fafc59e/10000087781.00000000/head//2=0,839072,bcd1d59e/1000005e23e.00000000/head//2=0,839072,c7d3d59e/10000004853.00000000/head//2=0,839072,20e3d59e/10000000447.00000791/head//2=0,839072}, attrs_to_read=)) -1> 2014-10-10 16:13:29.522663 7f10cf4ba700 5 -- op tracker -- seq: 1058582, time: 2014-10-10 16:13:29.522663, event: reached_pg, op: MOSDECSubOpRead(2.1es6 169 ECSubRead(tid=32300, to_read={8d0fc59e/100000c66b4.00000000/head//2=0,839072,1fafc59e/10000087781.00000000/head//2=0,839072,bcd1d59e/1000005e23e.00000000/head//2=0,839072,c7d3d59e/10000004853.00000000/head//2=0,839072,20e3d59e/10000000447.00000791/head//2=0,839072}, attrs_to_read=)) 0> 2014-10-10 16:13:29.526506 7f10cf4ba700 -1 osd/ECBackend.cc: In function 'void ECBackend::handle_sub_read(pg_shard_t, ECSubRead&, ECSubReadReply*)' thread 7f10cf4ba700 time 2014-10-10 16:13:29.524269 osd/ECBackend.cc: 876: FAILED assert(0) ceph version 0.86 (97dcc0539dfa7dac3de74852305d51580b7b1f82) 1: (ceph::__ceph_assert_fail(char const*, char const*, int, char const*)+0x85) [0xb79235] 2: (ECBackend::handle_sub_read(pg_shard_t, ECSubRead&, ECSubReadReply*)+0x77e) [0x9ecdfe] 3: (ECBackend::handle_message(std::tr1::shared_ptr<OpRequest>)+0x4b5) [0x9f59b5] 4: (ReplicatedPG::do_request(std::tr1::shared_ptr<OpRequest>&, ThreadPool::TPHandle&)+0x26b) [0x80455b] 5: (OSD::dequeue_op(boost::intrusive_ptr<PG>, std::tr1::shared_ptr<OpRequest>, ThreadPool::TPHandle&)+0x405) [0x6719f5] 6: (OSD::ShardedOpWQ::_process(unsigned int, ceph::heartbeat_handle_d*)+0x359) [0x671f89] 7: (ShardedThreadPool::shardedthreadpool_worker(unsigned int)+0x828) [0xb68e88] 8: (ShardedThreadPool::WorkThreadSharded::entry()+0x10) [0xb6afb0] 9: (()+0x7df3) [0x7f10e6e1fdf3] 10: (clone()+0x6d) [0x7f10e58e63dd] NOTE: a copy of the executable, or `objdump -rdS <executable>` is needed to interpret this. --- logging levels --- 0/ 5 none 0/ 1 lockdep 0/ 1 context 1/ 1 crush 1/ 5 mds 1/ 5 mds_balancer 1/ 5 mds_locker 1/ 5 mds_log 1/ 5 mds_log_expire 1/ 5 mds_migrator 0/ 1 buffer 0/ 1 timer 0/ 1 filer 0/ 1 striper 0/ 1 objecter 0/ 5 rados 0/ 5 rbd 0/ 5 rbd_replay 0/ 5 journaler 0/ 5 objectcacher 0/ 5 client 0/ 5 osd 0/ 5 optracker 0/ 5 objclass 1/ 3 filestore 1/ 3 keyvaluestore 1/ 3 journal 0/ 5 ms 1/ 5 mon 0/10 monc 1/ 5 paxos 0/ 5 tp 1/ 5 auth 1/ 5 crypto 1/ 1 finisher 1/ 5 heartbeatmap 1/ 5 perfcounter 1/ 5 rgw 1/10 civetweb 1/ 5 javaclient 1/ 5 asok 1/ 1 throttle 0/ 0 refs -2/-2 (syslog threshold) -1/-1 (stderr threshold) max_recent 10000 max_new 1000 log_file /var/log/ceph/ceph-osd.47.log --- end dump of recent events --- 2014-10-10 16:13:29.644863 7f10cf4ba700 -1 *** Caught signal (Aborted) ** in thread 7f10cf4ba700 ceph version 0.86 (97dcc0539dfa7dac3de74852305d51580b7b1f82) 1: /usr/bin/ceph-osd() [0xa86f92] 2: (()+0xf130) [0x7f10e6e27130] 3: (gsignal()+0x39) [0x7f10e5825989] 4: (abort()+0x148) [0x7f10e5827098] 5: (__gnu_cxx::__verbose_terminate_handler()+0x165) [0x7f10e61389d5] 6: (()+0x5e946) [0x7f10e6136946] 7: (()+0x5e973) [0x7f10e6136973] 8: (()+0x5eb9f) [0x7f10e6136b9f] 9: (ceph::__ceph_assert_fail(char const*, char const*, int, char const*)+0x27a) [0xb7942a] 10: (ECBackend::handle_sub_read(pg_shard_t, ECSubRead&, ECSubReadReply*)+0x77e) [0x9ecdfe] 11: (ECBackend::handle_message(std::tr1::shared_ptr<OpRequest>)+0x4b5) [0x9f59b5] 12: (ReplicatedPG::do_request(std::tr1::shared_ptr<OpRequest>&, ThreadPool::TPHandle&)+0x26b) [0x80455b] 13: (OSD::dequeue_op(boost::intrusive_ptr<PG>, std::tr1::shared_ptr<OpRequest>, ThreadPool::TPHandle&)+0x405) [0x6719f5] 14: (OSD::ShardedOpWQ::_process(unsigned int, ceph::heartbeat_handle_d*)+0x359) [0x671f89] 15: (ShardedThreadPool::shardedthreadpool_worker(unsigned int)+0x828) [0xb68e88] 16: (ShardedThreadPool::WorkThreadSharded::entry()+0x10) [0xb6afb0] 17: (()+0x7df3) [0x7f10e6e1fdf3] 18: (clone()+0x6d) [0x7f10e58e63dd] NOTE: a copy of the executable, or `objdump -rdS <executable>` is needed to interpret this. --- begin dump of recent events --- 0> 2014-10-10 16:13:29.644863 7f10cf4ba700 -1 *** Caught signal (Aborted) ** in thread 7f10cf4ba700 ceph version 0.86 (97dcc0539dfa7dac3de74852305d51580b7b1f82) 1: /usr/bin/ceph-osd() [0xa86f92] 2: (()+0xf130) [0x7f10e6e27130] 3: (gsignal()+0x39) [0x7f10e5825989] 4: (abort()+0x148) [0x7f10e5827098] 5: (__gnu_cxx::__verbose_terminate_handler()+0x165) [0x7f10e61389d5] 6: (()+0x5e946) [0x7f10e6136946] 7: (()+0x5e973) [0x7f10e6136973] 8: (()+0x5eb9f) [0x7f10e6136b9f] 9: (ceph::__ceph_assert_fail(char const*, char const*, int, char const*)+0x27a) [0xb7942a] 10: (ECBackend::handle_sub_read(pg_shard_t, ECSubRead&, ECSubReadReply*)+0x77e) [0x9ecdfe] 11: (ECBackend::handle_message(std::tr1::shared_ptr<OpRequest>)+0x4b5) [0x9f59b5] 12: (ReplicatedPG::do_request(std::tr1::shared_ptr<OpRequest>&, ThreadPool::TPHandle&)+0x26b) [0x80455b] 13: (OSD::dequeue_op(boost::intrusive_ptr<PG>, std::tr1::shared_ptr<OpRequest>, ThreadPool::TPHandle&)+0x405) [0x6719f5] 14: (OSD::ShardedOpWQ::_process(unsigned int, ceph::heartbeat_handle_d*)+0x359) [0x671f89] 15: (ShardedThreadPool::shardedthreadpool_worker(unsigned int)+0x828) [0xb68e88] 16: (ShardedThreadPool::WorkThreadSharded::entry()+0x10) [0xb6afb0] 17: (()+0x7df3) [0x7f10e6e1fdf3] 18: (clone()+0x6d) [0x7f10e58e63dd] NOTE: a copy of the executable, or `objdump -rdS <executable>` is needed to interpret this.
Thanks!
Updated by Kenneth Waegeman over 9 years ago
will this be an issue solved in Giant?
Updated by Haomai Wang over 9 years ago
Not sure, I'm still waiting for crash for master branch
Updated by Dmitry Smirnov over 9 years ago
- Severity changed from 3 - minor to 2 - major
My prematurely upgraded to "Giant" cluster is practically wrecked by this problem.
Haomai, is there any additional information you need? Thanks.
Updated by Haomai Wang over 9 years ago
Hmm, still now KeyValueStore isn't suitable for large version upgrade. So I don't sure which problem you hit.
I'm more interested in whether you hit this for master branch?
Updated by Dmitry Smirnov over 9 years ago
Well, I can apply selective patches on top of 0.87 but I'd be reluctant to deploy master branch cluster-wide...
All OSDs are crashing, filestore and keyvaluestore so I doubt that it is related to type of store...
I have two caching pools and two EC pools where it seems to happen on backfill very often.
Sometimes I'm getting yet another OSD down within minutes after restart and as soon as one goes down next crashes soon after...
Updated by Haomai Wang over 9 years ago
OK, it's seemed that not a simple test problem I just misunderstand. So do you have more logs and I find there exists a related bug associated with EC bug.
Updated by Haomai Wang over 9 years ago
- Status changed from New to Duplicate
Updated by Kenneth Waegeman over 9 years ago
Hi,
I tried this again on the new 0.88 release.
After about 30 minutes of testing, the EC-KV OSDs started crashing again..
But I can't seem to find a stacktrace this time, all the osds seem to fail without a warning:
2014-11-14 15:58:56.599573 7f5b6edd6700 0 -- 10.143.8.181:6810/30008 >> 10.143.8.181:6808/29246 pipe(0x4f19000 sd=72 :6810 s=0 pgs=0 cs=0 l=0 c=0x619d120).accept connect_seq 8 vs existing 7 state standby 2014-11-14 15:59:01.689848 7f5b6fbe4700 0 -- 10.143.8.181:6810/30008 >> 10.143.8.182:6820/33968 pipe(0x5108000 sd=93 :6810 s=0 pgs=0 cs=0 l=0 c=0x5e2eb40).accept connect_seq 6 vs existing 5 state standby 2014-11-14 15:59:10.307606 7f5b7533b700 0 -- 10.143.8.181:6810/30008 >> 10.143.8.181:6830/38099 pipe(0x50d8000 sd=103 :6810 s=0 pgs=0 cs=0 l=0 c=0x5e2d280).accept connect_seq 6 vs existing 5 state standby 2014-11-14 15:59:18.708032 7f5b77763700 0 -- 10.143.8.181:6810/30008 >> 10.143.8.180:6816/36202 pipe(0x601c800 sd=108 :6810 s=0 pgs=0 cs=0 l=0 c=0x5e2d120).accept connect_seq 8 vs existing 7 state standby 2014-11-14 15:59:22.523790 7f5b74830700 0 -- 10.143.8.181:6810/30008 >> 10.143.8.182:6830/38083 pipe(0x639a000 sd=66 :6810 s=0 pgs=0 cs=0 l=0 c=0x5e2cfc0).accept connect_seq 6 vs existing 5 state standby 2014-11-14 15:59:23.789143 7f5b6fae3700 0 -- 10.143.8.181:6810/30008 >> 10.143.8.180:6826/41035 pipe(0x5f72000 sd=63 :6810 s=0 pgs=0 cs=0 l=0 c=0x5e2ce60).accept connect_seq 6 vs existing 5 state standby 2014-11-14 15:59:27.947819 7f5b6ffe8700 0 -- 10.143.8.181:6810/30008 >> 10.143.8.182:6824/35567 pipe(0x5eb5800 sd=71 :6810 s=0 pgs=0 cs=0 l=0 c=0x5e2cd00).accept connect_seq 6 vs existing 5 state standby 2014-11-14 15:59:36.255208 7f5b7361e700 0 -- 10.143.8.181:6810/30008 >> 10.143.8.182:6810/30079 pipe(0x4f22000 sd=82 :6810 s=0 pgs=0 cs=0 l=0 c=0x5e29e40).accept connect_seq 6 vs existing 5 state standby 2014-11-14 15:59:39.822154 7f5b73119700 0 -- 10.143.8.181:6810/30008 >> 10.143.8.180:6818/37133 pipe(0x4dbd800 sd=98 :6810 s=0 pgs=0 cs=0 l=0 c=0x5e2a260).accept connect_seq 6 vs existing 5 state standby 2014-11-14 15:59:41.189383 7f5b72b13700 0 -- 10.143.8.181:6810/30008 >> 10.143.8.181:6828/37279 pipe(0x4db0000 sd=60 :6810 s=0 pgs=0 cs=0 l=0 c=0x5e2a100).accept connect_seq 6 vs existing 5 state standby 2014-11-14 16:03:49.590419 7f5b6fbe4700 0 -- 10.143.8.181:6810/30008 >> 10.143.8.182:6820/33968 pipe(0x5108000 sd=93 :6810 s=2 pgs=143 cs=7 l=0 c=0x4e7b9c0).fault, initiating reconnect 2014-11-14 16:03:49.590641 7f5b700e9700 0 -- 10.143.8.181:6810/30008 >> 10.143.8.182:6820/33968 pipe(0x5108000 sd=93 :6810 s=1 pgs=143 cs=8 l=0 c=0x4e7b9c0).fault 2014-11-14 16:03:49.592972 7f5b7715d700 0 -- 10.143.8.181:0/30008 >> 10.141.8.182:6821/33968 pipe(0x50f0000 sd=51 :0 s=1 pgs=0 cs=0 l=1 c=0x8da5120).fault 2014-11-14 16:03:49.594262 7f5b7806c700 0 -- 10.143.8.181:0/30008 >> 10.143.8.182:6821/33968 pipe(0xa662000 sd=53 :0 s=1 pgs=0 cs=0 l=1 c=0x8da4d00).fault 2014-11-14 16:04:07.356712 7f5b8ebbf700 0 osd.21 117 check_osdmap_features enabling on-disk ERASURE CODES compat feature 2014-11-14 16:04:07.374430 7f5b68f78700 0 -- 10.143.8.181:6810/30008 >> 10.143.8.180:6830/43053 pipe(0x61cd800 sd=58 :6810 s=0 pgs=0 cs=0 l=0 c=0x5e2c4c0).accept connect_seq 6 vs existing 6 state connecting 2014-11-14 16:04:07.382700 7f5b6c4ad700 0 -- 10.143.8.181:6810/30008 >> 10.143.8.182:6814/31619 pipe(0x50f0000 sd=73 :6810 s=0 pgs=0 cs=0 l=0 c=0x5e282c0).accept connect_seq 2 vs existing 2 state connecting 2014-11-14 16:04:08.360638 7f5b8d3bc700 0 osd.21 118 check_osdmap_features enabling on-disk ERASURE CODES compat feature (END) [root@ceph002 ~]# ceph status cluster 82766e04-585b-49a6-a0ac-c13d9ffd0a7d health HEALTH_WARN 64 pgs down 64 pgs peering 64 pgs stuck inactive 64 pgs stuck unclean 1411 requests are blocked > 32 sec mds0: Behind on trimming (93/30) monmap e1: 3 mons at {ceph001=10.141.8.180:6789/0,ceph002=10.141.8.181:6789/0,ceph003=10.141.8.182:6789/0} election epoch 8, quorum 0,1,2 ceph001,ceph002,ceph003 mdsmap e25: 1/1/1 up {0=ceph002=up:active}, 2 up:standby osdmap e155: 48 osds: 39 up, 39 in pgmap v1648: 1344 pgs, 4 pools, 113 GB data, 53063 objects 212 GB used, 99633 GB / 99846 GB avail 1280 active+clean 64 down+peering [root@ceph002 ~]# ceph osd tree # id weight type name up/down reweight -12 23.44 root default-cache -9 7.812 host ceph001-cache 1 3.906 osd.1 up 1 3 3.906 osd.3 up 1 -10 7.812 host ceph002-cache 17 3.906 osd.17 up 1 19 3.906 osd.19 up 1 -11 7.812 host ceph003-cache 33 3.906 osd.33 up 1 35 3.906 osd.35 up 1 -8 140.6 root default-ec -5 46.87 host ceph001-ec 4 3.906 osd.4 up 1 5 3.906 osd.5 up 1 6 3.906 osd.6 up 1 7 3.906 osd.7 up 1 8 3.906 osd.8 up 1 9 3.906 osd.9 up 1 10 3.906 osd.10 up 1 11 3.906 osd.11 up 1 12 3.906 osd.12 up 1 13 3.906 osd.13 down 0 14 3.906 osd.14 up 1 15 3.906 osd.15 up 1 -6 46.87 host ceph002-ec 20 3.906 osd.20 up 1 21 3.906 osd.21 down 0 22 3.906 osd.22 up 1 23 3.906 osd.23 up 1 24 3.906 osd.24 up 1 25 3.906 osd.25 up 1 26 3.906 osd.26 down 0 27 3.906 osd.27 up 1 28 3.906 osd.28 up 1 29 3.906 osd.29 down 0 30 3.906 osd.30 up 1 31 3.906 osd.31 up 1 -7 46.87 host ceph003-ec 36 3.906 osd.36 down 0 37 3.906 osd.37 up 1 38 3.906 osd.38 up 1 39 3.906 osd.39 down 0 40 3.906 osd.40 up 1 41 3.906 osd.41 down 0 42 3.906 osd.42 down 0 43 3.906 osd.43 down 0 44 3.906 osd.44 up 1 45 3.906 osd.45 up 1 46 3.906 osd.46 up 1 47 3.906 osd.47 up 1 -4 23.44 root default-ssd -1 7.812 host ceph001-ssd 0 3.906 osd.0 up 1 2 3.906 osd.2 up 1 -2 7.812 host ceph002-ssd 16 3.906 osd.16 up 1 18 3.906 osd.18 up 1 -3 7.812 host ceph003-ssd 32 3.906 osd.32 up 1 34 3.906 osd.34 up 1
I don't find any dump in the logs..