Actions
Bug #2267
closedCeph client crashed after shutting down one mds and osd
% Done:
0%
Source:
Development
Tags:
Backport:
Regression:
Severity:
Reviewed:
Affected Versions:
ceph-qa-suite:
Pull request ID:
Crash signature (v1):
Crash signature (v2):
Description
Ceph version: 0.44.1-1~bpo70+1
Kernel version: 3.2.12-1
Ceph config:
[global]
auth supported = cephx
keyring = /srv/ceph/keyring.admin
[mon]
mon data = /srv/ceph/mon
[mon.n3c1]
host = n3c1
mon addr = 1.1.1.1:6789
[mon.n8c1]
host = n8c1
mon addr = 2.2.2.2:6789
[mon.n4c1]
host = n4c1
mon addr = 3.3.3.3:6789
[mds]
debug mds = 1
keyring = /srv/ceph/ceph-stage2/keyring.$name
[mds.n3c1]
host = n3c1
mds standby replay = true
mds standby for name = n4c1
[mds.n4c1]
host = n4c1
[osd]
osd data = /srv/ceph/$name
osd journal = /srv/ceph/$name.journal
osd journal size = 1000
filestore btrfs snap = 0
keyring = /srv/ceph/ceph-stage2/keyring.$name
debug osd = 1
[osd.1]
host = n3c1
[osd.0]
host = n4c1
[961187.404239] libceph: osd1 1.1.1.1:6801 socket closed [961187.405540] libceph: osd1 1.1.1.1:6801 connect authorization failure [962088.110608] libceph: osd1 1.1.1.1:6801 socket closed [962088.112420] libceph: osd1 1.1.1.1:6801 connect authorization failure [962989.108250] libceph: osd1 1.1.1.1:6801 socket closed [963889.402235] libceph: osd1 1.1.1.1:6801 socket closed [963889.403419] libceph: osd1 1.1.1.1:6801 connect authorization failure [965619.703634] libceph: osd1 1.1.1.1:6801 socket closed [966519.993750] libceph: osd1 1.1.1.1:6801 socket closed [966519.995090] libceph: osd1 1.1.1.1:6801 connect authorization failure [967421.107795] libceph: osd1 1.1.1.1:6801 socket closed [968321.397794] libceph: osd1 1.1.1.1:6801 socket closed [968321.399122] libceph: osd1 1.1.1.1:6801 connect authorization failure [972358.945257] libceph: osd1 1.1.1.1:6801 socket closed [991643.078884] libceph: osd1 1.1.1.1:6801 socket closed [991643.080226] libceph: osd1 1.1.1.1:6801 connect authorization failure [1033201.439727] libceph: mon2 1.1.1.1:6804 socket closed [1033201.439737] libceph: mon2 1.1.1.1:6804 session lost, hunting for new mon [1033201.443627] libceph: mon0 2.2.2.2:6789 session established [1033203.977343] libceph: osd1 1.1.1.1:6801 socket closed [1033203.978001] libceph: osd1 1.1.1.1:6801 connection failed [1033204.816786] libceph: osd1 1.1.1.1:6801 connection failed [1033205.816636] libceph: osd1 1.1.1.1:6801 connection failed [1033207.820595] libceph: osd1 1.1.1.1:6801 connection failed [1033211.824574] libceph: osd1 1.1.1.1:6801 connection failed [1033233.190734] libceph: osd1 down [1033338.949396] libceph: osd1 up [1033503.993763] libceph: osd1 1.1.1.1:6801 socket closed [1033503.994352] libceph: osd1 1.1.1.1:6801 connection failed [1033504.816523] libceph: osd1 1.1.1.1:6801 connection failed [1033505.816596] libceph: osd1 1.1.1.1:6801 connection failed [1033507.820580] libceph: osd1 1.1.1.1:6801 connection failed [1033511.824589] libceph: osd1 1.1.1.1:6801 connection failed [1033534.730898] libceph: osd1 down [1033540.401857] libceph: get_reply unknown tid 31773 from osd0 [1034382.461077] libceph: osd1 up [1034382.768772] BUG: unable to handle kernel NULL pointer dereference at 0000000000000048 [1034382.768785] IP: [<ffffffffa02a1ec8>] con_work+0x14da/0x1d21 [libceph] [1034382.768798] PGD 2df5067 PUD 2ca0067 PMD 0 [1034382.768805] Oops: 0000 [#1] SMP [1034382.768811] CPU 0 [1034382.768813] Modules linked in: rmd160 sha1_ssse3 sha1_generic hmac crypto_null camellia lzo cast6 cast5 deflate zlib_deflate cts ctr gcm ccm serpent blowfish_gen eric blowfish_x86_64 blowfish_common twofish_generic twofish_x86_64_3way twofish_x86_64 twofish_common ecb xcbc sha256_generic sha512_generic des_generic xfrm_user ah 6 ah4 esp6 esp4 xfrm4_mode_beet xfrm4_tunnel tunnel4 xfrm4_mode_tunnel xfrm4_mode_transport xfrm6_mode_transport xfrm6_mode_ro xfrm6_mode_beet xfrm6_mode_tunnel ipcom p ipcomp6 xfrm_ipcomp xfrm6_tunnel tunnel6 rng_core af_key ip6table_filter ip6_tables iptable_filter ip_tables x_tables xfs cryptd aes_x86_64 aes_generic cbc rbd libc eph crc32c libcrc32c loop ib_iser rdma_cm ib_cm iw_cm ib_sa ib_mad ib_core ib_addr iscsi_tcp libiscsi_tcp libiscsi scsi_transport_iscsi scsi_mod nfsd nfs lockd fscach e auth_rpcgss nfs_acl sunrpc ext3 jbd fuse evdev snd_pcm snd_timer snd soundcore snd_page_alloc pcspkr ext4 mbcache jbd2 crc16 xen_blkfront xen_netfront [1034382.768933] [1034382.768937] Pid: 26554, comm: kworker/0:0 Not tainted 3.2.0-2-amd64 #1 [1034382.768943] RIP: e030:[<ffffffffa02a1ec8>] [<ffffffffa02a1ec8>] con_work+0x14da/0x1d21 [libceph] [1034382.768952] RSP: e02b:ffff88007c15bdb0 EFLAGS: 00010246 [1034382.768956] RAX: 0000000000000000 RBX: ffff88007bf9a030 RCX: 0000000000000080 [1034382.768962] RDX: 000000000007f000 RSI: ffff88007b800620 RDI: ffff88007b800618 [1034382.768967] RBP: ffffea00017fe410 R08: 0000000000000000 R09: 0000000000000258 [1034382.768971] R10: 96206c705f02e221 R11: 96206c705f02e221 R12: ffff88007b800580 [1034382.768976] R13: 0000000000000000 R14: 0000000000000000 R15: ffff88007bf9a420 [1034382.768985] FS: 00007fa421c4b7a0(0000) GS:ffff88007ff8b000(0000) knlGS:0000000000000000 [1034382.768990] CS: e033 DS: 0000 ES: 0000 CR0: 000000008005003b [1034382.772572] CR2: 0000000000000048 CR3: 000000000306a000 CR4: 0000000000002660 [1034382.772572] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [1034382.772572] DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400 [1034382.772572] Process kworker/0:0 (pid: 26554, threadinfo ffff88007c15a000, task ffff88007c836f60) [1034382.772572] Stack: [1034382.772572] 0000000000000000 ffffffff81003129 ffffffff81006c3f ffff88007c836f60 [1034382.772572] ffff88007bf9a058 ffffffff00080000 0000000000001000 0000000000080000 [1034382.772572] ffff88007bf9a1a8 ffff88007bf9a308 ffff88007a9d3e00 ffff880000080000 [1034382.772572] Call Trace: [1034382.772572] [<ffffffff81003129>] ? xen_end_context_switch+0xe/0x1c [1034382.772572] [<ffffffff81006c3f>] ? xen_restore_fl_direct_reloc+0x4/0x4 [1034382.772572] [<ffffffffa02a09ee>] ? read_partial_message_section.isra.17+0x74/0x74 [libceph] [1034382.772572] [<ffffffff8105ae4d>] ? process_one_work+0x163/0x284 [1034382.772572] [<ffffffff8105be15>] ? worker_thread+0xc2/0x145 [1034382.772572] [<ffffffff8105bd53>] ? manage_workers.isra.23+0x15b/0x15b [1034382.772572] [<ffffffff8105ef51>] ? kthread+0x76/0x7e [1034382.772572] [<ffffffff81348834>] ? kernel_thread_helper+0x4/0x10 [1034382.772572] [<ffffffff813468f3>] ? int_ret_from_sys_call+0x7/0x1b [1034382.772572] [<ffffffff81341a3c>] ? retint_restore_args+0x5/0x6 [1034382.772572] [<ffffffff81348830>] ? gs_change+0x13/0x13 [1034382.772572] Code: 89 ef e8 d7 df ff ff e9 97 00 00 00 49 83 bc 24 90 00 00 00 00 74 4c 4d 63 84 24 a0 00 00 00 49 8b 84 24 98 00 00 00 49 c1 e0 04 <4c> 03 40 48 31 c0 45 85 ed 49 8b 28 41 8b 48 0c 74 20 48 89 ef [1034382.772572] RIP [<ffffffffa02a1ec8>] con_work+0x14da/0x1d21 [libceph] [1034382.772572] RSP <ffff88007c15bdb0> [1034382.772572] CR2: 0000000000000048 [1034382.772572] ---[ end trace 3a0fd4b4c9915722 ]--- [1034382.780594] BUG: unable to handle kernel paging request at fffffffffffffff8 [1034382.780603] IP: [<ffffffff8105f16a>] kthread_data+0x7/0xc [1034382.780610] PGD 1607067 PUD 1608067 PMD 0 [1034382.780616] Oops: 0000 [#2] SMP [1034382.780621] CPU 0 [1034382.780623] Modules linked in: rmd160 sha1_ssse3 sha1_generic hmac crypto_null camellia lzo cast6 cast5 deflate zlib_deflate cts ctr gcm ccm serpent blowfish_gen eric blowfish_x86_64 blowfish_common twofish_generic twofish_x86_64_3way twofish_x86_64 twofish_common ecb xcbc sha256_generic sha512_generic des_generic xfrm_user ah 6 ah4 esp6 esp4 xfrm4_mode_beet xfrm4_tunnel tunnel4 xfrm4_mode_tunnel xfrm4_mode_transport xfrm6_mode_transport xfrm6_mode_ro xfrm6_mode_beet xfrm6_mode_tunnel ipcom p ipcomp6 xfrm_ipcomp xfrm6_tunnel tunnel6 rng_core af_key ip6table_filter ip6_tables iptable_filter ip_tables x_tables xfs cryptd aes_x86_64 aes_generic cbc rbd libc eph crc32c libcrc32c loop ib_iser rdma_cm ib_cm iw_cm ib_sa ib_mad ib_core ib_addr iscsi_tcp libiscsi_tcp libiscsi scsi_transport_iscsi scsi_mod nfsd nfs lockd fscach e auth_rpcgss nfs_acl sunrpc ext3 jbd fuse evdev snd_pcm snd_timer snd soundcore snd_page_alloc pcspkr ext4 mbcache jbd2 crc16 xen_blkfront xen_netfront [1034382.780744] [1034382.780748] Pid: 26554, comm: kworker/0:0 Tainted: G D 3.2.0-2-amd64 #1 [1034382.780754] RIP: e030:[<ffffffff8105f16a>] [<ffffffff8105f16a>] kthread_data+0x7/0xc [1034382.780761] RSP: e02b:ffff88007c15ba80 EFLAGS: 00010002 [1034382.780765] RAX: 0000000000000000 RBX: ffff88007ff9e540 RCX: 0000000000000000 [1034382.780771] RDX: 0000000000000000 RSI: 0000000000000000 RDI: ffff88007c836f60 [1034382.780777] RBP: 0000000000000000 R08: 0000000000000400 R09: 0720072007200720 [1034382.780782] R10: dead000000200200 R11: 0720072007200720 R12: ffff88007c15bb40 [1034382.780787] R13: ffff88007d367510 R14: 0000000000000000 R15: ffff88007c837258 [1034382.780795] FS: 00007fa421c4b7a0(0000) GS:ffff88007ff8b000(0000) knlGS:0000000000000000 [1034382.780800] CS: e033 DS: 0000 ES: 0000 CR0: 000000008005003b [1034382.780805] CR2: fffffffffffffff8 CR3: 000000000306a000 CR4: 0000000000002660 [1034382.780811] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [1034382.780817] DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400 [1034382.780822] Process kworker/0:0 (pid: 26554, threadinfo ffff88007c15a000, task ffff88007c836f60) [1034382.780827] Stack: [1034382.780830] ffffffff8105c141 ffff88007ff9e540 ffff88007c836f60 ffff88007c15bb40 [1034382.780838] ffffffff8133ff50 0000000000000200 ffff88007ff88b18 0000000000013540 [1034382.780847] ffff88007c15bfd8 ffff88007c15bfd8 ffff88007c836f60 ffffffff8109457f [1034382.780855] Call Trace: [1034382.780860] [<ffffffff8105c141>] ? wq_worker_sleeping+0xb/0x6f [1034382.784413] [<ffffffff8133ff50>] ? __schedule+0x138/0x5c3 [1034382.784413] [<ffffffff8109457f>] ? arch_local_irq_restore+0x7/0x8 [1034382.784413] [<ffffffff81048850>] ? release_task+0x31b/0x331 [1034382.784413] [<ffffffff8103642f>] ? should_resched+0x5/0x23 [1034382.784413] [<ffffffff81049fa2>] ? do_exit+0x730/0x732 [1034382.784413] [<ffffffff81070747>] ? arch_local_irq_restore+0x7/0x8 [1034382.784413] [<ffffffff8134254e>] ? oops_end+0xb1/0xb6 [1034382.784413] [<ffffffff8133a958>] ? no_context+0x1ff/0x20e [1034382.784413] [<ffffffff81040e60>] ? find_busiest_group+0x1f5/0x805 [1034382.784413] [<ffffffff81344559>] ? do_page_fault+0x1a8/0x337 [1034382.784413] [<ffffffff8127602a>] ? sock_no_sendpage+0x88/0x95 [1034382.784413] [<ffffffff812b62fb>] ? tcp_sendpage+0x47/0x418 [1034382.784413] [<ffffffff81341cb5>] ? page_fault+0x25/0x30 [1034382.784413] [<ffffffffa02a1ec8>] ? con_work+0x14da/0x1d21 [libceph] [1034382.784413] [<ffffffff81003129>] ? xen_end_context_switch+0xe/0x1c [1034382.784413] [<ffffffff81006c3f>] ? xen_restore_fl_direct_reloc+0x4/0x4 [1034382.784413] [<ffffffffa02a09ee>] ? read_partial_message_section.isra.17+0x74/0x74 [libceph] [1034382.784413] [<ffffffff8105ae4d>] ? process_one_work+0x163/0x284 [1034382.784413] [<ffffffff8105be15>] ? worker_thread+0xc2/0x145 [1034382.784413] [<ffffffff8105bd53>] ? manage_workers.isra.23+0x15b/0x15b [1034382.784413] [<ffffffff8105ef51>] ? kthread+0x76/0x7e [1034382.784413] [<ffffffff81348834>] ? kernel_thread_helper+0x4/0x10 [1034382.784413] [<ffffffff813468f3>] ? int_ret_from_sys_call+0x7/0x1b [1034382.784413] [<ffffffff81341a3c>] ? retint_restore_args+0x5/0x6 [1034382.784413] [<ffffffff81348830>] ? gs_change+0x13/0x13 [1034382.784413] Code: 3f 48 c1 e5 03 48 c1 e0 06 48 8d b0 80 5f 40 81 48 29 ee e8 ec 35 fe ff 81 4b 14 00 00 00 04 41 59 5b 5d c3 48 8b 87 a0 02 00 00 <48> 8b 40 f8 c3 48 3b 3d 72 8f 72 00 75 08 0f bf 87 6a 06 00 00 [1034382.784413] RIP [<ffffffff8105f16a>] kthread_data+0x7/0xc [1034382.784413] RSP <ffff88007c15ba80> [1034382.784413] CR2: fffffffffffffff8 [1034382.784413] ---[ end trace 3a0fd4b4c9915723 ]--- [1034382.784413] Fixing recursive fault but reboot is needed!
Files
Actions