Actions
Bug #3539
closednull deref at con_work+0x257d/0x2f40 after osd reset under ffsb workload
Status:
Rejected
Priority:
Urgent
Assignee:
-
Category:
libceph
Target version:
-
% Done:
0%
Source:
Development
Tags:
Backport:
Regression:
Severity:
Reviewed:
Affected Versions:
ceph-qa-suite:
Crash signature (v1):
Crash signature (v2):
Description
... <3>[ 3410.997126] libceph: osd0 10.214.132.35:6800 socket closed <6>[ 3433.257736] libceph: osd5 weight 0x10000 (in) <6>[ 3454.910103] libceph: osd3 weight 0x0 (out) <6>[ 3454.937907] libceph: osd4 weight 0x0 (out) <6>[ 3454.964732] libceph: osd1 weight 0x10000 (in) <3>[ 3454.992591] libceph: osd1 10.214.132.35:6806 socket closed <3>[ 3455.154736] libceph: osd0 10.214.132.35:6800 socket closed <6>[ 3456.933836] libceph: osd0 weight 0x0 (out) <6>[ 3465.723826] libceph: osd1 weight 0x0 (out) <3>[ 3465.875335] libceph: osd5 10.214.131.7:6806 socket closed <6>[ 3471.967466] libceph: osd4 weight 0x10000 (in) <3>[ 3472.026662] libceph: osd4 10.214.131.7:6803 socket closed <3>[ 3472.116364] libceph: osd2 10.214.132.35:6803 socket closed <3>[ 3476.738895] libceph: osd5 10.214.131.7:6806 socket closed <6>[ 3478.016208] libceph: osd0 weight 0x10000 (in) <3>[ 3478.139397] libceph: osd5 10.214.131.7:6806 socket closed <3>[ 3478.182630] libceph: osd0 10.214.132.35:6800 socket closed <6>[ 3494.220851] libceph: osd3 weight 0x10000 (in) <3>[ 3494.275514] libceph: osd5 10.214.131.7:6806 socket closed <3>[ 3494.397314] libceph: osd4 10.214.131.7:6803 socket closed <3>[ 3494.447277] libceph: osd3 10.214.131.7:6800 socket closed <6>[ 3506.998261] libceph: osd1 weight 0x10000 (in) <3>[ 3507.146202] libceph: osd4 10.214.131.7:6803 socket closed <3>[ 3507.195774] libceph: osd2 10.214.132.35:6803 socket closed <3>[ 3507.276069] libceph: osd1 10.214.132.35:6806 socket closed <6>[ 3510.964155] libceph: osd0 weight 0x0 (out) <3>[ 3511.099348] libceph: osd4 10.214.131.7:6803 socket closed <6>[ 3520.690627] libceph: osd0 weight 0x10000 (in) <3>[ 3520.831057] libceph: osd0 10.214.132.35:6800 socket closed <3>[ 3520.876162] libceph: osd3 10.214.131.7:6800 socket closed <3>[ 3520.922352] libceph: osd4 10.214.131.7:6803 socket closed <3>[ 3521.009618] libceph: osd5 10.214.131.7:6806 socket closed <6>[ 3531.668446] libceph: osd5 weight 0x0 (out) <3>[ 3531.852570] libceph: osd0 10.214.132.35:6800 socket closed <3>[ 3531.928636] libceph: osd2 10.214.132.35:6803 socket closed <4>[ 3684.939045] libceph: tid 52546 timed out on osd0, will reset osd <4>[ 3684.971008] libceph: tid 126504 timed out on osd1, will reset osd <4>[ 3690.000269] libceph: tid 151354 timed out on osd3, will reset osd <4>[ 3690.029860] libceph: tid 216257 timed out on osd4, will reset osd <4>[ 3690.055857] libceph: tid 247646 timed out on osd2, will reset osd <4>[ 3750.125794] libceph: tid 234232 timed out on osd1, will reset osd <4>[ 3750.151366] libceph: tid 97066 timed out on osd0, will reset osd <4>[ 3760.178332] libceph: tid 303684 timed out on osd3, will reset osd <6>[ 3815.064352] libceph: osd0 weight 0x0 (out) <3>[ 3817.570458] libceph: osd4 10.214.131.7:6803 socket closed <6>[ 3829.814102] libceph: osd4 weight 0x0 (out) <3>[ 3836.974889] INFO: task ffsb:30832 blocked for more than 120 seconds. <3>[ 3836.999808] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. <6>[ 3837.046152] ffsb D ffffffff8112a200 0 30832 29158 0x00000000 <4>[ 3837.074848] ffff880215a618e8 0000000000000046 ffff880215a618b8 0000000000000046 <4>[ 3837.125544] ffff88020b611f60 ffff880215a61fd8 ffff880215a61fd8 ffff880215a61fd8 <4>[ 3837.180312] ffff88020b869f60 ffff88020b611f60 ffff880215a618e8 ffff880227274e50 <4>[ 3837.238382] Call Trace: <4>[ 3837.265963] [<ffffffff8112a200>] ? __lock_page+0x70/0x70 <4>[ 3837.297354] [<ffffffff816345c9>] schedule+0x29/0x70 <4>[ 3837.328309] [<ffffffff8163469f>] io_schedule+0x8f/0xd0 <4>[ 3837.359964] [<ffffffff8112a20e>] sleep_on_page+0xe/0x20 <4>[ 3837.392205] [<ffffffff8163213a>] __wait_on_bit_lock+0x5a/0xc0 <4>[ 3837.425116] [<ffffffff8112a1f7>] __lock_page+0x67/0x70 <4>[ 3837.457288] [<ffffffff810798a0>] ? autoremove_wake_function+0x40/0x40 <4>[ 3837.491780] [<ffffffff8112b1a7>] find_lock_page+0x67/0x80 <4>[ 3837.524943] [<ffffffff8112b22f>] grab_cache_page_write_begin+0x6f/0xf0 <4>[ 3837.559554] [<ffffffffa04ffeba>] ? __touch_cap+0x4a/0x100 [ceph] <4>[ 3837.594185] [<ffffffffa04fea1e>] ceph_write_begin+0x4e/0xd0 [ceph] <4>[ 3837.629619] [<ffffffff8112a6fa>] generic_file_buffered_write+0x11a/0x2a0 <4>[ 3837.666331] [<ffffffff8112beb6>] __generic_file_aio_write+0x1b6/0x3b0 <4>[ 3837.703173] [<ffffffff8112c113>] ? generic_file_aio_write+0x63/0x100 <4>[ 3837.739450] [<ffffffff8112c131>] generic_file_aio_write+0x81/0x100 <4>[ 3837.775485] [<ffffffffa04faf53>] ceph_aio_write+0x883/0xaa0 [ceph] <4>[ 3837.811498] [<ffffffffa04ffe9f>] ? __touch_cap+0x2f/0x100 [ceph] <4>[ 3837.847415] [<ffffffff810848a8>] ? finish_task_switch+0x48/0x110 <4>[ 3837.883400] [<ffffffff810b393d>] ? trace_hardirqs_on+0xd/0x10 <4>[ 3837.918677] [<ffffffff810848e5>] ? finish_task_switch+0x85/0x110 <4>[ 3837.953805] [<ffffffff810848a8>] ? finish_task_switch+0x48/0x110 <4>[ 3837.987692] [<ffffffff8117b623>] do_sync_write+0xa3/0xe0 <4>[ 3838.020451] [<ffffffff8117bee3>] vfs_write+0xb3/0x180 <4>[ 3838.052375] [<ffffffff8117c20a>] sys_write+0x4a/0x90 <4>[ 3838.083079] [<ffffffff8163e1e9>] system_call_fastpath+0x16/0x1b <4>[ 3838.114194] INFO: lockdep is turned off. <3>[ 3838.142584] INFO: task ffsb:30833 blocked for more than 120 seconds. <3>[ 3838.173271] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. <6>[ 3838.229233] ffsb D ffffffff8112a200 0 30833 29158 0x00000000 <4>[ 3838.261756] ffff88020d0538e8 0000000000000046 ffff88020d0538b8 0000000000000046 <4>[ 3838.320603] ffff88020b613ec0 ffff88020d053fd8 ffff88020d053fd8 ffff88020d053fd8 <4>[ 3838.382697] ffff880225d29f60 ffff88020b613ec0 ffff88020d0538e8 ffff8802272d4e50 <4>[ 3838.447444] Call Trace: <4>[ 3838.478090] [<ffffffff8112a200>] ? __lock_page+0x70/0x70 <4>[ 3838.512414] [<ffffffff816345c9>] schedule+0x29/0x70 <4>[ 3838.545889] [<ffffffff8163469f>] io_schedule+0x8f/0xd0 <4>[ 3838.579439] [<ffffffff8112a20e>] sleep_on_page+0xe/0x20 <4>[ 3838.612691] [<ffffffff8163213a>] __wait_on_bit_lock+0x5a/0xc0 <4>[ 3838.646625] [<ffffffff8112a1f7>] __lock_page+0x67/0x70 <4>[ 3838.679889] [<ffffffff810798a0>] ? autoremove_wake_function+0x40/0x40 <4>[ 3838.714829] [<ffffffff8112b1a7>] find_lock_page+0x67/0x80 <4>[ 3838.748537] [<ffffffff8112b22f>] grab_cache_page_write_begin+0x6f/0xf0 <4>[ 3838.783673] [<ffffffffa04ffeba>] ? __touch_cap+0x4a/0x100 [ceph] <4>[ 3838.818298] [<ffffffffa04fea1e>] ceph_write_begin+0x4e/0xd0 [ceph] <4>[ 3838.853215] [<ffffffff8112a6fa>] generic_file_buffered_write+0x11a/0x2a0 <4>[ 3838.888925] [<ffffffff8112beb6>] __generic_file_aio_write+0x1b6/0x3b0 <4>[ 3838.925020] [<ffffffff8112c113>] ? generic_file_aio_write+0x63/0x100 <4>[ 3838.960585] [<ffffffff8112c131>] generic_file_aio_write+0x81/0x100 <4>[ 3838.996143] [<ffffffffa04faf53>] ceph_aio_write+0x883/0xaa0 [ceph] <4>[ 3839.032253] [<ffffffffa04ffe9f>] ? __touch_cap+0x2f/0x100 [ceph] <4>[ 3839.068284] [<ffffffffa04ffe9f>] ? __touch_cap+0x2f/0x100 [ceph] <4>[ 3839.103273] [<ffffffff8132bfbd>] ? do_raw_spin_unlock+0x5d/0xb0 <4>[ 3839.138058] [<ffffffff816358bb>] ? _raw_spin_unlock+0x2b/0x40 <4>[ 3839.172124] [<ffffffff8117b623>] do_sync_write+0xa3/0xe0 <4>[ 3839.204965] [<ffffffff8117bee3>] vfs_write+0xb3/0x180 <4>[ 3839.236965] [<ffffffff8117c20a>] sys_write+0x4a/0x90 <4>[ 3839.268010] [<ffffffff8163e1e9>] system_call_fastpath+0x16/0x1b <4>[ 3839.299472] INFO: lockdep is turned off. <3>[ 3839.328058] INFO: task ffsb:30836 blocked for more than 120 seconds. <3>[ 3839.359106] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. <6>[ 3839.414836] ffsb D ffffffff8112a200 0 30836 29158 0x00000000 <4>[ 3839.447187] ffff8802227b78e8 0000000000000046 ffff8802227b78b8 0000000000000046 <4>[ 3839.504996] ffff88020cb75e20 ffff8802227b7fd8 ffff8802227b7fd8 ffff8802227b7fd8 <4>[ 3839.566294] ffff88020b869f60 ffff88020cb75e20 ffff8802227b78e8 ffff880227274e50 <4>[ 3839.630575] Call Trace: <4>[ 3839.661232] [<ffffffff8112a200>] ? __lock_page+0x70/0x70 <4>[ 3839.695386] [<ffffffff816345c9>] schedule+0x29/0x70 <4>[ 3839.728755] [<ffffffff8163469f>] io_schedule+0x8f/0xd0 <4>[ 3839.762073] [<ffffffff8112a20e>] sleep_on_page+0xe/0x20 <4>[ 3839.795325] [<ffffffff8163213a>] __wait_on_bit_lock+0x5a/0xc0 <4>[ 3839.829006] [<ffffffff8112a1f7>] __lock_page+0x67/0x70 <4>[ 3839.861744] [<ffffffff810798a0>] ? autoremove_wake_function+0x40/0x40 <4>[ 3839.896594] [<ffffffff8112b1a7>] find_lock_page+0x67/0x80 <4>[ 3839.930254] [<ffffffff8112b22f>] grab_cache_page_write_begin+0x6f/0xf0 <4>[ 3839.965243] [<ffffffff811a5a4d>] ? __mark_inode_dirty+0x14d/0x230 <4>[ 3839.999746] [<ffffffffa04fea1e>] ceph_write_begin+0x4e/0xd0 [ceph] <4>[ 3840.035102] [<ffffffff8112a6fa>] generic_file_buffered_write+0x11a/0x2a0 <4>[ 3840.071325] [<ffffffff8112beb6>] __generic_file_aio_write+0x1b6/0x3b0 <4>[ 3840.107402] [<ffffffff8112c113>] ? generic_file_aio_write+0x63/0x100 <4>[ 3840.143254] [<ffffffff8112c131>] generic_file_aio_write+0x81/0x100 <4>[ 3840.178943] [<ffffffffa04faf53>] ceph_aio_write+0x883/0xaa0 [ceph] <4>[ 3840.215269] [<ffffffffa04ffe9f>] ? __touch_cap+0x2f/0x100 [ceph] <4>[ 3840.251470] [<ffffffffa04ffe9f>] ? __touch_cap+0x2f/0x100 [ceph] <4>[ 3840.286673] [<ffffffff8132bfbd>] ? do_raw_spin_unlock+0x5d/0xb0 <4>[ 3840.321612] [<ffffffff816358bb>] ? _raw_spin_unlock+0x2b/0x40 <4>[ 3840.355806] [<ffffffff8117b623>] do_sync_write+0xa3/0xe0 <4>[ 3840.388758] [<ffffffff8117bee3>] vfs_write+0xb3/0x180 <4>[ 3840.420880] [<ffffffff8117c20a>] sys_write+0x4a/0x90 <4>[ 3840.452108] [<ffffffff8163e1e9>] system_call_fastpath+0x16/0x1b <4>[ 3840.483763] INFO: lockdep is turned off. <3>[ 3840.512535] INFO: task ffsb:30837 blocked for more than 120 seconds. <3>[ 3840.543764] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. <6>[ 3840.599839] ffsb D 00000000ffffffff 0 30837 29158 0x00000000 <4>[ 3840.632360] ffff880222727e28 0000000000000046 ffff880222727e48 0000000000000246 <4>[ 3840.690424] ffff88020cb71f60 ffff880222727fd8 ffff880222727fd8 ffff880222727fd8 <4>[ 3840.751738] ffff880225d19f60 ffff88020cb71f60 0000000001007120 ffff8801efe34d38 <4>[ 3840.815977] Call Trace: <4>[ 3840.846459] [<ffffffff816345c9>] schedule+0x29/0x70 <4>[ 3840.879980] [<ffffffff816348ee>] schedule_preempt_disabled+0xe/0x10 <4>[ 3840.914999] [<ffffffff81632641>] mutex_lock_nested+0x151/0x320 <4>[ 3840.949379] [<ffffffffa04f9fc5>] ? ceph_llseek+0x45/0x150 [ceph] <4>[ 3840.983942] [<ffffffff8117d51c>] ? fget_light+0x3c/0x130 <4>[ 3841.017403] [<ffffffffa04f9fc5>] ceph_llseek+0x45/0x150 [ceph] <4>[ 3841.051475] [<ffffffff8117b4b3>] vfs_llseek+0x33/0x40 <4>[ 3841.084877] [<ffffffff8117ba36>] sys_lseek+0x76/0x90 <4>[ 3841.117845] [<ffffffff8163e1e9>] system_call_fastpath+0x16/0x1b <4>[ 3841.151969] INFO: lockdep is turned off. <3>[ 3841.183703] INFO: task ffsb:30840 blocked for more than 120 seconds. <3>[ 3841.219225] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. <6>[ 3841.285969] ffsb D ffffffff8112a200 0 30840 29158 0x00000000 <4>[ 3841.323940] ffff8802184118e8 0000000000000046 ffff8802184118b8 0000000000000046 <4>[ 3841.394041] ffff880107e50000 ffff880218411fd8 ffff880218411fd8 ffff880218411fd8 <4>[ 3841.467022] ffff880225d19f60 ffff880107e50000 ffff8802184118e8 ffff880227254e50 <4>[ 3841.541468] Call Trace: <4>[ 3841.576260] [<ffffffff8112a200>] ? __lock_page+0x70/0x70 <4>[ 3841.614459] [<ffffffff816345c9>] schedule+0x29/0x70 <4>[ 3841.651928] [<ffffffff8163469f>] io_schedule+0x8f/0xd0 <4>[ 3841.688653] [<ffffffff8112a20e>] sleep_on_page+0xe/0x20 <4>[ 3841.724694] [<ffffffff8163213a>] __wait_on_bit_lock+0x5a/0xc0 <4>[ 3841.760673] [<ffffffff8112a1f7>] __lock_page+0x67/0x70 <4>[ 3841.795107] [<ffffffff810798a0>] ? autoremove_wake_function+0x40/0x40 <4>[ 3841.830609] [<ffffffff8112b1a7>] find_lock_page+0x67/0x80 <4>[ 3841.865253] [<ffffffff8112b22f>] grab_cache_page_write_begin+0x6f/0xf0 <4>[ 3841.901283] [<ffffffffa04ffeba>] ? __touch_cap+0x4a/0x100 [ceph] <4>[ 3841.936912] [<ffffffffa04fea1e>] ceph_write_begin+0x4e/0xd0 [ceph] <4>[ 3841.972773] [<ffffffff8112a6fa>] generic_file_buffered_write+0x11a/0x2a0 <4>[ 3842.009404] [<ffffffff8112beb6>] __generic_file_aio_write+0x1b6/0x3b0 <4>[ 3842.045896] [<ffffffff8112c113>] ? generic_file_aio_write+0x63/0x100 <4>[ 3842.081845] [<ffffffff8112c131>] generic_file_aio_write+0x81/0x100 <4>[ 3842.117645] [<ffffffffa04faf53>] ceph_aio_write+0x883/0xaa0 [ceph] <4>[ 3842.154001] [<ffffffffa04ffe9f>] ? __touch_cap+0x2f/0x100 [ceph] <4>[ 3842.190054] [<ffffffffa04ffe9f>] ? __touch_cap+0x2f/0x100 [ceph] <4>[ 3842.225063] [<ffffffff8132bfbd>] ? do_raw_spin_unlock+0x5d/0xb0 <4>[ 3842.259905] [<ffffffff816358bb>] ? _raw_spin_unlock+0x2b/0x40 <4>[ 3842.293948] [<ffffffff8117b623>] do_sync_write+0xa3/0xe0 <4>[ 3842.326727] [<ffffffff8117bee3>] vfs_write+0xb3/0x180 <4>[ 3842.358706] [<ffffffff8117c20a>] sys_write+0x4a/0x90 <4>[ 3842.389726] [<ffffffff8163e1e9>] system_call_fastpath+0x16/0x1b <4>[ 3842.421147] INFO: lockdep is turned off. <3>[ 3842.449732] INFO: task ffsb:30848 blocked for more than 120 seconds. <3>[ 3842.480763] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. <6>[ 3842.536411] ffsb D ffffffff8112a200 0 30848 29158 0x00000000 <4>[ 3842.568678] ffff8802184078e8 0000000000000046 ffff8802184078b8 0000000000000046 <4>[ 3842.626427] ffff880221ff0000 ffff880218407fd8 ffff880218407fd8 ffff880218407fd8 <4>[ 3842.687596] ffff88020b869f60 ffff880221ff0000 ffff8802184078e8 ffff880227274e50 <4>[ 3842.751803] Call Trace: <4>[ 3842.782257] [<ffffffff8112a200>] ? __lock_page+0x70/0x70 <4>[ 3842.816351] [<ffffffff816345c9>] schedule+0x29/0x70 <4>[ 3842.849681] [<ffffffff8163469f>] io_schedule+0x8f/0xd0 <4>[ 3842.882879] [<ffffffff8112a20e>] sleep_on_page+0xe/0x20 <4>[ 3842.916026] [<ffffffff8163213a>] __wait_on_bit_lock+0x5a/0xc0 <4>[ 3842.949602] [<ffffffff8112a1f7>] __lock_page+0x67/0x70 <4>[ 3842.982169] [<ffffffff810798a0>] ? autoremove_wake_function+0x40/0x40 <4>[ 3843.016839] [<ffffffff8112b1a7>] find_lock_page+0x67/0x80 <4>[ 3843.050375] [<ffffffff8112b22f>] grab_cache_page_write_begin+0x6f/0xf0 <4>[ 3843.085272] [<ffffffffa04ffeba>] ? __touch_cap+0x4a/0x100 [ceph] <4>[ 3843.119646] [<ffffffffa04fea1e>] ceph_write_begin+0x4e/0xd0 [ceph] <4>[ 3843.155103] [<ffffffff8112a6fa>] generic_file_buffered_write+0x11a/0x2a0 <4>[ 3843.191238] [<ffffffff8112beb6>] __generic_file_aio_write+0x1b6/0x3b0 <4>[ 3843.227226] [<ffffffff8112c113>] ? generic_file_aio_write+0x63/0x100 <4>[ 3843.262825] [<ffffffff8112c131>] generic_file_aio_write+0x81/0x100 <4>[ 3843.298400] [<ffffffffa04faf53>] ceph_aio_write+0x883/0xaa0 [ceph] <4>[ 3843.334554] [<ffffffffa04ffe9f>] ? __touch_cap+0x2f/0x100 [ceph] <4>[ 3843.370642] [<ffffffff810848a8>] ? finish_task_switch+0x48/0x110 <4>[ 3843.406449] [<ffffffff810b393d>] ? trace_hardirqs_on+0xd/0x10 <4>[ 3843.441733] [<ffffffff810848e5>] ? finish_task_switch+0x85/0x110 <4>[ 3843.476864] [<ffffffff810848a8>] ? finish_task_switch+0x48/0x110 <4>[ 3843.510762] [<ffffffff81325d5e>] ? trace_hardirqs_on_thunk+0x3a/0x3f <4>[ 3843.544831] [<ffffffff8117b623>] do_sync_write+0xa3/0xe0 <4>[ 3843.577365] [<ffffffff8117bee3>] vfs_write+0xb3/0x180 <4>[ 3843.608700] [<ffffffff8117c20a>] sys_write+0x4a/0x90 <4>[ 3843.638952] [<ffffffff8163e1e9>] system_call_fastpath+0x16/0x1b <4>[ 3843.669473] INFO: lockdep is turned off. <3>[ 3843.697118] INFO: task ffsb:30849 blocked for more than 120 seconds. <3>[ 3843.727205] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. <6>[ 3843.783525] ffsb D ffffffff8112a200 0 30849 29158 0x00000000 <4>[ 3843.816293] ffff88012b4038e8 0000000000000046 ffff88012b4038b8 0000000000000046 <4>[ 3843.875734] ffff88022466de20 ffff88012b403fd8 ffff88012b403fd8 ffff88012b403fd8 <4>[ 3843.938200] ffff8801bd2a8000 ffff88022466de20 ffff88012b4038e8 ffff880227234e50 <4>[ 3844.002871] Call Trace: <4>[ 3844.033552] [<ffffffff8112a200>] ? __lock_page+0x70/0x70 <4>[ 3844.067852] [<ffffffff816345c9>] schedule+0x29/0x70 <4>[ 3844.101336] [<ffffffff8163469f>] io_schedule+0x8f/0xd0 <4>[ 3844.134900] [<ffffffff8112a20e>] sleep_on_page+0xe/0x20 <4>[ 3844.168119] [<ffffffff8163213a>] __wait_on_bit_lock+0x5a/0xc0 <4>[ 3844.202379] [<ffffffff8112a1f7>] __lock_page+0x67/0x70 <4>[ 3844.235753] [<ffffffff810798a0>] ? autoremove_wake_function+0x40/0x40 <4>[ 3844.270754] [<ffffffff8112b1a7>] find_lock_page+0x67/0x80 <4>[ 3844.304528] [<ffffffff8112b22f>] grab_cache_page_write_begin+0x6f/0xf0 <4>[ 3844.339702] [<ffffffffa04ffeba>] ? __touch_cap+0x4a/0x100 [ceph] <4>[ 3844.374439] [<ffffffffa04fea1e>] ceph_write_begin+0x4e/0xd0 [ceph] <4>[ 3844.409397] [<ffffffff8112a6fa>] generic_file_buffered_write+0x11a/0x2a0 <4>[ 3844.445787] [<ffffffff8112beb6>] __generic_file_aio_write+0x1b6/0x3b0 <4>[ 3844.481987] [<ffffffff8112c113>] ? generic_file_aio_write+0x63/0x100 <4>[ 3844.517565] [<ffffffff8112c131>] generic_file_aio_write+0x81/0x100 <4>[ 3844.553173] [<ffffffffa04faf53>] ceph_aio_write+0x883/0xaa0 [ceph] <4>[ 3844.589312] [<ffffffffa04ffe9f>] ? __touch_cap+0x2f/0x100 [ceph] <4>[ 3844.625365] [<ffffffffa04ffe9f>] ? __touch_cap+0x2f/0x100 [ceph] <4>[ 3844.660439] [<ffffffff8132bfbd>] ? do_raw_spin_unlock+0x5d/0xb0 <4>[ 3844.695278] [<ffffffff816358bb>] ? _raw_spin_unlock+0x2b/0x40 <4>[ 3844.729361] [<ffffffff8117b623>] do_sync_write+0xa3/0xe0 <4>[ 3844.762182] [<ffffffff8117bee3>] vfs_write+0xb3/0x180 <4>[ 3844.794219] [<ffffffff8117c20a>] sys_write+0x4a/0x90 <4>[ 3844.825313] [<ffffffff8163e1e9>] system_call_fastpath+0x16/0x1b <4>[ 3844.856838] INFO: lockdep is turned off. <3>[ 3844.885450] INFO: task ffsb:30855 blocked for more than 120 seconds. <3>[ 3844.916482] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. <6>[ 3844.972298] ffsb D ffffffff8112a200 0 30855 29158 0x00000000 <4>[ 3845.004657] ffff88020b9cf8e8 0000000000000046 ffff88020b9cf8b8 0000000000000046 <4>[ 3845.062674] ffff880223850000 ffff88020b9cffd8 ffff88020b9cffd8 ffff88020b9cffd8 <4>[ 3845.124131] ffff880225d1bec0 ffff880223850000 ffff88020b9cf8e8 ffff880227274e50 <4>[ 3845.188600] Call Trace: <4>[ 3845.219130] [<ffffffff8112a200>] ? __lock_page+0x70/0x70 <4>[ 3845.253381] [<ffffffff816345c9>] schedule+0x29/0x70 <4>[ 3845.286789] [<ffffffff8163469f>] io_schedule+0x8f/0xd0 <4>[ 3845.320097] [<ffffffff8112a20e>] sleep_on_page+0xe/0x20 <4>[ 3845.353450] [<ffffffff8163213a>] __wait_on_bit_lock+0x5a/0xc0 <4>[ 3845.387151] [<ffffffff8112a1f7>] __lock_page+0x67/0x70 <4>[ 3845.419884] [<ffffffff810798a0>] ? autoremove_wake_function+0x40/0x40 <4>[ 3845.454805] [<ffffffff8112b1a7>] find_lock_page+0x67/0x80 <4>[ 3845.488500] [<ffffffff8112b22f>] grab_cache_page_write_begin+0x6f/0xf0 <4>[ 3845.523526] [<ffffffffa04ffeba>] ? __touch_cap+0x4a/0x100 [ceph] <4>[ 3845.558027] [<ffffffffa04fea1e>] ceph_write_begin+0x4e/0xd0 [ceph] <4>[ 3845.593398] [<ffffffff8112a6fa>] generic_file_buffered_write+0x11a/0x2a0 <4>[ 3845.629728] [<ffffffff8112beb6>] __generic_file_aio_write+0x1b6/0x3b0 <4>[ 3845.665775] [<ffffffff8112c113>] ? generic_file_aio_write+0x63/0x100 <4>[ 3845.701363] [<ffffffff8112c131>] generic_file_aio_write+0x81/0x100 <4>[ 3845.736980] [<ffffffffa04faf53>] ceph_aio_write+0x883/0xaa0 [ceph] <4>[ 3845.773178] [<ffffffffa04ffe9f>] ? __touch_cap+0x2f/0x100 [ceph] <4>[ 3845.809253] [<ffffffffa04ffe9f>] ? __touch_cap+0x2f/0x100 [ceph] <4>[ 3845.844364] [<ffffffff8132bfbd>] ? do_raw_spin_unlock+0x5d/0xb0 <4>[ 3845.879219] [<ffffffff816358bb>] ? _raw_spin_unlock+0x2b/0x40 <4>[ 3845.913336] [<ffffffff8117b623>] do_sync_write+0xa3/0xe0 <4>[ 3845.946217] [<ffffffff8117bee3>] vfs_write+0xb3/0x180 <4>[ 3845.978261] [<ffffffff8117c20a>] sys_write+0x4a/0x90 <4>[ 3846.009350] [<ffffffff8163e1e9>] system_call_fastpath+0x16/0x1b <4>[ 3846.040915] INFO: lockdep is turned off. <3>[ 3846.069586] INFO: task ffsb:30863 blocked for more than 120 seconds. <3>[ 3846.100707] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. <6>[ 3846.156799] ffsb D ffffffff8112a200 0 30863 29158 0x00000000 <4>[ 3846.189287] ffff8801d68598e8 0000000000000046 ffff8801d68598b8 0000000000000046 <4>[ 3846.247252] ffff880225e40000 ffff8801d6859fd8 ffff8801d6859fd8 ffff8801d6859fd8 <4>[ 3846.308750] ffff880225d18000 ffff880225e40000 ffff8801d68598e8 ffff880227234e50 <4>[ 3846.373164] Call Trace: <4>[ 3846.403711] [<ffffffff8112a200>] ? __lock_page+0x70/0x70 <4>[ 3846.437911] [<ffffffff816345c9>] schedule+0x29/0x70 <4>[ 3846.471328] [<ffffffff8163469f>] io_schedule+0x8f/0xd0 <4>[ 3846.504668] [<ffffffff8112a20e>] sleep_on_page+0xe/0x20 <4>[ 3846.537980] [<ffffffff8163213a>] __wait_on_bit_lock+0x5a/0xc0 <4>[ 3846.571696] [<ffffffff8112a1f7>] __lock_page+0x67/0x70 <4>[ 3846.604439] [<ffffffff810798a0>] ? autoremove_wake_function+0x40/0x40 <4>[ 3846.639491] [<ffffffff8112b1a7>] find_lock_page+0x67/0x80 <4>[ 3846.673170] [<ffffffff8112b22f>] grab_cache_page_write_begin+0x6f/0xf0 <4>[ 3846.708183] [<ffffffffa04ffeba>] ? __touch_cap+0x4a/0x100 [ceph] <4>[ 3846.742688] [<ffffffffa04fea1e>] ceph_write_begin+0x4e/0xd0 [ceph] <4>[ 3846.778052] [<ffffffff8112a6fa>] generic_file_buffered_write+0x11a/0x2a0 <4>[ 3846.814216] [<ffffffff8112beb6>] __generic_file_aio_write+0x1b6/0x3b0 <4>[ 3846.850275] [<ffffffff8112c113>] ? generic_file_aio_write+0x63/0x100 <4>[ 3846.885829] [<ffffffff8112c131>] generic_file_aio_write+0x81/0x100 <4>[ 3846.921398] [<ffffffffa04faf53>] ceph_aio_write+0x883/0xaa0 [ceph] <4>[ 3846.957575] [<ffffffffa04ffe9f>] ? __touch_cap+0x2f/0x100 [ceph] <4>[ 3846.993584] [<ffffffffa04ffe9f>] ? __touch_cap+0x2f/0x100 [ceph] <4>[ 3847.028567] [<ffffffff8132bfbd>] ? do_raw_spin_unlock+0x5d/0xb0 <4>[ 3847.063353] [<ffffffff816358bb>] ? _raw_spin_unlock+0x2b/0x40 <4>[ 3847.097370] [<ffffffff8117b623>] do_sync_write+0xa3/0xe0 <4>[ 3847.130288] [<ffffffff8117bee3>] vfs_write+0xb3/0x180 <4>[ 3847.162270] [<ffffffff8117c20a>] sys_write+0x4a/0x90 <4>[ 3847.193318] [<ffffffff8163e1e9>] system_call_fastpath+0x16/0x1b <4>[ 3847.224795] INFO: lockdep is turned off. <4>[ 3880.449333] libceph: tid 320266 timed out on osd4, will reset osd <4>[ 3890.481907] libceph: tid 256933 timed out on osd1, will reset osd <1>[ 3890.521849] BUG: unable to handle kernel NULL pointer dereference at 0000000000000090 <1>[ 3890.577501] IP: [<ffffffffa04c3b5d>] con_work+0x257d/0x2f40 [libceph] <4>[ 3890.609137] PGD 0 <4>[ 3890.635983] Oops: 0002 [#1] SMP [0]kdb> bt Stack traceback for pid 30867 0xffff88021d715e20 30867 2 1 0 R 0xffff88021d716280 *kworker/0:3 ffff880108fbbc00 0000000000000018 ffff880108fbbd1c ffff880227214580 0000000000000000 ffffffff81092a73 0000000000000002 ffff88021d715e20 ffff880108fbbcc0 0000000000000046 0000000000000000 ffff880227214580 Call Trace: [<ffffffff81092a73>] ? idle_balance+0x133/0x180 [<ffffffff810848a8>] ? finish_task_switch+0x48/0x110 [<ffffffff810717e0>] ? process_one_work+0x1a0/0x5f0 [<ffffffff81071776>] ? process_one_work+0x136/0x5f0 [<ffffffffa04c15e0>] ? ceph_msg_new+0x2e0/0x2e0 [libceph] [<ffffffff810735fd>] ? worker_thread+0x18d/0x4f0 [<ffffffff81073470>] ? manage_workers+0x320/0x320 [<ffffffff810791fe>] ? kthread+0xae/0xc0 [<ffffffff810b393d>] ? trace_hardirqs_on+0xd/0x10 [<ffffffff8163f3c4>] ? kernel_thread_helper+0x4/0x10 [<ffffffff81635880>] ? _raw_spin_unlock_irq+0x30/0x40 [<ffffffff816360b0>] ? retint_restore_args+0x13/0x13 [<ffffffff81079150>] ? flush_kthread_work+0x1a0/0x1a0 [<ffffffff8163f3c0>] ? gs_change+0x13/0x13 [0]kdb>
reproduced ~ 2/15 runs of
kernel: branch: master kdb: true nuke-on-error: true overrides: ceph: branch: wip-mds-ls conf: mds: debug mds: 1/20 fs: btrfs log-whitelist: - slow request roles: - - mon.a - mon.c - osd.0 - osd.1 - osd.2 - - mon.b - mds.a - osd.3 - osd.4 - osd.5 - - client.0 tasks: - chef: null - clock: null - ceph: log-whitelist: - wrongly marked me down - objects unfound and apparently lost - thrashosds: null - kclient: null - workunit: clients: all: - suites/ffsb.sh
Updated by Sage Weil over 11 years ago
if (!msg->middle) return -ENOMEM; 5b44: 41 be f4 ff ff ff mov $0xfffffff4,%r14d 5b4a: e9 b9 dd ff ff jmpq 3908 <con_work+0x328> int middle_len = le32_to_cpu(msg->hdr.middle_len); dout("alloc_middle %p type %d %s middle_len %d\n", msg, type, ceph_msg_type_name(type), middle_len); BUG_ON(!middle_len); BUG_ON(msg->middle); 5b4f: 0f 0b ud2 int type = le16_to_cpu(msg->hdr.type); int middle_len = le32_to_cpu(msg->hdr.middle_len); dout("alloc_middle %p type %d %s middle_len %d\n", msg, type, ceph_msg_type_name(type), middle_len); BUG_ON(!middle_len); 5b51: 0f 0b ud2 ceph_msg_put(msg); return -EAGAIN; } con->in_msg = msg; if (con->in_msg) { con->in_msg->con = con->ops->get(con); 5b53: 0f 0b ud2 static inline int kref_sub(struct kref *kref, unsigned int count, void (*release)(struct kref *kref)) { WARN_ON(release == NULL); if (atomic_sub_and_test((int) count, &kref->refcount)) { 5b55: 49 8d bc 24 90 00 00 lea 0x90(%r12),%rdi 5b5c: 00 5b5d: f0 41 83 ac 24 90 00 lock subl $0x1,0x90(%r12) 5b64: 00 00 01 5b67: 0f 94 c0 sete %al 5b6a: 84 c0 test %al,%al mutex_unlock(&con->mutex); msg = con->ops->alloc_msg(con, hdr, skip); mutex_lock(&con->mutex); if (con->state != CON_STATE_OPEN) { if (msg) ceph_msg_put(msg); 5b6c: 41 be f5 ff ff ff mov $0xfffffff5,%r14d 5b72: 0f 84 90 dd ff ff je 3908 <con_work+0x328> release(kref); 5b78: e8 00 00 00 00 callq 5b7d <con_work+0x259d> 5b79: R_X86_64_PC32 ceph_msg_last_put+0xfffffffffffffffc 5b7d: e9 86 dd ff ff jmpq 3908 <con_work+0x328> return -EBADMSG; } if (do_datacrc && (m->footer.flags & CEPH_MSG_FOOTER_NOCRC) == 0 && con->in_data_crc != le32_to_cpu(m->footer.data_crc)) { pr_err("read_partial_message %p data crc %u != exp. %u\n", m, 5b82: 4c 89 e6 mov %r12,%rsi 5b85: 48 c7 c7 00 00 00 00 mov $0x0,%rdi 5b88: R_X86_64_32S .rodata.str1.8+0x1260 5b8c: 31 c0 xor %eax,%eax 5b8e: e8 00 00 00 00 callq 5b93 <con_work+0x25b3> 5b8f: R_X86_64_PC32 printk+0xfffffffffffffffc if (con->in_tag == CEPH_MSGR_TAG_MSG) { ret = read_partial_message(con);
IP is at 0x5b5d (0x257d + 0x35e0)
Updated by Sage Weil over 11 years ago
- Status changed from 12 to Rejected
commit:7246240c7c186542f73af4fadc744d66440f616f is not in master. false alarm!
Actions