Bug #58670
closedsegfault due to race condition between timeout handler calling close() and the call of async_shutdown()
100%
Description
We experienced segfault crashes with radosgw with the following crash report that contains the backtrace:
- ceph crash info 2023-02-08T10:02:13.175703Z_320a8451-ecde-4083-9b48-8b243e914d66 {
"archived": "2023-02-08 13:50:38.168408",
"backtrace": [
"(()+0x43090) [0x7f584efb8090]",
"(boost::asio::detail::reactive_socket_service_base::start_op(boost::asio::detail::reactive_socket_service_base::base_implementation_type&, int, boost::asio::detail::reactor_op*, bool, bool, bool)+0x125) [0x7f584f4e27b5]",
"(void boost::asio::detail::reactive_socket_service_base::async_receive<boost::asio::mutable_buffers_1, boost::asio::ssl::detail::io_op<boost::asio::basic_stream_socket<boost::asio::ip::tcp, boost::asio::io_context::executor_type>, boost::asio::ssl::detail::shutdown_op, spawn::detail::coro_handler<boost::asio::executor_binder<void ()(), boost::asio::strand<boost::asio::io_context::executor_type> >, void> >, boost::asio::detail::io_object_executor<boost::asio::io_context::executor_type> >(boost::asio::detail::reactive_socket_service_base::base_implementation_type&, boost::asio::mutable_buffers_1 const&, int, boost::asio::ssl::detail::io_op<boost::asio::basic_stream_socket<boost::asio::ip::tcp, boost::asio::io_context::executor_type>, boost::asio::ssl::detail::shutdown_op, spawn::detail::coro_handler<boost::asio::executor_binder<void ()(), boost::asio::strand<boost::asio::io_context::executor_type> >, void> >&, boost::asio::detail::io_object_executor<boost::asio::io_context::executor_type> const&)+0x1a3) [0x7f584f4f4123]",
"(boost::asio::ssl::detail::io_op<boost::asio::basic_stream_socket<boost::asio::ip::tcp, boost::asio::io_context::executor_type>, boost::asio::ssl::detail::shutdown_op, spawn::detail::coro_handler<boost::asio::executor_binder<void ()(), boost::asio::strand<boost::asio::io_context::executor_type> >, void> >::operator()(boost::system::error_code, unsigned long, int)+0x495) [0x7f584f4fb8a5]",
"(boost::asio::detail::executor_op<boost::asio::detail::binder2<boost::asio::ssl::detail::io_op<boost::asio::basic_stream_socket<boost::asio::ip::tcp, boost::asio::io_context::executor_type>, boost::asio::ssl::detail::shutdown_op, spawn::detail::coro_handler<boost::asio::executor_binder<void ()(), boost::asio::strand<boost::asio::io_context::executor_type> >, void> >, boost::system::error_code, unsigned long>, std::allocator<void>, boost::asio::detail::scheduler_operation>::do_complete(void*, boost::asio::detail::scheduler_operation*, boost::system::error_code const&, unsigned long)+0xec) [0x7f584f4fc26c]",
"(boost::asio::detail::strand_executor_service::invoker<boost::asio::io_context::executor_type const>::operator()()+0x8d) [0x7f584f4f2d9d]",
"(void boost::asio::io_context::executor_type::dispatch<boost::asio::detail::strand_executor_service::invoker<boost::asio::io_context::executor_type const>, std::allocator<void> >(boost::asio::detail::strand_executor_service::invoker<boost::asio::io_context::executor_type const>&&, std::allocator<void> const&) const+0x9c) [0x7f584f4f2f9c]",
"(void boost::asio::detail::strand_executor_service::dispatch<boost::asio::io_context::executor_type const, boost::asio::detail::binder2<boost::asio::ssl::detail::io_op<boost::asio::basic_stream_socket<boost::asio::ip::tcp, boost::asio::io_context::executor_type>, boost::asio::ssl::detail::shutdown_op, spawn::detail::coro_handler<boost::asio::executor_binder<void ()(), boost::asio::strand<boost::asio::io_context::executor_type> >, void> >, boost::system::error_code, unsigned long>, std::allocator<void> >(std::shared_ptr<boost::asio::detail::strand_executor_service::strand_impl> const&, boost::asio::io_context::executor_type const&, boost::asio::detail::binder2<boost::asio::ssl::detail::io_op<boost::asio::basic_stream_socket<boost::asio::ip::tcp, boost::asio::io_context::executor_type>, boost::asio::ssl::detail::shutdown_op, spawn::detail::coro_handler<boost::asio::executor_binder<void ()(), boost::asio::strand<boost::asio::io_context::executor_type> >, void> >, boost::system::error_code, unsigned long>&&, std::allocator<void> const&)+0x197) [0x7f584f4fbee7]",
"(boost::asio::detail::reactive_socket_recv_op<boost::asio::mutable_buffers_1, boost::asio::ssl::detail::io_op<boost::asio::basic_stream_socket<boost::asio::ip::tcp, boost::asio::io_context::executor_type>, boost::asio::ssl::detail::shutdown_op, spawn::detail::coro_handler<boost::asio::executor_binder<void ()(), boost::asio::strand<boost::asio::io_context::executor_type> >, void> >, boost::asio::detail::io_object_executor<boost::asio::io_context::executor_type> >::do_complete(void, boost::asio::detail::scheduler_operation*, boost::system::error_code const&, unsigned long)+0xff) [0x7f584f4fc09f]",
"(boost::asio::detail::scheduler::run(boost::system::error_code&)+0x4f6) [0x7f584f4e8676]",
"(()+0x361ac6) [0x7f584f4c8ac6]",
"(()+0xd6de4) [0x7f5845a2bde4]",
"(()+0x8609) [0x7f5845e17609]",
"(clone()+0x43) [0x7f584f094133]"
],
"ceph_version": "15.2.17",
"crash_id": "2023-02-08T10:02:13.175703Z_320a8451-ecde-4083-9b48-8b243e914d66",
"entity_name": "client.rgw.rgw12-atl1",
"os_id": "ubuntu",
"os_name": "Ubuntu",
"os_version": "20.04.5 LTS (Focal Fossa)",
"os_version_id": "20.04",
"process_name": "radosgw",
"stack_sig": "edd084a0c95d1274f15927122091c26292df3954d550aca6fe3176a2d095ee93",
"timestamp": "2023-02-08T10:02:13.175703Z",
"utsname_machine": "x86_64",
"utsname_release": "5.4.0-80-generic",
"utsname_sysname": "Linux",
"utsname_version": "#90-Ubuntu SMP Fri Jul 9 22:49:44 UTC 2021"
}
Investigation through gdb on the core file shows more detailed stack info:
#0 __GI_raise (sig=sig@entry=11) at ../sysdeps/unix/sysv/linux/raise.c:50
#1 0x00007f63c0c4758e in reraise_fatal (signum=11) at ./src/global/signal_handler.cc:326
#2 handle_fatal_signal (signum=11) at ./src/global/signal_handler.cc:326
#3 <signal handler called>
#4 boost::asio::detail::epoll_reactor::start_op (allow_speculative=<optimized out>, is_continuation=true, op=0x7f63646056e0, descriptor_data=@0x7f6367419148: 0x0, descriptor=<optimized out>, op_type=0, this=0x564c33642090)
at ./obj-x86_64-linux-gnu/boost/include/boost/asio/detail/impl/epoll_reactor.ipp:245
#5 boost::asio::detail::reactive_socket_service_base::start_op (noop=false, is_non_blocking=<optimized out>, is_continuation=true, op=0x7f63646056e0, op_type=0, impl=..., this=<optimized out>)
at ./obj-x86_64-linux-gnu/boost/include/boost/asio/detail/impl/reactive_socket_service_base.ipp:246
#6 boost::asio::detail::reactive_socket_service_base::start_op (this=<optimized out>, impl=..., op_type=0, op=0x7f63646056e0, is_continuation=<optimized out>, is_non_blocking=<optimized out>, noop=false)
at ./obj-x86_64-linux-gnu/boost/include/boost/asio/detail/impl/reactive_socket_service_base.ipp:235
#7 0x00007f63c04f3123 in boost::asio::detail::reactive_socket_service_base::async_receive<boost::asio::mutable_buffers_1, boost::asio::ssl::detail::io_op<boost::asio::basic_stream_socket<boost::asio::ip::tcp, boost::asio::io_context::executor_type>, boost::asio::ssl::detail::shutdown_op, spawn::detail::coro_handler<boost::asio::executor_binder<void ()(), boost::asio::strand<boost::asio::io_context::executor_type> >, void> >, boost::asio::detail::io_object_executor<boost::asio::io_context::executor_type> > (this=this@entry=0x564c338a5bb8, impl=..., buffers=..., flags=flags@entry=0, handler=..., io_ex=...) at ./obj-x86_64-linux-gnu/boost/include/boost/asio/buffer.hpp:136
#8 0x00007f63c04fa8a5 in boost::asio::basic_stream_socket<boost::asio::ip::tcp, boost::asio::io_context::executor_type>::initiate_async_receive::operator()<boost::asio::ssl::detail::io_op<boost::asio::basic_stream_socket<boost::asio::ip::tcp, boost::asio::io_context::executor_type>, boost::asio::ssl::detail::shutdown_op, spawn::detail::coro_handler<boost::asio::executor_binder<void ()(), boost::asio::strand<boost::asio::io_context::executor_type> >, void> >, boost::asio::mutable_buffers_1> (this=<optimized out>, flags=0, buffers=..., handler=...) at ./obj-x86_64-linux-gnu/boost/include/boost/asio/detail/io_object_impl.hpp:144
#9 boost::asio::async_result<boost::asio::ssl::detail::io_op<boost::asio::basic_stream_socket<boost::asio::ip::tcp, boost::asio::io_context::executor_type>, boost::asio::ssl::detail::shutdown_op, spawn::detail::coro_handler<boost::asio::executor_binder<void ()(), boost::asio::strand<boost::asio::io_context::executor_type> >, void> >, void (boost::system::error_code, unsigned long)>::initiate<boost::asio::basic_stream_socket<boost::asio::ip::tcp, boost::asio::io_context::executor_type>::initiate_async_receive, boost::asio::ssl::detail::io_op<boost::asio::basic_stream_socket<boost::asio::ip::tcp, boost::asio::io_context::executor_type>, boost::asio::ssl::detail::shutdown_op, spawn::detail::coro_handler<boost::asio::executor_binder<void ()(), boost::asio::strand<boost::asio::io_context::executor_type> >, void> >, boost::asio::mutable_buffers_1 const&, int>(boost::asio::basic_stream_socket<boost::asio::ip::tcp, boost::asio::io_context::executor_type>::initiate_async_receive&&, boost::asio::ssl::detail::io_op<boost::asio::basic_stream_socket<boost::asio::ip::tcp, boost::asio::io_context::executor_type>, boost::asio::ssl::detail::shutdown_op, spawn::detail::coro_handler<boost::asio::executor_binder<void (*)(), boost::asio::strand<boost::asio::io_context::executor_type> >, void> >&&, boost::asio::mutable_buffers_1 const&, int&&) (token=..., initiation=...)
at ./obj-x86_64-linux-gnu/boost/include/boost/asio/async_result.hpp:151
The request that causes the crash is always a request that just experienced a write/read timeout. Further investigation shows the timeout handler is calling close() in a different thread than the request coroutine that calls async_shutdown() after handle_connection(). It causes the process to access the same socket from different threads, which is always dangerous.
The crash can be reproduced with the following steps:- Reduce the request timeout from 65s to 2-5s to trigger timeout more easily.
- Reduce the bandwidth of HTTPS traffic and increase its latency and loss to make download slower.
- Make the client to use "Connection: close" header.
Then, a flood of download requests from the client could trigger the segfault about 1 hour (I used 300 s3cmd running in parallel and downloading the same 3MB object repetitively via HTTPS).
Updated by Yixin Jin about 1 year ago
I found out that by replacing the socket close() call in the timeout handler with socket cancel() followed by socket shutdown(), this crash goes away. Probably because both calls don't reset descriptor_data pointer in boost::asio, which was the cause of the segfault (accessing null pointer). It also properly wakes up the pending ops and disables the network functionality so async_shutdown() can exit gracefully.
Updated by Casey Bodley about 1 year ago
- Priority changed from Normal to High
- Tags set to ssl beast timeout
- Backport set to pacific quincy
Updated by Casey Bodley about 1 year ago
- Status changed from New to Triaged
Yixin Jin wrote:
I found out that by replacing the socket close() call in the timeout handler with socket cancel() followed by socket shutdown(), this crash goes away. Probably because both calls don't reset descriptor_data pointer in boost::asio, which was the cause of the segfault (accessing null pointer). It also properly wakes up the pending ops and disables the network functionality so async_shutdown() can exit gracefully.
@Yixin can you please submit a pull request with this fix?
Updated by Yixin Jin about 1 year ago
Casey Bodley wrote:
Yixin Jin wrote:
I found out that by replacing the socket close() call in the timeout handler with socket cancel() followed by socket shutdown(), this crash goes away. Probably because both calls don't reset descriptor_data pointer in boost::asio, which was the cause of the segfault (accessing null pointer). It also properly wakes up the pending ops and disables the network functionality so async_shutdown() can exit gracefully.
@Yixin can you please submit a pull request with this fix?
I just created a PR for it. https://github.com/ceph/ceph/pull/50131. Please help review it.
Updated by Casey Bodley about 1 year ago
- Status changed from Triaged to Fix Under Review
- Pull request ID set to 50131
Updated by J. Eric Ivancich about 1 year ago
- Status changed from Fix Under Review to Pending Backport
Updated by Backport Bot about 1 year ago
- Copied to Backport #58771: pacific: segfault due to race condition between timeout handler calling close() and the call of async_shutdown() added
Updated by Backport Bot about 1 year ago
- Copied to Backport #58772: quincy: segfault due to race condition between timeout handler calling close() and the call of async_shutdown() added
Updated by Backport Bot about 1 year ago
- Tags changed from ssl beast timeout to ssl beast timeout backport_processed
Updated by Konstantin Shalygin about 1 year ago
- Status changed from Pending Backport to Resolved
- Target version set to v18.0.0
- % Done changed from 0 to 100