Bug #58670
closedsegfault due to race condition between timeout handler calling close() and the call of async_shutdown()
100%
Description
We experienced segfault crashes with radosgw with the following crash report that contains the backtrace:
- ceph crash info 2023-02-08T10:02:13.175703Z_320a8451-ecde-4083-9b48-8b243e914d66 {
"archived": "2023-02-08 13:50:38.168408",
"backtrace": [
"(()+0x43090) [0x7f584efb8090]",
"(boost::asio::detail::reactive_socket_service_base::start_op(boost::asio::detail::reactive_socket_service_base::base_implementation_type&, int, boost::asio::detail::reactor_op*, bool, bool, bool)+0x125) [0x7f584f4e27b5]",
"(void boost::asio::detail::reactive_socket_service_base::async_receive<boost::asio::mutable_buffers_1, boost::asio::ssl::detail::io_op<boost::asio::basic_stream_socket<boost::asio::ip::tcp, boost::asio::io_context::executor_type>, boost::asio::ssl::detail::shutdown_op, spawn::detail::coro_handler<boost::asio::executor_binder<void ()(), boost::asio::strand<boost::asio::io_context::executor_type> >, void> >, boost::asio::detail::io_object_executor<boost::asio::io_context::executor_type> >(boost::asio::detail::reactive_socket_service_base::base_implementation_type&, boost::asio::mutable_buffers_1 const&, int, boost::asio::ssl::detail::io_op<boost::asio::basic_stream_socket<boost::asio::ip::tcp, boost::asio::io_context::executor_type>, boost::asio::ssl::detail::shutdown_op, spawn::detail::coro_handler<boost::asio::executor_binder<void ()(), boost::asio::strand<boost::asio::io_context::executor_type> >, void> >&, boost::asio::detail::io_object_executor<boost::asio::io_context::executor_type> const&)+0x1a3) [0x7f584f4f4123]",
"(boost::asio::ssl::detail::io_op<boost::asio::basic_stream_socket<boost::asio::ip::tcp, boost::asio::io_context::executor_type>, boost::asio::ssl::detail::shutdown_op, spawn::detail::coro_handler<boost::asio::executor_binder<void ()(), boost::asio::strand<boost::asio::io_context::executor_type> >, void> >::operator()(boost::system::error_code, unsigned long, int)+0x495) [0x7f584f4fb8a5]",
"(boost::asio::detail::executor_op<boost::asio::detail::binder2<boost::asio::ssl::detail::io_op<boost::asio::basic_stream_socket<boost::asio::ip::tcp, boost::asio::io_context::executor_type>, boost::asio::ssl::detail::shutdown_op, spawn::detail::coro_handler<boost::asio::executor_binder<void ()(), boost::asio::strand<boost::asio::io_context::executor_type> >, void> >, boost::system::error_code, unsigned long>, std::allocator<void>, boost::asio::detail::scheduler_operation>::do_complete(void*, boost::asio::detail::scheduler_operation*, boost::system::error_code const&, unsigned long)+0xec) [0x7f584f4fc26c]",
"(boost::asio::detail::strand_executor_service::invoker<boost::asio::io_context::executor_type const>::operator()()+0x8d) [0x7f584f4f2d9d]",
"(void boost::asio::io_context::executor_type::dispatch<boost::asio::detail::strand_executor_service::invoker<boost::asio::io_context::executor_type const>, std::allocator<void> >(boost::asio::detail::strand_executor_service::invoker<boost::asio::io_context::executor_type const>&&, std::allocator<void> const&) const+0x9c) [0x7f584f4f2f9c]",
"(void boost::asio::detail::strand_executor_service::dispatch<boost::asio::io_context::executor_type const, boost::asio::detail::binder2<boost::asio::ssl::detail::io_op<boost::asio::basic_stream_socket<boost::asio::ip::tcp, boost::asio::io_context::executor_type>, boost::asio::ssl::detail::shutdown_op, spawn::detail::coro_handler<boost::asio::executor_binder<void ()(), boost::asio::strand<boost::asio::io_context::executor_type> >, void> >, boost::system::error_code, unsigned long>, std::allocator<void> >(std::shared_ptr<boost::asio::detail::strand_executor_service::strand_impl> const&, boost::asio::io_context::executor_type const&, boost::asio::detail::binder2<boost::asio::ssl::detail::io_op<boost::asio::basic_stream_socket<boost::asio::ip::tcp, boost::asio::io_context::executor_type>, boost::asio::ssl::detail::shutdown_op, spawn::detail::coro_handler<boost::asio::executor_binder<void ()(), boost::asio::strand<boost::asio::io_context::executor_type> >, void> >, boost::system::error_code, unsigned long>&&, std::allocator<void> const&)+0x197) [0x7f584f4fbee7]",
"(boost::asio::detail::reactive_socket_recv_op<boost::asio::mutable_buffers_1, boost::asio::ssl::detail::io_op<boost::asio::basic_stream_socket<boost::asio::ip::tcp, boost::asio::io_context::executor_type>, boost::asio::ssl::detail::shutdown_op, spawn::detail::coro_handler<boost::asio::executor_binder<void ()(), boost::asio::strand<boost::asio::io_context::executor_type> >, void> >, boost::asio::detail::io_object_executor<boost::asio::io_context::executor_type> >::do_complete(void, boost::asio::detail::scheduler_operation*, boost::system::error_code const&, unsigned long)+0xff) [0x7f584f4fc09f]",
"(boost::asio::detail::scheduler::run(boost::system::error_code&)+0x4f6) [0x7f584f4e8676]",
"(()+0x361ac6) [0x7f584f4c8ac6]",
"(()+0xd6de4) [0x7f5845a2bde4]",
"(()+0x8609) [0x7f5845e17609]",
"(clone()+0x43) [0x7f584f094133]"
],
"ceph_version": "15.2.17",
"crash_id": "2023-02-08T10:02:13.175703Z_320a8451-ecde-4083-9b48-8b243e914d66",
"entity_name": "client.rgw.rgw12-atl1",
"os_id": "ubuntu",
"os_name": "Ubuntu",
"os_version": "20.04.5 LTS (Focal Fossa)",
"os_version_id": "20.04",
"process_name": "radosgw",
"stack_sig": "edd084a0c95d1274f15927122091c26292df3954d550aca6fe3176a2d095ee93",
"timestamp": "2023-02-08T10:02:13.175703Z",
"utsname_machine": "x86_64",
"utsname_release": "5.4.0-80-generic",
"utsname_sysname": "Linux",
"utsname_version": "#90-Ubuntu SMP Fri Jul 9 22:49:44 UTC 2021"
}
Investigation through gdb on the core file shows more detailed stack info:
#0 __GI_raise (sig=sig@entry=11) at ../sysdeps/unix/sysv/linux/raise.c:50
#1 0x00007f63c0c4758e in reraise_fatal (signum=11) at ./src/global/signal_handler.cc:326
#2 handle_fatal_signal (signum=11) at ./src/global/signal_handler.cc:326
#3 <signal handler called>
#4 boost::asio::detail::epoll_reactor::start_op (allow_speculative=<optimized out>, is_continuation=true, op=0x7f63646056e0, descriptor_data=@0x7f6367419148: 0x0, descriptor=<optimized out>, op_type=0, this=0x564c33642090)
at ./obj-x86_64-linux-gnu/boost/include/boost/asio/detail/impl/epoll_reactor.ipp:245
#5 boost::asio::detail::reactive_socket_service_base::start_op (noop=false, is_non_blocking=<optimized out>, is_continuation=true, op=0x7f63646056e0, op_type=0, impl=..., this=<optimized out>)
at ./obj-x86_64-linux-gnu/boost/include/boost/asio/detail/impl/reactive_socket_service_base.ipp:246
#6 boost::asio::detail::reactive_socket_service_base::start_op (this=<optimized out>, impl=..., op_type=0, op=0x7f63646056e0, is_continuation=<optimized out>, is_non_blocking=<optimized out>, noop=false)
at ./obj-x86_64-linux-gnu/boost/include/boost/asio/detail/impl/reactive_socket_service_base.ipp:235
#7 0x00007f63c04f3123 in boost::asio::detail::reactive_socket_service_base::async_receive<boost::asio::mutable_buffers_1, boost::asio::ssl::detail::io_op<boost::asio::basic_stream_socket<boost::asio::ip::tcp, boost::asio::io_context::executor_type>, boost::asio::ssl::detail::shutdown_op, spawn::detail::coro_handler<boost::asio::executor_binder<void ()(), boost::asio::strand<boost::asio::io_context::executor_type> >, void> >, boost::asio::detail::io_object_executor<boost::asio::io_context::executor_type> > (this=this@entry=0x564c338a5bb8, impl=..., buffers=..., flags=flags@entry=0, handler=..., io_ex=...) at ./obj-x86_64-linux-gnu/boost/include/boost/asio/buffer.hpp:136
#8 0x00007f63c04fa8a5 in boost::asio::basic_stream_socket<boost::asio::ip::tcp, boost::asio::io_context::executor_type>::initiate_async_receive::operator()<boost::asio::ssl::detail::io_op<boost::asio::basic_stream_socket<boost::asio::ip::tcp, boost::asio::io_context::executor_type>, boost::asio::ssl::detail::shutdown_op, spawn::detail::coro_handler<boost::asio::executor_binder<void ()(), boost::asio::strand<boost::asio::io_context::executor_type> >, void> >, boost::asio::mutable_buffers_1> (this=<optimized out>, flags=0, buffers=..., handler=...) at ./obj-x86_64-linux-gnu/boost/include/boost/asio/detail/io_object_impl.hpp:144
#9 boost::asio::async_result<boost::asio::ssl::detail::io_op<boost::asio::basic_stream_socket<boost::asio::ip::tcp, boost::asio::io_context::executor_type>, boost::asio::ssl::detail::shutdown_op, spawn::detail::coro_handler<boost::asio::executor_binder<void ()(), boost::asio::strand<boost::asio::io_context::executor_type> >, void> >, void (boost::system::error_code, unsigned long)>::initiate<boost::asio::basic_stream_socket<boost::asio::ip::tcp, boost::asio::io_context::executor_type>::initiate_async_receive, boost::asio::ssl::detail::io_op<boost::asio::basic_stream_socket<boost::asio::ip::tcp, boost::asio::io_context::executor_type>, boost::asio::ssl::detail::shutdown_op, spawn::detail::coro_handler<boost::asio::executor_binder<void ()(), boost::asio::strand<boost::asio::io_context::executor_type> >, void> >, boost::asio::mutable_buffers_1 const&, int>(boost::asio::basic_stream_socket<boost::asio::ip::tcp, boost::asio::io_context::executor_type>::initiate_async_receive&&, boost::asio::ssl::detail::io_op<boost::asio::basic_stream_socket<boost::asio::ip::tcp, boost::asio::io_context::executor_type>, boost::asio::ssl::detail::shutdown_op, spawn::detail::coro_handler<boost::asio::executor_binder<void (*)(), boost::asio::strand<boost::asio::io_context::executor_type> >, void> >&&, boost::asio::mutable_buffers_1 const&, int&&) (token=..., initiation=...)
at ./obj-x86_64-linux-gnu/boost/include/boost/asio/async_result.hpp:151
The request that causes the crash is always a request that just experienced a write/read timeout. Further investigation shows the timeout handler is calling close() in a different thread than the request coroutine that calls async_shutdown() after handle_connection(). It causes the process to access the same socket from different threads, which is always dangerous.
The crash can be reproduced with the following steps:- Reduce the request timeout from 65s to 2-5s to trigger timeout more easily.
- Reduce the bandwidth of HTTPS traffic and increase its latency and loss to make download slower.
- Make the client to use "Connection: close" header.
Then, a flood of download requests from the client could trigger the segfault about 1 hour (I used 300 s3cmd running in parallel and downloading the same 3MB object repetitively via HTTPS).