Actions
Bug #44247
opencreate_queue_pair failed may lead to Segmentation fault in RDMAConnectedSocketImpl
Status:
New
Priority:
Normal
Assignee:
-
Category:
msgr
Target version:
-
% Done:
0%
Source:
Tags:
Backport:
Regression:
No
Severity:
3 - minor
Reviewed:
Affected Versions:
ceph-qa-suite:
Pull request ID:
Crash signature (v1):
Crash signature (v2):
Description
I run ceph_perf_msgr_server/client with RoCE, when create_queue_pair failed, the constructor of RDMAConnectedSocketImpl may occur Segmentation fault.
2020-01-14T08:59:09.088+0800 ffff672f1e80 10 Processor -- accept listen_fd=27
2020-01-14T08:59:09.088+0800 ffff672f1e80 15 RDMAServerSocketImpl accept
2020-01-14T08:59:09.088+0800 ffff672f1e80 20 Infiniband init started.
hr_qp->port_num= 0x1
2020-01-14T08:59:09.088+0800 ffff672f1e80 20 Infiniband modify_qp_to_init successfully switch to INIT state Queue Pair, qp number: 24
2020-01-14T08:59:09.088+0800 ffff672f1e80 20 Infiniband init successfully create queue pair: qp=0xaaab0c52e280
2020-01-14T08:59:09.088+0800 ffff672f1e80 -1 Infiniband can_alloc WARNING: OUT OF RX BUFFERS: allocated: 24576 requested: 32768 limit: 32768
2020-01-14T08:59:09.088+0800 ffff672f1e80 -1 Infiniband can_alloc WARNING: OUT OF RX BUFFERS: allocated: 24576 requested: 16384 limit: 32768
2020-01-14T08:59:09.088+0800 ffff672f1e80 -1 Infiniband post_chunks_to_rq WARNING: out of memory. Request 4096 rx buffers. Only get 0 rx buffers.
2020-01-14T08:59:09.088+0800 ffff672f1e80 -1 Infiniband init intialize no SRQ Queue Pair, qp number: 24 fatal error: can't post SQ WR
2020-01-14T08:59:09.088+0800 ffff672f1e80 20 Infiniband ~QueuePair destroy Queue Pair, qp number: 24 left SQ WR 0
2020-01-14T08:59:09.088+0800 ffff672f1e80 20 Infiniband ~QueuePair destroy qp=0xaaab0c52e280
*** Caught signal (Segmentation fault) **
in thread ffff672f1e80 thread_name:msgr-worker-0
ceph version 15.0.0-8506-g0277d9184e (0277d9184ee3f681fad7812b4275e8d97353353d) octopus (dev)
1: (__kernel_rt_sigreturn()+0) [0xffffa8c315c0]
2: (RDMAConnectedSocketImpl::RDMAConnectedSocketImpl(CephContext*, std::shared_ptr<Infiniband>&, std::shared_ptr<RDMADispatcher>&, RDMAWorker*)+0x18c) [0xaaaac62ec874]
3: (RDMAServerSocketImpl::accept(ConnectedSocket*, SocketOptions const&, entity_addr_t*, Worker*)+0x124) [0xaaaac62f4dcc]
4: (Processor::accept()+0x11c) [0xaaaac60396b4]
5: (EventCenter::process_events(unsigned int, std::chrono::duration<unsigned long, std::ratio<1l, 1000000000l> >*)+0x51c) [0xaaaac604167c]
6: (()+0x469a20) [0xaaaac6047a20]
7: (()+0xc9ed4) [0xffffa8719ed4]
8: (()+0x7088) [0xffffa8bcd088]
2020-01-14T08:59:09.100+0800 ffff672f1e80 -1 *** Caught signal (Segmentation fault) **
in thread ffff672f1e80 thread_name:msgr-worker-0
ceph version 15.0.0-8506-g0277d9184e (0277d9184ee3f681fad7812b4275e8d97353353d) octopus (dev)
1: (__kernel_rt_sigreturn()+0) [0xffffa8c315c0]
2: (RDMAConnectedSocketImpl::RDMAConnectedSocketImpl(CephContext*, std::shared_ptr<Infiniband>&, std::shared_ptr<RDMADispatcher>&, RDMAWorker*)+0x18c) [0xaaaac62ec874]
3: (RDMAServerSocketImpl::accept(ConnectedSocket*, SocketOptions const&, entity_addr_t*, Worker*)+0x124) [0xaaaac62f4dcc]
4: (Processor::accept()+0x11c) [0xaaaac60396b4]
5: (EventCenter::process_events(unsigned int, std::chrono::duration<unsigned long, std::ratio<1l, 1000000000l> >*)+0x51c) [0xaaaac604167c]
6: (()+0x469a20) [0xaaaac6047a20]
7: (()+0xc9ed4) [0xffffa8719ed4]
8: (()+0x7088) [0xffffa8bcd088]
NOTE: a copy of the executable, or `objdump -rdS <executable>` is needed to interpret this.
here are the code of create_queue_pair and RDMAConnectedSocketImpl constructor in msg/async/rdma/Infiniband.cc
Infiniband::QueuePair* Infiniband::create_queue_pair(CephContext *cct, CompletionQueue *tx,
CompletionQueue* rx, ibv_qp_type type, struct rdma_cm_id *cm_id)
{
Infiniband::QueuePair *qp = new QueuePair(
cct, *this, type, ib_physical_port, srq, tx, rx, tx_queue_len, rx_queue_len, cm_id);
if (qp->init()) {
delete qp;
return NULL;
}
return qp;
}
...
RDMAConnectedSocketImpl::RDMAConnectedSocketImpl(CephContext *cct, shared_ptr<Infiniband> &ib,
shared_ptr<RDMADispatcher>& rdma_dispatcher,
RDMAWorker *w)
: cct(cct), connected(0), error(0), ib(ib),
dispatcher(rdma_dispatcher), worker(w),
is_server(false), con_handler(new C_handle_connection(this)),
active(false), pending(false)
{
if (!cct->_conf->ms_async_rdma_cm) {
qp = ib->create_queue_pair(cct, dispatcher->get_tx_cq(), dispatcher->get_rx_cq(), IBV_QPT_RC, NULL);
local_qpn = qp->get_local_qp_number();
notify_fd = eventfd(0, EFD_CLOEXEC|EFD_NONBLOCK);
dispatcher->register_qp(qp, this);
dispatcher->perf_logger->inc(l_msgr_rdma_created_queue_pair);
dispatcher->perf_logger->inc(l_msgr_rdma_active_queue_pair);
}
}
Files
No data to display
Actions