Support #22471
closedosd op
0%
Description
[root@localhost ~]# ceph daemon osd.106 dump_historic_slow_ops
{
"num to keep": 20,
"threshold to keep": 10,
"Ops": [
{
"description": "osd_op(client.25433.0:1582472 15.e6s0 15:6707d682:::rbd_data.19.5eab74b0dc51.0000000000006afe:head [set-alloc-hint object_size 4194304 write_size 4194304,write 0~3997696] snapc 0=[] ondisk+write+known_if_redirected e1360)",
"initiated_at": "2017-12-17 08:52:01.049025",
"age": 91045.294053,
"duration": 106.969826,
"type_data": {
"flag_point": "commit sent; apply or cleanup",
"client_info": {
"client": "client.25433",
"client_addr": "10.193.56.234:0/2975911039",
"tid": 1582472
},
"events": [
_*{
"time": "2017-12-17 08:52:01.049025",
"event": "initiated"
},
{
"time": "2017-12-17 08:53:47.750308",
"event": "queued_for_pg"
},*_
{
"time": "2017-12-17 08:53:47.750330",
"event": "reached_pg"
},
{
"time": "2017-12-17 08:53:47.764229",
"event": "started"
},
{
"time": "2017-12-17 08:53:47.768730",
"event": "started"
},
{
"time": "2017-12-17 08:53:47.806282",
"event": "sub_op_committed"
},
{
"time": "2017-12-17 08:53:47.806286",
"event": "sub_op_applied"
},
{
"time": "2017-12-17 08:53:48.018778",
"event": "commit_sent"
},
{
"time": "2017-12-17 08:53:48.018852",
"event": "done"
}
]
}
}
]
}
But ceph.log has no any abnomal such as slow request.
2017-12-17 08:46:03.882734 mon.node1 mon.0 10.193.56.230:6789/0 16138 : cluster [INF] mon.1 10.193.56.231:6789/0
2017-12-17 08:46:03.883054 mon.node1 mon.0 10.193.56.230:6789/0 16139 : cluster [INF] mon.2 10.193.56.232:6789/0
2017-12-17 08:46:03.884719 mon.node1 mon.0 10.193.56.230:6789/0 16140 : cluster [INF] mon.3 10.193.56.233:6789/0
2017-12-17 08:51:03.882178 mon.node1 mon.0 10.193.56.230:6789/0 16208 : cluster [INF] mon.2 10.193.56.232:6789/0
2017-12-17 08:51:03.882245 mon.node1 mon.0 10.193.56.230:6789/0 16209 : cluster [INF] mon.3 10.193.56.233:6789/0
2017-12-17 08:51:03.882273 mon.node1 mon.0 10.193.56.230:6789/0 16210 : cluster [INF] mon.4 10.193.56.234:6789/0
2017-12-17 08:51:03.882320 mon.node1 mon.0 10.193.56.230:6789/0 16211 : cluster [INF] mon.1 10.193.56.231:6789/0
2017-12-17 08:56:03.884224 mon.node1 mon.0 10.193.56.230:6789/0 16275 : cluster [INF] mon.1 10.193.56.231:6789/0
2017-12-17 08:56:03.885126 mon.node1 mon.0 10.193.56.230:6789/0 16276 : cluster [INF] mon.2 10.193.56.232:6789/0
2017-12-17 08:56:03.887934 mon.node1 mon.0 10.193.56.230:6789/0 16277 : cluster [INF] mon.4 10.193.56.234:6789/0
2017-12-17 08:56:03.888850 mon.node1 mon.0 10.193.56.230:6789/0 16278 : cluster [INF] mon.3 10.193.56.233:6789/0
2017-12-17 09:00:00.000090 mon.node1 mon.0 10.193.56.230:6789/0 16314 : cluster [WRN] overall HEALTH_WARN noscrub,nodeep-scrub flag(s) set; application not enabled on 10 pool(s)
2017-12-17 09:01:03.882950 mon.node1 mon.0 10.193.56.230:6789/0 16330 : cluster [INF] mon.2 10.193.56.232:6789/0
2017-12-17 09:01:03.883001 mon.node1 mon.0 10.193.56.230:6789/0 16331 : cluster [INF] mon.4 10.193.56.234:6789/0
Files
Updated by zhou yang over 6 years ago
- File ceph.log ceph.log added
- File ceph-osd.106.log.2.bz2 ceph-osd.106.log.2.bz2 added
ceph 12.2.1
rbd ec-overwrite.
Some times, osd op was blocked for a long time (such as 106 seconds) from initiated to queued_for_pg,has anyone encountered a similar problem?
Updated by Nathan Cutler over 6 years ago
- Tracker changed from Tasks to Bug
- Project changed from Stable releases to Ceph
- Regression set to No
- Severity set to 3 - minor
Updated by zhou yang over 6 years ago
This problem seems to be network related,we add debug in AsyncConnection::process and find it takes a long time in the sate of @STATE_OPEN_MESSAGE_READ_FRONT.
case STATE_OPEN_MESSAGE_READ_FRONT:
{
// read front
unsigned front_len = current_header.front_len;
if (front_len) {
if (!front.length())
front.push_back(buffer::create(front_len));
r = read_until(front_len, front.c_str());
if (r < 0) {
ldout(async_msgr->cct, 0) << func << " read message front failed" << dendl;
goto fail;
} else if (r > 0) {
{
utime_t t1=ceph_clock_now() - recv_stamp;
if(t1 > utime_t(1,0)) {
ldout(async_msgr->cct, 0) << "asyncconnection_process" << " pthread_id=" << pthread_self() << " time=" << t1 << dendl;
}
}
}@
debug out put:
2017-12-19 19:03:20.690473 7f527ee20700 0 -- 10.193.56.233:6829/5275 >> 10.193.56.234:6849/26811 conn(0x7f529e624000 :6829 s=STATE_OPEN_MESSAGE_READ_FRONT pgs=7 cs=1 l=0).asyncconnection_process pthread_id=139992292787968 front_len time=1.519262
2017-12-19 19:03:22.356598 7f527ee20700 0 -- 10.193.56.233:6829/5275 >> 10.193.56.234:6849/26811 conn(0x7f529e624000 :6829 s=STATE_OPEN_MESSAGE_READ_FRONT pgs=7 cs=1 l=0).asyncconnection_process pthread_id=139992292787968 front_len time=3.185389
2017-12-19 19:03:25.692588 7f527ee20700 0 -- 10.193.56.233:6829/5275 >> 10.193.56.234:6849/26811 conn(0x7f529e624000 :6829 s=STATE_OPEN_MESSAGE_READ_FRONT pgs=7 cs=1 l=0).asyncconnection_process pthread_id=139992292787968 front_len time=6.521378
2017-12-19 19:03:32.372666 7f527ee20700 0 -- 10.193.56.233:6829/5275 >> 10.193.56.234:6849/26811 conn(0x7f529e624000 :6829 s=STATE_OPEN_MESSAGE_READ_FRONT pgs=7 cs=1 l=0).asyncconnection_process pthread_id=139992292787968 front_len time=13.201456
2017-12-19 19:03:45.716051 7f527ee20700 0 -- 10.193.56.233:6829/5275 >> 10.193.56.234:6849/26811 conn(0x7f529e624000 :6829 s=STATE_OPEN_MESSAGE_READ_FRONT pgs=7 cs=1 l=0).asyncconnection_process pthread_id=139992292787968 front_len time=26.544841
2017-12-19 19:04:12.403907 7f527ee20700 0 -- 10.193.56.233:6829/5275 >> 10.193.56.234:6849/26811 conn(0x7f529e624000 :6829 s=STATE_OPEN_MESSAGE_READ_FRONT pgs=7 cs=1 l=0).asyncconnection_process pthread_id=139992292787968 front_len time=53.232698
{
utime_t t1=ceph_clock_now() - recv_stamp;
if(t1 > utime_t(1,0)) {
ldout(async_msgr->cct, 0) << "asyncconnection_process" << " pthread_id=" << pthread_self() << " front_len time=" << t1 << dendl;
}
}
break;
}
ldout(async_msgr->cct, 20) << func << " got front " << front.length() << dendl;
}
state = STATE_OPEN_MESSAGE_READ_MIDDLE;
It looks like that the socket has been blocked and return EAGAIN
@/* return -1 means `fd` occurs error or closed, it should be closed
* return 0 means EAGAIN or EINTR */
ssize_t AsyncConnection::read_bulk(char *buf, unsigned len)
{
ssize_t nread;
again:
nread = cs.read(buf, len);
if (nread < 0) {
if (nread == EAGAIN) {>cct, 1) << func << " reading from fd=" << cs.fd()
nread = 0;
} else if (nread == -EINTR) {
goto again;
} else {
ldout(async_msgr
<< " : "<< strerror(nread) << dendl;
return 1;>cct, 1) << func << " peer close file descriptor "
}
} else if (nread == 0) {
ldout(async_msgr
<< cs.fd() << dendl;
return -1;
}
return nread;
}@
Updated by Greg Farnum over 6 years ago
- Tracker changed from Bug to Support
- Status changed from New to Closed
Sounds like the network was slow or something?
Updated by zhou yang over 6 years ago
Greg Farnum wrote:
Sounds like the network was slow or something?
Yes,we changed the switch, the problem was solved.