Actions
Bug #18798
closedFS activity hung, MDS reports client "failing to respond to capability release"
Status:
Resolved
Priority:
Normal
Assignee:
-
Category:
-
Target version:
-
% Done:
0%
Source:
Tags:
cephfs, mds
Backport:
Regression:
No
Severity:
3 - minor
Reviewed:
Affected Versions:
ceph-qa-suite:
Component(FS):
Labels (FS):
Pull request ID:
Crash signature (v1):
Crash signature (v2):
Description
I've had two occurrences in the past 3 weeks where filesystem activity hangs, with the MDS report a client "failing to respond to capability release". I'm using the Kernel cephfs client. Ceph is version 10.2.5 and Kernel is version 4.9.0. I've since downgraded the kernel to 4.7.5, as I was running that Kernel previously and did not have any issues (which could be just luck, or the bug was introduced somewhere between 4.7.5 and 4.9.0).
More details below:
[root@kb-ceph03 ~]# ceph status cluster 3bd52056-9ed8-4ff6-9359-6a3ea437d1e9 health HEALTH_WARN mds0: Client kb-ceph03.knc.local: failing to respond to capability release monmap e2: 3 mons at {kb-ceph01=10.100.212.2:6789/0,kb-ceph02=10.100.212.3:6789/0,kb-ceph03=10.100.212.4:6789/0} election epoch 362, quorum 0,1,2 kb-ceph01,kb-ceph02,kb-ceph03 fsmap e215: 1/1/1 up {0=kb-ceph03=up:active}, 2 up:standby osdmap e1293: 24 osds: 24 up, 24 in flags sortbitwise,require_jewel_osds pgmap v2612120: 1296 pgs, 4 pools, 5782 GB data, 1446 kobjects 15314 GB used, 115 TB / 130 TB avail 1296 active+clean client io 8127 kB/s wr, 0 op/s rd, 1 op/s wr [root@kb-ceph03 ~]# ceph mds stat e215: 1/1/1 up {0=kb-ceph03=up:active}, 2 up:standby [root@kb-ceph03 ~]# ceph daemon mds.kb-ceph03 dump_ops_in_flight { "ops": [ { "description": "client_request(client.48718333:240331824 getattr pAsLsXsFs #1000000498c 2017-02-01 14:03:38.913579)", "initiated_at": "2017-02-01 14:03:38.914845", "age": 1446.955546, "duration": 1446.955708, "type_data": [ "failed to rdlock, waiting", "client.48718333:240331824", "client_request", { "client": "client.48718333", "tid": 240331824 }, [ { "time": "2017-02-01 14:03:38.914845", "event": "initiated" }, { "time": "2017-02-01 14:03:38.914958", "event": "failed to rdlock, waiting" } ] ] }, { "description": "client_request(client.48718333:240322739 getattr pAsLsXsFs #1000000498c 2017-02-01 14:01:27.049810)", "initiated_at": "2017-02-01 14:01:27.049678", "age": 1578.820714, "duration": 1578.820904, "type_data": [ "failed to rdlock, waiting", "client.48718333:240322739", "client_request", { "client": "client.48718333", "tid": 240322739 }, [ { "time": "2017-02-01 14:01:27.049678", "event": "initiated" }, { "time": "2017-02-01 14:01:27.049869", "event": "failed to rdlock, waiting" } ] ] } ], "num_ops": 2 } [root@kb-ceph03 ~]# ceph daemon mds.kb-ceph03 session ls [ { "id": 68523984, "num_leases": 1, "num_caps": 820, "state": "open", "replay_requests": 0, "completed_requests": 0, "reconnecting": false, "inst": "client.68523984 10.100.212.3:0\/2863784256", "client_metadata": { "entity_id": "admin", "hostname": "kb-ceph02.knc.local", "kernel_version": "4.9.0-1.el7.elrepo.x86_64", "root": "\/" } }, { "id": 48805081, "num_leases": 0, "num_caps": 843, "state": "open", "replay_requests": 0, "completed_requests": 0, "reconnecting": false, "inst": "client.48805081 10.100.212.4:0\/1660598714", "client_metadata": { "entity_id": "admin", "hostname": "kb-ceph03.knc.local", "kernel_version": "4.9.0-1.el7.elrepo.x86_64", "root": "\/" } }, { "id": 48718333, "num_leases": 0, "num_caps": 840, "state": "open", "replay_requests": 0, "completed_requests": 2, "reconnecting": false, "inst": "client.48718333 10.100.212.2:0\/2586033108", "client_metadata": { "entity_id": "admin", "hostname": "kb-ceph01.knc.local", "kernel_version": "4.9.0-1.el7.elrepo.x86_64", "root": "\/" } } ]
Actions