Bug #18579: Fuse client has "opening" session to nonexistent MDS rank after MDS cluster shrink - CephFS - Ceph

Bug #18579

<pre> 
 mds_sessions 

     
 { 
     "id": 4125, 
     "sessions": [ 
         { 
             "mds": 0, 
             "addr": "194.12.182.59:6812\/3223369410", "194.12.182.59:6807\/3035008084", 
             "seq": 11847, 85, 
             "cap_gen": 0, 
             "cap_ttl": "2017-01-18 10:03:38.879099", 09:50:59.897736", 
             "last_cap_renew_request": "2017-01-18 10:02:38.879099", 09:49:59.897736", 
             "cap_renew_seq": 20, 46, 
             "num_caps": 4782, 3572, 
             "state": "open" 
         }, 
         { 
             "mds": 1, 
             "addr": "194.12.182.59:6813\/197224361", "194.12.182.59:6807\/1687650752", 
             "seq": 0, 
             "cap_gen": 0, 
             "cap_ttl": "0.000000", 
             "last_cap_renew_request": "0.000000", "2017-01-18 09:43:19.841816", 
             "cap_renew_seq": 0, 1, 
             "num_caps": 0, 
             "state": "opening" 
         } 
     ], 


 
     "mdsmap_epoch": 21 
 } 

 </pre> 

 <pre> 
 mds_requests mds_requests: 
 { 
     "request": { 
         "tid": 5837, 4793, 
         "op": "mkdir", "create", 
         "path": "#20000000274\/fssnap.d", "#20000000420\/98netconfig", 
         "path2": "", 
         "ino": "20000000274", "20000000420", 
         "dentry": "fssnap.d", "98netconfig", 
         "hint_ino": "0", 
         "sent_stamp": "2017-01-18 09:59:38.913936", 09:43:13.739417", 
         "mds": -1, 
         "resend_mds": -1, 
         "send_to_auth": 0, 
         "sent_on_mseq": 0, 
         "retry_attempt": 0, 
         "got_unsafe": 0, 
         "uid": 1000, 
         "gid": 1000, 
         "oldest_client_tid": 5837, 4793, 
         "mdsmap_epoch": 0, 
         "flags": 0, 
         "num_retry": 0, 
         "num_fwd": 0, 
         "num_releases": 0, 
         "abort_rc": 0 
     } 
 } 
 </pre> 

 </pre> With a (file creation) workload running initially with one MDS rank, created mds rank 1, then deactivated it, and client got stuck.    No client logs but I find it likely that this is reproducible. 

 This kind of bug should be reproducible by the cluster size thrasher: http://tracker.ceph.com/issues/10792

Back

Project

General

Profile

Ceph » CephFS

Bug #18579