Project

General

Profile

Actions

Bug #63646

open

mds: incorrectly issued the Fc caps in LOCK_EXCL_XSYN state for filelock

Added by Xiubo Li 5 months ago. Updated 5 months ago.

Status:
Pending Backport
Priority:
Normal
Assignee:
Category:
Correctness/Safety
Target version:
% Done:

0%

Source:
Tags:
backport_processed
Backport:
quincy, reef
Regression:
No
Severity:
3 - minor
Reviewed:
Affected Versions:
ceph-qa-suite:
Component(FS):
MDS
Labels (FS):
Pull request ID:
Crash signature (v1):
Crash signature (v2):

Description

# sudo ceph tell mds.a dump inode 1099928481958
{
    "path": "/volumes/csi/csi-vol-8/c1/user/08905",
    "ino": 1099928481958,
    "rdev": 0,
    "ctime": "2023-11-22T17:43:18.475402+0000",
    "btime": "2023-11-22T17:43:18.471402+0000",
    "mode": 33188,
    "uid": 1000,
    "gid": 1000,
    "nlink": 1,
    "dir_layout": {
        "dir_hash": 0,
        "unused1": 0,
        "unused2": 0,
        "unused3": 0
    },
    "layout": {
        "stripe_unit": 4194304,
        "stripe_count": 1,
        "object_size": 4194304,
        "pool_id": 3,
        "pool_ns": "" 
    },
    "old_pools": [],
    "size": 57944,
    "truncate_seq": 1,
    "truncate_size": 18446744073709551615,
    "truncate_from": 0,
    "truncate_pending": 0,
    "mtime": "2023-11-22T17:43:18.474402+0000",
    "atime": "2023-11-22T17:43:18.471402+0000",
    "time_warp_seq": 1,
    "change_attr": 8,
    "export_pin": -1,
    "export_ephemeral_random_pin": 0,
    "export_ephemeral_distributed_pin": false,
    "client_ranges": [
        {
            "client": 292645041,
            "byte range": {
                "first": 0,
                "last": 4194304
            },
            "follows": 27828
        }
    ],
    "dirstat": {
        "version": 0,
        "mtime": "0.000000",
        "num_files": 0,
        "num_subdirs": 0,
        "change_attr": 0
    },
    "rstat": {
        "version": 0,
        "rbytes": 57944,
        "rfiles": 1,
        "rsubdirs": 0,
        "rsnaps": 0,
        "rctime": "2023-11-22T17:43:18.475402+0000" 
    },
    "accounted_rstat": {
        "version": 0,
        "rbytes": 57944,
        "rfiles": 1,
        "rsubdirs": 0,
        "rsnaps": 0,
        "rctime": "2023-11-22T17:43:18.475402+0000" 
    },
    "version": 42583120,
    "file_data_version": 0,
    "xattr_version": 1,
    "backtrace_version": 42574155,
    "stray_prior_path": "",
    "max_size_ever": 0,
    "quota": {
        "max_bytes": 0,
        "max_files": 0
    },
    "last_scrub_stamp": "0.000000",
    "last_scrub_version": 0,
    "symlink": "",
    "xattrs": [
        {
            "key": "security.selinux",
            "val": "system_u:object_r:container_file_t:s0:c14,c27\u0000" 
        }
    ],
    "dirfragtree": {
        "splits": []
    },
    "old_inodes": [],
    "oldest_snap": 18446744073709551614,
    "damage_flags": 0,
    "is_auth": true,
    "auth_state": {
        "replicas": {}
    },
    "replica_state": {
        "authority": [
            0,
            -2
        ],
        "replica_nonce": 0
    },
    "auth_pins": 3,
    "is_frozen": false,
    "is_freezing": false,
    "pins": {
        "ptrwaiter": 13,
        "request": 240,
        "lock": 1,
        "caps": 1,
        "dirtyrstat": 0,
        "dirty": 0,
        "waiter": 1,
        "authpin": 1
    },
    "nref": 257,
    "versionlock": {
        "gather_set": [],
        "state": "lock",
        "is_leased": false,
        "num_rdlocks": 0,
        "num_wrlocks": 0,
        "num_xlocks": 0,
        "xlock_by": {}
    },
    "authlock": {},
    "linklock": {},
    "dirfragtreelock": {},
    "filelock": {
        "gather_set": [],
        "state": "excl->xsyn",
        "is_leased": false,
        "num_rdlocks": 0,
        "num_wrlocks": 0,
        "num_xlocks": 0,
        "xlock_by": {}
    },
    "xattrlock": {},
    "snaplock": {
        "gather_set": [],
        "state": "sync",
        "is_leased": false,
        "num_rdlocks": 2,
        "num_wrlocks": 0,
        "num_xlocks": 0,
        "xlock_by": {}
    },
    "nestlock": {
        "gather_set": [],
        "state": "lock",
        "is_leased": false,
        "num_rdlocks": 0,
        "num_wrlocks": 0,
        "num_xlocks": 0,
        "xlock_by": {}
    },
    "flocklock": {},
    "policylock": {},
    "states": [
        "auth" 
    ],
    "client_caps": [
        {
            "client_id": 292644945,
            "pending": "pAsLsXsFc",
            "issued": "pAsLsXsFc",
            "wanted": "-",
            "last_sent": 26
        },
        {
            "client_id": 292645008,
            "pending": "pAsLsXsFc",
            "issued": "pAsLsXsFc",
            "wanted": "-",
            "last_sent": 3
        },
        {
            "client_id": 292645041,
            "pending": "pAsLsXsFc",
            "issued": "pAsLsXsFcr",
            "wanted": "-",
            "last_sent": 25
        },
        {
            "client_id": 294423859,
            "pending": "pAsLsXsFc",
            "issued": "pAsLsXsFc",
            "wanted": "-",
            "last_sent": 184
        },
        {
            "client_id": 294423874,
            "pending": "pAsLsXsFc",
            "issued": "pAsLsXsFc",
            "wanted": "-",
            "last_sent": 10
        },
        {
            "client_id": 294423919,
            "pending": "pAsLsXsFc",
            "issued": "pAsLsXsFc",
            "wanted": "-",
            "last_sent": 9
        },
        {
            "client_id": 294423985,
            "pending": "pAsLsXsFc",
            "issued": "pAsLsXsFc",
            "wanted": "-",
            "last_sent": 12
        },
        {
            "client_id": 294424000,
            "pending": "pAsLsXsFc",
            "issued": "pAsLsXsFc",
            "wanted": "-",
            "last_sent": 3
        }
    ],
    "loner": 292645041,
    "want_loner": -1,
    "mds_caps_wanted": []
}

Related issues 3 (1 open2 closed)

Copied to CephFS - Backport #63808: quincy: mds: incorrectly issued the Fc caps in LOCK_EXCL_XSYN state for filelockResolvedXiubo LiActions
Copied to CephFS - Backport #63809: reef: mds: incorrectly issued the Fc caps in LOCK_EXCL_XSYN state for filelockIn ProgressXiubo LiActions
Copied to CephFS - Backport #63833: pacific: mds: incorrectly issued the Fc caps in LOCK_EXCL_XSYN state for filelockResolvedXiubo LiActions
Actions #1

Updated by Xiubo Li 5 months ago

  • Status changed from New to Fix Under Review
  • Backport set to pacific, quincy, reef
  • Pull request ID set to 54669
Actions #2

Updated by Venky Shankar 5 months ago

Discussion from another channel:

The above issued/pending caps in all the clients look so strange to me. While when the 'filelock' is in 'excl->xsyn' state:

120 [LOCK_EXCL_XSYN] = { LOCK_XSYN, false, LOCK_LOCK, 0, 0, XCL, 0, 0, 0, 0, 0,CEPH_CAP_GCACHE|CEPH_CAP_GBUFFER,0,0 },

For the 'any' it's '0'. And only the 'loner' has the 'Fcb' caps. That means excepting the 'loner' client '292645041' all the other clients shouldn't have the 'Fc' caps in issued/pending.

There seems buggy in the 'states[LOCK_EXCL_XSYN].loner' as above, which is 'false', IMO it should be 'true' since it's like all the others which has loner caps only, such as:

88     [LOCK_EXCL_SYNC] = { LOCK_SYNC, true,  LOCK_LOCK, 0,    0,   0,   0,   XCL, 0,   0,   0,CEPH_CAP_GSHARED|CEPH_CAP_GCACHE|CEPH_CAP_GRD,0,0 },
>  ...
92     [LOCK_XSYN_SYNC] = { LOCK_SYNC, true,  LOCK_LOCK, AUTH, 0,   AUTH,0,   0,   0,   0,   0,CEPH_CAP_GCACHE,0,0 },
93    
    ...
99     [LOCK_XSYN_LOCK] = { LOCK_LOCK, true,  LOCK_LOCK, AUTH, 0,   0,   XCL, 0,   0,   0,   0,CEPH_CAP_GCACHE|CEPH_CAP_GBUFFER,0,0 },
110     [LOCK_EXCL_MIX]  = { LOCK_MIX,  true,  LOCK_LOCK, 0,    0,   0,   XCL, 0,   0,   0,   0,CEPH_CAP_GRD|CEPH_CAP_GWR,0,0 },  
111     [LOCK_XSYN_MIX]  = { LOCK_MIX,  true,  LOCK_LOCK, 0,    0,   0,   XCL, 0,   0,   0,   0,0,0,0 }, 
112     
113     [LOCK_EXCL]      = { 0,         true,  LOCK_LOCK, 0,    0,   XCL, XCL, 0,   0,   0,   0,CEPH_CAP_GSHARED|CEPH_CAP_GEXCL|CEPH_CAP_GCACHE|CEPH_CAP_GRD|CEPH_CAP_GWR|CEPH_CAP_GBUFFER,0,0 },
114     [LOCK_SYNC_EXCL] = { LOCK_EXCL, true,  LOCK_LOCK, ANY,  0,   0,   0,   0,   0,   0,   0,CEPH_CAP_GSHARED|CEPH_CAP_GCACHE|CEPH_CAP_GRD,0,0 },
115     [LOCK_MIX_EXCL]  = { LOCK_EXCL, true,  LOCK_LOCK, 0,    0,   0,   XCL, 0,   0,   0,   0,CEPH_CAP_GRD|CEPH_CAP_GWR,0,0 },  
116     [LOCK_LOCK_EXCL] = { LOCK_EXCL, true,  LOCK_LOCK, AUTH, 0,   0,   0,   0,   0,   0,   0,CEPH_CAP_GCACHE|CEPH_CAP_GBUFFER,0,0 },
117     [LOCK_XSYN_EXCL] = { LOCK_EXCL, true,  LOCK_LOCK, AUTH, 0,   XCL, 0,   0,   0,   0,   0,CEPH_CAP_GCACHE|CEPH_CAP_GBUFFER,0,0 },
118 
119     [LOCK_XSYN]      = { 0,         true,  LOCK_LOCK, AUTH, AUTH,AUTH,XCL, 0,   0,   0,   0,CEPH_CAP_GCACHE|CEPH_CAP_GBUFFER,0,0 },         

Else it will always set the 'allowed' to 'Fcb' in 'Locker::get_allowed_caps()'.

Actions #3

Updated by Venky Shankar 5 months ago

Venky Shankar wrote:

Discussion from another channel:

The above issued/pending caps in all the clients look so strange to me. While when the 'filelock' is in 'excl->xsyn' state:

120 [LOCK_EXCL_XSYN] = { LOCK_XSYN, false, LOCK_LOCK, 0, 0, XCL, 0, 0, 0, 0, 0,CEPH_CAP_GCACHE|CEPH_CAP_GBUFFER,0,0 },

For the 'any' it's '0'. And only the 'loner' has the 'Fcb' caps.

Yeh, I initially though only Fb caps were allowed to a loner client in XSYN state, but I guess its fine for a loner client to read from its cache too.

That means excepting the 'loner' client '292645041' all the other clients shouldn't have the 'Fc' caps in issued/pending.

That sounds right. Those should be under revocation by the MDS when transitioning from EXCL to XSYN which is the state the lock is in. Its strange that the conflicting caps hasn't been revoked for any of the clients be the MDS.

There seems buggy in the 'states[LOCK_EXCL_XSYN].loner' as above, which is 'false', IMO it should be 'true' since it's like all the others which has loner caps only, such as:

[...]

Else it will always set the 'allowed' to 'Fcb' in 'Locker::get_allowed_caps()'.

That code in the state machine is really really old (2011). So, you suspect that since @loner is unset in the state machine for excl->xsyn, the MDS isn't revoking the caps from other clients?

Actions #4

Updated by Xiubo Li 5 months ago

Venky Shankar wrote:

Venky Shankar wrote:

Discussion from another channel:

The above issued/pending caps in all the clients look so strange to me. While when the 'filelock' is in 'excl->xsyn' state:

120 [LOCK_EXCL_XSYN] = { LOCK_XSYN, false, LOCK_LOCK, 0, 0, XCL, 0, 0, 0, 0, 0,CEPH_CAP_GCACHE|CEPH_CAP_GBUFFER,0,0 },

For the 'any' it's '0'. And only the 'loner' has the 'Fcb' caps.

Yeh, I initially though only Fb caps were allowed to a loner client in XSYN state, but I guess its fine for a loner client to read from its cache too.

That means excepting the 'loner' client '292645041' all the other clients shouldn't have the 'Fc' caps in issued/pending.

That sounds right. Those should be under revocation by the MDS when transitioning from EXCL to XSYN which is the state the lock is in. Its strange that the conflicting caps hasn't been revoked for any of the clients be the MDS.

There seems buggy in the 'states[LOCK_EXCL_XSYN].loner' as above, which is 'false', IMO it should be 'true' since it's like all the others which has loner caps only, such as:

[...]

Else it will always set the 'allowed' to 'Fcb' in 'Locker::get_allowed_caps()'.

That code in the state machine is really really old (2011). So, you suspect that since @loner is unset in the state machine for excl->xsyn, the MDS isn't revoking the caps from other clients?

It seems this won't trigger that.

This tracker is just fixing the confusing output for the non-loner clients in excl->xsyn state. Which is incorrect IMO.

Let me explain more:

Just after Locker::file_xsyn() setting the lock state to LOCK_EXCL_XSYN, it will try to revoke/issue the caps under this state:

5669 void Locker::file_xsyn(SimpleLock *lock, bool *need_issue)
5670 { 
5671   dout(7) << "file_xsyn on " << *lock << " on " << *lock->get_parent() << dendl;
5672   CInode *in = static_cast<CInode *>(lock->get_parent());
5673   ceph_assert(in->is_auth());
5674   ceph_assert(in->get_loner() >= 0 && in->get_mds_caps_wanted().empty()); 
5675          
5676   switch (lock->get_state()) {
5677   case LOCK_EXCL: lock->set_state(LOCK_EXCL_XSYN); break;                    =====> set the state to LOCK_EXCL_XSYN                                                                                                                                                                     
5678   default: ceph_abort();
5679   }
...
5687     
5688   if (in->is_head() &&
5689       in->issued_caps_need_gather(lock)) {
5690     if (need_issue)
5691       *need_issue = true;
5692     else
5693       issue_caps(in);                            ====> issue or revoke caps
5694     gather++;
5695   }
...

And then in Locker::issue_caps(), for the non-loner clients it will always return Fcb from get_allowed_caps(), please see the all_allowed and return value allowed:

2312 int Locker::issue_caps(CInode *in, Capability *only_cap)
2313 {
2314   // count conflicts with
2315   int nissued = 0;
2316   int all_allowed = -1, loner_allowed = -1, xlocker_allowed = -1;
2317 
2318   ceph_assert(in->is_head());
2319 
2320   // client caps
2321   map<client_t, Capability>::iterator it;
2322   if (only_cap)
2323     it = in->client_caps.find(only_cap->get_client());
2324   else
2325     it = in->client_caps.begin();
2326   for (; it != in->client_caps.end(); ++it) {
2327     Capability *cap = &it->second;
2328     int allowed = get_allowed_caps(in, cap, all_allowed, loner_allowed,
2329                                    xlocker_allowed);
2330     int pending = cap->pending();
2331     int wanted = cap->wanted();
2332 
...

But from the state machine when the filelock is in LOCK_EXCL_XSYN state, only the loner client could have the Fcb caps, and all the other clients' caps the allowed caps should be empty, please see the any below:

85                        // stable     loner  rep state  r     rp   rd   wr   fwr  l    x    caps(any,loner,xlocker,replica)
120     [LOCK_EXCL_XSYN] = { LOCK_XSYN, false, LOCK_LOCK, 0,    0,   XCL, 0,   0,   0,   0,   0,CEPH_CAP_GCACHE|CEPH_CAP_GBUFFER,0,0 },

And also from the above machine state it means to be a @loner state, just like LOCK_EXCL and LOCK_{X}_EXCL, etc:

113     [LOCK_EXCL]      = { 0,         true,  LOCK_LOCK, 0,    0,   XCL, XCL, 0,   0,   0,   0,CEPH_CAP_GSHARED|CEPH_CAP_GEXCL|CEPH_CAP_GCACHE|CEPH_CAP_GRD|CEPH_CAP_GWR|CEPH_CAP_GBUFFER,0,0 },                                      
114     [LOCK_SYNC_EXCL] = { LOCK_EXCL, true,  LOCK_LOCK, ANY,  0,   0,   0,   0,   0,   0,   0,CEPH_CAP_GSHARED|CEPH_CAP_GCACHE|CEPH_CAP_GRD,0,0 },
115     [LOCK_MIX_EXCL]  = { LOCK_EXCL, true,  LOCK_LOCK, 0,    0,   0,   XCL, 0,   0,   0,   0,CEPH_CAP_GRD|CEPH_CAP_GWR,0,0 },  
116     [LOCK_LOCK_EXCL] = { LOCK_EXCL, true,  LOCK_LOCK, AUTH, 0,   0,   0,   0,   0,   0,   0,CEPH_CAP_GCACHE|CEPH_CAP_GBUFFER,0,0 },
117     [LOCK_XSYN_EXCL] = { LOCK_EXCL, true,  LOCK_LOCK, AUTH, 0,   XCL, 0,   0,   0,   0,   0,CEPH_CAP_GCACHE|CEPH_CAP_GBUFFER,0,0 },

It will miss the state machine when transiting from LOCK_EXCL --> LOCK_EXCL_XSYN --> LOCK_XSYN. Both the LOCK_EXCL and LOCK_XSYN won't issue any caps, including the Fcb, to non-loner clients, while the intermediate state, the LOCK_EXCL_XSYN, will. Is this intermediate state special from all the others and what we mean? Why ?

I am still not sure whether will this introduce potential bugs.

Actions #5

Updated by Venky Shankar 5 months ago

Xiubo Li wrote:

Venky Shankar wrote:

Venky Shankar wrote:

Discussion from another channel:

The above issued/pending caps in all the clients look so strange to me. While when the 'filelock' is in 'excl->xsyn' state:

120 [LOCK_EXCL_XSYN] = { LOCK_XSYN, false, LOCK_LOCK, 0, 0, XCL, 0, 0, 0, 0, 0,CEPH_CAP_GCACHE|CEPH_CAP_GBUFFER,0,0 },

For the 'any' it's '0'. And only the 'loner' has the 'Fcb' caps.

Yeh, I initially though only Fb caps were allowed to a loner client in XSYN state, but I guess its fine for a loner client to read from its cache too.

That means excepting the 'loner' client '292645041' all the other clients shouldn't have the 'Fc' caps in issued/pending.

That sounds right. Those should be under revocation by the MDS when transitioning from EXCL to XSYN which is the state the lock is in. Its strange that the conflicting caps hasn't been revoked for any of the clients be the MDS.

There seems buggy in the 'states[LOCK_EXCL_XSYN].loner' as above, which is 'false', IMO it should be 'true' since it's like all the others which has loner caps only, such as:

[...]

Else it will always set the 'allowed' to 'Fcb' in 'Locker::get_allowed_caps()'.

That code in the state machine is really really old (2011). So, you suspect that since @loner is unset in the state machine for excl->xsyn, the MDS isn't revoking the caps from other clients?

It seems this won't trigger that.

This tracker is just fixing the confusing output for the non-loner clients in excl->xsyn state. Which is incorrect IMO.

Let me explain more:

Just after Locker::file_xsyn() setting the lock state to LOCK_EXCL_XSYN, it will try to revoke/issue the caps under this state:

[...]

And then in Locker::issue_caps(), for the non-loner clients it will always return Fcb from get_allowed_caps(), please see the all_allowed and return value allowed:

[...]

But from the state machine when the filelock is in LOCK_EXCL_XSYN state, only the loner client could have the Fcb caps, and all the other clients' caps the allowed caps should be empty, please see the any below:

[...]

And also from the above machine state it means to be a @loner state, just like LOCK_EXCL and LOCK_{X}_EXCL, etc:

[...]

It will miss the state machine when transiting from LOCK_EXCL --> LOCK_EXCL_XSYN --> LOCK_XSYN. Both the LOCK_EXCL and LOCK_XSYN won't issue any caps, including the Fcb, to non-loner clients, while the intermediate state, the LOCK_EXCL_XSYN, will. Is this intermediate state special from all the others and what we mean? Why ?

Your explanation makes sense. That bit (@loner = false) looks incorrect. This has been like this even since XSYN was introduced.

I am still not sure whether will this introduce potential bugs.

I guess only running this through fs suite will tell.

Actions #6

Updated by Xiubo Li 5 months ago

Venky Shankar wrote:

Xiubo Li wrote:

Venky Shankar wrote:

[...]

It will miss the state machine when transiting from LOCK_EXCL --> LOCK_EXCL_XSYN --> LOCK_XSYN. Both the LOCK_EXCL and LOCK_XSYN won't issue any caps, including the Fcb, to non-loner clients, while the intermediate state, the LOCK_EXCL_XSYN, will. Is this intermediate state special from all the others and what we mean? Why ?

Your explanation makes sense. That bit (@loner = false) looks incorrect. This has been like this even since XSYN was introduced.

I am still not sure whether will this introduce potential bugs.

I guess only running this through fs suite will tell.

Okay, sounds good.

Actions #7

Updated by Patrick Donnelly 5 months ago

  • Target version set to v19.0.0
Actions #8

Updated by Venky Shankar 5 months ago

  • Category set to Correctness/Safety
  • Status changed from Fix Under Review to Pending Backport
  • Backport changed from pacific, quincy, reef to quincy, reef
  • Component(FS) MDS added
Actions #9

Updated by Backport Bot 5 months ago

  • Copied to Backport #63808: quincy: mds: incorrectly issued the Fc caps in LOCK_EXCL_XSYN state for filelock added
Actions #10

Updated by Backport Bot 5 months ago

  • Copied to Backport #63809: reef: mds: incorrectly issued the Fc caps in LOCK_EXCL_XSYN state for filelock added
Actions #11

Updated by Backport Bot 5 months ago

  • Tags set to backport_processed
Actions #12

Updated by Xiubo Li 5 months ago

  • Copied to Backport #63833: pacific: mds: incorrectly issued the Fc caps in LOCK_EXCL_XSYN state for filelock added
Actions

Also available in: Atom PDF