Actions
Bug #64390
openclient: async I/O stalls if the data pool gets full
Status:
New
Priority:
Normal
Assignee:
Category:
Correctness/Safety
Target version:
% Done:
0%
Source:
Tags:
Backport:
Regression:
No
Severity:
3 - minor
Reviewed:
Affected Versions:
ceph-qa-suite:
Component(FS):
Client
Labels (FS):
Pull request ID:
Crash signature (v1):
Crash signature (v2):
Description
test case:
TEST_F(TestClient, LlreadvLlwritevDataPoolFull) {
/* Test perfoming async I/O after filling the fs and make sure it handles
the read/write gracefully */
int mypid = getpid();
char filename[256];
client->unmount();
TearDown();
SetUp();
sprintf(filename, "test_llreadvllwritevdatapoolfullfile%u", mypid);
Inode *root, *file;
root = client->get_root();
ASSERT_NE(root, (Inode *)NULL);
Fh *fh;
struct ceph_statx stx;
ASSERT_EQ(0, client->ll_createx(root, filename, 0666,
O_RDWR | O_CREAT | O_TRUNC,
&file, &fh, &stx, 0, 0, myperm));
struct statvfs stbuf;
int64_t rc;
rc = client->ll_statfs(root, &stbuf, myperm);
ASSERT_EQ(rc, 0);
int64_t fs_available_space = stbuf.f_bfree * stbuf.f_bsize;
ASSERT_GT(fs_available_space, 0);
const int64_t BUFSIZE = 1024 * 1024 * 1024;
int64_t bytes_written = 0, offset = 0;
char* buf = new char[BUFSIZE];
char* small_buf = NULL;
memset(buf, 0xCC, BUFSIZE);
while(fs_available_space) {
if (fs_available_space >= BUFSIZE) {
bytes_written = client->ll_write(fh, offset, BUFSIZE, buf);
ASSERT_GT(bytes_written, 0);
offset += BUFSIZE;
fs_available_space -= BUFSIZE;
} else {
small_buf = new char[fs_available_space];
memset(small_buf, 0xDD, fs_available_space);
bytes_written = client->ll_write(fh, offset, fs_available_space, small_buf);
ASSERT_GT(bytes_written, 0);
break;
}
}
std::unique_ptr<C_SaferCond> writefinish = nullptr;
std::unique_ptr<C_SaferCond> readfinish = nullptr;
writefinish.reset(new C_SaferCond("test-nonblocking-writefinish-datapool-full"));
readfinish.reset(new C_SaferCond("test-nonblocking-readfinish-datapool-full"));
char* out_buf_0 = new char[BUFSIZE];
memset(out_buf_0, 0xDD, BUFSIZE);
char* out_buf_1 = new char[BUFSIZE];
memset(out_buf_1, 0xFF, BUFSIZE);
char* out_buf_2 = new char[BUFSIZE];
memset(out_buf_2, 0xFF, BUFSIZE);
char* out_buf_3 = new char[BUFSIZE];
memset(out_buf_3, 0xFF, BUFSIZE);
char* out_buf_4 = new char[BUFSIZE];
memset(out_buf_4, 0xFF, BUFSIZE);
char* out_buf_5 = new char[BUFSIZE];
memset(out_buf_5, 0xFF, BUFSIZE);
struct iovec iov_out[6] = {
{out_buf_0, BUFSIZE},
{out_buf_1, BUFSIZE},
{out_buf_2, BUFSIZE},
{out_buf_3, BUFSIZE},
{out_buf_4, BUFSIZE},
{out_buf_5, BUFSIZE},
};
bufferlist bl;
rc = client->ll_preadv_pwritev(fh, iov_out, 6, 0, true, writefinish.get(),
nullptr);
ASSERT_EQ(rc, 0);
bytes_written = writefinish->wait();
ASSERT_EQ(bytes_written, -CEPHFS_ENOSPC);
client->ll_release(fh);
ASSERT_EQ(0, client->ll_unlink(root, filename, myperm));
delete[] buf;
delete[] small_buf;
delete[] out_buf_0;
delete[] out_buf_1;
delete[] out_buf_2;
delete[] out_buf_3;
delete[] out_buf_4;
delete[] out_buf_5;
}
firstly the assertion fails after the async write call
2024-02-12T19:09:43.795+0530 7f84bac686c0 19 client.4304 C_Write_Finisher::try_complete this 0x5594b702bcc0 onuninlinefinished 1 iofinished 1 iofinished_r 2147483647 fsync_finished 1
2024-02-12T19:09:43.795+0530 7f84bac686c0 19 client.4304 complete with iofinished_r 2147483647
/home/dparmar/CephRepoForRunningTestsLocally/ceph/src/test/client/nonblocking.cc:800: Failure
Expected equality of these values:
bytes_written
Which is: 2147483647
-28
i expected the API to return ENOSPC but it returned be 2GiB i.e. 33% data was written (shouldn't happen since I had filled up all the available space in the first place)
Do we get the ENOSPC error when releasing file handle after this:
2024-02-12T19:09:43.795+0530 7f84bf65c9c0 1 client.4304 _release_fh 0x5594b6f277f0 on inode 0x10000000000.head(faked_ino=0 nref=8 ll_ref=1 cap_refs={4=0,1024=0,4096=0,8192=0} open={3=0} mode=100666 size=106287857664/110582824960 nlink=1 btime=2024-02-12T18:42:52.646736+0530 mtime=2024-02-12T19:09:43.796040+0530 ctime=2024-02-12T19:09:43.796040+0530 change_attr=100 caps=p(0=p) flushing_caps=Fw objectset[0x10000000000 ts 0/0 objects 1000 dirty_or_tx 0] parents=0x1.head["test_llreadvllwritevdatapoolfullfile1269955"] 0x7f84900088e0) caught async_err = (28) No space left on device
and then this, the call is stalled:
2024-02-12T19:09:43.976+0530 7f84977fe6c0 20 client.4304 upkeep thread waiting interval 1.000000000s
2024-02-12T19:09:44.614+0530 7f84b1ffb6c0 1 client.4304 _handle_full_flag: FULL: cancelling outstanding operations on 1
2024-02-12T19:09:44.614+0530 7f84b1ffb6c0 1 client.4304 _handle_full_flag: FULL: cancelling outstanding operations on 2
2024-02-12T19:09:44.614+0530 7f84b1ffb6c0 1 client.4304 _handle_full_flag: FULL: cancelling outstanding operations on 3
2024-02-12T19:09:44.614+0530 7f84b1ffb6c0 10 client.4304 unmounting: trim pass, size was 0+2
2024-02-12T19:09:44.614+0530 7f84b1ffb6c0 20 client.4304 trim_cache size 0 max 16384
2024-02-12T19:09:44.614+0530 7f84b1ffb6c0 10 client.4304 unmounting: trim pass, size still 0+2
2024-02-12T19:09:44.977+0530 7f84977fe6c0 20 client.4304 tick
2024-02-12T19:09:44.977+0530 7f84977fe6c0 20 client.4304 collect_and_send_metrics
2024-02-12T19:09:44.977+0530 7f84977fe6c0 20 client.4304 collect_and_send_global_metrics
2024-02-12T19:09:44.977+0530 7f84977fe6c0 10 client.4304 _put_inode on 0x1.head(faked_ino=0 nref=2 ll_ref=0 cap_refs={1024=0} open={} mode=40755 size=0/0 nlink=1 btime=2024-02-12T18:41:58.976066+0530 mtime=2024-02-12T18:42:52.646736+0530 ctime=2024-02-12T18:42:52.646736+0530 change_attr=1 caps=pAsLsXs(0=pAsLsXs) has_dir_layout 0x7f84900081e0) n = 1
2024-02-12T19:09:44.977+0530 7f84977fe6c0 10 client.4304 remove_cap mds.0 on 0x1.head(faked_ino=0 nref=1 ll_ref=0 cap_refs={1024=0} open={} mode=40755 size=0/0 nlink=1 btime=2024-02-12T18:41:58.976066+0530 mtime=2024-02-12T18:42:52.646736+0530 ctime=2024-02-12T18:42:52.646736+0530 change_attr=1 caps=pAsLsXs(0=pAsLsXs) has_dir_layout 0x7f84900081e0)
2024-02-12T19:09:44.977+0530 7f84977fe6c0 15 client.4304 remove_cap last one, closing snaprealm 0x7f84900080f0
2024-02-12T19:09:44.977+0530 7f84977fe6c0 20 client.4304 put_snap_realm 0x1 0x7f84900080f0 2 -> 1
2024-02-12T19:09:44.977+0530 7f84977fe6c0 10 client.4304 _put_inode deleting 0x1.head(faked_ino=0 nref=1 ll_ref=0 cap_refs={1024=0} open={} mode=40755 size=0/0 nlink=1 btime=2024-02-12T18:41:58.976066+0530 mtime=2024-02-12T18:42:52.646736+0530 ctime=2024-02-12T18:42:52.646736+0530 change_attr=1 caps=- has_dir_layout 0x7f84900081e0)
2024-02-12T19:09:44.977+0530 7f84977fe6c0 10 client.4304 _put_inode on 0x10000000000.head(faked_ino=0 nref=4 ll_ref=0 cap_refs={4=0,1024=0,4096=0,8192=0} open={3=0} mode=100666 size=106287857664/110582824960 nlink=1 btime=2024-02-12T18:42:52.646736+0530 mtime=2024-02-12T19:09:43.796040+0530 ctime=2024-02-12T19:09:43.796040+0530 change_attr=100 caps=p(0=p) flushing_caps=Fw objectset[0x10000000000 ts 0/0 objects 332 dirty_or_tx 0] 0x7f84900088e0) n = 2
2024-02-12T19:09:44.977+0530 7f84977fe6c0 20 client.4304 trim_cache size 0 max 16384
2024-02-12T19:09:44.977+0530 7f84977fe6c0 20 client.4304 upkeep thread waiting interval 1.000000000s
2024-02-12T19:09:45.682+0530 7f84b1ffb6c0 1 client.4304 _handle_full_flag: FULL: cancelling outstanding operations on 1
2024-02-12T19:09:45.682+0530 7f84b1ffb6c0 1 client.4304 _handle_full_flag: FULL: cancelling outstanding operations on 2
2024-02-12T19:09:45.682+0530 7f84b1ffb6c0 1 client.4304 _handle_full_flag: FULL: cancelling outstanding operations on 3
2024-02-12T19:09:45.682+0530 7f84b1ffb6c0 10 client.4304 unmounting: trim pass, size was 0+1
2024-02-12T19:09:45.682+0530 7f84b1ffb6c0 20 client.4304 trim_cache size 0 max 16384
2024-02-12T19:09:45.682+0530 7f84b1ffb6c0 10 client.4304 unmounting: trim pass, size still 0+1
2024-02-12T19:09:45.977+0530 7f84977fe6c0 20 client.4304 tick
2024-02-12T19:09:45.977+0530 7f84977fe6c0 20 client.4304 collect_and_send_metrics
2024-02-12T19:09:45.977+0530 7f84977fe6c0 20 client.4304 collect_and_send_global_metrics
2024-02-12T19:09:45.977+0530 7f84977fe6c0 20 client.4304 trim_cache size 0 max 16384
2024-02-12T19:09:45.977+0530 7f84977fe6c0 20 client.4304 upkeep thread waiting interval 1.000000000s
2024-02-12T19:09:46.977+0530 7f84977fe6c0 20 client.4304 tick
Updated by Venky Shankar 3 months ago
- Category set to Correctness/Safety
- Assignee set to Dhairya Parmar
- Target version set to v19.0.0
Actions