Actions
Bug #62293
closedosd mclock QoS : osd_mclock_scheduler_client_lim is not limited
% Done:
0%
Description
First of all, I want to confirm whether there is a problem with my understanding:
(1) reserveration / weight / limit
(2) the limit is indeed limiting the maximum resource usage of IOPS/Bandwidth that a certain type of client can use
- osd_mclock_max_capacity_iops_hdd 180
- osd_mclock_scheduler_client_lim 0.25 ==> 180*0.25 = 45 iops
- osd_mclock_max_sequential_bandwidth_hdd 157286400 ==> 150M*0.25= 37.5 MiB/s
for example,
I limit the client's iops to a maximum of 100 iops,
Then no matter how many rados benches I run, it will never exceed 100 iops,
The iops/util of the HDD disk should always be stable at the same water level
If my understanding of limit is correct,
then the pressure on osd should not increase with the increase of rados bench
Problem:
But the actual situation is that the more clients there are in rados bench,
the greater the pressure on osd, and the more iops/bandwidth consumption of hdd
WHY?
reproduce:
ceph cluster:
# ceph -s
cluster:
id: dcc749b4-c686-453b-8f25-6b965cdb360f
health: HEALTH_OK
services:
mon: 1 daemons, quorum a (age 22h)
mgr: x(active, since 22h)
osd: 1 osds: 1 up (since 20m), 1 in (since 23h)
data:
pools: 2 pools, 129 pgs
objects: 17.80M objects, 1.7 TiB
usage: 2.0 TiB used, 7.4 TiB / 9.4 TiB avail
pgs: 129 active+clean
# ceph osd tree
ID CLASS WEIGHT TYPE NAME STATUS REWEIGHT PRI-AFF
-1 9.39259 root default
-3 9.39259 host SZJD-YFQ-PM-OS01-BCONEST-06
0 hdd 9.39259 osd.0 up 1.00000 1.00000
# ceph osd df
ID CLASS WEIGHT REWEIGHT SIZE RAW USE DATA OMAP META AVAIL %USE VAR PGS STATUS
0 hdd 9.39259 1.00000 9.4 TiB 2.0 TiB 1.7 TiB 1 KiB 12 GiB 7.4 TiB 21.52 1.00 129 up
TOTAL 9.4 TiB 2.0 TiB 1.7 TiB 1.2 KiB 12 GiB 7.4 TiB 21.52
MIN/MAX VAR: 1.00/1.00 STDDEV: 0
# ceph df
--- RAW STORAGE ---
CLASS SIZE AVAIL USED RAW USED %RAW USED
hdd 9.4 TiB 7.4 TiB 2.0 TiB 2.0 TiB 21.52
TOTAL 9.4 TiB 7.4 TiB 2.0 TiB 2.0 TiB 21.52
--- POOLS ---
POOL ID PGS STORED OBJECTS USED %USED MAX AVAIL
.mgr 1 1 577 KiB 2 580 KiB 0 6.9 TiB
test-pool 2 128 1.7 TiB 17.80M 1.7 TiB 19.99 6.9 TiB
# ceph osd pool ls detail
pool 1 '.mgr' replicated size 1 min_size 1 crush_rule 0 object_hash rjenkins pg_num 1 pgp_num 1 autoscale_mode on last_change 8 flags hashpspool stripe_width 0 pg_num_max 32 pg_num_min 1 application mgr read_balance_score 1.00
pool 2 'test-pool' replicated size 1 min_size 1 crush_rule 0 object_hash rjenkins pg_num 128 pgp_num 128 autoscale_mode off last_change 20 flags hashpspool stripe_width 0 application rgw read_balance_score 1.00
# ceph osd crush rule dump
[
{
"rule_id": 0,
"rule_name": "replicated_rule",
"type": 1,
"steps": [
{
"op": "take",
"item": -1,
"item_name": "default"
},
{
"op": "choose_firstn",
"num": 0,
"type": "osd"
},
{
"op": "emit"
}
]
}
]
# ls -l dev/osd0/
total 72
-rw------- 1 root root 11 Aug 2 17:43 bfm_blocks
-rw------- 1 root root 4 Aug 2 17:43 bfm_blocks_per_key
-rw------- 1 root root 5 Aug 2 17:43 bfm_bytes_per_block
-rw------- 1 root root 15 Aug 2 17:43 bfm_size
lrwxrwxrwx 1 root root 8 Aug 2 17:43 block -> /dev/sdh
lrwxrwxrwx 1 root root 10 Aug 2 17:43 block.db -> /dev/sdag1
lrwxrwxrwx 1 root root 10 Aug 2 17:43 block.wal -> /dev/sdag2
-rw------- 1 root root 2 Aug 2 17:43 bluefs
-rw------- 1 root root 37 Aug 2 17:43 ceph_fsid
-rw------- 1 root root 75 Aug 2 17:43 ceph_version_when_created
-rw------- 1 root root 28 Aug 2 17:43 created_at
-rw-r--r-- 1 root root 37 Aug 2 17:43 fsid
-rw-r--r-- 1 root root 63 Aug 2 17:43 keyring
-rw------- 1 root root 8 Aug 2 17:43 kv_backend
-rw------- 1 root root 21 Aug 2 17:43 magic
-rw------- 1 root root 4 Aug 2 17:43 mkfs_done
-rw------- 1 root root 41 Aug 2 17:43 osd_key
-rw------- 1 root root 6 Aug 2 17:43 ready
-rw------- 1 root root 3 Aug 2 17:44 require_osd_release
-rw------- 1 root root 10 Aug 2 17:43 type
-rw------- 1 root root 2 Aug 2 17:43 whoami
[root@SZJD-YFQ-PM-OS01-BCONEST-06 b]# lsblk /dev/sdh /dev/sdag
NAME MAJ:MIN RM SIZE RO TYPE MOUNTPOINT
sdh 8:112 0 9.1T 0 disk //hdd block
sdag 66:0 0 447.1G 0 disk //ssd
|-sdag1 66:1 0 304.1G 0 part //db
`-sdag2 66:2 0 102.9G 0 part //wal
Thu Aug 3 14:57:21 UTC 2023
"debug_mclock": "1/5",
"osd_mclock_force_run_benchmark_on_init": "false",
"osd_mclock_iops_capacity_threshold_hdd": "180.000000",
"osd_mclock_iops_capacity_threshold_ssd": "80000.000000",
"osd_mclock_max_capacity_iops_hdd": "180.000000",
"osd_mclock_max_capacity_iops_ssd": "21500.000000",
"osd_mclock_max_sequential_bandwidth_hdd": "157286400",
"osd_mclock_max_sequential_bandwidth_ssd": "1258291200",
"osd_mclock_override_recovery_settings": "false",
"osd_mclock_profile": "custom",
"osd_mclock_scheduler_anticipation_timeout": "0.000000",
"osd_mclock_scheduler_background_best_effort_lim": "0.100000",
"osd_mclock_scheduler_background_best_effort_res": "0.100000",
"osd_mclock_scheduler_background_best_effort_wgt": "1",
"osd_mclock_scheduler_background_recovery_lim": "1.000000",
"osd_mclock_scheduler_background_recovery_res": "0.480000",
"osd_mclock_scheduler_background_recovery_wgt": "17",
"osd_mclock_scheduler_client_lim": "0.450000", //lim = 180*0.45 = 81 iops
"osd_mclock_scheduler_client_res": "0.250000", //res = 180*0.25 = 45 iops
"osd_mclock_scheduler_client_wgt": "3",
"osd_mclock_skip_benchmark": "true",
"osd_op_queue": "mclock_scheduler",
# cat write-name4.txt
89b8a2e4-cdbc-4a96-8432-0a0c28ebe847%25.779912.1__shadow_.TOh6_uWIYrPYx5bqObztEFip9uFKxz1_000000000000000000000000000000009d76cf627ab3a_WX_YCY0801_475279953_0
f5e1cec2-3027-49f6-8299-b0334e633178%25.780012.1__shadow_.TOh6_uWIYrPYx5bqObztEFip9uFKxz1_000000000000000000000000000000003d0fbf2875c33_WX_YCY0801_714510436_0
b29bd483-dac7-48ae-be13-ac28351ccdd1%25.780112.1__shadow_.TOh6_uWIYrPYx5bqObztEFip9uFKxz1_000000000000000000000000000000004870c438a5c82_WX_YCY0801_010795134_0
6daeab64-417e-4117-b5be-293e862255c2%25.780212.1__shadow_.TOh6_uWIYrPYx5bqObztEFip9uFKxz1_000000000000000000000000000000003a3b6c4cbd4a8_WX_YCY0801_853521873_0
# cat test-4read.sh
for name in `cat ./write-name4.txt` ; do
rados bench 3600 rand -t 2 -p test-pool --osd_client_op_priority 47 --show-time --run-name "$name" >> readlog &
done
test-case-1: one-rados-bench : Disk utilization 40%
iostat -xtm 1 -d /dev/sdag /dev/sdh
08/03/23 15:06:31
Device r/s w/s rMB/s wMB/s rrqm/s wrqm/s %rrqm %wrqm r_await w_await aqu-sz rareq-sz wareq-sz svctm %util
sdag 52.00 0.00 1.80 0.00 0.00 0.00 0.00 0.00 0.35 0.00 0.02 35.38 0.00 0.27 1.40
sdh 45.00 0.00 4.57 0.00 89.00 0.00 66.42 0.00 8.89 0.00 0.40 104.00 0.00 7.69 34.60
08/03/23 15:06:32
Device r/s w/s rMB/s wMB/s rrqm/s wrqm/s %rrqm %wrqm r_await w_await aqu-sz rareq-sz wareq-sz svctm %util
sdag 52.00 0.00 1.31 0.00 0.00 0.00 0.00 0.00 0.38 0.00 0.02 25.85 0.00 0.35 1.80
sdh 48.00 0.00 4.88 0.00 95.00 0.00 66.43 0.00 11.56 0.00 0.56 104.00 0.00 8.98 43.10
08/03/23 15:06:33
Device r/s w/s rMB/s wMB/s rrqm/s wrqm/s %rrqm %wrqm r_await w_await aqu-sz rareq-sz wareq-sz svctm %util
sdag 51.00 0.00 0.84 0.00 0.00 0.00 0.00 0.00 0.20 0.00 0.01 16.94 0.00 0.20 1.00
sdh 49.00 0.00 4.98 0.00 94.00 0.00 65.73 0.00 9.78 0.00 0.48 104.00 0.00 8.65 42.40
08/03/23 15:06:34
Device r/s w/s rMB/s wMB/s rrqm/s wrqm/s %rrqm %wrqm r_await w_await aqu-sz rareq-sz wareq-sz svctm %util
sdag 53.00 0.00 0.74 0.00 0.00 0.00 0.00 0.00 0.25 0.00 0.01 14.26 0.00 0.25 1.30
sdh 51.00 0.00 5.18 0.00 100.00 0.00 66.23 0.00 9.33 0.00 0.48 104.00 0.00 8.25 42.10
08/03/23 15:06:35
Device r/s w/s rMB/s wMB/s rrqm/s wrqm/s %rrqm %wrqm r_await w_await aqu-sz rareq-sz wareq-sz svctm %util
sdag 49.00 0.00 0.39 0.00 0.00 0.00 0.00 0.00 0.22 0.00 0.01 8.08 0.00 0.18 0.90
sdh 49.00 0.00 4.98 0.00 98.00 0.00 66.67 0.00 10.02 0.00 0.49 104.00 0.00 8.73 42.80
08/03/23 15:06:50
Device r/s w/s rMB/s wMB/s rrqm/s wrqm/s %rrqm %wrqm r_await w_await aqu-sz rareq-sz wareq-sz svctm %util
sdag 46.00 0.00 0.36 0.00 0.00 0.00 0.00 0.00 0.17 0.00 0.01 8.09 0.00 0.17 0.80
sdh 44.00 0.00 4.47 0.00 90.00 0.00 67.16 0.00 10.11 0.00 0.46 104.00 0.00 8.93 39.30
08/03/23 15:06:51
Device r/s w/s rMB/s wMB/s rrqm/s wrqm/s %rrqm %wrqm r_await w_await aqu-sz rareq-sz wareq-sz svctm %util
sdag 48.00 0.00 0.38 0.00 0.00 0.00 0.00 0.00 0.12 0.00 0.01 8.08 0.00 0.12 0.60
sdh 48.00 0.00 4.88 0.00 93.00 0.00 65.96 0.00 10.04 0.00 0.48 104.00 0.00 8.50 40.80
08/03/23 15:06:52
Device r/s w/s rMB/s wMB/s rrqm/s wrqm/s %rrqm %wrqm r_await w_await aqu-sz rareq-sz wareq-sz svctm %util
sdag 48.00 0.00 0.38 0.00 0.00 0.00 0.00 0.00 0.27 0.00 0.01 8.17 0.00 0.27 1.30
sdh 49.00 0.00 4.98 0.00 92.00 0.00 65.25 0.00 8.78 0.00 0.42 104.00 0.00 7.67 37.60
08/03/23 15:06:53
Device r/s w/s rMB/s wMB/s rrqm/s wrqm/s %rrqm %wrqm r_await w_await aqu-sz rareq-sz wareq-sz svctm %util
sdag 50.00 0.00 0.40 0.00 0.00 0.00 0.00 0.00 0.10 0.00 0.01 8.16 0.00 0.10 0.50
sdh 51.00 0.00 5.18 0.00 99.00 0.00 66.00 0.00 10.51 0.00 0.53 104.00 0.00 8.31 42.40
test-case-2: two-rados-bench : Disk utilization 70%~80%
iostat -xtm 1 -d /dev/sdag /dev/sdh
08/03/23 15:07:16
Device r/s w/s rMB/s wMB/s rrqm/s wrqm/s %rrqm %wrqm r_await w_await aqu-sz rareq-sz wareq-sz svctm %util
sdag 97.00 0.00 0.76 0.00 0.00 0.00 0.00 0.00 0.19 0.00 0.02 8.04 0.00 0.19 1.80
sdh 99.00 0.00 10.05 0.00 191.00 0.00 65.86 0.00 11.63 0.00 1.13 104.00 0.00 7.13 70.60
08/03/23 15:07:17
Device r/s w/s rMB/s wMB/s rrqm/s wrqm/s %rrqm %wrqm r_await w_await aqu-sz rareq-sz wareq-sz svctm %util
sdag 98.02 0.00 0.78 0.00 0.00 0.00 0.00 0.00 0.13 0.00 0.01 8.16 0.00 0.13 1.29
sdh 97.03 0.00 9.85 0.00 191.09 0.00 66.32 0.00 12.73 0.00 1.24 104.00 0.00 8.12 78.81
08/03/23 15:07:18
Device r/s w/s rMB/s wMB/s rrqm/s wrqm/s %rrqm %wrqm r_await w_await aqu-sz rareq-sz wareq-sz svctm %util
sdag 100.00 0.00 0.80 0.00 0.00 0.00 0.00 0.00 0.20 0.00 0.02 8.16 0.00 0.20 2.00
sdh 101.00 0.00 10.26 0.00 195.00 0.00 65.88 0.00 12.27 0.00 1.23 104.00 0.00 7.54 76.20
08/03/23 15:07:19
Device r/s w/s rMB/s wMB/s rrqm/s wrqm/s %rrqm %wrqm r_await w_await aqu-sz rareq-sz wareq-sz svctm %util
sdag 102.00 0.00 0.82 0.00 0.00 0.00 0.00 0.00 0.22 0.00 0.02 8.24 0.00 0.22 2.20
sdh 100.00 0.00 10.16 0.00 200.00 0.00 66.67 0.00 13.50 0.00 1.37 104.00 0.00 8.15 81.50
08/03/23 15:07:20
Device r/s w/s rMB/s wMB/s rrqm/s wrqm/s %rrqm %wrqm r_await w_await aqu-sz rareq-sz wareq-sz svctm %util
sdag 93.00 0.00 0.74 0.00 0.00 0.00 0.00 0.00 0.14 0.00 0.01 8.13 0.00 0.13 1.20
sdh 95.00 0.00 9.65 0.00 183.00 0.00 65.83 0.00 13.46 0.00 1.26 104.00 0.00 7.98 75.80
08/03/23 15:07:21
Device r/s w/s rMB/s wMB/s rrqm/s wrqm/s %rrqm %wrqm r_await w_await aqu-sz rareq-sz wareq-sz svctm %util
sdag 95.00 0.00 0.76 0.00 0.00 0.00 0.00 0.00 0.13 0.00 0.01 8.21 0.00 0.12 1.10
sdh 96.00 0.00 9.75 0.00 187.00 0.00 66.08 0.00 13.05 0.00 1.25 104.00 0.00 8.19 78.60
08/03/23 15:07:22
Device r/s w/s rMB/s wMB/s rrqm/s wrqm/s %rrqm %wrqm r_await w_await aqu-sz rareq-sz wareq-sz svctm %util
sdag 101.00 0.00 0.82 0.00 0.00 0.00 0.00 0.00 0.17 0.00 0.02 8.28 0.00 0.17 1.70
sdh 100.00 0.00 10.16 0.00 193.00 0.00 65.87 0.00 10.55 0.00 1.06 104.00 0.00 7.90 79.00
test-case-3: three-rados-bench : Disk utilization 95%~99%
iostat -xtm 1 -d /dev/sdag /dev/sdh
08/03/23 15:08:03
Device r/s w/s rMB/s wMB/s rrqm/s wrqm/s %rrqm %wrqm r_await w_await aqu-sz rareq-sz wareq-sz svctm %util
sdag 127.00 0.00 1.01 0.00 0.00 0.00 0.00 0.00 0.18 0.00 0.02 8.16 0.00 0.17 2.20
sdh 126.00 0.00 12.80 0.00 248.00 0.00 66.31 0.00 16.83 0.00 2.14 104.00 0.00 7.52 94.70
08/03/23 15:08:04
Device r/s w/s rMB/s wMB/s rrqm/s wrqm/s %rrqm %wrqm r_await w_await aqu-sz rareq-sz wareq-sz svctm %util
sdag 127.00 0.00 1.01 0.00 0.00 0.00 0.00 0.00 0.17 0.00 0.02 8.16 0.00 0.17 2.20
sdh 130.00 0.00 13.20 0.00 250.00 0.00 65.79 0.00 17.62 0.00 2.29 104.00 0.00 7.58 98.50
08/03/23 15:08:05
Device r/s w/s rMB/s wMB/s rrqm/s wrqm/s %rrqm %wrqm r_await w_await aqu-sz rareq-sz wareq-sz svctm %util
sdag 135.00 0.00 1.08 0.00 0.00 0.00 0.00 0.00 0.19 0.00 0.02 8.18 0.00 0.18 2.40
sdh 136.00 0.00 13.81 0.00 261.00 0.00 65.74 0.00 14.79 0.00 2.00 104.00 0.00 7.08 96.30
08/03/23 15:08:06
Device r/s w/s rMB/s wMB/s rrqm/s wrqm/s %rrqm %wrqm r_await w_await aqu-sz rareq-sz wareq-sz svctm %util
sdag 144.00 0.00 1.14 0.00 0.00 0.00 0.00 0.00 0.22 0.00 0.03 8.08 0.00 0.22 3.10
sdh 144.00 0.00 14.62 0.00 280.00 0.00 66.04 0.00 17.28 0.00 2.49 104.00 0.00 6.83 98.40
08/03/23 15:08:07
Device r/s w/s rMB/s wMB/s rrqm/s wrqm/s %rrqm %wrqm r_await w_await aqu-sz rareq-sz wareq-sz svctm %util
sdag 136.00 0.00 1.07 0.00 0.00 0.00 0.00 0.00 0.15 0.00 0.02 8.06 0.00 0.15 2.00
sdh 137.00 0.00 13.91 0.00 269.00 0.00 66.26 0.00 17.08 0.00 2.35 104.00 0.00 7.22 98.90
test-case-4: four-rados-bench : Disk utilization 99%~100%
iostat -xtm 1 -d /dev/sdag /dev/sdh
08/03/23 15:08:55
Device r/s w/s rMB/s wMB/s rrqm/s wrqm/s %rrqm %wrqm r_await w_await aqu-sz rareq-sz wareq-sz svctm %util
sdag 140.00 0.00 1.11 0.00 0.00 0.00 0.00 0.00 0.24 0.00 0.03 8.09 0.00 0.24 3.30
sdh 145.00 0.00 14.73 0.00 279.00 0.00 65.80 0.00 20.74 0.00 3.00 104.00 0.00 6.88 99.80
08/03/23 15:08:56
Device r/s w/s rMB/s wMB/s rrqm/s wrqm/s %rrqm %wrqm r_await w_await aqu-sz rareq-sz wareq-sz svctm %util
sdag 143.00 0.00 1.14 0.00 0.00 0.00 0.00 0.00 0.16 0.00 0.02 8.17 0.00 0.16 2.30
sdh 146.00 0.00 14.83 0.00 285.00 0.00 66.13 0.00 21.55 0.00 3.11 104.00 0.00 6.84 99.80
08/03/23 15:08:57
Device r/s w/s rMB/s wMB/s rrqm/s wrqm/s %rrqm %wrqm r_await w_await aqu-sz rareq-sz wareq-sz svctm %util
sdag 151.00 0.00 1.20 0.00 0.00 0.00 0.00 0.00 0.17 0.00 0.03 8.11 0.00 0.18 2.70
sdh 153.00 0.00 15.54 0.00 296.00 0.00 65.92 0.00 20.97 0.00 3.24 104.00 0.00 6.54 100.10
08/03/23 15:08:58
Device r/s w/s rMB/s wMB/s rrqm/s wrqm/s %rrqm %wrqm r_await w_await aqu-sz rareq-sz wareq-sz svctm %util
sdag 155.00 0.00 1.22 0.00 0.00 0.00 0.00 0.00 0.17 0.00 0.03 8.05 0.00 0.17 2.60
sdh 160.00 0.00 16.25 0.00 310.00 0.00 65.96 0.00 20.99 0.00 3.33 104.00 0.00 6.24 99.90
08/03/23 15:08:59
Device r/s w/s rMB/s wMB/s rrqm/s wrqm/s %rrqm %wrqm r_await w_await aqu-sz rareq-sz wareq-sz svctm %util
sdag 151.00 0.00 1.20 0.00 0.00 0.00 0.00 0.00 0.22 0.00 0.03 8.11 0.00 0.22 3.30
sdh 154.00 0.00 15.64 0.00 298.00 0.00 65.93 0.00 21.37 0.00 3.27 104.00 0.00 6.49 100.00
Actions