Feature #63889
Updated by Matan Breizman 5 months ago
<pre><code class="text">
OSDMap trimming lower bound is originated from the monitor's `osdmap_first_committed` epoch.
This value is shared from the monitor to the OSDs through OSDMap messages (MOSDMaps).
In the scenario where there is no OSDMap related activity, the monitor may have new `osdmap_first_committed` epoch without informing the OSDs about it.
As a result the OSDSuperblock::cluster_osdmap_trim_lower_bound (mon's `osdmap_first_committed`)
will remain "stale" until the next MOSDMap shared from the monitor. Since the tlb is stale, trim_maps calls won't actually trim any maps.
To resolve this scenario, we can enhance the monitor's behavior to share a MOSDMap with the new cluster_osdmap_trim_lower_bound
regardless of any MOSDMap activity.
</code></pre>
ref-PR: (osd: fix scheduling of OSD::trim_maps is not timely) https://github.com/ceph/ceph/pull/54686
Problem Description
1. ceph -s check HEALTH_OK, and all pg are active+lean
2. ceph report | grep osdmap_, check the latest osdmap epoch is 25307, the oldest osdmap epoch is 24807, the difference between the two is 500 osdmaps
3. ceph tell osd.0 status, view
- oldest_map is 21594
- superblock.cluster_osdmap_trim_lower_bound=21594, should be 24807, not as expected
- newest_map is 25307
- The difference of `3714` osdmap is not as expected
4. ceph -s --debug_ms=1 > aa 2>&1 show that cluster_osdmap_trim_lower_bound = 24807 of osd_map
<pre>
[root@zjw-cmain-dev build]# ceph -s
cluster:
id: 713551e4-fec5-453b-ac9e-ae0716d235cb
health: HEALTH_OK
services:
mon: 1 daemons, quorum a (age 5d)
mgr: x(active, since 5d)
osd: 2 osds: 2 up (since 2d), 2 in (since 3d)
flags noout,nobackfill,norebalance,norecover,noscrub,nodeep-scrub
data:
pools: 2 pools, 256 pgs
objects: 10.26k objects, 10 GiB
usage: 10 GiB used, 9.2 TiB / 9.2 TiB avail
pgs: 256 active+clean
[root@zjw-cmain-dev build]# ceph osd dump | head
epoch 25307
[root@zjw-cmain-dev build]# ceph report | grep osdmap_
report 974124877
"osdmap_clean_epochs": {
"osdmap_first_committed": 24807,
"osdmap_last_committed": 25307,
25307 - 24807 = 500 epoch osdmap
[root@zjw-cmain-dev build]# ceph report | jq .osdmap_clean_epochs
report 1902116976
{
"min_last_epoch_clean": 25306,
"last_epoch_clean": {
"per_pool": [
{
"poolid": 1,
"floor": 25307
},
{
"poolid": 2,
"floor": 25306
}
]
},
"osd_epochs": [
{
"id": 0,
"epoch": 25307
},
{
"id": 1,
"epoch": 25307
}
]
}
```
osd.0.log
```
[root@zjw-cmain-dev build]# tail -n 1000000 out/osd.0.log |grep trim_maps | tail -n 10
2023-12-25T15:23:02.055+0800 7f9d5dfa3700 5 osd.0 25307 trim_maps: min=21594 oldest_map=21594 superblock.cluster_osdmap_trim_lower_bound=21594 service.map_cache.cached_key_lower_bound=25258
2023-12-25T15:24:04.020+0800 7f9d5dfa3700 5 osd.0 25307 trim_maps: min=21594 oldest_map=21594 superblock.cluster_osdmap_trim_lower_bound=21594 service.map_cache.cached_key_lower_bound=25258
2023-12-25T15:25:05.060+0800 7f9d5dfa3700 5 osd.0 25307 trim_maps: min=21594 oldest_map=21594 superblock.cluster_osdmap_trim_lower_bound=21594 service.map_cache.cached_key_lower_bound=25258
2023-12-25T15:26:06.941+0800 7f9d5dfa3700 5 osd.0 25307 trim_maps: min=21594 oldest_map=21594 superblock.cluster_osdmap_trim_lower_bound=21594 service.map_cache.cached_key_lower_bound=25258
2023-12-25T15:27:08.033+0800 7f9d5dfa3700 5 osd.0 25307 trim_maps: min=21594 oldest_map=21594 superblock.cluster_osdmap_trim_lower_bound=21594 service.map_cache.cached_key_lower_bound=25258
2023-12-25T15:28:09.170+0800 7f9d5dfa3700 5 osd.0 25307 trim_maps: min=21594 oldest_map=21594 superblock.cluster_osdmap_trim_lower_bound=21594 service.map_cache.cached_key_lower_bound=25258
2023-12-25T15:29:11.127+0800 7f9d5dfa3700 5 osd.0 25307 trim_maps: min=21594 oldest_map=21594 superblock.cluster_osdmap_trim_lower_bound=21594 service.map_cache.cached_key_lower_bound=25258
2023-12-25T15:30:12.387+0800 7f9d5dfa3700 5 osd.0 25307 trim_maps: min=21594 oldest_map=21594 superblock.cluster_osdmap_trim_lower_bound=21594 service.map_cache.cached_key_lower_bound=25258
2023-12-25T15:31:14.312+0800 7f9d5dfa3700 5 osd.0 25307 trim_maps: min=21594 oldest_map=21594 superblock.cluster_osdmap_trim_lower_bound=21594 service.map_cache.cached_key_lower_bound=25258
2023-12-25T15:32:16.066+0800 7f9d5dfa3700 5 osd.0 25307 trim_maps: min=21594 oldest_map=21594 superblock.cluster_osdmap_trim_lower_bound=21594 service.map_cache.cached_key_lower_bound=25258
```
```
[root@zjw-cmain-dev build]# ceph tell osd.0 status
{
"cluster_fsid": "713551e4-fec5-453b-ac9e-ae0716d235cb",
"osd_fsid": "bc068df8-0acb-4164-8262-3813b40ce736",
"whoami": 0,
"state": "active",
"maps": "[21594~3714]",
"cluster_osdmap_trim_lower_bound": 21594,
"num_pgs": 128
}
[root@zjw-cmain-dev build]# ceph tell osd.1 status
{
"cluster_fsid": "713551e4-fec5-453b-ac9e-ae0716d235cb",
"osd_fsid": "a62eb046-59cb-40f7-bf63-82c228ef5e30",
"whoami": 1,
"state": "active",
"maps": "[21594~3714]",
"cluster_osdmap_trim_lower_bound": 21594,
"num_pgs": 128
}
```
```
[root@zjw-cmain-dev build]# ceph -s --debug_ms=1 > aa 2>&1
[root@zjw-cmain-dev build]# grep osdmap aa
2023-12-25T16:10:19.628+0800 7ffb2a3fc700 1 -- 127.0.0.1:0/810641284 --> v2:127.0.0.1:40815/0 -- mon_subscribe({osdmap=0}) v3 -- 0x7ffb24111330 con 0x7ffb240fef90
[root@zjw-cmain-dev build]# grep osd_map aa
2023-12-25T16:10:19.630+0800 7ffb20ff9700 1 -- 127.0.0.1:0/810641284 <== mon.0 v2:127.0.0.1:40815/0 5 ==== osd_map(25307..25307 src has 24807..25307) v4 ==== 2651+0+0 (crc 0 0 0) 0x7ffb14032bb0 con 0x7ffb240fef90
Monitor::handle_subscribe
OSDMonitor::check_osdmap_sub
sub->session->con->send_message(build_latest_full(sub->session->con_features));
OSDMonitor::build_latest_full
class MOSDMap final : public Message {
void print(std::ostream& out) const override {
out << "osd_map(" << get_first() << ".." << get_last();
if (cluster_osdmap_trim_lower_bound || newest_map)
out << " src has " << cluster_osdmap_trim_lower_bound
<< ".." << newest_map;
out << ")";
}
};
cluster_osdmap_trim_lower_bound = 24807
newest_map = 25307
</pre>
reproduce
<pre>
1. # cat start-cluster.sh
#!/bin/sh
set -x
rm -rf dev ceph.conf out
MON=1 MGR=1 OSD=1 FS=0 MDS=0 RGW=0 ../src/vstart.sh -n -l -X -b --msgr2 \
--bluestore-devs /dev/disk/by-id/ata-ST10000NM0196-2AA131_ZA29A049 \
--bluestore-db-devs /dev/disk/by-id/ata-INTEL_SSDSC2KG480G8_BTYG909204YN480BGN-part3 \
--bluestore-wal-devs /dev/disk/by-id/ata-INTEL_SSDSC2KG480G8_BTYG909204YN480BGN-part1
2. # ../src/vstart.sh --inc-osd
3. # crush tree
osd.0 in root default-root-0
osd.1 in root default-root-1
4. # create pool
create test-pool-0 with default-root-0 root. (single replicas)
create test-pool-1 with default-root-1 root. (single replicas)
5. # kill -9 <osd.0.pid>
6. # change osdmap
for i in {1..10000}; do
ceph osd pool application rm "test-pool-1" rgw "abc"
ceph osd pool application set "test-pool-1" rgw "abc" "efg"
ceph osd dump | head -n 1
done
7. # record osdmap info
ceph osd dump | head
ceph report | grep osdmap
ceph report | jq .osdmap_clean_epochs
ceph tell osd.1 status
8. # start osd.0
init-ceph start osd.0
9. # record osdmap info
ceph tell osd.0 status
ceph tell osd.1 status
ceph osd dump | head
ceph report | grep osdmap
ceph report | jq .osdmap_clean_epochs
10. # ceph daemon osd.0 config set debug_osd 20
ceph daemon osd.1 config set debug_osd 20
ceph daemon osd.0 config set osd_map_trim_min_interval 60
ceph daemon osd.1 config set osd_map_trim_min_interval 60
11. tail -f out/osd.0.log | grep trim_maps
</pre>