Bug #63520
openthe usage of osd_pg_stat_report_interval_max is not uniform
0%
Description
Scenario 1: osd_pg_stat_report_interval_max is in epoch units
boost::statechart::result PeeringState::Active::react(const AdvMap& advmap)
{
... ...
// if we haven't reported our PG stats in a long time, do so now.
if (ps->info.stats.reported_epoch + ps->cct->_conf->osd_pg_stat_report_interval_max //500 epoch
< advmap.osdmap->get_epoch()) {
psdout(20) << "reporting stats to osd after "
<< (advmap.osdmap->get_epoch() - ps->info.stats.reported_epoch) << " epochs" << dendl;
need_publish = true;
}
if (need_publish)
pl->publish_stats_to_osd();
... ...
}
Scenario 2: osd_pg_stat_report_interval_max in seconds
std::optional<pg_stat_t> PeeringState::prepare_stats_for_publish(
bool pg_stats_publish_valid,
const pg_stat_t &pg_stats_publish,
const object_stat_collection_t &unstable_stats)
{
... ...
utime_t cutoff = now;
cutoff -= cct->_conf->osd_pg_stat_report_interval_max; //500
// 500s, report pg stats for any given pg at least this often
if (pg_stats_publish_valid && pre_publish == pg_stats_publish && info.stats.last_fresh > cutoff) {
psdout(15) << "publish_stats_to_osd " << pg_stats_publish.reported_epoch
<< ": no change since " << info.stats.last_fresh << dendl;
return std::nullopt;
} else {
// update our stat summary and timestamps
info.stats.reported_epoch = get_osdmap_epoch();
++info.stats.reported_seq;
info.stats.last_fresh = now;
if (info.stats.state & PG_STATE_CLEAN)
info.stats.last_clean = now;
if (info.stats.state & PG_STATE_ACTIVE)
info.stats.last_active = now;
if (info.stats.state & (PG_STATE_ACTIVE|PG_STATE_PEERED))
info.stats.last_peered = now;
info.stats.last_unstale = now;
if ((info.stats.state & PG_STATE_DEGRADED) == 0)
info.stats.last_undegraded = now;
if ((info.stats.state & PG_STATE_UNDERSIZED) == 0)
info.stats.last_fullsized = now;
psdout(15) << "publish_stats_to_osd " << pg_stats_publish.reported_epoch
<< ":" << pg_stats_publish.reported_seq << dendl;
return std::make_optional(std::move(pre_publish));
}
}
And prepare_stats_for_publish is not called periodically according to osd_pg_stat_report_interval_max to update info.stats.reported_epoch = get_osdmap_epoch(),
This will affect the update of pg's last_epoch_clean, which will in turn affect mon's acquisition of min_last_epoch_clean, ultimately causing osdmap to not be trimmed as expected.
epoch_t pg_stat_t::get_effective_last_epoch_clean() const
{
if (state & PG_STATE_CLEAN) {
// we are clean as of this report, and should thus take the
// reported epoch
return reported_epoch;
} else {
return last_epoch_clean;
}
}
version_t OSDMonitor::get_trim_to() const
{
if (mon.get_quorum().empty()) {
dout(10) << __func__ << " quorum not formed, trim_to = 0" << dendl;
return 0;
}
{
std::lock_guard<std::mutex> l(creating_pgs_lock);
if (!creating_pgs.pgs.empty()) {
dout(10) << __func__ << " pgs creating, trim_to = 0" << dendl;
return 0;
}
}
if (g_conf().get_val<bool>("mon_debug_block_osdmap_trim")) { //false
dout(0) << __func__
<< " blocking osdmap trim"
<< " ('mon_debug_block_osdmap_trim' set to 'true')"
<< " trim_to = 0" << dendl;
return 0;
}
{
epoch_t floor = get_min_last_epoch_clean();
dout(10) << " min_last_epoch_clean " << floor << dendl;
if (g_conf()->mon_osd_force_trim_to > 0 && // = 0
g_conf()->mon_osd_force_trim_to < (int)get_last_committed()) {
floor = g_conf()->mon_osd_force_trim_to;
dout(10) << __func__ << " explicit mon_osd_force_trim_to = " << floor << dendl;
}
unsigned min = g_conf()->mon_min_osdmap_epochs; //500
if (floor + min > get_last_committed()) {
if (min < get_last_committed())
floor = get_last_committed() - min;
else
floor = 0;
}
if (floor > get_first_committed()) {
dout(10) << __func__ << " trim_to = " << floor << dendl;
return floor;
}
}
dout(10) << __func__ << " trim_to = 0" << dendl;
return 0;
}
epoch_t OSDMonitor::get_min_last_epoch_clean() const
{
auto floor = last_epoch_clean.get_lower_bound(osdmap);
// also scan osd epochs
// don't trim past the oldest reported osd epoch
for (auto [osd, epoch] : osd_epochs) {
if (epoch < floor) {
floor = epoch;
}
}
return floor;
}
Updated by jianwei zhang 6 months ago
Disadvantages of non-uniform units: After 500 osdmaps are reached, it may not be reached within 500 seconds.
Updated by jianwei zhang 6 months ago
solution:
main idea:
1. Compatible with the judgment of epoch and seconds at the same time
2. Reason 1: 500 epoch osdmaps are also the minimum number of osdmaps that mon must retain. When PG Active State receives advmap, if info.stats.reported_epoch exceeds 500 osdmaps, it can be considered that the one reported by pgid is too old and needs to be updated. to facilitate mon trim osdmaps
3. Reason 2: When there is no change in osdmap, update info.stats.reported_epoch by the number of seconds.
Updated by Radoslaw Zarzynski 6 months ago
- Status changed from New to Fix Under Review