Actions
Bug #59291
openpg_pool_t version compatibility issue
% Done:
0%
Source:
Tags:
Backport:
Regression:
No
Severity:
1 - critical
Reviewed:
04/03/2023
Affected Versions:
ceph-qa-suite:
Component(RADOS):
Pull request ID:
Crash signature (v1):
Crash signature (v2):
Description
How is pg_pool_t version forward compatible? For example, if I want to add a new field, how should I modify it?
!is_stretch_pool() condition is met, it will be reduced from v=30 to v=29
Why does the version go backwards? @Greg Farnum Farnum <gfarnum@redhat.com>
For example, if I want to add a new field XXX, how should I design it for version compatibility?
hope someone can help me, thank you!
/*
* pg_pool
*/
struct pg_pool_t {
...
utime_t create_time;
uint64_t flags = 0; ///< FLAG_*
__u8 type = 0; ///< TYPE_*
__u8 size = 0, min_size = 0; ///< number of osds in each pg
__u8 crush_rule = 0; ///< crush placement rule
__u8 object_hash = 0; ///< hash mapping object name to ps
pg_autoscale_mode_t pg_autoscale_mode = pg_autoscale_mode_t::UNKNOWN;
...
};
void pg_pool_t::encode(ceph::buffer::list& bl, uint64_t features) const
{
using ceph::encode;
if ((features & CEPH_FEATURE_PGPOOL3) == 0) {
// this encoding matches the old struct ceph_pg_pool
__u8 struct_v = 2;
encode(struct_v, bl);
encode(type, bl);
encode(size, bl);
encode(crush_rule, bl);
encode(object_hash, bl);
encode(pg_num, bl);
encode(pgp_num, bl);
__u32 lpg_num = 0, lpgp_num = 0; // tell old code that there are no localized pgs.
encode(lpg_num, bl);
encode(lpgp_num, bl);
encode(last_change, bl);
encode(snap_seq, bl);
encode(snap_epoch, bl);
__u32 n = snaps.size();
encode(n, bl);
n = removed_snaps.num_intervals();
encode(n, bl);
encode(auid, bl);
encode_nohead(snaps, bl, features);
encode_nohead(removed_snaps, bl);
return;
}
if ((features & CEPH_FEATURE_OSDENC) == 0) {
__u8 struct_v = 4;
encode(struct_v, bl);
encode(type, bl);
encode(size, bl);
encode(crush_rule, bl);
encode(object_hash, bl);
encode(pg_num, bl);
encode(pgp_num, bl);
__u32 lpg_num = 0, lpgp_num = 0; // tell old code that there are no localized pgs.
encode(lpg_num, bl);
encode(lpgp_num, bl);
encode(last_change, bl);
encode(snap_seq, bl);
encode(snap_epoch, bl);
encode(snaps, bl, features);
encode(removed_snaps, bl);
encode(auid, bl);
encode(flags, bl);
encode((uint32_t)0, bl); // crash_replay_interval
return;
}
if ((features & CEPH_FEATURE_OSD_POOLRESEND) == 0) {
// we simply added last_force_op_resend here, which is a fully
// backward compatible change. however, encoding the same map
// differently between monitors triggers scrub noise (even though
// they are decodable without the feature), so let's be pendantic
// about it.
ENCODE_START(14, 5, bl);
encode(type, bl);
encode(size, bl);
encode(crush_rule, bl);
encode(object_hash, bl);
encode(pg_num, bl);
encode(pgp_num, bl);
__u32 lpg_num = 0, lpgp_num = 0; // tell old code that there are no localized pgs.
encode(lpg_num, bl);
encode(lpgp_num, bl);
encode(last_change, bl);
encode(snap_seq, bl);
encode(snap_epoch, bl);
encode(snaps, bl, features);
encode(removed_snaps, bl);
encode(auid, bl);
encode(flags, bl);
encode((uint32_t)0, bl); // crash_replay_interval
encode(min_size, bl);
encode(quota_max_bytes, bl);
encode(quota_max_objects, bl);
encode(tiers, bl);
encode(tier_of, bl);
__u8 c = cache_mode;
encode(c, bl);
encode(read_tier, bl);
encode(write_tier, bl);
encode(properties, bl);
encode(hit_set_params, bl);
encode(hit_set_period, bl);
encode(hit_set_count, bl);
encode(stripe_width, bl);
encode(target_max_bytes, bl);
encode(target_max_objects, bl);
encode(cache_target_dirty_ratio_micro, bl);
encode(cache_target_full_ratio_micro, bl);
encode(cache_min_flush_age, bl);
encode(cache_min_evict_age, bl);
encode(erasure_code_profile, bl);
ENCODE_FINISH(bl);
return;
}
uint8_t v = 30;
// NOTE: any new encoding dependencies must be reflected by
// SIGNIFICANT_FEATURES
if (!(features & CEPH_FEATURE_NEW_OSDOP_ENCODING)) {
// this was the first post-hammer thing we added; if it's missing, encode
// like hammer.
v = 21;
} else if (!HAVE_FEATURE(features, SERVER_LUMINOUS)) {
v = 24;
} else if (!HAVE_FEATURE(features, SERVER_MIMIC)) {
v = 26;
} else if (!HAVE_FEATURE(features, SERVER_NAUTILUS)) {
v = 27;
} else if (!is_stretch_pool()) {
v = 29;
}
ENCODE_START(v, 5, bl);
encode(type, bl);
encode(size, bl);
encode(crush_rule, bl);
encode(object_hash, bl);
encode(pg_num, bl);
encode(pgp_num, bl);
__u32 lpg_num = 0, lpgp_num = 0; // tell old code that there are no localized pgs.
encode(lpg_num, bl);
encode(lpgp_num, bl);
encode(last_change, bl);
encode(snap_seq, bl);
encode(snap_epoch, bl);
encode(snaps, bl, features);
encode(removed_snaps, bl);
encode(auid, bl);
if (v >= 27) {
encode(flags, bl);
} else {
auto tmp = flags;
tmp &= ~(FLAG_SELFMANAGED_SNAPS | FLAG_POOL_SNAPS | FLAG_CREATING);
encode(tmp, bl);
}
encode((uint32_t)0, bl); // crash_replay_interval
encode(min_size, bl);
encode(quota_max_bytes, bl);
encode(quota_max_objects, bl);
encode(tiers, bl);
encode(tier_of, bl);
__u8 c = cache_mode;
encode(c, bl);
encode(read_tier, bl);
encode(write_tier, bl);
encode(properties, bl);
encode(hit_set_params, bl);
encode(hit_set_period, bl);
encode(hit_set_count, bl);
encode(stripe_width, bl);
encode(target_max_bytes, bl);
encode(target_max_objects, bl);
encode(cache_target_dirty_ratio_micro, bl);
encode(cache_target_full_ratio_micro, bl);
encode(cache_min_flush_age, bl);
encode(cache_min_evict_age, bl);
encode(erasure_code_profile, bl);
encode(last_force_op_resend_preluminous, bl);
encode(min_read_recency_for_promote, bl);
encode(expected_num_objects, bl);
if (v >= 19) {
encode(cache_target_dirty_high_ratio_micro, bl);
}
if (v >= 20) {
encode(min_write_recency_for_promote, bl);
}
if (v >= 21) {
encode(use_gmt_hitset, bl);
}
if (v >= 22) {
encode(fast_read, bl);
}
if (v >= 23) {
encode(hit_set_grade_decay_rate, bl);
encode(hit_set_search_last_n, bl);
}
if (v >= 24) {
encode(opts, bl, features);
}
if (v >= 25) {
encode(last_force_op_resend_prenautilus, bl);
}
if (v >= 26) {
encode(application_metadata, bl);
}
if (v >= 27) {
encode(create_time, bl);
}
if (v >= 28) {
encode(pg_num_target, bl);
encode(pgp_num_target, bl);
encode(pg_num_pending, bl);
encode((epoch_t)0, bl); // pg_num_dec_last_epoch_started from 14.1.[01]
encode((epoch_t)0, bl); // pg_num_dec_last_epoch_clean from 14.1.[01]
encode(last_force_op_resend, bl);
encode(pg_autoscale_mode, bl);
}
if (v >= 29) {
encode(last_pg_merge_meta, bl);
}
if (v >= 30) {
encode(peering_crush_bucket_count, bl);
encode(peering_crush_bucket_target, bl);
encode(peering_crush_bucket_barrier, bl);
encode(peering_crush_mandatory_member, bl);
}
ENCODE_FINISH(bl);
}
void pg_pool_t::decode(ceph::buffer::list::const_iterator& bl)
{
DECODE_START_LEGACY_COMPAT_LEN(30, 5, 5, bl);
decode(type, bl);
decode(size, bl);
decode(crush_rule, bl);
decode(object_hash, bl);
decode(pg_num, bl);
decode(pgp_num, bl);
{
__u32 lpg_num, lpgp_num;
decode(lpg_num, bl);
decode(lpgp_num, bl);
}
decode(last_change, bl);
decode(snap_seq, bl);
decode(snap_epoch, bl);
if (struct_v >= 3) {
decode(snaps, bl);
decode(removed_snaps, bl);
decode(auid, bl);
} else {
__u32 n, m;
decode(n, bl);
decode(m, bl);
decode(auid, bl);
decode_nohead(n, snaps, bl);
decode_nohead(m, removed_snaps, bl);
}
if (struct_v >= 4) {
decode(flags, bl);
uint32_t crash_replay_interval;
decode(crash_replay_interval, bl);
} else {
flags = 0;
}
// upgrade path for selfmanaged vs pool snaps
if (snap_seq > 0 && (flags & (FLAG_SELFMANAGED_SNAPS|FLAG_POOL_SNAPS)) == 0) {
if (!removed_snaps.empty()) {
flags |= FLAG_SELFMANAGED_SNAPS;
} else {
flags |= FLAG_POOL_SNAPS;
}
}
if (struct_v >= 7) {
decode(min_size, bl);
} else {
min_size = size - size/2;
}
if (struct_v >= 8) {
decode(quota_max_bytes, bl);
decode(quota_max_objects, bl);
}
if (struct_v >= 9) {
decode(tiers, bl);
decode(tier_of, bl);
__u8 v;
decode(v, bl);
cache_mode = (cache_mode_t)v;
decode(read_tier, bl);
decode(write_tier, bl);
}
if (struct_v >= 10) {
decode(properties, bl);
}
if (struct_v >= 11) {
decode(hit_set_params, bl);
decode(hit_set_period, bl);
decode(hit_set_count, bl);
} else {
pg_pool_t def;
hit_set_period = def.hit_set_period;
hit_set_count = def.hit_set_count;
}
if (struct_v >= 12) {
decode(stripe_width, bl);
} else {
set_stripe_width(0);
}
if (struct_v >= 13) {
decode(target_max_bytes, bl);
decode(target_max_objects, bl);
decode(cache_target_dirty_ratio_micro, bl);
decode(cache_target_full_ratio_micro, bl);
decode(cache_min_flush_age, bl);
decode(cache_min_evict_age, bl);
} else {
target_max_bytes = 0;
target_max_objects = 0;
cache_target_dirty_ratio_micro = 0;
cache_target_full_ratio_micro = 0;
cache_min_flush_age = 0;
cache_min_evict_age = 0;
}
if (struct_v >= 14) {
decode(erasure_code_profile, bl);
}
if (struct_v >= 15) {
decode(last_force_op_resend_preluminous, bl);
} else {
last_force_op_resend_preluminous = 0;
}
if (struct_v >= 16) {
decode(min_read_recency_for_promote, bl);
} else {
min_read_recency_for_promote = 1;
}
if (struct_v >= 17) {
decode(expected_num_objects, bl);
} else {
expected_num_objects = 0;
}
if (struct_v >= 19) {
decode(cache_target_dirty_high_ratio_micro, bl);
} else {
cache_target_dirty_high_ratio_micro = cache_target_dirty_ratio_micro;
}
if (struct_v >= 20) {
decode(min_write_recency_for_promote, bl);
} else {
min_write_recency_for_promote = 1;
}
if (struct_v >= 21) {
decode(use_gmt_hitset, bl);
} else {
use_gmt_hitset = false;
}
if (struct_v >= 22) {
decode(fast_read, bl);
} else {
fast_read = false;
}
if (struct_v >= 23) {
decode(hit_set_grade_decay_rate, bl);
decode(hit_set_search_last_n, bl);
} else {
hit_set_grade_decay_rate = 0;
hit_set_search_last_n = 1;
}
if (struct_v >= 24) {
decode(opts, bl);
}
if (struct_v >= 25) {
decode(last_force_op_resend_prenautilus, bl);
} else {
last_force_op_resend_prenautilus = last_force_op_resend_preluminous;
}
if (struct_v >= 26) {
decode(application_metadata, bl);
}
if (struct_v >= 27) {
decode(create_time, bl);
}
if (struct_v >= 28) {
decode(pg_num_target, bl);
decode(pgp_num_target, bl);
decode(pg_num_pending, bl);
epoch_t old_merge_last_epoch_clean, old_merge_last_epoch_started;
decode(old_merge_last_epoch_started, bl);
decode(old_merge_last_epoch_clean, bl);
decode(last_force_op_resend, bl);
decode(pg_autoscale_mode, bl);
if (struct_v >= 29) {
decode(last_pg_merge_meta, bl);
} else {
last_pg_merge_meta.last_epoch_clean = old_merge_last_epoch_clean;
last_pg_merge_meta.last_epoch_started = old_merge_last_epoch_started;
}
} else {
pg_num_target = pg_num;
pgp_num_target = pgp_num;
pg_num_pending = pg_num;
last_force_op_resend = last_force_op_resend_prenautilus;
pg_autoscale_mode = pg_autoscale_mode_t::WARN; // default to warn on upgrade
}
if (struct_v >= 30) {
decode(peering_crush_bucket_count, bl);
decode(peering_crush_bucket_target, bl);
decode(peering_crush_bucket_barrier, bl);
decode(peering_crush_mandatory_member, bl);
}
DECODE_FINISH(bl);
calc_pg_masks();
calc_grade_table();
}
commit ddfda9274c45e9dce1b2b3a58cac91fba488419b
Author: Greg Farnum <gfarnum@redhat.com>
Date: Tue Jul 21 17:04:32 2020 +0000
Date: Tue Jul 21 17:04:32 2020 +0000
osd: conditionally encode stretch state in pg_pool_t, do not require new compat
This struct gets sent out to clients as well as OSDs, and we need them
to be able to decode it without crashing/failing. So we can't require it,
and happily the OSDs which might act on it will be gated by the OSDMap.
Additionally, we don't want older servers to fail OSDMap crc checks, so don't
encode the stretch data members if they aren't in use.
Signed-off-by: Greg Farnum <gfarnum@redhat.com>
diff --git a/src/osd/osd_types.cc b/src/osd/osd_types.cc
index 198fe89d78..a6cd635e9a 100644
--- a/src/osd/osd_types.cc
+++ b/src/osd/osd_types.cc
@@ -1942,9 +1942,10 @@ void pg_pool_t::encode(ceph::buffer::list& bl, uint64_t features) const
v = 26;
} else if (!HAVE_FEATURE(features, SERVER_NAUTILUS)) {
v = 27;
+ } else if (!is_stretch_pool()) {
+ v = 29;
}
- uint8_t new_compat = 0;
ENCODE_START(v, 5, bl);
encode(type, bl);
encode(size, bl);
@@ -2033,20 +2034,13 @@ void pg_pool_t::encode(ceph::buffer::list& bl, uint64_t features) const
if (v >= 29) {
encode(last_pg_merge_meta, bl);
}
- if (peering_crush_bucket_barrier != 0 ||
- peering_crush_bucket_target != 0 ||
- peering_crush_bucket_count !=0 ||
- peering_crush_mandatory_member != CRUSH_ITEM_NONE) {
- ceph_assert(v >= 30);
- new_compat = 30;
- }
if (v >= 30) {
encode(peering_crush_bucket_count, bl);
encode(peering_crush_bucket_target, bl);
encode(peering_crush_bucket_barrier, bl);
encode(peering_crush_mandatory_member, bl);
}
- ENCODE_FINISH_NEW_COMPAT(bl, new_compat);
+ ENCODE_FINISH(bl);
}
void pg_pool_t::decode(ceph::buffer::list::const_iterator& bl)
Actions