Project

General

Profile

Actions

Bug #59291

open

pg_pool_t version compatibility issue

Added by jianwei zhang about 1 year ago. Updated 8 months ago.

Status:
New
Priority:
High
Category:
-
Target version:
-
% Done:

0%

Source:
Tags:
Backport:
Regression:
No
Severity:
1 - critical
Reviewed:
04/03/2023
Affected Versions:
ceph-qa-suite:
Component(RADOS):
Pull request ID:
Crash signature (v1):
Crash signature (v2):

Description

How is pg_pool_t version forward compatible? For example, if I want to add a new field, how should I modify it?

!is_stretch_pool() condition is met, it will be reduced from v=30 to v=29

Why does the version go backwards? @Greg Farnum Farnum <>

For example, if I want to add a new field XXX, how should I design it for version compatibility?

hope someone can help me, thank you!

/*
 * pg_pool
 */
struct pg_pool_t {
  ...
  utime_t create_time;
  uint64_t flags = 0;           ///< FLAG_*
  __u8 type = 0;                ///< TYPE_*
  __u8 size = 0, min_size = 0;  ///< number of osds in each pg
  __u8 crush_rule = 0;          ///< crush placement rule
  __u8 object_hash = 0;         ///< hash mapping object name to ps
  pg_autoscale_mode_t pg_autoscale_mode = pg_autoscale_mode_t::UNKNOWN;
  ...
};

void pg_pool_t::encode(ceph::buffer::list& bl, uint64_t features) const
{
  using ceph::encode;
  if ((features & CEPH_FEATURE_PGPOOL3) == 0) {
    // this encoding matches the old struct ceph_pg_pool
    __u8 struct_v = 2;
    encode(struct_v, bl);
    encode(type, bl);
    encode(size, bl);
    encode(crush_rule, bl);
    encode(object_hash, bl);
    encode(pg_num, bl);
    encode(pgp_num, bl);
    __u32 lpg_num = 0, lpgp_num = 0;  // tell old code that there are no localized pgs.
    encode(lpg_num, bl);
    encode(lpgp_num, bl);
    encode(last_change, bl);
    encode(snap_seq, bl);
    encode(snap_epoch, bl);

    __u32 n = snaps.size();
    encode(n, bl);
    n = removed_snaps.num_intervals();
    encode(n, bl);

    encode(auid, bl);

    encode_nohead(snaps, bl, features);
    encode_nohead(removed_snaps, bl);
    return;
  }

  if ((features & CEPH_FEATURE_OSDENC) == 0) {
    __u8 struct_v = 4;
    encode(struct_v, bl);
    encode(type, bl);
    encode(size, bl);
    encode(crush_rule, bl);
    encode(object_hash, bl);
    encode(pg_num, bl);
    encode(pgp_num, bl);
    __u32 lpg_num = 0, lpgp_num = 0;  // tell old code that there are no localized pgs.
    encode(lpg_num, bl);
    encode(lpgp_num, bl);
    encode(last_change, bl);
    encode(snap_seq, bl);
    encode(snap_epoch, bl);
    encode(snaps, bl, features);
    encode(removed_snaps, bl);
    encode(auid, bl);
    encode(flags, bl);
    encode((uint32_t)0, bl); // crash_replay_interval
    return;
  }

  if ((features & CEPH_FEATURE_OSD_POOLRESEND) == 0) {
    // we simply added last_force_op_resend here, which is a fully
    // backward compatible change.  however, encoding the same map
    // differently between monitors triggers scrub noise (even though
    // they are decodable without the feature), so let's be pendantic
    // about it.
    ENCODE_START(14, 5, bl);
    encode(type, bl);
    encode(size, bl);
    encode(crush_rule, bl);
    encode(object_hash, bl);
    encode(pg_num, bl);
    encode(pgp_num, bl);
    __u32 lpg_num = 0, lpgp_num = 0;  // tell old code that there are no localized pgs.
    encode(lpg_num, bl);
    encode(lpgp_num, bl);
    encode(last_change, bl);
    encode(snap_seq, bl);
    encode(snap_epoch, bl);
    encode(snaps, bl, features);
    encode(removed_snaps, bl);
    encode(auid, bl);
    encode(flags, bl);
    encode((uint32_t)0, bl); // crash_replay_interval
    encode(min_size, bl);
    encode(quota_max_bytes, bl);
    encode(quota_max_objects, bl);
    encode(tiers, bl);
    encode(tier_of, bl);
    __u8 c = cache_mode;
    encode(c, bl);
    encode(read_tier, bl);
    encode(write_tier, bl);
    encode(properties, bl);
    encode(hit_set_params, bl);
    encode(hit_set_period, bl);
    encode(hit_set_count, bl);
    encode(stripe_width, bl);
    encode(target_max_bytes, bl);
    encode(target_max_objects, bl);
    encode(cache_target_dirty_ratio_micro, bl);
    encode(cache_target_full_ratio_micro, bl);
    encode(cache_min_flush_age, bl);
    encode(cache_min_evict_age, bl);
    encode(erasure_code_profile, bl);
    ENCODE_FINISH(bl);
    return;
  }

  uint8_t v = 30;
  // NOTE: any new encoding dependencies must be reflected by
  // SIGNIFICANT_FEATURES
  if (!(features & CEPH_FEATURE_NEW_OSDOP_ENCODING)) {
    // this was the first post-hammer thing we added; if it's missing, encode
    // like hammer.
    v = 21;
  } else if (!HAVE_FEATURE(features, SERVER_LUMINOUS)) {
    v = 24;
  } else if (!HAVE_FEATURE(features, SERVER_MIMIC)) {
    v = 26;
  } else if (!HAVE_FEATURE(features, SERVER_NAUTILUS)) {
    v = 27;
  } else if (!is_stretch_pool()) {
    v = 29;
  }

  ENCODE_START(v, 5, bl);
  encode(type, bl);
  encode(size, bl);
  encode(crush_rule, bl);
  encode(object_hash, bl);
  encode(pg_num, bl);
  encode(pgp_num, bl);
  __u32 lpg_num = 0, lpgp_num = 0;  // tell old code that there are no localized pgs.
  encode(lpg_num, bl);
  encode(lpgp_num, bl);
  encode(last_change, bl);
  encode(snap_seq, bl);
  encode(snap_epoch, bl);
  encode(snaps, bl, features);
  encode(removed_snaps, bl);
  encode(auid, bl);
  if (v >= 27) {
    encode(flags, bl);
  } else {
    auto tmp = flags;
    tmp &= ~(FLAG_SELFMANAGED_SNAPS | FLAG_POOL_SNAPS | FLAG_CREATING);
    encode(tmp, bl);
  }
  encode((uint32_t)0, bl); // crash_replay_interval
  encode(min_size, bl);
  encode(quota_max_bytes, bl);
  encode(quota_max_objects, bl);
  encode(tiers, bl);
  encode(tier_of, bl);
  __u8 c = cache_mode;
  encode(c, bl);
  encode(read_tier, bl);
  encode(write_tier, bl);
  encode(properties, bl);
  encode(hit_set_params, bl);
  encode(hit_set_period, bl);
  encode(hit_set_count, bl);
  encode(stripe_width, bl);
  encode(target_max_bytes, bl);
  encode(target_max_objects, bl);
  encode(cache_target_dirty_ratio_micro, bl);
  encode(cache_target_full_ratio_micro, bl);
  encode(cache_min_flush_age, bl);
  encode(cache_min_evict_age, bl);
  encode(erasure_code_profile, bl);
  encode(last_force_op_resend_preluminous, bl);
  encode(min_read_recency_for_promote, bl);
  encode(expected_num_objects, bl);
  if (v >= 19) {
    encode(cache_target_dirty_high_ratio_micro, bl);
  }
  if (v >= 20) {
    encode(min_write_recency_for_promote, bl);
  }
  if (v >= 21) {
    encode(use_gmt_hitset, bl);
  }
  if (v >= 22) {
    encode(fast_read, bl);
  }
  if (v >= 23) {
    encode(hit_set_grade_decay_rate, bl);
    encode(hit_set_search_last_n, bl);
  }
  if (v >= 24) {
    encode(opts, bl, features);
  }
  if (v >= 25) {
    encode(last_force_op_resend_prenautilus, bl);
  }
  if (v >= 26) {
    encode(application_metadata, bl);
  }
  if (v >= 27) {
    encode(create_time, bl);
  }
  if (v >= 28) {
    encode(pg_num_target, bl);
    encode(pgp_num_target, bl);
    encode(pg_num_pending, bl);
    encode((epoch_t)0, bl);  // pg_num_dec_last_epoch_started from 14.1.[01]
    encode((epoch_t)0, bl);  // pg_num_dec_last_epoch_clean from 14.1.[01]
    encode(last_force_op_resend, bl);
    encode(pg_autoscale_mode, bl);
  }
  if (v >= 29) {
    encode(last_pg_merge_meta, bl);
  }
  if (v >= 30) {
    encode(peering_crush_bucket_count, bl);
    encode(peering_crush_bucket_target, bl);
    encode(peering_crush_bucket_barrier, bl);
    encode(peering_crush_mandatory_member, bl);
  }
  ENCODE_FINISH(bl);
}

void pg_pool_t::decode(ceph::buffer::list::const_iterator& bl)
{
  DECODE_START_LEGACY_COMPAT_LEN(30, 5, 5, bl);
  decode(type, bl);
  decode(size, bl);
  decode(crush_rule, bl);
  decode(object_hash, bl);
  decode(pg_num, bl);
  decode(pgp_num, bl);
  {
    __u32 lpg_num, lpgp_num;
    decode(lpg_num, bl);
    decode(lpgp_num, bl);
  }
  decode(last_change, bl);
  decode(snap_seq, bl);
  decode(snap_epoch, bl);

  if (struct_v >= 3) {
    decode(snaps, bl);
    decode(removed_snaps, bl);
    decode(auid, bl);
  } else {
    __u32 n, m;
    decode(n, bl);
    decode(m, bl);
    decode(auid, bl);
    decode_nohead(n, snaps, bl);
    decode_nohead(m, removed_snaps, bl);
  }

  if (struct_v >= 4) {
    decode(flags, bl);
    uint32_t crash_replay_interval;
    decode(crash_replay_interval, bl);
  } else {
    flags = 0;
  }
  // upgrade path for selfmanaged vs pool snaps
  if (snap_seq > 0 && (flags & (FLAG_SELFMANAGED_SNAPS|FLAG_POOL_SNAPS)) == 0) {
    if (!removed_snaps.empty()) {
      flags |= FLAG_SELFMANAGED_SNAPS;
    } else {
      flags |= FLAG_POOL_SNAPS;
    }
  }
  if (struct_v >= 7) {
    decode(min_size, bl);
  } else {
    min_size = size - size/2;
  }
  if (struct_v >= 8) {
    decode(quota_max_bytes, bl);
    decode(quota_max_objects, bl);
  }
  if (struct_v >= 9) {
    decode(tiers, bl);
    decode(tier_of, bl);
    __u8 v;
    decode(v, bl);
    cache_mode = (cache_mode_t)v;
    decode(read_tier, bl);
    decode(write_tier, bl);
  }
  if (struct_v >= 10) {
    decode(properties, bl);
  }
  if (struct_v >= 11) {
    decode(hit_set_params, bl);
    decode(hit_set_period, bl);
    decode(hit_set_count, bl);
  } else {
    pg_pool_t def;
    hit_set_period = def.hit_set_period;
    hit_set_count = def.hit_set_count;
  }
  if (struct_v >= 12) {
    decode(stripe_width, bl);
  } else {
    set_stripe_width(0);
  }
  if (struct_v >= 13) {
    decode(target_max_bytes, bl);
    decode(target_max_objects, bl);
    decode(cache_target_dirty_ratio_micro, bl);
    decode(cache_target_full_ratio_micro, bl);
    decode(cache_min_flush_age, bl);
    decode(cache_min_evict_age, bl);
  } else {
    target_max_bytes = 0;
    target_max_objects = 0;
    cache_target_dirty_ratio_micro = 0;
    cache_target_full_ratio_micro = 0;
    cache_min_flush_age = 0;
    cache_min_evict_age = 0;
  }
  if (struct_v >= 14) {
    decode(erasure_code_profile, bl);
  }
  if (struct_v >= 15) {
    decode(last_force_op_resend_preluminous, bl);
  } else {
    last_force_op_resend_preluminous = 0;
  }
  if (struct_v >= 16) {
    decode(min_read_recency_for_promote, bl);
  } else {
    min_read_recency_for_promote = 1;
  }
  if (struct_v >= 17) {
    decode(expected_num_objects, bl);
  } else {
    expected_num_objects = 0;
  }
  if (struct_v >= 19) {
    decode(cache_target_dirty_high_ratio_micro, bl);
  } else {
    cache_target_dirty_high_ratio_micro = cache_target_dirty_ratio_micro;
  }
  if (struct_v >= 20) {
    decode(min_write_recency_for_promote, bl);
  } else {
    min_write_recency_for_promote = 1;
  }
  if (struct_v >= 21) {
    decode(use_gmt_hitset, bl);
  } else {
    use_gmt_hitset = false;
  }
  if (struct_v >= 22) {
    decode(fast_read, bl);
  } else {
    fast_read = false;
  }
  if (struct_v >= 23) {
    decode(hit_set_grade_decay_rate, bl);
    decode(hit_set_search_last_n, bl);
  } else {
    hit_set_grade_decay_rate = 0;
    hit_set_search_last_n = 1;
  }
  if (struct_v >= 24) {
    decode(opts, bl);
  }
  if (struct_v >= 25) {
    decode(last_force_op_resend_prenautilus, bl);
  } else {
    last_force_op_resend_prenautilus = last_force_op_resend_preluminous;
  }
  if (struct_v >= 26) {
    decode(application_metadata, bl);
  }
  if (struct_v >= 27) {
    decode(create_time, bl);
  }
  if (struct_v >= 28) {
    decode(pg_num_target, bl);
    decode(pgp_num_target, bl);
    decode(pg_num_pending, bl);
    epoch_t old_merge_last_epoch_clean, old_merge_last_epoch_started;
    decode(old_merge_last_epoch_started, bl);
    decode(old_merge_last_epoch_clean, bl);
    decode(last_force_op_resend, bl);
    decode(pg_autoscale_mode, bl);
    if (struct_v >= 29) {
      decode(last_pg_merge_meta, bl);
    } else {
      last_pg_merge_meta.last_epoch_clean = old_merge_last_epoch_clean;
      last_pg_merge_meta.last_epoch_started = old_merge_last_epoch_started;
    }
  } else {
    pg_num_target = pg_num;
    pgp_num_target = pgp_num;
    pg_num_pending = pg_num;
    last_force_op_resend = last_force_op_resend_prenautilus;
    pg_autoscale_mode = pg_autoscale_mode_t::WARN;    // default to warn on upgrade
  }
  if (struct_v >= 30) {
    decode(peering_crush_bucket_count, bl);
    decode(peering_crush_bucket_target, bl);
    decode(peering_crush_bucket_barrier, bl);
    decode(peering_crush_mandatory_member, bl);
  }
  DECODE_FINISH(bl);
  calc_pg_masks();
  calc_grade_table();
}

commit ddfda9274c45e9dce1b2b3a58cac91fba488419b
Author: Greg Farnum <gfarnum@redhat.com>
Date:   Tue Jul 21 17:04:32 2020 +0000
Date:   Tue Jul 21 17:04:32 2020 +0000

    osd: conditionally encode stretch state in pg_pool_t, do not require new compat

    This struct gets sent out to clients as well as OSDs, and we need them
    to be able to decode it without crashing/failing. So we can't require it,
    and happily the OSDs which might act on it will be gated by the OSDMap.

    Additionally, we don't want older servers to fail OSDMap crc checks, so don't
    encode the stretch data members if they aren't in use.

    Signed-off-by: Greg Farnum <gfarnum@redhat.com>

diff --git a/src/osd/osd_types.cc b/src/osd/osd_types.cc
index 198fe89d78..a6cd635e9a 100644
--- a/src/osd/osd_types.cc
+++ b/src/osd/osd_types.cc
@@ -1942,9 +1942,10 @@ void pg_pool_t::encode(ceph::buffer::list& bl, uint64_t features) const
     v = 26;
   } else if (!HAVE_FEATURE(features, SERVER_NAUTILUS)) {
     v = 27;
+  } else if (!is_stretch_pool()) {
+    v = 29;
   }

-  uint8_t new_compat = 0;
   ENCODE_START(v, 5, bl);
   encode(type, bl);
   encode(size, bl);
@@ -2033,20 +2034,13 @@ void pg_pool_t::encode(ceph::buffer::list& bl, uint64_t features) const
   if (v >= 29) {
     encode(last_pg_merge_meta, bl);
   }
-  if (peering_crush_bucket_barrier != 0 ||
-      peering_crush_bucket_target != 0 ||
-      peering_crush_bucket_count !=0 ||
-      peering_crush_mandatory_member != CRUSH_ITEM_NONE) {
-    ceph_assert(v >= 30);
-    new_compat = 30;
-  }
   if (v >= 30) {
     encode(peering_crush_bucket_count, bl);
     encode(peering_crush_bucket_target, bl);
     encode(peering_crush_bucket_barrier, bl);
     encode(peering_crush_mandatory_member, bl);
   }
-  ENCODE_FINISH_NEW_COMPAT(bl, new_compat);
+  ENCODE_FINISH(bl);
 }

 void pg_pool_t::decode(ceph::buffer::list::const_iterator& bl)

Actions

Also available in: Atom PDF