Project

General

Profile

Bug #59291

pg_pool_t version compatibility issue

Added by jianwei zhang 11 months ago. Updated 6 months ago.

Status:
New
Priority:
High
Category:
-
Target version:
-
% Done:

0%

Source:
Tags:
Backport:
Regression:
No
Severity:
1 - critical
Reviewed:
04/03/2023
Affected Versions:
ceph-qa-suite:
Component(RADOS):
Pull request ID:
Crash signature (v1):
Crash signature (v2):

Description

How is pg_pool_t version forward compatible? For example, if I want to add a new field, how should I modify it?

!is_stretch_pool() condition is met, it will be reduced from v=30 to v=29

Why does the version go backwards? @Greg Farnum <>

For example, if I want to add a new field XXX, how should I design it for version compatibility?

hope someone can help me, thank you!

/*
 * pg_pool
 */
struct pg_pool_t {
  ...
  utime_t create_time;
  uint64_t flags = 0;           ///< FLAG_*
  __u8 type = 0;                ///< TYPE_*
  __u8 size = 0, min_size = 0;  ///< number of osds in each pg
  __u8 crush_rule = 0;          ///< crush placement rule
  __u8 object_hash = 0;         ///< hash mapping object name to ps
  pg_autoscale_mode_t pg_autoscale_mode = pg_autoscale_mode_t::UNKNOWN;
  ...
};

void pg_pool_t::encode(ceph::buffer::list& bl, uint64_t features) const
{
  using ceph::encode;
  if ((features & CEPH_FEATURE_PGPOOL3) == 0) {
    // this encoding matches the old struct ceph_pg_pool
    __u8 struct_v = 2;
    encode(struct_v, bl);
    encode(type, bl);
    encode(size, bl);
    encode(crush_rule, bl);
    encode(object_hash, bl);
    encode(pg_num, bl);
    encode(pgp_num, bl);
    __u32 lpg_num = 0, lpgp_num = 0;  // tell old code that there are no localized pgs.
    encode(lpg_num, bl);
    encode(lpgp_num, bl);
    encode(last_change, bl);
    encode(snap_seq, bl);
    encode(snap_epoch, bl);

    __u32 n = snaps.size();
    encode(n, bl);
    n = removed_snaps.num_intervals();
    encode(n, bl);

    encode(auid, bl);

    encode_nohead(snaps, bl, features);
    encode_nohead(removed_snaps, bl);
    return;
  }

  if ((features & CEPH_FEATURE_OSDENC) == 0) {
    __u8 struct_v = 4;
    encode(struct_v, bl);
    encode(type, bl);
    encode(size, bl);
    encode(crush_rule, bl);
    encode(object_hash, bl);
    encode(pg_num, bl);
    encode(pgp_num, bl);
    __u32 lpg_num = 0, lpgp_num = 0;  // tell old code that there are no localized pgs.
    encode(lpg_num, bl);
    encode(lpgp_num, bl);
    encode(last_change, bl);
    encode(snap_seq, bl);
    encode(snap_epoch, bl);
    encode(snaps, bl, features);
    encode(removed_snaps, bl);
    encode(auid, bl);
    encode(flags, bl);
    encode((uint32_t)0, bl); // crash_replay_interval
    return;
  }

  if ((features & CEPH_FEATURE_OSD_POOLRESEND) == 0) {
    // we simply added last_force_op_resend here, which is a fully
    // backward compatible change.  however, encoding the same map
    // differently between monitors triggers scrub noise (even though
    // they are decodable without the feature), so let's be pendantic
    // about it.
    ENCODE_START(14, 5, bl);
    encode(type, bl);
    encode(size, bl);
    encode(crush_rule, bl);
    encode(object_hash, bl);
    encode(pg_num, bl);
    encode(pgp_num, bl);
    __u32 lpg_num = 0, lpgp_num = 0;  // tell old code that there are no localized pgs.
    encode(lpg_num, bl);
    encode(lpgp_num, bl);
    encode(last_change, bl);
    encode(snap_seq, bl);
    encode(snap_epoch, bl);
    encode(snaps, bl, features);
    encode(removed_snaps, bl);
    encode(auid, bl);
    encode(flags, bl);
    encode((uint32_t)0, bl); // crash_replay_interval
    encode(min_size, bl);
    encode(quota_max_bytes, bl);
    encode(quota_max_objects, bl);
    encode(tiers, bl);
    encode(tier_of, bl);
    __u8 c = cache_mode;
    encode(c, bl);
    encode(read_tier, bl);
    encode(write_tier, bl);
    encode(properties, bl);
    encode(hit_set_params, bl);
    encode(hit_set_period, bl);
    encode(hit_set_count, bl);
    encode(stripe_width, bl);
    encode(target_max_bytes, bl);
    encode(target_max_objects, bl);
    encode(cache_target_dirty_ratio_micro, bl);
    encode(cache_target_full_ratio_micro, bl);
    encode(cache_min_flush_age, bl);
    encode(cache_min_evict_age, bl);
    encode(erasure_code_profile, bl);
    ENCODE_FINISH(bl);
    return;
  }

  uint8_t v = 30;
  // NOTE: any new encoding dependencies must be reflected by
  // SIGNIFICANT_FEATURES
  if (!(features & CEPH_FEATURE_NEW_OSDOP_ENCODING)) {
    // this was the first post-hammer thing we added; if it's missing, encode
    // like hammer.
    v = 21;
  } else if (!HAVE_FEATURE(features, SERVER_LUMINOUS)) {
    v = 24;
  } else if (!HAVE_FEATURE(features, SERVER_MIMIC)) {
    v = 26;
  } else if (!HAVE_FEATURE(features, SERVER_NAUTILUS)) {
    v = 27;
  } else if (!is_stretch_pool()) {
    v = 29;
  }

  ENCODE_START(v, 5, bl);
  encode(type, bl);
  encode(size, bl);
  encode(crush_rule, bl);
  encode(object_hash, bl);
  encode(pg_num, bl);
  encode(pgp_num, bl);
  __u32 lpg_num = 0, lpgp_num = 0;  // tell old code that there are no localized pgs.
  encode(lpg_num, bl);
  encode(lpgp_num, bl);
  encode(last_change, bl);
  encode(snap_seq, bl);
  encode(snap_epoch, bl);
  encode(snaps, bl, features);
  encode(removed_snaps, bl);
  encode(auid, bl);
  if (v >= 27) {
    encode(flags, bl);
  } else {
    auto tmp = flags;
    tmp &= ~(FLAG_SELFMANAGED_SNAPS | FLAG_POOL_SNAPS | FLAG_CREATING);
    encode(tmp, bl);
  }
  encode((uint32_t)0, bl); // crash_replay_interval
  encode(min_size, bl);
  encode(quota_max_bytes, bl);
  encode(quota_max_objects, bl);
  encode(tiers, bl);
  encode(tier_of, bl);
  __u8 c = cache_mode;
  encode(c, bl);
  encode(read_tier, bl);
  encode(write_tier, bl);
  encode(properties, bl);
  encode(hit_set_params, bl);
  encode(hit_set_period, bl);
  encode(hit_set_count, bl);
  encode(stripe_width, bl);
  encode(target_max_bytes, bl);
  encode(target_max_objects, bl);
  encode(cache_target_dirty_ratio_micro, bl);
  encode(cache_target_full_ratio_micro, bl);
  encode(cache_min_flush_age, bl);
  encode(cache_min_evict_age, bl);
  encode(erasure_code_profile, bl);
  encode(last_force_op_resend_preluminous, bl);
  encode(min_read_recency_for_promote, bl);
  encode(expected_num_objects, bl);
  if (v >= 19) {
    encode(cache_target_dirty_high_ratio_micro, bl);
  }
  if (v >= 20) {
    encode(min_write_recency_for_promote, bl);
  }
  if (v >= 21) {
    encode(use_gmt_hitset, bl);
  }
  if (v >= 22) {
    encode(fast_read, bl);
  }
  if (v >= 23) {
    encode(hit_set_grade_decay_rate, bl);
    encode(hit_set_search_last_n, bl);
  }
  if (v >= 24) {
    encode(opts, bl, features);
  }
  if (v >= 25) {
    encode(last_force_op_resend_prenautilus, bl);
  }
  if (v >= 26) {
    encode(application_metadata, bl);
  }
  if (v >= 27) {
    encode(create_time, bl);
  }
  if (v >= 28) {
    encode(pg_num_target, bl);
    encode(pgp_num_target, bl);
    encode(pg_num_pending, bl);
    encode((epoch_t)0, bl);  // pg_num_dec_last_epoch_started from 14.1.[01]
    encode((epoch_t)0, bl);  // pg_num_dec_last_epoch_clean from 14.1.[01]
    encode(last_force_op_resend, bl);
    encode(pg_autoscale_mode, bl);
  }
  if (v >= 29) {
    encode(last_pg_merge_meta, bl);
  }
  if (v >= 30) {
    encode(peering_crush_bucket_count, bl);
    encode(peering_crush_bucket_target, bl);
    encode(peering_crush_bucket_barrier, bl);
    encode(peering_crush_mandatory_member, bl);
  }
  ENCODE_FINISH(bl);
}

void pg_pool_t::decode(ceph::buffer::list::const_iterator& bl)
{
  DECODE_START_LEGACY_COMPAT_LEN(30, 5, 5, bl);
  decode(type, bl);
  decode(size, bl);
  decode(crush_rule, bl);
  decode(object_hash, bl);
  decode(pg_num, bl);
  decode(pgp_num, bl);
  {
    __u32 lpg_num, lpgp_num;
    decode(lpg_num, bl);
    decode(lpgp_num, bl);
  }
  decode(last_change, bl);
  decode(snap_seq, bl);
  decode(snap_epoch, bl);

  if (struct_v >= 3) {
    decode(snaps, bl);
    decode(removed_snaps, bl);
    decode(auid, bl);
  } else {
    __u32 n, m;
    decode(n, bl);
    decode(m, bl);
    decode(auid, bl);
    decode_nohead(n, snaps, bl);
    decode_nohead(m, removed_snaps, bl);
  }

  if (struct_v >= 4) {
    decode(flags, bl);
    uint32_t crash_replay_interval;
    decode(crash_replay_interval, bl);
  } else {
    flags = 0;
  }
  // upgrade path for selfmanaged vs pool snaps
  if (snap_seq > 0 && (flags & (FLAG_SELFMANAGED_SNAPS|FLAG_POOL_SNAPS)) == 0) {
    if (!removed_snaps.empty()) {
      flags |= FLAG_SELFMANAGED_SNAPS;
    } else {
      flags |= FLAG_POOL_SNAPS;
    }
  }
  if (struct_v >= 7) {
    decode(min_size, bl);
  } else {
    min_size = size - size/2;
  }
  if (struct_v >= 8) {
    decode(quota_max_bytes, bl);
    decode(quota_max_objects, bl);
  }
  if (struct_v >= 9) {
    decode(tiers, bl);
    decode(tier_of, bl);
    __u8 v;
    decode(v, bl);
    cache_mode = (cache_mode_t)v;
    decode(read_tier, bl);
    decode(write_tier, bl);
  }
  if (struct_v >= 10) {
    decode(properties, bl);
  }
  if (struct_v >= 11) {
    decode(hit_set_params, bl);
    decode(hit_set_period, bl);
    decode(hit_set_count, bl);
  } else {
    pg_pool_t def;
    hit_set_period = def.hit_set_period;
    hit_set_count = def.hit_set_count;
  }
  if (struct_v >= 12) {
    decode(stripe_width, bl);
  } else {
    set_stripe_width(0);
  }
  if (struct_v >= 13) {
    decode(target_max_bytes, bl);
    decode(target_max_objects, bl);
    decode(cache_target_dirty_ratio_micro, bl);
    decode(cache_target_full_ratio_micro, bl);
    decode(cache_min_flush_age, bl);
    decode(cache_min_evict_age, bl);
  } else {
    target_max_bytes = 0;
    target_max_objects = 0;
    cache_target_dirty_ratio_micro = 0;
    cache_target_full_ratio_micro = 0;
    cache_min_flush_age = 0;
    cache_min_evict_age = 0;
  }
  if (struct_v >= 14) {
    decode(erasure_code_profile, bl);
  }
  if (struct_v >= 15) {
    decode(last_force_op_resend_preluminous, bl);
  } else {
    last_force_op_resend_preluminous = 0;
  }
  if (struct_v >= 16) {
    decode(min_read_recency_for_promote, bl);
  } else {
    min_read_recency_for_promote = 1;
  }
  if (struct_v >= 17) {
    decode(expected_num_objects, bl);
  } else {
    expected_num_objects = 0;
  }
  if (struct_v >= 19) {
    decode(cache_target_dirty_high_ratio_micro, bl);
  } else {
    cache_target_dirty_high_ratio_micro = cache_target_dirty_ratio_micro;
  }
  if (struct_v >= 20) {
    decode(min_write_recency_for_promote, bl);
  } else {
    min_write_recency_for_promote = 1;
  }
  if (struct_v >= 21) {
    decode(use_gmt_hitset, bl);
  } else {
    use_gmt_hitset = false;
  }
  if (struct_v >= 22) {
    decode(fast_read, bl);
  } else {
    fast_read = false;
  }
  if (struct_v >= 23) {
    decode(hit_set_grade_decay_rate, bl);
    decode(hit_set_search_last_n, bl);
  } else {
    hit_set_grade_decay_rate = 0;
    hit_set_search_last_n = 1;
  }
  if (struct_v >= 24) {
    decode(opts, bl);
  }
  if (struct_v >= 25) {
    decode(last_force_op_resend_prenautilus, bl);
  } else {
    last_force_op_resend_prenautilus = last_force_op_resend_preluminous;
  }
  if (struct_v >= 26) {
    decode(application_metadata, bl);
  }
  if (struct_v >= 27) {
    decode(create_time, bl);
  }
  if (struct_v >= 28) {
    decode(pg_num_target, bl);
    decode(pgp_num_target, bl);
    decode(pg_num_pending, bl);
    epoch_t old_merge_last_epoch_clean, old_merge_last_epoch_started;
    decode(old_merge_last_epoch_started, bl);
    decode(old_merge_last_epoch_clean, bl);
    decode(last_force_op_resend, bl);
    decode(pg_autoscale_mode, bl);
    if (struct_v >= 29) {
      decode(last_pg_merge_meta, bl);
    } else {
      last_pg_merge_meta.last_epoch_clean = old_merge_last_epoch_clean;
      last_pg_merge_meta.last_epoch_started = old_merge_last_epoch_started;
    }
  } else {
    pg_num_target = pg_num;
    pgp_num_target = pgp_num;
    pg_num_pending = pg_num;
    last_force_op_resend = last_force_op_resend_prenautilus;
    pg_autoscale_mode = pg_autoscale_mode_t::WARN;    // default to warn on upgrade
  }
  if (struct_v >= 30) {
    decode(peering_crush_bucket_count, bl);
    decode(peering_crush_bucket_target, bl);
    decode(peering_crush_bucket_barrier, bl);
    decode(peering_crush_mandatory_member, bl);
  }
  DECODE_FINISH(bl);
  calc_pg_masks();
  calc_grade_table();
}

commit ddfda9274c45e9dce1b2b3a58cac91fba488419b
Author: Greg Farnum <gfarnum@redhat.com>
Date:   Tue Jul 21 17:04:32 2020 +0000
Date:   Tue Jul 21 17:04:32 2020 +0000

    osd: conditionally encode stretch state in pg_pool_t, do not require new compat

    This struct gets sent out to clients as well as OSDs, and we need them
    to be able to decode it without crashing/failing. So we can't require it,
    and happily the OSDs which might act on it will be gated by the OSDMap.

    Additionally, we don't want older servers to fail OSDMap crc checks, so don't
    encode the stretch data members if they aren't in use.

    Signed-off-by: Greg Farnum <gfarnum@redhat.com>

diff --git a/src/osd/osd_types.cc b/src/osd/osd_types.cc
index 198fe89d78..a6cd635e9a 100644
--- a/src/osd/osd_types.cc
+++ b/src/osd/osd_types.cc
@@ -1942,9 +1942,10 @@ void pg_pool_t::encode(ceph::buffer::list& bl, uint64_t features) const
     v = 26;
   } else if (!HAVE_FEATURE(features, SERVER_NAUTILUS)) {
     v = 27;
+  } else if (!is_stretch_pool()) {
+    v = 29;
   }

-  uint8_t new_compat = 0;
   ENCODE_START(v, 5, bl);
   encode(type, bl);
   encode(size, bl);
@@ -2033,20 +2034,13 @@ void pg_pool_t::encode(ceph::buffer::list& bl, uint64_t features) const
   if (v >= 29) {
     encode(last_pg_merge_meta, bl);
   }
-  if (peering_crush_bucket_barrier != 0 ||
-      peering_crush_bucket_target != 0 ||
-      peering_crush_bucket_count !=0 ||
-      peering_crush_mandatory_member != CRUSH_ITEM_NONE) {
-    ceph_assert(v >= 30);
-    new_compat = 30;
-  }
   if (v >= 30) {
     encode(peering_crush_bucket_count, bl);
     encode(peering_crush_bucket_target, bl);
     encode(peering_crush_bucket_barrier, bl);
     encode(peering_crush_mandatory_member, bl);
   }
-  ENCODE_FINISH_NEW_COMPAT(bl, new_compat);
+  ENCODE_FINISH(bl);
 }

 void pg_pool_t::decode(ceph::buffer::list::const_iterator& bl)

History

#1 Updated by Igor Fedotov 11 months ago

  • Project changed from bluestore to RADOS

#2 Updated by jianwei zhang 11 months ago

Thank you for helping me modify the status

#3 Updated by jianwei zhang 11 months ago

ceph version 16.2.16

#4 Updated by jianwei zhang 11 months ago

Ceph Version 17.2.5 also has the same problem

#5 Updated by jianwei zhang 11 months ago

mail :

As the commit message says, we need to encode a version clients can understand, and older OSDs (in case of in-progress upgrades). These data members are only used when the cluster is in stretch mode; if it isn’t we can encode the data structure as if on an older version of the code/struct because the values are unused.

I believe this particular patch was prompted because clients (which didn’t support the new features yet) connected to the cluster and then crashed on the “ ceph_assert(v >= 30);” (which I’d included to catch bugs between the OSDs) bit, which was obviously not a desired behavior.
-Greg

thanks for your reply!

I now want to add a new member to pg_pool_t,
I put v=31, but it still uses v=29, because the !is_stretch_pool()condition is true,
This directly caused my newly added members to be unable to be encoded and decoded,
I don't know how to solve this problem, do you have a way?
- jianwei zhang
I haven’t though it through. You may just be able to remove that conditional in the encode function at this point?

The RADOS team has a lot of experience handling encoding versions so if you pick an approach to fix it and point it out in a pr they can check it.
-Greg
I don't currently have a good way to handle this either,I am somewhat pessimistic about the solution to this problem,I feel like this breaks the rules of version compatibility.very tricky.
- jianwei zhang

#6 Updated by Radoslaw Zarzynski 11 months ago

  • Priority changed from Normal to High

Oops, it looks like is_stretch_pool() really doesn't depend on features.

  uint8_t v = 30;
  // NOTE: any new encoding dependencies must be reflected by
  // SIGNIFICANT_FEATURES
  if (!(features & CEPH_FEATURE_NEW_OSDOP_ENCODING)) {
    // this was the first post-hammer thing we added; if it's missing, encode
    // like hammer.
    v = 21;
  } else if (!HAVE_FEATURE(features, SERVER_LUMINOUS)) {
    v = 24;
  } else if (!HAVE_FEATURE(features, SERVER_MIMIC)) {
    v = 26;
  } else if (!HAVE_FEATURE(features, SERVER_NAUTILUS)) {
    v = 27;
  } else if (!is_stretch_pool()) {
    v = 29;
  }

#7 Updated by Neha Ojha 10 months ago

  • Assignee set to Kamoltat (Junior) Sirivadhna

#8 Updated by Kamoltat (Junior) Sirivadhna 10 months ago

So Neha and I have discussed this and we were looking into a solution where anything 31 and above would have to encode all the `stretch_cluster` variables (peering_crush_bucket_count, peering_crush_bucket_target, ...). So if someone wants to bump the version to >= 31 and add more stuff to pg_pool_t, then going forward ceph will also encode stretch_cluster variables as well.

#9 Updated by Radoslaw Zarzynski 9 months ago

Hi Junior! What's the current status?

#10 Updated by Radoslaw Zarzynski 9 months ago

I have an impression since 30 we've changed the responsibility of the version bits: now they're indicating whether the byte stream contains the stretch-mode related fields or not; this should have been built on top of a struct wrapped in std::optional, I think.

As we must live with all decoders that are already in the field (including the revision where 30-and-above means stretch mode), the problem become painful. Bumping version without introducing a feature bit would enforce encoders to encode the stretch-related fields just for the sake of being understandable by the problematic decoder's revision. Perhaps we could find values for these fields which – after decoding – would give the same meaning like the default-constructed ones at version 29. If so, the issue can be turned into footprint / potential performance thing.

Since Squid we could use SERVER_SQUID to let encoder distinguish whether decoder needs the NOP-filling-junks or not.

#11 Updated by Radoslaw Zarzynski 9 months ago

How big are the stretch-related fields after encoding?

#12 Updated by Nitzan Mordechai 9 months ago

Junior, i know its a bit ugly fix, but how about using the compact version as well?
we can bump compact to 6 (currently 5) and check for version 29.6 and 31.6 that way we can easily find the new versions

#13 Updated by Radoslaw Zarzynski 9 months ago

we can bump compact to 6 (currently 5) and check for version 29.6 and 31.6 that way we can easily find the new versions

Apart from assigning a quirky responsibility to the compat param, there is a (theoretical) problem: we would lose support for an (archaic) decoder revision:

#define DECODE_START(v, bl)                                             \
  __u8 struct_v, struct_compat;                                         \
  using ::ceph::decode;                                                 \
  decode(struct_v, bl);                                         \
  decode(struct_compat, bl);                                            \
  if (v < struct_compat)                                                \
    throw ::ceph::buffer::malformed_input(DECODE_ERR_OLDVERSION(__PRETTY_FUNCTION__, v, struct_compat)); \
  __u32 struct_len;                        \

Please imagine the decoder revision that does something like:

void pg_pool_t::decode(ceph::buffer::list::const_iterator& bl)
{
  DECODE_START(5, bl);
  decode(...);
  decode(...);

#14 Updated by Kamoltat (Junior) Sirivadhna 9 months ago

As discussed in the call with Radek and Nitzan,

Nitzan -> work on the actual fix
Junior -> Unit test -> Integration test

In summary, Radek proposed rewriting the ENCODE_START part where it is compatible with older version of DECODE_START.

#15 Updated by Radoslaw Zarzynski 9 months ago

The proposal we discussed:

pg_pool_t::encode(bl, features)
{
  // ...
  if (is_new_decoder(features) /* by e.g. SERVER_SQUID */) {
    ENCODE_START(v /* 31? */, bl);
    // everything except the stretch-related bits
    std::optional<stretch_related_fields_t> maybe_stretch_things = get_from(this);
    encode(maybe_stretch_things, bl);
  } else {
    // as like now -- must be compatible
  }
}

#16 Updated by jianwei zhang 7 months ago

Radoslaw Zarzynski wrote:

The proposal we discussed:

[...]

look good to me, thanks share

#17 Updated by Honggang Yang 6 months ago

Radoslaw Zarzynski wrote:

The proposal we discussed:

[...]

Hello rzarzyns, any update? please paste the related PR here if ready

#18 Updated by Radoslaw Zarzynski 6 months ago

Well, this case grown so much that, I'm afraid, there is single PR. Pasting my notes:

Dencoder unitestasbility improvements
=====================================
* https://github.com/ceph/ceph-object-corpus/pull/16
* `generate_test_instances()` and `dump()`:
  - RBD: https://github.com/ceph/ceph/pull/52481
  - osd ~~and mds~~:
    + ~~https://github.com/ceph/ceph/pull/52482/~~
    + https://github.com/ceph/ceph/pull/52871
  - common: https://github.com/ceph/ceph/pull/52210/
  - RGW: https://github.com/ceph/ceph/pull/52198/
  - CephFS: https://github.com/ceph/ceph/pull/52623
  - https://github.com/ceph/ceph/pull/52871

Also available in: Atom PDF