diff --git a/src/crush/CrushCompiler.cc b/src/crush/CrushCompiler.cc index b52a55a..5ddb5e9 100644 --- a/src/crush/CrushCompiler.cc +++ b/src/crush/CrushCompiler.cc @@ -78,6 +78,7 @@ int CrushCompiler::decompile_bucket_impl(int i, ostream &out) bool dopos = false; switch (alg) { case CRUSH_BUCKET_UNIFORM: + case CRUSH_BUCKET_LINEAR: out << "\t# do not change bucket size (" << n << ") unnecessarily"; dopos = true; break; @@ -435,6 +436,8 @@ int CrushCompiler::parse_bucket(iter_t const& i) alg = CRUSH_BUCKET_TREE; else if (a == "straw") alg = CRUSH_BUCKET_STRAW; + else if (a == "linear") + alg = CRUSH_BUCKET_LINEAR; else { err << "unknown bucket alg '" << a << "'" << std::endl << std::endl; return -EINVAL; @@ -512,7 +515,7 @@ int CrushCompiler::parse_bucket(iter_t const& i) assert(0); } - if (alg == CRUSH_BUCKET_UNIFORM) { + if (alg == CRUSH_BUCKET_UNIFORM || alg == CRUSH_BUCKET_LINEAR) { if (!have_uniform_weight) { have_uniform_weight = true; uniform_weight = weight; diff --git a/src/crush/CrushWrapper.cc b/src/crush/CrushWrapper.cc index 9618d98..1366193 100644 --- a/src/crush/CrushWrapper.cc +++ b/src/crush/CrushWrapper.cc @@ -964,6 +964,10 @@ void CrushWrapper::encode(bufferlist& bl, bool lean) const } break; + case CRUSH_BUCKET_LINEAR: + ::encode(((crush_bucket_linear*)crush->buckets[i])->item_weight, bl); + break; + default: assert(0); break; @@ -1108,6 +1112,9 @@ void CrushWrapper::decode_crush_bucket(crush_bucket** bptr, bufferlist::iterator case CRUSH_BUCKET_STRAW: size = sizeof(crush_bucket_straw); break; + case CRUSH_BUCKET_LINEAR: + size = sizeof(crush_bucket_linear); + break; default: { char str[128]; @@ -1171,6 +1178,10 @@ void CrushWrapper::decode_crush_bucket(crush_bucket** bptr, bufferlist::iterator break; } + case CRUSH_BUCKET_LINEAR: + ::decode(((crush_bucket_linear*)bucket)->item_weight, blp); + break; + default: // We should have handled this case in the first switch statement assert(0); diff --git a/src/crush/CrushWrapper.h b/src/crush/CrushWrapper.h index 56e3d16..f20fa1a 100644 --- a/src/crush/CrushWrapper.h +++ b/src/crush/CrushWrapper.h @@ -925,6 +925,20 @@ public: out[i] = rawout[i]; } + void do_rule(int rule, int x, vector& out, int maxout, + const vector<__u32>& weight, float balance_param) const { + Mutex::Locker l(mapper_lock); + int rawout[maxout]; + int scratch[maxout * 3]; + int numrep = crush_do_rule_wrapper(crush, rule, x, rawout, maxout, + &weight[0], weight.size(), scratch, balance_param); + if (numrep < 0) + numrep = 0; + out.resize(numrep); + for (int i=0; ih.alg = CRUSH_BUCKET_LINEAR; + bucket->h.hash = hash; + bucket->h.type = type; + bucket->h.size = size; + + bucket->h.weight = size * item_weight; + bucket->item_weight = item_weight; + bucket->h.items = malloc(sizeof(__s32) * size); + + if (!bucket->h.items) + goto err; + + bucket->h.perm = malloc(sizeof(__u32) * size); + + if (!bucket->h.perm) + goto err; + for (i=0; ih.items[i] = items[i]; + + return bucket; +err: + free(bucket->h.perm); + free(bucket->h.items); + free(bucket); + return NULL; +} + struct crush_bucket* @@ -542,6 +589,13 @@ crush_make_bucket(int alg, int hash, int type, int size, case CRUSH_BUCKET_STRAW: return (struct crush_bucket *)crush_make_straw_bucket(hash, type, size, items, weights); + + case CRUSH_BUCKET_LINEAR: + if (size && weights) + item_weight = weights[0]; + else + item_weight = 0; + return (struct crush_bucket *)crush_make_linear_bucket(hash, type, size, items, item_weight); } return 0; } @@ -709,6 +763,33 @@ int crush_add_straw_bucket_item(struct crush_bucket_straw *bucket, int item, int return crush_calc_straw(bucket); } +int crush_add_linear_bucket_item(struct crush_bucket_linear *bucket, int item, int weight) +{ + int newsize = bucket->h.size + 1; + void *_realloc = NULL; + + if ((_realloc = realloc(bucket->h.items, sizeof(__s32)*newsize)) == NULL) { + return -ENOMEM; + } else { + bucket->h.items = _realloc; + } + if ((_realloc = realloc(bucket->h.perm, sizeof(__u32)*newsize)) == NULL) { + return -ENOMEM; + } else { + bucket->h.perm = _realloc; + } + + bucket->h.items[newsize-1] = item; + + if (crush_addition_is_unsafe(bucket->h.weight, weight)) + return -ERANGE; + + bucket->h.weight += weight; + bucket->h.size++; + + return 0; +} + int crush_bucket_add_item(struct crush_bucket *b, int item, int weight) { /* invalidate perm cache */ @@ -723,6 +804,8 @@ int crush_bucket_add_item(struct crush_bucket *b, int item, int weight) return crush_add_tree_bucket_item((struct crush_bucket_tree *)b, item, weight); case CRUSH_BUCKET_STRAW: return crush_add_straw_bucket_item((struct crush_bucket_straw *)b, item, weight); + case CRUSH_BUCKET_LINEAR: + return crush_add_linear_bucket_item((struct crush_bucket_linear *)b, item, weight); default: return -1; } @@ -921,6 +1004,36 @@ int crush_remove_straw_bucket_item(struct crush_bucket_straw *bucket, int item) return crush_calc_straw(bucket); } +int crush_remove_linear_bucket_item(struct crush_bucket_linear *bucket, int item) +{ + unsigned i, j; + int newsize; + void *_realloc = NULL; + + for (i = 0; i < bucket->h.size; i++) + if (bucket->h.items[i] == item) + break; + if (i == bucket->h.size) + return -ENOENT; + + for (j = i; j < bucket->h.size; j++) + bucket->h.items[j] = bucket->h.items[j+1]; + newsize = --bucket->h.size; + bucket->h.weight -= bucket->item_weight; + + if ((_realloc = realloc(bucket->h.items, sizeof(__s32)*newsize)) == NULL) { + return -ENOMEM; + } else { + bucket->h.items = _realloc; + } + if ((_realloc = realloc(bucket->h.perm, sizeof(__u32)*newsize)) == NULL) { + return -ENOMEM; + } else { + bucket->h.perm = _realloc; + } + return 0; +} + int crush_bucket_remove_item(struct crush_bucket *b, int item) { /* invalidate perm cache */ @@ -935,6 +1048,8 @@ int crush_bucket_remove_item(struct crush_bucket *b, int item) return crush_remove_tree_bucket_item((struct crush_bucket_tree *)b, item); case CRUSH_BUCKET_STRAW: return crush_remove_straw_bucket_item((struct crush_bucket_straw *)b, item); + case CRUSH_BUCKET_LINEAR: + return crush_remove_linear_bucket_item((struct crush_bucket_linear *)b, item); default: return -1; } @@ -1025,6 +1140,16 @@ int crush_adjust_straw_bucket_item_weight(struct crush_bucket_straw *bucket, int return diff; } +int crush_adjust_linear_bucket_item_weight(struct crush_bucket_linear *bucket, int item, int weight) +{ + int diff = (weight - bucket->item_weight) * bucket->h.size; + + bucket->item_weight = weight; + bucket->h.weight = bucket->item_weight * bucket->h.size; + + return diff; +} + int crush_bucket_adjust_item_weight(struct crush_bucket *b, int item, int weight) { switch (b->alg) { @@ -1040,6 +1165,9 @@ int crush_bucket_adjust_item_weight(struct crush_bucket *b, int item, int weight case CRUSH_BUCKET_STRAW: return crush_adjust_straw_bucket_item_weight((struct crush_bucket_straw *)b, item, weight); + case CRUSH_BUCKET_LINEAR: + return crush_adjust_linear_bucket_item_weight((struct crush_bucket_linear *)b, + item, weight); default: return -1; } @@ -1144,6 +1272,34 @@ static int crush_reweight_straw_bucket(struct crush_map *crush, struct crush_buc return 0; } +static int crush_reweight_linear_bucket(struct crush_map *crush, struct crush_bucket_linear *bucket) +{ + unsigned i; + unsigned sum = 0, n = 0, leaves = 0; + + for (i = 0; i < bucket->h.size; i++) { + int id = bucket->h.items[i]; + if (id < 0) { + struct crush_bucket *c = crush->buckets[-1-id]; + crush_reweight_bucket(crush, c); + + if (crush_addition_is_unsafe(sum, c->weight)) + return -ERANGE; + + sum += c->weight; + n++; + } else { + leaves++; + } + } + + if (n > leaves) + bucket->item_weight = sum / n; // more bucket children than leaves, average! + bucket->h.weight = bucket->item_weight * bucket->h.size; + + return 0; +} + int crush_reweight_bucket(struct crush_map *crush, struct crush_bucket *b) { switch (b->alg) { @@ -1155,6 +1311,8 @@ int crush_reweight_bucket(struct crush_map *crush, struct crush_bucket *b) return crush_reweight_tree_bucket(crush, (struct crush_bucket_tree *)b); case CRUSH_BUCKET_STRAW: return crush_reweight_straw_bucket(crush, (struct crush_bucket_straw *)b); + case CRUSH_BUCKET_LINEAR: + return crush_reweight_linear_bucket(crush, (struct crush_bucket_linear *)b); default: return -1; } diff --git a/src/crush/builder.h b/src/crush/builder.h index 1003c35..13c7660 100644 --- a/src/crush/builder.h +++ b/src/crush/builder.h @@ -39,5 +39,9 @@ struct crush_bucket_straw * crush_make_straw_bucket(int hash, int type, int size, int *items, int *weights); +struct crush_bucket_linear * +crush_make_linear_bucket(int hash, int type, int size, + int *items, + int item_weight); #endif diff --git a/src/crush/crush.c b/src/crush/crush.c index 519793a..8e7841a 100644 --- a/src/crush/crush.c +++ b/src/crush/crush.c @@ -18,6 +18,7 @@ const char *crush_bucket_alg_name(int alg) case CRUSH_BUCKET_LIST: return "list"; case CRUSH_BUCKET_TREE: return "tree"; case CRUSH_BUCKET_STRAW: return "straw"; + case CRUSH_BUCKET_LINEAR: return "linear"; default: return "unknown"; } } @@ -41,6 +42,8 @@ int crush_get_bucket_item_weight(const struct crush_bucket *b, int p) return ((struct crush_bucket_tree *)b)->node_weights[crush_calc_tree_node(p)]; case CRUSH_BUCKET_STRAW: return ((struct crush_bucket_straw *)b)->item_weights[p]; + case CRUSH_BUCKET_LINEAR: + return ((struct crush_bucket_linear *)b)->item_weight; } return 0; } @@ -78,6 +81,13 @@ void crush_destroy_bucket_straw(struct crush_bucket_straw *b) kfree(b); } +void crush_destroy_bucket_linear(struct crush_bucket_linear *b) +{ + kfree(b->h.perm); + kfree(b->h.items); + kfree(b); +} + void crush_destroy_bucket(struct crush_bucket *b) { switch (b->alg) { @@ -93,6 +103,9 @@ void crush_destroy_bucket(struct crush_bucket *b) case CRUSH_BUCKET_STRAW: crush_destroy_bucket_straw((struct crush_bucket_straw *)b); break; + case CRUSH_BUCKET_LINEAR: + crush_destroy_bucket_linear((struct crush_bucket_linear *)b); + break; } } diff --git a/src/crush/crush.h b/src/crush/crush.h index 322d16c..e881fb2 100644 --- a/src/crush/crush.h +++ b/src/crush/crush.h @@ -112,7 +112,8 @@ enum { CRUSH_BUCKET_UNIFORM = 1, CRUSH_BUCKET_LIST = 2, CRUSH_BUCKET_TREE = 3, - CRUSH_BUCKET_STRAW = 4 + CRUSH_BUCKET_STRAW = 4, + CRUSH_BUCKET_LINEAR = 5 }; extern const char *crush_bucket_alg_name(int alg); @@ -159,7 +160,10 @@ struct crush_bucket_straw { __u32 *straws; /* 16-bit fixed point */ }; - +struct crush_bucket_linear { + struct crush_bucket h; + __u32 item_weight; /* 16-bit fixed point; all items equally weighted */ +}; /* * CRUSH map includes all buckets, rules, etc. @@ -203,6 +207,7 @@ extern void crush_destroy_bucket_uniform(struct crush_bucket_uniform *b); extern void crush_destroy_bucket_list(struct crush_bucket_list *b); extern void crush_destroy_bucket_tree(struct crush_bucket_tree *b); extern void crush_destroy_bucket_straw(struct crush_bucket_straw *b); +extern void crush_destroy_bucket_linear(struct crush_bucket_linear *b); extern void crush_destroy_bucket(struct crush_bucket *b); extern void crush_destroy_rule(struct crush_rule *r); extern void crush_destroy(struct crush_map *map); diff --git a/src/crush/grammar.h b/src/crush/grammar.h index 42b0b8e..3dd75b0 100644 --- a/src/crush/grammar.h +++ b/src/crush/grammar.h @@ -116,7 +116,8 @@ struct crush_grammar : public grammar bucket_alg = str_p("alg") >> ( str_p("uniform") | str_p("list") | str_p("tree") | - str_p("straw") ); + str_p("straw")| + str_p("linear")); bucket_hash = str_p("hash") >> ( integer | str_p("rjenkins1") ); bucket_item = str_p("item") >> name diff --git a/src/crush/mapper.c b/src/crush/mapper.c index e610f31..6ce477e 100644 --- a/src/crush/mapper.c +++ b/src/crush/mapper.c @@ -239,6 +239,13 @@ static int bucket_straw_choose(struct crush_bucket_straw *bucket, return bucket->h.items[high]; } +static int bucket_linear_choose(struct crush_bucket_linear *bucket, + int x, int r) +{ + unsigned int item = x%bucket->h.size; + return bucket->h.items[item]; +} + static int crush_bucket_choose(struct crush_bucket *in, int x, int r) { dprintk(" crush_bucket_choose %d x=%d r=%d\n", in->id, x, r); @@ -262,6 +269,32 @@ static int crush_bucket_choose(struct crush_bucket *in, int x, int r) } } +static int crush_bucket_choose_wrapper(struct crush_bucket *in, int x, int r, float balance_param) +{ + dprintk(" crush_bucket_choose %d x=%d r=%d\n", in->id, x, r); + BUG_ON(in->size == 0); + switch (in->alg) { + case CRUSH_BUCKET_UNIFORM: + return bucket_uniform_choose((struct crush_bucket_uniform *)in, + x, r); + case CRUSH_BUCKET_LIST: + return bucket_list_choose((struct crush_bucket_list *)in, + x, r); + case CRUSH_BUCKET_TREE: + return bucket_tree_choose((struct crush_bucket_tree *)in, + x, r); + case CRUSH_BUCKET_STRAW: + return bucket_straw_choose((struct crush_bucket_straw *)in, + x, r); + case CRUSH_BUCKET_LINEAR: + return bucket_linear_choose((struct crush_bucket_linear *)in, + x/balance_param, r); + default: + dprintk("unknown bucket %d alg %d\n", in->id, in->alg); + return in->items[0]; + } +} + /* * true if device is marked "out" (failed, fully offloaded) * of the cluster @@ -473,6 +506,166 @@ reject: return outpos; } +static int crush_choose_firstn_wrapper(const struct crush_map *map, + struct crush_bucket *bucket, + const __u32 *weight, int weight_max, + int x, int numrep, int type, + int *out, int outpos, + unsigned int tries, + unsigned int recurse_tries, + unsigned int local_retries, + unsigned int local_fallback_retries, + int recurse_to_leaf, + int *out2, float balance_param) +{ + int rep; + unsigned int ftotal, flocal; + int retry_descent, retry_bucket, skip_rep; + struct crush_bucket *in = bucket; + int r; + int i; + int item = 0; + int itemtype; + int collide, reject; + + dprintk("CHOOSE%s bucket %d x %d outpos %d numrep %d\n", recurse_to_leaf ? "_LEAF" : "", + bucket->id, x, outpos, numrep); + + for (rep = outpos; rep < numrep; rep++) { + /* keep trying until we get a non-out, non-colliding item */ + ftotal = 0; + skip_rep = 0; + do { + retry_descent = 0; + in = bucket; /* initial bucket */ + + /* choose through intervening buckets */ + flocal = 0; + do { + collide = 0; + retry_bucket = 0; + r = rep; + /* r' = r + f_total */ + r += ftotal; + + /* bucket choose */ + if (in->size == 0) { + reject = 1; + goto reject; + } + if (local_fallback_retries > 0 && + flocal >= (in->size>>1) && + flocal > local_fallback_retries) + item = bucket_perm_choose(in, x, r); + else + item = crush_bucket_choose_wrapper(in, x, r, balance_param); + if (item >= map->max_devices) { + dprintk(" bad item %d\n", item); + skip_rep = 1; + break; + } + + /* desired type? */ + if (item < 0) + itemtype = map->buckets[-1-item]->type; + else + itemtype = 0; + dprintk(" item %d type %d\n", item, itemtype); + + /* keep going? */ + if (itemtype != type) { + if (item >= 0 || + (-1-item) >= map->max_buckets) { + dprintk(" bad item type %d\n", type); + skip_rep = 1; + break; + } + in = map->buckets[-1-item]; + retry_bucket = 1; + continue; + } + + /* collision? */ + for (i = 0; i < outpos; i++) { + if (out[i] == item) { + collide = 1; + break; + } + } + + reject = 0; + if (!collide && recurse_to_leaf) { + if (item < 0) { + if (crush_choose_firstn_wrapper(map, + map->buckets[-1-item], + weight, weight_max, + x, outpos+1, 0, + out2, outpos, + recurse_tries, 0, + local_retries, + local_fallback_retries, + 0, + NULL, balance_param) <= outpos) + /* didn't get leaf */ + reject = 1; + } else { + /* we already have a leaf! */ + out2[outpos] = item; + } + } + + if (!reject) { + /* out? */ + if (itemtype == 0) + reject = is_out(map, weight, + weight_max, + item, x); + else + reject = 0; + } + +reject: + if (reject || collide) { + ftotal++; + flocal++; + + if (collide && flocal <= local_retries) + /* retry locally a few times */ + retry_bucket = 1; + else if (local_fallback_retries > 0 && + flocal <= in->size + local_fallback_retries) + /* exhaustive bucket search */ + retry_bucket = 1; + else if (ftotal < tries) + /* then retry descent */ + retry_descent = 1; + else + /* else give up */ + skip_rep = 1; + dprintk(" reject %d collide %d " + "ftotal %u flocal %u\n", + reject, collide, ftotal, + flocal); + } + } while (retry_bucket); + } while (retry_descent); + + if (skip_rep) { + dprintk("skip rep\n"); + continue; + } + + dprintk("CHOOSE got %d\n", item); + out[outpos] = item; + outpos++; + + if (map->choose_tries && ftotal <= map->choose_total_tries) + map->choose_tries[ftotal]++; + } + + dprintk("CHOOSE returns %d\n", outpos); + return outpos; +} /** * crush_choose_indep: alternative breadth-first positionally stable mapping @@ -848,4 +1041,177 @@ int crush_do_rule(const struct crush_map *map, return result_len; } +int crush_do_rule_wrapper(const struct crush_map *map, + int ruleno, int x, int *result, int result_max, + const __u32 *weight, int weight_max, + int *scratch, + float balance_param) +{ + int result_len; + int *a = scratch; + int *b = scratch + result_max; + int *c = scratch + result_max*2; + int recurse_to_leaf; + int *w; + int wsize = 0; + int *o; + int osize; + int *tmp; + struct crush_rule *rule; + __u32 step; + int i, j; + int numrep; + /* + * the original choose_total_tries value was off by one (it + * counted "retries" and not "tries"). add one. + */ + int choose_tries = map->choose_total_tries + 1; + int choose_leaf_tries = 0; + /* + * the local tries values were counted as "retries", though, + * and need no adjustment + */ + int choose_local_retries = map->choose_local_tries; + int choose_local_fallback_retries = map->choose_local_fallback_tries; + + if ((__u32)ruleno >= map->max_rules) { + dprintk(" bad ruleno %d\n", ruleno); + return 0; + } + + rule = map->rules[ruleno]; + result_len = 0; + w = a; + o = b; + + for (step = 0; step < rule->len; step++) { + int firstn = 0; + struct crush_rule_step *curstep = &rule->steps[step]; + + switch (curstep->op) { + case CRUSH_RULE_TAKE: + w[0] = curstep->arg1; + wsize = 1; + break; + + case CRUSH_RULE_SET_CHOOSE_TRIES: + if (curstep->arg1 > 0) + choose_tries = curstep->arg1; + break; + + case CRUSH_RULE_SET_CHOOSELEAF_TRIES: + if (curstep->arg1 > 0) + choose_leaf_tries = curstep->arg1; + break; + + case CRUSH_RULE_SET_CHOOSE_LOCAL_TRIES: + if (curstep->arg1 >= 0) + choose_local_retries = curstep->arg1; + break; + + case CRUSH_RULE_SET_CHOOSE_LOCAL_FALLBACK_TRIES: + if (curstep->arg1 >= 0) + choose_local_fallback_retries = curstep->arg1; + break; + + case CRUSH_RULE_CHOOSELEAF_FIRSTN: + case CRUSH_RULE_CHOOSE_FIRSTN: + firstn = 1; + /* fall through */ + case CRUSH_RULE_CHOOSELEAF_INDEP: + case CRUSH_RULE_CHOOSE_INDEP: + if (wsize == 0) + break; + + recurse_to_leaf = + curstep->op == + CRUSH_RULE_CHOOSELEAF_FIRSTN || + curstep->op == + CRUSH_RULE_CHOOSELEAF_INDEP; + + /* reset output */ + osize = 0; + + for (i = 0; i < wsize; i++) { + /* + * see CRUSH_N, CRUSH_N_MINUS macros. + * basically, numrep <= 0 means relative to + * the provided result_max + */ + numrep = curstep->arg1; + if (numrep <= 0) { + numrep += result_max; + if (numrep <= 0) + continue; + } + j = 0; + if (firstn) { + int recurse_tries; + if (choose_leaf_tries) + recurse_tries = + choose_leaf_tries; + else if (map->chooseleaf_descend_once) + recurse_tries = 1; + else + recurse_tries = choose_tries; + osize += crush_choose_firstn_wrapper( + map, + map->buckets[-1-w[i]], + weight, weight_max, + x, numrep, + curstep->arg2, + o+osize, j, + choose_tries, + recurse_tries, + choose_local_retries, + choose_local_fallback_retries, + recurse_to_leaf, + c+osize, + balance_param); + } else { + crush_choose_indep( + map, + map->buckets[-1-w[i]], + weight, weight_max, + x, numrep, numrep, + curstep->arg2, + o+osize, j, + choose_tries, + choose_leaf_tries ? + choose_leaf_tries : 1, + recurse_to_leaf, + c+osize, + 0); + osize += numrep; + } + } + + if (recurse_to_leaf) + /* copy final _leaf_ values to output set */ + memcpy(o, c, osize*sizeof(*o)); + + /* swap o and w arrays */ + tmp = o; + o = w; + w = tmp; + wsize = osize; + break; + + + case CRUSH_RULE_EMIT: + for (i = 0; i < wsize && result_len < result_max; i++) { + result[result_len] = w[i]; + result_len++; + } + wsize = 0; + break; + + default: + dprintk(" unknown op %d at step %d\n", + curstep->op, step); + break; + } + } + return result_len; +} diff --git a/src/crush/mapper.h b/src/crush/mapper.h index 5dfd5b1..0fa1b88 100644 --- a/src/crush/mapper.h +++ b/src/crush/mapper.h @@ -16,5 +16,10 @@ extern int crush_do_rule(const struct crush_map *map, int x, int *result, int result_max, const __u32 *weights, int weight_max, int *scratch); - +extern int crush_do_rule_wrapper(const struct crush_map *map, + int ruleno, + int x, int *result, int result_max, + const __u32 *weights, int weight_max, + int *scratch, + float balance_param); #endif diff --git a/src/mon/OSDMonitor.cc b/src/mon/OSDMonitor.cc index dd729f0..eb56bab 100644 --- a/src/mon/OSDMonitor.cc +++ b/src/mon/OSDMonitor.cc @@ -66,6 +66,27 @@ static ostream& _prefix(std::ostream *_dout, Monitor *mon, OSDMap& osdmap) { << ").osd e" << osdmap.get_epoch() << " "; } +float cal_average(vector dataset, int length) { + float avg = 0; + float sum = 0; + for(int i = 0; i < length; i++){ + sum += dataset[i]; + } + avg = sum/length; + return avg; +} + +float cal_stdev(vector dataset, int length) { + float avg = cal_average(dataset, length); + float sum = 0; + float stdev; + for(int i = 0; i < length; i++){ + sum += pow(dataset[i] - avg, 2); + } + stdev = pow(sum/length, 0.5); + return stdev; +} + bool OSDMonitor::_have_pending_crush() { return pending_inc.crush.length(); @@ -3150,6 +3171,43 @@ int OSDMonitor::prepare_new_pool(MPoolOp *m) pg_pool_t::TYPE_REPLICATED, 0, ss); } +void OSDMonitor::prepare_adaptive_balance_param(pg_pool_t *pi, int64_t pool) +{ + float min_stdev = 999999; + float balance_param = 1; + + for (int k = 1; k < 5; k++) { + map osd_total_pgs; + pi->set_balance_param((float)k); + for (int i = 0; i < (int)pi->get_pg_num(); i++) { + pg_t pg(i, pool); + pg.set_pool(pool); + vector acting; + int nrep = osdmap.pg_to_up_osds_adaptive(*pi, pg, acting); + if (nrep) { + for (int j = 0; j < nrep; j++) { + osd_total_pgs[acting[j]]++; + } + } + } + + vector pg_num; + for (map::iterator ptr = osd_total_pgs.begin(); + ptr != osd_total_pgs.end(); + ++ptr) { + pg_num.push_back(ptr->second); + } + + float stdev = cal_stdev(pg_num, pg_num.size()); + if (stdev < min_stdev) { + min_stdev = stdev; + balance_param = k; + } + } + + pi->set_balance_param(balance_param); +} + int OSDMonitor::crush_ruleset_create_erasure(const string &name, const string &profile, int *ruleset, @@ -3527,6 +3585,7 @@ int OSDMonitor::prepare_new_pool(string& name, uint64_t auid, g_conf->osd_pool_default_cache_target_full_ratio * 1000000; pi->cache_min_flush_age = g_conf->osd_pool_default_cache_min_flush_age; pi->cache_min_evict_age = g_conf->osd_pool_default_cache_min_evict_age; + prepare_adaptive_balance_param(pi, pool); pending_inc.new_pool_names[pool] = name; return 0; } diff --git a/src/mon/OSDMonitor.h b/src/mon/OSDMonitor.h index 650c55e..eabe06d 100644 --- a/src/mon/OSDMonitor.h +++ b/src/mon/OSDMonitor.h @@ -253,6 +253,7 @@ private: bool prepare_pool_op (MPoolOp *m); bool prepare_pool_op_create (MPoolOp *m); bool prepare_pool_op_delete(MPoolOp *m); + void prepare_adaptive_balance_param(pg_pool_t *pi, int64_t pool); int crush_ruleset_create_erasure(const string &name, const string &profile, int *ruleset, diff --git a/src/osd/OSDMap.cc b/src/osd/OSDMap.cc index ada0909..118f1ce 100644 --- a/src/osd/OSDMap.cc +++ b/src/osd/OSDMap.cc @@ -1419,13 +1419,13 @@ int OSDMap::_pg_to_osds(const pg_pool_t& pool, pg_t pg, ps_t *ppps) const { // map to osds[] - ps_t pps = pool.raw_pg_to_pps(pg); // placement ps + ps_t pps = pool.raw_pg_to_congruential_pps(pg); // placement ps unsigned size = pool.get_size(); // what crush rule? int ruleno = crush->find_rule(pool.get_crush_ruleset(), pool.get_type(), size); if (ruleno >= 0) - crush->do_rule(ruleno, pps, *osds, size, osd_weight); + crush->do_rule(ruleno, pps, *osds, size, osd_weight, pool.get_balance_param()); _remove_nonexistent_osds(pool, *osds); @@ -1583,6 +1583,15 @@ void OSDMap::pg_to_raw_up(pg_t pg, vector *up, int *primary) const _raw_to_up_osds(*pool, raw, up, primary); _apply_primary_affinity(pps, *pool, up, primary); } + +int OSDMap::pg_to_up_osds_adaptive(const pg_pool_t& pool, pg_t pg, vector& up) const +{ + int primary; + vector raw; + _pg_to_osds(pool, pg, &raw, &primary, NULL); + _raw_to_up_osds(pool, raw, &up, &primary); + return up.size(); +} void OSDMap::_pg_to_up_acting_osds(const pg_t& pg, vector *up, int *up_primary, vector *acting, int *acting_primary) const diff --git a/src/osd/OSDMap.h b/src/osd/OSDMap.h index 983a44d..5f1addd 100644 --- a/src/osd/OSDMap.h +++ b/src/osd/OSDMap.h @@ -642,6 +642,7 @@ public: int r = pg_to_acting_osds(pg, &acting, &primary); return r; } + int pg_to_up_osds_adaptive(const pg_pool_t& pool, pg_t pg, vector& up) const; /** * This does not apply temp overrides and should not be used * by anybody for data mapping purposes. Specify both pointers. diff --git a/src/osd/osd_types.cc b/src/osd/osd_types.cc index 1dc2af1..5017005 100644 --- a/src/osd/osd_types.cc +++ b/src/osd/osd_types.cc @@ -770,6 +770,7 @@ void pg_pool_t::dump(Formatter *f) const f->dump_int("min_size", get_min_size()); f->dump_int("crush_ruleset", get_crush_ruleset()); f->dump_int("object_hash", get_object_hash()); + f->dump_float("balance_param", get_balance_param()); f->dump_int("pg_num", get_pg_num()); f->dump_int("pg_placement_num", get_pgp_num()); f->dump_unsigned("crash_replay_interval", get_crash_replay_interval()); @@ -993,6 +994,23 @@ ps_t pg_pool_t::raw_pg_to_pps(pg_t pg) const } } +ps_t pg_pool_t::raw_pg_to_congruential_pps(pg_t pg) const +{ + if (flags & FLAG_HASHPSPOOL) { + // Shuffles the original pgid sequence, with poolid being the seed + ps_t stable = ceph_stable_mod(pg.ps(), pgp_num, pgp_num_mask); + ps_t pps = (1033*stable + 2*pg.pool() + 1) % pgp_num_mask + pg.pool(); + return pps; + } else { + // Legacy behavior; add ps and pool together. This is not a great + // idea because the PGs from each pool will essentially overlap on + // top of each other: 0.5 == 1.4 == 2.3 == ... + return + ceph_stable_mod(pg.ps(), pgp_num, pgp_num_mask) + + pg.pool(); + } +} + uint32_t pg_pool_t::get_random_pg_position(pg_t pg, uint32_t seed) const { uint32_t r = crush_hash32_2(CRUSH_HASH_RJENKINS1, seed, 123); @@ -1020,6 +1038,7 @@ void pg_pool_t::encode(bufferlist& bl, uint64_t features) const ::encode(size, bl); ::encode(crush_ruleset, bl); ::encode(object_hash, bl); + ::encode(balance_param, bl); ::encode(pg_num, bl); ::encode(pgp_num, bl); __u32 lpg_num = 0, lpgp_num = 0; // tell old code that there are no localized pgs. @@ -1048,6 +1067,7 @@ void pg_pool_t::encode(bufferlist& bl, uint64_t features) const ::encode(size, bl); ::encode(crush_ruleset, bl); ::encode(object_hash, bl); + ::encode(balance_param, bl); ::encode(pg_num, bl); ::encode(pgp_num, bl); __u32 lpg_num = 0, lpgp_num = 0; // tell old code that there are no localized pgs. @@ -1075,6 +1095,7 @@ void pg_pool_t::encode(bufferlist& bl, uint64_t features) const ::encode(size, bl); ::encode(crush_ruleset, bl); ::encode(object_hash, bl); + ::encode(balance_param, bl); ::encode(pg_num, bl); ::encode(pgp_num, bl); __u32 lpg_num = 0, lpgp_num = 0; // tell old code that there are no localized pgs. @@ -1118,6 +1139,7 @@ void pg_pool_t::encode(bufferlist& bl, uint64_t features) const ::encode(size, bl); ::encode(crush_ruleset, bl); ::encode(object_hash, bl); + ::encode(balance_param, bl); ::encode(pg_num, bl); ::encode(pgp_num, bl); __u32 lpg_num = 0, lpgp_num = 0; // tell old code that there are no localized pgs. @@ -1165,6 +1187,7 @@ void pg_pool_t::decode(bufferlist::iterator& bl) ::decode(size, bl); ::decode(crush_ruleset, bl); ::decode(object_hash, bl); + ::decode(balance_param, bl); ::decode(pg_num, bl); ::decode(pgp_num, bl); { @@ -1286,6 +1309,7 @@ void pg_pool_t::generate_test_instances(list& o) a.size = 2; a.crush_ruleset = 3; a.object_hash = 4; + a.balance_param = 1.0; a.pg_num = 6; a.pgp_num = 5; a.last_change = 9; @@ -1338,6 +1362,7 @@ ostream& operator<<(ostream& out, const pg_pool_t& p) << " min_size " << p.get_min_size() << " crush_ruleset " << p.get_crush_ruleset() << " object_hash " << p.get_object_hash_name() + << " balance_param " << p.get_balance_param() << " pg_num " << p.get_pg_num() << " pgp_num " << p.get_pgp_num() << " last_change " << p.get_last_change(); diff --git a/src/osd/osd_types.h b/src/osd/osd_types.h index 4dab643..b9c255c 100644 --- a/src/osd/osd_types.h +++ b/src/osd/osd_types.h @@ -897,6 +897,7 @@ struct pg_pool_t { __u8 size, min_size; ///< number of osds in each pg __u8 crush_ruleset; ///< crush placement rule set __u8 object_hash; ///< hash mapping object name to ps + float balance_param; private: __u32 pg_num, pgp_num; ///< number of pgs @@ -984,7 +985,7 @@ public: pg_pool_t() : flags(0), type(0), size(0), min_size(0), - crush_ruleset(0), object_hash(0), + crush_ruleset(0), object_hash(0), balance_param(1), pg_num(0), pgp_num(0), last_change(0), last_force_op_resend(0), @@ -1031,6 +1032,7 @@ public: unsigned get_min_size() const { return min_size; } int get_crush_ruleset() const { return crush_ruleset; } int get_object_hash() const { return object_hash; } + float get_balance_param() const { return balance_param; } const char *get_object_hash_name() const { return ceph_str_hash_name(get_object_hash()); } @@ -1070,6 +1072,8 @@ public: unsigned get_pg_num_mask() const { return pg_num_mask; } unsigned get_pgp_num_mask() const { return pgp_num_mask; } + void set_balance_param(float b) { balance_param = b; } + // if pg_num is not a multiple of two, pgs are not equally sized. // return, for a given pg, the fraction (denominator) of the total // pool size that it represents. @@ -1145,6 +1149,7 @@ public: * seeds. */ ps_t raw_pg_to_pps(pg_t pg) const; + ps_t raw_pg_to_congruential_pps(pg_t pg) const; /// choose a random hash position within a pg uint32_t get_random_pg_position(pg_t pgid, uint32_t seed) const; diff --git a/src/tools/crushtool.cc b/src/tools/crushtool.cc index 08818c4..cd93151 100644 --- a/src/tools/crushtool.cc +++ b/src/tools/crushtool.cc @@ -91,7 +91,7 @@ void usage() cout << " specify output for for (de)compilation\n"; cout << " --build --num_osds N layer1 ...\n"; cout << " build a new map, where each 'layer' is\n"; - cout << " 'name (uniform|straw|list|tree) size'\n"; + cout << " 'name (uniform|straw|list|tree|linear) size'\n"; cout << " -i mapfn --test test a range of inputs on the map\n"; cout << " [--min-x x] [--max-x x] [--x x]\n"; cout << " [--min-rule r] [--max-rule r] [--rule r]\n"; @@ -148,6 +148,7 @@ struct bucket_types_t { { "list", CRUSH_BUCKET_LIST }, { "straw", CRUSH_BUCKET_STRAW }, { "tree", CRUSH_BUCKET_TREE }, + { "linear", CRUSH_BUCKET_LINEAR }, { 0, 0 }, };