1
|
|
2
|
|
3
|
|
4
|
|
5
|
|
6
|
|
7
|
|
8
|
|
9
|
|
10
|
|
11
|
|
12
|
|
13
|
|
14
|
|
15
|
|
16
|
|
17
|
|
18
|
|
19
|
#include <sstream>
|
20
|
|
21
|
#include "OSDMonitor.h"
|
22
|
#include "Monitor.h"
|
23
|
#include "MDSMonitor.h"
|
24
|
#include "PGMonitor.h"
|
25
|
|
26
|
#include "MonitorDBStore.h"
|
27
|
|
28
|
#include "crush/CrushWrapper.h"
|
29
|
#include "crush/CrushTester.h"
|
30
|
#include "crush/CrushTreeDumper.h"
|
31
|
|
32
|
#include "messages/MOSDFailure.h"
|
33
|
#include "messages/MOSDMarkMeDown.h"
|
34
|
#include "messages/MOSDMap.h"
|
35
|
#include "messages/MMonGetOSDMap.h"
|
36
|
#include "messages/MOSDBoot.h"
|
37
|
#include "messages/MOSDAlive.h"
|
38
|
#include "messages/MPoolOp.h"
|
39
|
#include "messages/MPoolOpReply.h"
|
40
|
#include "messages/MOSDPGTemp.h"
|
41
|
#include "messages/MMonCommand.h"
|
42
|
#include "messages/MRemoveSnaps.h"
|
43
|
#include "messages/MOSDScrub.h"
|
44
|
|
45
|
#include "common/TextTable.h"
|
46
|
#include "common/Timer.h"
|
47
|
#include "common/ceph_argparse.h"
|
48
|
#include "common/perf_counters.h"
|
49
|
#include "common/strtol.h"
|
50
|
|
51
|
#include "common/config.h"
|
52
|
#include "common/errno.h"
|
53
|
|
54
|
#include "erasure-code/ErasureCodePlugin.h"
|
55
|
|
56
|
#include "include/compat.h"
|
57
|
#include "include/assert.h"
|
58
|
#include "include/stringify.h"
|
59
|
#include "include/util.h"
|
60
|
#include "common/cmdparse.h"
|
61
|
#include "include/str_list.h"
|
62
|
#include "include/str_map.h"
|
63
|
|
64
|
#define dout_subsys ceph_subsys_mon
|
65
|
#undef dout_prefix
|
66
|
#define dout_prefix _prefix(_dout, mon, osdmap)
|
67
|
static ostream& _prefix(std::ostream *_dout, Monitor *mon, OSDMap& osdmap) {
|
68
|
return *_dout << "mon." << mon->name << "@" << mon->rank
|
69
|
<< "(" << mon->get_state_name()
|
70
|
<< ").osd e" << osdmap.get_epoch() << " ";
|
71
|
}
|
72
|
|
73
|
bool OSDMonitor::_have_pending_crush()
|
74
|
{
|
75
|
return pending_inc.crush.length();
|
76
|
}
|
77
|
|
78
|
CrushWrapper &OSDMonitor::_get_stable_crush()
|
79
|
{
|
80
|
return *osdmap.crush;
|
81
|
}
|
82
|
|
83
|
void OSDMonitor::_get_pending_crush(CrushWrapper& newcrush)
|
84
|
{
|
85
|
bufferlist bl;
|
86
|
if (pending_inc.crush.length())
|
87
|
bl = pending_inc.crush;
|
88
|
else
|
89
|
osdmap.crush->encode(bl);
|
90
|
|
91
|
bufferlist::iterator p = bl.begin();
|
92
|
newcrush.decode(p);
|
93
|
}
|
94
|
|
95
|
void OSDMonitor::create_initial()
|
96
|
{
|
97
|
dout(10) << "create_initial for " << mon->monmap->fsid << dendl;
|
98
|
|
99
|
OSDMap newmap;
|
100
|
|
101
|
bufferlist bl;
|
102
|
mon->store->get("mkfs", "osdmap", bl);
|
103
|
|
104
|
if (bl.length()) {
|
105
|
newmap.decode(bl);
|
106
|
newmap.set_fsid(mon->monmap->fsid);
|
107
|
} else {
|
108
|
newmap.build_simple(g_ceph_context, 0, mon->monmap->fsid, 0,
|
109
|
g_conf->osd_pg_bits, g_conf->osd_pgp_bits);
|
110
|
}
|
111
|
newmap.set_epoch(1);
|
112
|
newmap.created = newmap.modified = ceph_clock_now(g_ceph_context);
|
113
|
|
114
|
|
115
|
newmap.encode(pending_inc.fullmap, mon->quorum_features | CEPH_FEATURE_RESERVED);
|
116
|
pending_inc.full_crc = newmap.get_crc();
|
117
|
dout(20) << " full crc " << pending_inc.full_crc << dendl;
|
118
|
}
|
119
|
|
120
|
void OSDMonitor::update_from_paxos(bool *need_bootstrap)
|
121
|
{
|
122
|
version_t version = get_last_committed();
|
123
|
if (version == osdmap.epoch)
|
124
|
return;
|
125
|
assert(version >= osdmap.epoch);
|
126
|
|
127
|
dout(15) << "update_from_paxos paxos e " << version
|
128
|
<< ", my e " << osdmap.epoch << dendl;
|
129
|
|
130
|
|
131
|
|
132
|
|
133
|
|
134
|
|
135
|
|
136
|
|
137
|
|
138
|
|
139
|
|
140
|
|
141
|
|
142
|
version_t latest_full = get_version_latest_full();
|
143
|
if (latest_full == 0 && get_first_committed() > 1)
|
144
|
latest_full = get_first_committed();
|
145
|
|
146
|
if (latest_full > 0) {
|
147
|
|
148
|
|
149
|
bufferlist test;
|
150
|
get_version_full(latest_full, test);
|
151
|
if (test.length() == 0) {
|
152
|
dout(10) << __func__ << " ignoring recorded latest_full as it is missing; fallback to search" << dendl;
|
153
|
latest_full = 0;
|
154
|
}
|
155
|
}
|
156
|
if (get_first_committed() > 1 &&
|
157
|
latest_full < get_first_committed()) {
|
158
|
|
159
|
|
160
|
|
161
|
|
162
|
|
163
|
version_t lc = get_last_committed();
|
164
|
version_t fc = get_first_committed();
|
165
|
|
166
|
dout(10) << __func__ << " looking for valid full map in interval"
|
167
|
<< " [" << fc << ", " << lc << "]" << dendl;
|
168
|
|
169
|
latest_full = 0;
|
170
|
for (version_t v = lc; v >= fc; v--) {
|
171
|
string full_key = "full_" + stringify(v);
|
172
|
if (mon->store->exists(get_service_name(), full_key)) {
|
173
|
dout(10) << __func__ << " found latest full map v " << v << dendl;
|
174
|
latest_full = v;
|
175
|
break;
|
176
|
}
|
177
|
}
|
178
|
|
179
|
|
180
|
|
181
|
|
182
|
assert(latest_full > 0);
|
183
|
MonitorDBStore::TransactionRef t(new MonitorDBStore::Transaction);
|
184
|
put_version_latest_full(t, latest_full);
|
185
|
mon->store->apply_transaction(t);
|
186
|
dout(10) << __func__ << " updated the on-disk full map version to "
|
187
|
<< latest_full << dendl;
|
188
|
}
|
189
|
|
190
|
if ((latest_full > 0) && (latest_full > osdmap.epoch)) {
|
191
|
bufferlist latest_bl;
|
192
|
get_version_full(latest_full, latest_bl);
|
193
|
assert(latest_bl.length() != 0);
|
194
|
dout(7) << __func__ << " loading latest full map e" << latest_full << dendl;
|
195
|
osdmap.decode(latest_bl);
|
196
|
}
|
197
|
|
198
|
|
199
|
MonitorDBStore::TransactionRef t;
|
200
|
size_t tx_size = 0;
|
201
|
while (version > osdmap.epoch) {
|
202
|
bufferlist inc_bl;
|
203
|
int err = get_version(osdmap.epoch+1, inc_bl);
|
204
|
assert(err == 0);
|
205
|
assert(inc_bl.length());
|
206
|
|
207
|
dout(7) << "update_from_paxos applying incremental " << osdmap.epoch+1 << dendl;
|
208
|
OSDMap::Incremental inc(inc_bl);
|
209
|
err = osdmap.apply_incremental(inc);
|
210
|
assert(err == 0);
|
211
|
|
212
|
if (!t)
|
213
|
t.reset(new MonitorDBStore::Transaction);
|
214
|
|
215
|
|
216
|
|
217
|
|
218
|
|
219
|
uint64_t f = inc.encode_features;
|
220
|
if (!f)
|
221
|
f = mon->quorum_features;
|
222
|
if (!f)
|
223
|
f = -1;
|
224
|
bufferlist full_bl;
|
225
|
osdmap.encode(full_bl, f | CEPH_FEATURE_RESERVED);
|
226
|
tx_size += full_bl.length();
|
227
|
|
228
|
bufferlist orig_full_bl;
|
229
|
get_version_full(osdmap.epoch, orig_full_bl);
|
230
|
if (orig_full_bl.length()) {
|
231
|
|
232
|
assert(inc.have_crc);
|
233
|
if (inc.full_crc != osdmap.crc) {
|
234
|
|
235
|
|
236
|
|
237
|
|
238
|
|
239
|
|
240
|
derr << __func__ << " full map CRC mismatch, resetting to canonical"
|
241
|
<< dendl;
|
242
|
osdmap = OSDMap();
|
243
|
osdmap.decode(orig_full_bl);
|
244
|
}
|
245
|
} else {
|
246
|
assert(!inc.have_crc);
|
247
|
put_version_full(t, osdmap.epoch, full_bl);
|
248
|
}
|
249
|
put_version_latest_full(t, osdmap.epoch);
|
250
|
|
251
|
|
252
|
dout(1) << osdmap << dendl;
|
253
|
|
254
|
if (osdmap.epoch == 1) {
|
255
|
t->erase("mkfs", "osdmap");
|
256
|
}
|
257
|
|
258
|
if (tx_size > g_conf->mon_sync_max_payload_size*2) {
|
259
|
mon->store->apply_transaction(t);
|
260
|
t = MonitorDBStore::TransactionRef();
|
261
|
tx_size = 0;
|
262
|
}
|
263
|
}
|
264
|
|
265
|
if (t) {
|
266
|
mon->store->apply_transaction(t);
|
267
|
}
|
268
|
|
269
|
for (int o = 0; o < osdmap.get_max_osd(); o++) {
|
270
|
if (osdmap.is_down(o)) {
|
271
|
|
272
|
osd_epoch.erase(o);
|
273
|
|
274
|
|
275
|
if (osdmap.is_in(o) &&
|
276
|
down_pending_out.count(o) == 0) {
|
277
|
dout(10) << " adding osd." << o << " to down_pending_out map" << dendl;
|
278
|
down_pending_out[o] = ceph_clock_now(g_ceph_context);
|
279
|
}
|
280
|
}
|
281
|
}
|
282
|
|
283
|
map<int,epoch_t>::iterator p = osd_epoch.upper_bound(osdmap.get_max_osd());
|
284
|
while (p != osd_epoch.end()) {
|
285
|
osd_epoch.erase(p++);
|
286
|
}
|
287
|
|
288
|
|
289
|
|
290
|
assert(g_conf->mon_osd_allow_primary_temp || osdmap.primary_temp->empty());
|
291
|
|
292
|
if (mon->is_leader()) {
|
293
|
|
294
|
mon->pgmon()->check_osd_map(osdmap.epoch);
|
295
|
}
|
296
|
|
297
|
check_subs();
|
298
|
|
299
|
share_map_with_random_osd();
|
300
|
update_logger();
|
301
|
|
302
|
process_failures();
|
303
|
|
304
|
|
305
|
update_msgr_features();
|
306
|
}
|
307
|
|
308
|
void OSDMonitor::update_msgr_features()
|
309
|
{
|
310
|
set<int> types;
|
311
|
types.insert((int)entity_name_t::TYPE_OSD);
|
312
|
types.insert((int)entity_name_t::TYPE_CLIENT);
|
313
|
types.insert((int)entity_name_t::TYPE_MDS);
|
314
|
types.insert((int)entity_name_t::TYPE_MON);
|
315
|
for (set<int>::iterator q = types.begin(); q != types.end(); ++q) {
|
316
|
uint64_t mask;
|
317
|
uint64_t features = osdmap.get_features(*q, &mask);
|
318
|
if ((mon->messenger->get_policy(*q).features_required & mask) != features) {
|
319
|
dout(0) << "crush map has features " << features << ", adjusting msgr requires" << dendl;
|
320
|
Messenger::Policy p = mon->messenger->get_policy(*q);
|
321
|
p.features_required = (p.features_required & ~mask) | features;
|
322
|
mon->messenger->set_policy(*q, p);
|
323
|
}
|
324
|
}
|
325
|
}
|
326
|
|
327
|
bool OSDMonitor::thrash()
|
328
|
{
|
329
|
if (!thrash_map)
|
330
|
return false;
|
331
|
|
332
|
thrash_map--;
|
333
|
int o;
|
334
|
|
335
|
|
336
|
if (rand() % 4 == 0 || thrash_last_up_osd < 0)
|
337
|
o = rand() % osdmap.get_num_osds();
|
338
|
else
|
339
|
o = thrash_last_up_osd;
|
340
|
if (osdmap.is_up(o)) {
|
341
|
dout(5) << "thrash_map osd." << o << " up_thru" << dendl;
|
342
|
pending_inc.new_up_thru[o] = osdmap.get_epoch();
|
343
|
}
|
344
|
|
345
|
|
346
|
o = rand() % osdmap.get_num_osds();
|
347
|
if (osdmap.is_up(o)) {
|
348
|
dout(5) << "thrash_map osd." << o << " down" << dendl;
|
349
|
pending_inc.new_state[o] = CEPH_OSD_UP;
|
350
|
} else if (osdmap.exists(o)) {
|
351
|
dout(5) << "thrash_map osd." << o << " up" << dendl;
|
352
|
pending_inc.new_state[o] = CEPH_OSD_UP;
|
353
|
pending_inc.new_up_client[o] = entity_addr_t();
|
354
|
pending_inc.new_up_cluster[o] = entity_addr_t();
|
355
|
pending_inc.new_hb_back_up[o] = entity_addr_t();
|
356
|
pending_inc.new_weight[o] = CEPH_OSD_IN;
|
357
|
thrash_last_up_osd = o;
|
358
|
}
|
359
|
|
360
|
|
361
|
o = rand() % osdmap.get_num_osds();
|
362
|
if (osdmap.exists(o)) {
|
363
|
dout(5) << "thrash_map osd." << o << " in" << dendl;
|
364
|
pending_inc.new_weight[o] = CEPH_OSD_IN;
|
365
|
}
|
366
|
|
367
|
|
368
|
o = rand() % osdmap.get_num_osds();
|
369
|
if (osdmap.exists(o)) {
|
370
|
dout(5) << "thrash_map osd." << o << " out" << dendl;
|
371
|
pending_inc.new_weight[o] = CEPH_OSD_OUT;
|
372
|
}
|
373
|
|
374
|
|
375
|
|
376
|
int n = rand() % mon->pgmon()->pg_map.pg_stat.size();
|
377
|
ceph::unordered_map<pg_t,pg_stat_t>::iterator p = mon->pgmon()->pg_map.pg_stat.begin();
|
378
|
ceph::unordered_map<pg_t,pg_stat_t>::iterator e = mon->pgmon()->pg_map.pg_stat.end();
|
379
|
while (n--)
|
380
|
++p;
|
381
|
for (int i=0; i<50; i++) {
|
382
|
unsigned size = osdmap.get_pg_size(p->first);
|
383
|
vector<int> v;
|
384
|
bool have_real_osd = false;
|
385
|
for (int j=0; j < (int)size; j++) {
|
386
|
o = rand() % osdmap.get_num_osds();
|
387
|
if (osdmap.exists(o) && std::find(v.begin(), v.end(), o) == v.end()) {
|
388
|
have_real_osd = true;
|
389
|
v.push_back(o);
|
390
|
}
|
391
|
}
|
392
|
for (vector<int>::iterator q = p->second.acting.begin();
|
393
|
q != p->second.acting.end() && v.size() < size;
|
394
|
++q) {
|
395
|
if (std::find(v.begin(), v.end(), *q) == v.end()) {
|
396
|
if (*q != CRUSH_ITEM_NONE)
|
397
|
have_real_osd = true;
|
398
|
v.push_back(*q);
|
399
|
}
|
400
|
}
|
401
|
if (osdmap.pg_is_ec(p->first)) {
|
402
|
while (v.size() < size)
|
403
|
v.push_back(CRUSH_ITEM_NONE);
|
404
|
}
|
405
|
if (!v.empty() && have_real_osd)
|
406
|
pending_inc.new_pg_temp[p->first] = v;
|
407
|
dout(5) << "thrash_map pg " << p->first << " pg_temp remapped to " << v << dendl;
|
408
|
|
409
|
++p;
|
410
|
if (p == e)
|
411
|
p = mon->pgmon()->pg_map.pg_stat.begin();
|
412
|
}
|
413
|
return true;
|
414
|
}
|
415
|
|
416
|
void OSDMonitor::on_active()
|
417
|
{
|
418
|
update_logger();
|
419
|
|
420
|
if (thrash_map) {
|
421
|
if (mon->is_leader()) {
|
422
|
if (thrash())
|
423
|
propose_pending();
|
424
|
} else {
|
425
|
thrash_map = 0;
|
426
|
}
|
427
|
}
|
428
|
|
429
|
if (mon->is_leader())
|
430
|
mon->clog->info() << "osdmap " << osdmap << "\n";
|
431
|
|
432
|
if (!mon->is_leader()) {
|
433
|
list<MOSDFailure*> ls;
|
434
|
take_all_failures(ls);
|
435
|
while (!ls.empty()) {
|
436
|
dispatch(ls.front());
|
437
|
ls.pop_front();
|
438
|
}
|
439
|
}
|
440
|
}
|
441
|
|
442
|
void OSDMonitor::on_shutdown()
|
443
|
{
|
444
|
dout(10) << __func__ << dendl;
|
445
|
|
446
|
|
447
|
list<MOSDFailure*> ls;
|
448
|
take_all_failures(ls);
|
449
|
while (!ls.empty()) {
|
450
|
ls.front()->put();
|
451
|
ls.pop_front();
|
452
|
}
|
453
|
}
|
454
|
|
455
|
void OSDMonitor::update_logger()
|
456
|
{
|
457
|
dout(10) << "update_logger" << dendl;
|
458
|
|
459
|
mon->cluster_logger->set(l_cluster_num_osd, osdmap.get_num_osds());
|
460
|
mon->cluster_logger->set(l_cluster_num_osd_up, osdmap.get_num_up_osds());
|
461
|
mon->cluster_logger->set(l_cluster_num_osd_in, osdmap.get_num_in_osds());
|
462
|
mon->cluster_logger->set(l_cluster_osd_epoch, osdmap.get_epoch());
|
463
|
}
|
464
|
|
465
|
|
466
|
|
467
|
|
468
|
|
469
|
|
470
|
int OSDMonitor::reweight_by_utilization(int oload, std::string& out_str,
|
471
|
bool by_pg, const set<int64_t> *pools)
|
472
|
{
|
473
|
if (oload <= 100) {
|
474
|
ostringstream oss;
|
475
|
oss << "You must give a percentage higher than 100. "
|
476
|
"The reweighting threshold will be calculated as <average-utilization> "
|
477
|
"times <input-percentage>. For example, an argument of 200 would "
|
478
|
"reweight OSDs which are twice as utilized as the average OSD.\n";
|
479
|
out_str = oss.str();
|
480
|
return -EINVAL;
|
481
|
}
|
482
|
|
483
|
const PGMap &pgm = mon->pgmon()->pg_map;
|
484
|
vector<int> pgs_by_osd(osdmap.get_max_osd());
|
485
|
|
486
|
|
487
|
|
488
|
double average_util;
|
489
|
if (by_pg) {
|
490
|
|
491
|
double weight_sum = 0.0;
|
492
|
unsigned num_pg_copies = 0;
|
493
|
int num_osds = 0;
|
494
|
for (ceph::unordered_map<pg_t,pg_stat_t>::const_iterator p =
|
495
|
pgm.pg_stat.begin();
|
496
|
p != pgm.pg_stat.end();
|
497
|
++p) {
|
498
|
if (pools && pools->count(p->first.pool()) == 0)
|
499
|
continue;
|
500
|
for (vector<int>::const_iterator q = p->second.acting.begin();
|
501
|
q != p->second.acting.end();
|
502
|
++q) {
|
503
|
if (*q >= (int)pgs_by_osd.size())
|
504
|
pgs_by_osd.resize(*q);
|
505
|
if (pgs_by_osd[*q] == 0) {
|
506
|
weight_sum += osdmap.crush->get_item_weightf(*q);
|
507
|
++num_osds;
|
508
|
}
|
509
|
++pgs_by_osd[*q];
|
510
|
++num_pg_copies;
|
511
|
}
|
512
|
}
|
513
|
|
514
|
if (!num_osds || (num_pg_copies / num_osds < g_conf->mon_reweight_min_pgs_per_osd)) {
|
515
|
ostringstream oss;
|
516
|
oss << "Refusing to reweight: we only have " << num_pg_copies
|
517
|
<< " PGs across " << num_osds << " osds!\n";
|
518
|
out_str = oss.str();
|
519
|
return -EDOM;
|
520
|
}
|
521
|
|
522
|
average_util = (double)num_pg_copies / weight_sum;
|
523
|
} else {
|
524
|
|
525
|
int num_osd = MIN(1, pgm.osd_stat.size());
|
526
|
if ((uint64_t)pgm.osd_sum.kb * 1024 / num_osd
|
527
|
< g_conf->mon_reweight_min_bytes_per_osd) {
|
528
|
ostringstream oss;
|
529
|
oss << "Refusing to reweight: we only have " << pgm.osd_sum.kb
|
530
|
<< " kb across all osds!\n";
|
531
|
out_str = oss.str();
|
532
|
return -EDOM;
|
533
|
}
|
534
|
if ((uint64_t)pgm.osd_sum.kb_used * 1024 / num_osd
|
535
|
< g_conf->mon_reweight_min_bytes_per_osd) {
|
536
|
ostringstream oss;
|
537
|
oss << "Refusing to reweight: we only have " << pgm.osd_sum.kb_used
|
538
|
<< " kb used across all osds!\n";
|
539
|
out_str = oss.str();
|
540
|
return -EDOM;
|
541
|
}
|
542
|
|
543
|
average_util = (double)pgm.osd_sum.kb_used / (double)pgm.osd_sum.kb;
|
544
|
}
|
545
|
|
546
|
|
547
|
double overload_util = average_util * (double)oload / 100.0;
|
548
|
|
549
|
|
550
|
double underload_util = average_util;
|
551
|
|
552
|
ostringstream oss;
|
553
|
char buf[128];
|
554
|
snprintf(buf, sizeof(buf), "average %04f, overload %04f. ",
|
555
|
average_util, overload_util);
|
556
|
oss << buf;
|
557
|
std::string sep;
|
558
|
oss << "reweighted: ";
|
559
|
bool changed = false;
|
560
|
for (ceph::unordered_map<int,osd_stat_t>::const_iterator p =
|
561
|
pgm.osd_stat.begin();
|
562
|
p != pgm.osd_stat.end();
|
563
|
++p) {
|
564
|
float util;
|
565
|
if (by_pg) {
|
566
|
util = pgs_by_osd[p->first] / osdmap.crush->get_item_weightf(p->first);
|
567
|
} else {
|
568
|
util = (double)p->second.kb_used / (double)p->second.kb;
|
569
|
}
|
570
|
if (util >= overload_util) {
|
571
|
sep = ", ";
|
572
|
|
573
|
|
574
|
|
575
|
unsigned weight = osdmap.get_weight(p->first);
|
576
|
unsigned new_weight = (unsigned)((average_util / util) * (float)weight);
|
577
|
pending_inc.new_weight[p->first] = new_weight;
|
578
|
char buf[128];
|
579
|
snprintf(buf, sizeof(buf), "osd.%d [%04f -> %04f]", p->first,
|
580
|
(float)weight / (float)0x10000,
|
581
|
(float)new_weight / (float)0x10000);
|
582
|
oss << buf << sep;
|
583
|
changed = true;
|
584
|
}
|
585
|
if (util <= underload_util) {
|
586
|
|
587
|
unsigned weight = osdmap.get_weight(p->first);
|
588
|
unsigned new_weight = (unsigned)((average_util / util) * (float)weight);
|
589
|
if (new_weight > 0x10000)
|
590
|
new_weight = 0x10000;
|
591
|
if (new_weight > weight) {
|
592
|
sep = ", ";
|
593
|
pending_inc.new_weight[p->first] = new_weight;
|
594
|
char buf[128];
|
595
|
snprintf(buf, sizeof(buf), "osd.%d [%04f -> %04f]", p->first,
|
596
|
(float)weight / (float)0x10000,
|
597
|
(float)new_weight / (float)0x10000);
|
598
|
oss << buf << sep;
|
599
|
changed = true;
|
600
|
}
|
601
|
}
|
602
|
}
|
603
|
if (sep.empty()) {
|
604
|
oss << "(none)";
|
605
|
}
|
606
|
out_str = oss.str();
|
607
|
dout(10) << "reweight_by_utilization: finished with " << out_str << dendl;
|
608
|
return changed;
|
609
|
}
|
610
|
|
611
|
template <typename F>
|
612
|
class OSDUtilizationDumper : public CrushTreeDumper::Dumper<F> {
|
613
|
public:
|
614
|
typedef CrushTreeDumper::Dumper<F> Parent;
|
615
|
|
616
|
OSDUtilizationDumper(const CrushWrapper *crush, const OSDMap *osdmap_,
|
617
|
const PGMap *pgm_, bool tree_) :
|
618
|
Parent(crush),
|
619
|
osdmap(osdmap_),
|
620
|
pgm(pgm_),
|
621
|
tree(tree_),
|
622
|
average_util(0),
|
623
|
min_var(-1),
|
624
|
max_var(-1),
|
625
|
stddev(0),
|
626
|
sum(0) {
|
627
|
if (pgm->osd_sum.kb)
|
628
|
average_util = 100.0 * (double)pgm->osd_sum.kb_used / (double)pgm->osd_sum.kb;
|
629
|
}
|
630
|
|
631
|
protected:
|
632
|
void dump_stray(F *f) {
|
633
|
for (int i = 0; i <= osdmap->get_max_osd(); i++) {
|
634
|
if (osdmap->exists(i) && !this->is_touched(i))
|
635
|
dump_item(CrushTreeDumper::Item(i, 0, 0), f);
|
636
|
}
|
637
|
}
|
638
|
|
639
|
virtual void dump_item(const CrushTreeDumper::Item &qi, F *f) {
|
640
|
if (!tree && qi.is_bucket())
|
641
|
return;
|
642
|
|
643
|
float reweight = qi.is_bucket() ? -1 : osdmap->get_weightf(qi.id);
|
644
|
int64_t kb = 0, kb_used = 0, kb_avail = 0;
|
645
|
double util = 0;
|
646
|
if (get_bucket_utilization(qi.id, kb, kb_used, kb_avail) && kb > 0)
|
647
|
util = 100.0 * (double)kb_used / (double)kb;
|
648
|
double var = 1.0;
|
649
|
if (average_util)
|
650
|
var = util / average_util;
|
651
|
|
652
|
dump_item(qi, reweight, kb, kb_used, kb_avail, util, var, f);
|
653
|
|
654
|
if (!qi.is_bucket()) {
|
655
|
if (min_var < 0 || var < min_var) min_var = var;
|
656
|
if (max_var < 0 || var > max_var) max_var = var;
|
657
|
|
658
|
double dev = util - average_util;
|
659
|
dev *= dev;
|
660
|
stddev += reweight * dev;
|
661
|
sum += reweight;
|
662
|
}
|
663
|
}
|
664
|
|
665
|
virtual void dump_item(const CrushTreeDumper::Item &qi, float &reweight,
|
666
|
int64_t kb, int64_t kb_used, int64_t kb_avail,
|
667
|
double& util, double& var, F *f) = 0;
|
668
|
|
669
|
double dev() {
|
670
|
return sum > 0 ? sqrt(stddev / sum) : 0;
|
671
|
}
|
672
|
|
673
|
bool get_bucket_utilization(int id, int64_t& kb, int64_t& kb_used,
|
674
|
int64_t& kb_avail) const {
|
675
|
if (id >= 0) {
|
676
|
typedef ceph::unordered_map<int32_t,osd_stat_t> OsdStat;
|
677
|
|
678
|
OsdStat::const_iterator p = pgm->osd_stat.find(id);
|
679
|
|
680
|
if (p == pgm->osd_stat.end())
|
681
|
return false;
|
682
|
|
683
|
kb = p->second.kb;
|
684
|
kb_used = p->second.kb_used;
|
685
|
kb_avail = p->second.kb_avail;
|
686
|
return kb > 0;
|
687
|
}
|
688
|
|
689
|
kb = 0;
|
690
|
kb_used = 0;
|
691
|
kb_avail = 0;
|
692
|
|
693
|
for (int k = osdmap->crush->get_bucket_size(id) - 1; k >= 0; k--) {
|
694
|
int item = osdmap->crush->get_bucket_item(id, k);
|
695
|
int64_t kb_i = 0, kb_used_i = 0, kb_avail_i;
|
696
|
if (!get_bucket_utilization(item, kb_i, kb_used_i, kb_avail_i))
|
697
|
return false;
|
698
|
kb += kb_i;
|
699
|
kb_used += kb_used_i;
|
700
|
kb_avail += kb_avail_i;
|
701
|
}
|
702
|
return kb > 0;
|
703
|
}
|
704
|
|
705
|
protected:
|
706
|
const OSDMap *osdmap;
|
707
|
const PGMap *pgm;
|
708
|
bool tree;
|
709
|
double average_util;
|
710
|
double min_var;
|
711
|
double max_var;
|
712
|
double stddev;
|
713
|
double sum;
|
714
|
};
|
715
|
|
716
|
class OSDUtilizationPlainDumper : public OSDUtilizationDumper<TextTable> {
|
717
|
public:
|
718
|
typedef OSDUtilizationDumper<TextTable> Parent;
|
719
|
|
720
|
OSDUtilizationPlainDumper(const CrushWrapper *crush, const OSDMap *osdmap,
|
721
|
const PGMap *pgm, bool tree) :
|
722
|
Parent(crush, osdmap, pgm, tree) {}
|
723
|
|
724
|
void dump(TextTable *tbl) {
|
725
|
tbl->define_column("ID", TextTable::LEFT, TextTable::RIGHT);
|
726
|
tbl->define_column("WEIGHT", TextTable::LEFT, TextTable::RIGHT);
|
727
|
tbl->define_column("REWEIGHT", TextTable::LEFT, TextTable::RIGHT);
|
728
|
tbl->define_column("SIZE", TextTable::LEFT, TextTable::RIGHT);
|
729
|
tbl->define_column("USE", TextTable::LEFT, TextTable::RIGHT);
|
730
|
tbl->define_column("AVAIL", TextTable::LEFT, TextTable::RIGHT);
|
731
|
tbl->define_column("%USE", TextTable::LEFT, TextTable::RIGHT);
|
732
|
tbl->define_column("VAR", TextTable::LEFT, TextTable::RIGHT);
|
733
|
if (tree)
|
734
|
tbl->define_column("TYPE NAME", TextTable::LEFT, TextTable::LEFT);
|
735
|
|
736
|
Parent::dump(tbl);
|
737
|
|
738
|
dump_stray(tbl);
|
739
|
|
740
|
*tbl << "" << "" << "TOTAL"
|
741
|
<< si_t(pgm->osd_sum.kb)
|
742
|
<< si_t(pgm->osd_sum.kb_used << 10)
|
743
|
<< si_t(pgm->osd_sum.kb_avail << 10)
|
744
|
<< lowprecision_t(average_util)
|
745
|
<< ""
|
746
|
<< TextTable::endrow;
|
747
|
}
|
748
|
|
749
|
protected:
|
750
|
struct lowprecision_t {
|
751
|
float v;
|
752
|
lowprecision_t(float _v) : v(_v) {}
|
753
|
};
|
754
|
friend std::ostream &operator<<(ostream& out, const lowprecision_t& v);
|
755
|
|
756
|
virtual void dump_item(const CrushTreeDumper::Item &qi, float &reweight,
|
757
|
int64_t kb, int64_t kb_used, int64_t kb_avail,
|
758
|
double& util, double& var, TextTable *tbl) {
|
759
|
*tbl << qi.id
|
760
|
<< weightf_t(qi.weight)
|
761
|
<< weightf_t(reweight)
|
762
|
<< si_t(kb << 10)
|
763
|
<< si_t(kb_used << 10)
|
764
|
<< si_t(kb_avail << 10)
|
765
|
<< lowprecision_t(util)
|
766
|
<< lowprecision_t(var);
|
767
|
|
768
|
if (tree) {
|
769
|
ostringstream name;
|
770
|
for (int k = 0; k < qi.depth; k++)
|
771
|
name << " ";
|
772
|
if (qi.is_bucket()) {
|
773
|
int type = crush->get_bucket_type(qi.id);
|
774
|
name << crush->get_type_name(type) << " "
|
775
|
<< crush->get_item_name(qi.id);
|
776
|
} else {
|
777
|
name << "osd." << qi.id;
|
778
|
}
|
779
|
*tbl << name.str();
|
780
|
}
|
781
|
|
782
|
*tbl << TextTable::endrow;
|
783
|
}
|
784
|
|
785
|
public:
|
786
|
string summary() {
|
787
|
ostringstream out;
|
788
|
out << "MIN/MAX VAR: " << lowprecision_t(min_var)
|
789
|
<< "/" << lowprecision_t(max_var) << " "
|
790
|
<< "STDDEV: " << lowprecision_t(dev());
|
791
|
return out.str();
|
792
|
}
|
793
|
};
|
794
|
|
795
|
ostream& operator<<(ostream& out,
|
796
|
const OSDUtilizationPlainDumper::lowprecision_t& v)
|
797
|
{
|
798
|
if (v.v < -0.01) {
|
799
|
return out << "-";
|
800
|
} else if (v.v < 0.001) {
|
801
|
return out << "0";
|
802
|
} else {
|
803
|
std::streamsize p = out.precision();
|
804
|
return out << std::fixed << std::setprecision(2) << v.v << std::setprecision(p);
|
805
|
}
|
806
|
}
|
807
|
|
808
|
class OSDUtilizationFormatDumper : public OSDUtilizationDumper<Formatter> {
|
809
|
public:
|
810
|
typedef OSDUtilizationDumper<Formatter> Parent;
|
811
|
|
812
|
OSDUtilizationFormatDumper(const CrushWrapper *crush, const OSDMap *osdmap,
|
813
|
const PGMap *pgm, bool tree) :
|
814
|
Parent(crush, osdmap, pgm, tree) {}
|
815
|
|
816
|
void dump(Formatter *f) {
|
817
|
f->open_array_section("nodes");
|
818
|
Parent::dump(f);
|
819
|
f->close_section();
|
820
|
|
821
|
f->open_array_section("stray");
|
822
|
dump_stray(f);
|
823
|
f->close_section();
|
824
|
}
|
825
|
|
826
|
protected:
|
827
|
virtual void dump_item(const CrushTreeDumper::Item &qi, float &reweight,
|
828
|
int64_t kb, int64_t kb_used, int64_t kb_avail,
|
829
|
double& util, double& var, Formatter *f) {
|
830
|
f->open_object_section("item");
|
831
|
CrushTreeDumper::dump_item_fields(crush, qi, f);
|
832
|
f->dump_float("reweight", reweight);
|
833
|
f->dump_int("kb", kb);
|
834
|
f->dump_int("kb_used", kb_used);
|
835
|
f->dump_int("kb_avail", kb_avail);
|
836
|
f->dump_float("utilization", util);
|
837
|
f->dump_float("var", var);
|
838
|
CrushTreeDumper::dump_bucket_children(crush, qi, f);
|
839
|
f->close_section();
|
840
|
}
|
841
|
|
842
|
public:
|
843
|
void summary(Formatter *f) {
|
844
|
f->open_object_section("summary");
|
845
|
f->dump_int("total_kb", pgm->osd_sum.kb);
|
846
|
f->dump_int("total_kb_used", pgm->osd_sum.kb_used);
|
847
|
f->dump_int("total_kb_avail", pgm->osd_sum.kb_avail);
|
848
|
f->dump_float("average_utilization", average_util);
|
849
|
f->dump_float("min_var", min_var);
|
850
|
f->dump_float("max_var", max_var);
|
851
|
f->dump_float("dev", dev());
|
852
|
f->close_section();
|
853
|
}
|
854
|
};
|
855
|
|
856
|
void OSDMonitor::print_utilization(ostream &out, Formatter *f, bool tree) const
|
857
|
{
|
858
|
const PGMap *pgm = &mon->pgmon()->pg_map;
|
859
|
const CrushWrapper *crush = osdmap.crush.get();
|
860
|
|
861
|
if (f) {
|
862
|
f->open_object_section("df");
|
863
|
OSDUtilizationFormatDumper d(crush, &osdmap, pgm, tree);
|
864
|
d.dump(f);
|
865
|
d.summary(f);
|
866
|
f->close_section();
|
867
|
f->flush(out);
|
868
|
} else {
|
869
|
OSDUtilizationPlainDumper d(crush, &osdmap, pgm, tree);
|
870
|
TextTable tbl;
|
871
|
d.dump(&tbl);
|
872
|
out << tbl
|
873
|
<< d.summary() << "\n";
|
874
|
}
|
875
|
}
|
876
|
|
877
|
void OSDMonitor::create_pending()
|
878
|
{
|
879
|
pending_inc = OSDMap::Incremental(osdmap.epoch+1);
|
880
|
pending_inc.fsid = mon->monmap->fsid;
|
881
|
|
882
|
dout(10) << "create_pending e " << pending_inc.epoch << dendl;
|
883
|
|
884
|
|
885
|
OSDMap::remove_redundant_temporaries(g_ceph_context, osdmap, &pending_inc);
|
886
|
|
887
|
|
888
|
OSDMap::remove_down_temps(g_ceph_context, osdmap, &pending_inc);
|
889
|
}
|
890
|
|
891
|
|
892
|
|
893
|
|
894
|
|
895
|
void OSDMonitor::encode_pending(MonitorDBStore::TransactionRef t)
|
896
|
{
|
897
|
dout(10) << "encode_pending e " << pending_inc.epoch
|
898
|
<< dendl;
|
899
|
|
900
|
|
901
|
pending_inc.modified = ceph_clock_now(g_ceph_context);
|
902
|
|
903
|
int r = pending_inc.propagate_snaps_to_tiers(g_ceph_context, osdmap);
|
904
|
assert(r == 0);
|
905
|
|
906
|
bufferlist bl;
|
907
|
|
908
|
|
909
|
for (map<int32_t,uint8_t>::iterator i = pending_inc.new_state.begin();
|
910
|
i != pending_inc.new_state.end();
|
911
|
++i) {
|
912
|
int s = i->second ? i->second : CEPH_OSD_UP;
|
913
|
if (s & CEPH_OSD_UP)
|
914
|
dout(2) << " osd." << i->first << " DOWN" << dendl;
|
915
|
if (s & CEPH_OSD_EXISTS)
|
916
|
dout(2) << " osd." << i->first << " DNE" << dendl;
|
917
|
}
|
918
|
for (map<int32_t,entity_addr_t>::iterator i = pending_inc.new_up_client.begin();
|
919
|
i != pending_inc.new_up_client.end();
|
920
|
++i) {
|
921
|
|
922
|
dout(2) << " osd." << i->first << " UP " << i->second << dendl;
|
923
|
}
|
924
|
for (map<int32_t,uint32_t>::iterator i = pending_inc.new_weight.begin();
|
925
|
i != pending_inc.new_weight.end();
|
926
|
++i) {
|
927
|
if (i->second == CEPH_OSD_OUT) {
|
928
|
dout(2) << " osd." << i->first << " OUT" << dendl;
|
929
|
} else if (i->second == CEPH_OSD_IN) {
|
930
|
dout(2) << " osd." << i->first << " IN" << dendl;
|
931
|
} else {
|
932
|
dout(2) << " osd." << i->first << " WEIGHT " << hex << i->second << dec << dendl;
|
933
|
}
|
934
|
}
|
935
|
|
936
|
|
937
|
OSDMap tmp;
|
938
|
{
|
939
|
tmp.deepish_copy_from(osdmap);
|
940
|
tmp.apply_incremental(pending_inc);
|
941
|
bufferlist fullbl;
|
942
|
::encode(tmp, fullbl, mon->quorum_features | CEPH_FEATURE_RESERVED);
|
943
|
pending_inc.full_crc = tmp.get_crc();
|
944
|
|
945
|
|
946
|
|
947
|
|
948
|
put_version_full(t, pending_inc.epoch, fullbl);
|
949
|
}
|
950
|
|
951
|
|
952
|
assert(get_last_committed() + 1 == pending_inc.epoch);
|
953
|
::encode(pending_inc, bl, mon->quorum_features | CEPH_FEATURE_RESERVED);
|
954
|
|
955
|
dout(20) << " full_crc " << tmp.get_crc()
|
956
|
<< " inc_crc " << pending_inc.inc_crc << dendl;
|
957
|
|
958
|
|
959
|
put_version(t, pending_inc.epoch, bl);
|
960
|
put_last_committed(t, pending_inc.epoch);
|
961
|
|
962
|
|
963
|
for (map<int,bufferlist>::iterator p = pending_metadata.begin();
|
964
|
p != pending_metadata.end();
|
965
|
++p)
|
966
|
t->put(OSD_METADATA_PREFIX, stringify(p->first), p->second);
|
967
|
for (set<int>::iterator p = pending_metadata_rm.begin();
|
968
|
p != pending_metadata_rm.end();
|
969
|
++p)
|
970
|
t->erase(OSD_METADATA_PREFIX, stringify(*p));
|
971
|
pending_metadata.clear();
|
972
|
pending_metadata_rm.clear();
|
973
|
}
|
974
|
|
975
|
int OSDMonitor::dump_osd_metadata(int osd, Formatter *f, ostream *err)
|
976
|
{
|
977
|
bufferlist bl;
|
978
|
int r = mon->store->get(OSD_METADATA_PREFIX, stringify(osd), bl);
|
979
|
if (r < 0)
|
980
|
return r;
|
981
|
map<string,string> m;
|
982
|
try {
|
983
|
bufferlist::iterator p = bl.begin();
|
984
|
::decode(m, p);
|
985
|
}
|
986
|
catch (buffer::error& e) {
|
987
|
if (err)
|
988
|
*err << "osd." << osd << " metadata is corrupt";
|
989
|
return -EIO;
|
990
|
}
|
991
|
for (map<string,string>::iterator p = m.begin(); p != m.end(); ++p)
|
992
|
f->dump_string(p->first.c_str(), p->second);
|
993
|
return 0;
|
994
|
}
|
995
|
|
996
|
void OSDMonitor::share_map_with_random_osd()
|
997
|
{
|
998
|
if (osdmap.get_num_up_osds() == 0) {
|
999
|
dout(10) << __func__ << " no up osds, don't share with anyone" << dendl;
|
1000
|
return;
|
1001
|
}
|
1002
|
|
1003
|
MonSession *s = mon->session_map.get_random_osd_session(&osdmap);
|
1004
|
if (!s) {
|
1005
|
dout(10) << __func__ << " no up osd on our session map" << dendl;
|
1006
|
return;
|
1007
|
}
|
1008
|
|
1009
|
dout(10) << "committed, telling random " << s->inst << " all about it" << dendl;
|
1010
|
|
1011
|
MOSDMap *m = build_incremental(osdmap.get_epoch() - 1, osdmap.get_epoch());
|
1012
|
s->con->send_message(m);
|
1013
|
|
1014
|
|
1015
|
}
|
1016
|
|
1017
|
version_t OSDMonitor::get_trim_to()
|
1018
|
{
|
1019
|
if (mon->pgmon()->is_readable() &&
|
1020
|
mon->pgmon()->pg_map.creating_pgs.empty()) {
|
1021
|
epoch_t floor = mon->pgmon()->pg_map.get_min_last_epoch_clean();
|
1022
|
dout(10) << " min_last_epoch_clean " << floor << dendl;
|
1023
|
if (g_conf->mon_osd_force_trim_to > 0 &&
|
1024
|
g_conf->mon_osd_force_trim_to < (int)get_last_committed()) {
|
1025
|
floor = g_conf->mon_osd_force_trim_to;
|
1026
|
dout(10) << " explicit mon_osd_force_trim_to = " << floor << dendl;
|
1027
|
}
|
1028
|
unsigned min = g_conf->mon_min_osdmap_epochs;
|
1029
|
if (floor + min > get_last_committed()) {
|
1030
|
if (min < get_last_committed())
|
1031
|
floor = get_last_committed() - min;
|
1032
|
else
|
1033
|
floor = 0;
|
1034
|
}
|
1035
|
if (floor > get_first_committed())
|
1036
|
return floor;
|
1037
|
}
|
1038
|
return 0;
|
1039
|
}
|
1040
|
|
1041
|
void OSDMonitor::encode_trim_extra(MonitorDBStore::TransactionRef tx,
|
1042
|
version_t first)
|
1043
|
{
|
1044
|
dout(10) << __func__ << " including full map for e " << first << dendl;
|
1045
|
bufferlist bl;
|
1046
|
get_version_full(first, bl);
|
1047
|
put_version_full(tx, first, bl);
|
1048
|
}
|
1049
|
|
1050
|
|
1051
|
|
1052
|
bool OSDMonitor::preprocess_query(PaxosServiceMessage *m)
|
1053
|
{
|
1054
|
dout(10) << "preprocess_query " << *m << " from " << m->get_orig_source_inst() << dendl;
|
1055
|
|
1056
|
switch (m->get_type()) {
|
1057
|
|
1058
|
case MSG_MON_COMMAND:
|
1059
|
return preprocess_command(static_cast<MMonCommand*>(m));
|
1060
|
case CEPH_MSG_MON_GET_OSDMAP:
|
1061
|
return preprocess_get_osdmap(static_cast<MMonGetOSDMap*>(m));
|
1062
|
|
1063
|
|
1064
|
case MSG_OSD_MARK_ME_DOWN:
|
1065
|
return preprocess_mark_me_down(static_cast<MOSDMarkMeDown*>(m));
|
1066
|
case MSG_OSD_FAILURE:
|
1067
|
return preprocess_failure(static_cast<MOSDFailure*>(m));
|
1068
|
case MSG_OSD_BOOT:
|
1069
|
return preprocess_boot(static_cast<MOSDBoot*>(m));
|
1070
|
case MSG_OSD_ALIVE:
|
1071
|
return preprocess_alive(static_cast<MOSDAlive*>(m));
|
1072
|
case MSG_OSD_PGTEMP:
|
1073
|
return preprocess_pgtemp(static_cast<MOSDPGTemp*>(m));
|
1074
|
|
1075
|
case CEPH_MSG_POOLOP:
|
1076
|
return preprocess_pool_op(static_cast<MPoolOp*>(m));
|
1077
|
|
1078
|
case MSG_REMOVE_SNAPS:
|
1079
|
return preprocess_remove_snaps(static_cast<MRemoveSnaps*>(m));
|
1080
|
|
1081
|
default:
|
1082
|
assert(0);
|
1083
|
m->put();
|
1084
|
return true;
|
1085
|
}
|
1086
|
}
|
1087
|
|
1088
|
bool OSDMonitor::prepare_update(PaxosServiceMessage *m)
|
1089
|
{
|
1090
|
dout(7) << "prepare_update " << *m << " from " << m->get_orig_source_inst() << dendl;
|
1091
|
|
1092
|
switch (m->get_type()) {
|
1093
|
|
1094
|
case MSG_OSD_MARK_ME_DOWN:
|
1095
|
return prepare_mark_me_down(static_cast<MOSDMarkMeDown*>(m));
|
1096
|
case MSG_OSD_FAILURE:
|
1097
|
return prepare_failure(static_cast<MOSDFailure*>(m));
|
1098
|
case MSG_OSD_BOOT:
|
1099
|
return prepare_boot(static_cast<MOSDBoot*>(m));
|
1100
|
case MSG_OSD_ALIVE:
|
1101
|
return prepare_alive(static_cast<MOSDAlive*>(m));
|
1102
|
case MSG_OSD_PGTEMP:
|
1103
|
return prepare_pgtemp(static_cast<MOSDPGTemp*>(m));
|
1104
|
|
1105
|
case MSG_MON_COMMAND:
|
1106
|
return prepare_command(static_cast<MMonCommand*>(m));
|
1107
|
|
1108
|
case CEPH_MSG_POOLOP:
|
1109
|
return prepare_pool_op(static_cast<MPoolOp*>(m));
|
1110
|
|
1111
|
case MSG_REMOVE_SNAPS:
|
1112
|
return prepare_remove_snaps(static_cast<MRemoveSnaps*>(m));
|
1113
|
|
1114
|
default:
|
1115
|
assert(0);
|
1116
|
m->put();
|
1117
|
}
|
1118
|
|
1119
|
return false;
|
1120
|
}
|
1121
|
|
1122
|
bool OSDMonitor::should_propose(double& delay)
|
1123
|
{
|
1124
|
dout(10) << "should_propose" << dendl;
|
1125
|
|
1126
|
|
1127
|
if (pending_inc.fullmap.length())
|
1128
|
return true;
|
1129
|
|
1130
|
|
1131
|
if (!osd_weight.empty() &&
|
1132
|
osd_weight.size() == (unsigned)osdmap.get_max_osd()) {
|
1133
|
dout(0) << " adjusting osd weights based on " << osd_weight << dendl;
|
1134
|
osdmap.adjust_osd_weights(osd_weight, pending_inc);
|
1135
|
delay = 0.0;
|
1136
|
osd_weight.clear();
|
1137
|
return true;
|
1138
|
}
|
1139
|
|
1140
|
return PaxosService::should_propose(delay);
|
1141
|
}
|
1142
|
|
1143
|
|
1144
|
|
1145
|
|
1146
|
|
1147
|
|
1148
|
bool OSDMonitor::preprocess_get_osdmap(MMonGetOSDMap *m)
|
1149
|
{
|
1150
|
dout(10) << __func__ << " " << *m << dendl;
|
1151
|
MOSDMap *reply = new MOSDMap(mon->monmap->fsid);
|
1152
|
epoch_t first = get_first_committed();
|
1153
|
epoch_t last = osdmap.get_epoch();
|
1154
|
int max = g_conf->osd_map_message_max;
|
1155
|
for (epoch_t e = MAX(first, m->get_full_first());
|
1156
|
e < MIN(last, m->get_full_last()) && max > 0;
|
1157
|
++e, --max) {
|
1158
|
int r = get_version_full(e, reply->maps[e]);
|
1159
|
assert(r >= 0);
|
1160
|
}
|
1161
|
for (epoch_t e = MAX(first, m->get_inc_first());
|
1162
|
e < MIN(last, m->get_inc_last()) && max > 0;
|
1163
|
++e, --max) {
|
1164
|
int r = get_version(e, reply->incremental_maps[e]);
|
1165
|
assert(r >= 0);
|
1166
|
}
|
1167
|
reply->oldest_map = get_first_committed();
|
1168
|
reply->newest_map = osdmap.get_epoch();
|
1169
|
mon->send_reply(m, reply);
|
1170
|
m->put();
|
1171
|
return true;
|
1172
|
}
|
1173
|
|
1174
|
|
1175
|
|
1176
|
|
1177
|
|
1178
|
|
1179
|
|
1180
|
bool OSDMonitor::check_source(PaxosServiceMessage *m, uuid_d fsid) {
|
1181
|
|
1182
|
MonSession *session = m->get_session();
|
1183
|
if (!session)
|
1184
|
return true;
|
1185
|
if (!session->is_capable("osd", MON_CAP_X)) {
|
1186
|
dout(0) << "got MOSDFailure from entity with insufficient caps "
|
1187
|
<< session->caps << dendl;
|
1188
|
return true;
|
1189
|
}
|
1190
|
if (fsid != mon->monmap->fsid) {
|
1191
|
dout(0) << "check_source: on fsid " << fsid
|
1192
|
<< " != " << mon->monmap->fsid << dendl;
|
1193
|
return true;
|
1194
|
}
|
1195
|
return false;
|
1196
|
}
|
1197
|
|
1198
|
|
1199
|
bool OSDMonitor::preprocess_failure(MOSDFailure *m)
|
1200
|
{
|
1201
|
|
1202
|
int badboy = m->get_target().name.num();
|
1203
|
|
1204
|
|
1205
|
if (check_source(m, m->fsid))
|
1206
|
goto didit;
|
1207
|
|
1208
|
|
1209
|
if (m->get_orig_source().is_osd()) {
|
1210
|
int from = m->get_orig_source().num();
|
1211
|
if (!osdmap.exists(from) ||
|
1212
|
osdmap.get_addr(from) != m->get_orig_source_inst().addr ||
|
1213
|
osdmap.is_down(from)) {
|
1214
|
dout(5) << "preprocess_failure from dead osd." << from << ", ignoring" << dendl;
|
1215
|
send_incremental(m, m->get_epoch()+1);
|
1216
|
goto didit;
|
1217
|
}
|
1218
|
}
|
1219
|
|
1220
|
|
1221
|
|
1222
|
if (!osdmap.have_inst(badboy)) {
|
1223
|
dout(5) << "preprocess_failure dne(/dup?): " << m->get_target() << ", from " << m->get_orig_source_inst() << dendl;
|
1224
|
if (m->get_epoch() < osdmap.get_epoch())
|
1225
|
send_incremental(m, m->get_epoch()+1);
|
1226
|
goto didit;
|
1227
|
}
|
1228
|
if (osdmap.get_inst(badboy) != m->get_target()) {
|
1229
|
dout(5) << "preprocess_failure wrong osd: report " << m->get_target() << " != map's " << osdmap.get_inst(badboy)
|
1230
|
<< ", from " << m->get_orig_source_inst() << dendl;
|
1231
|
if (m->get_epoch() < osdmap.get_epoch())
|
1232
|
send_incremental(m, m->get_epoch()+1);
|
1233
|
goto didit;
|
1234
|
}
|
1235
|
|
1236
|
|
1237
|
if (osdmap.is_down(badboy) ||
|
1238
|
osdmap.get_up_from(badboy) > m->get_epoch()) {
|
1239
|
dout(5) << "preprocess_failure dup/old: " << m->get_target() << ", from " << m->get_orig_source_inst() << dendl;
|
1240
|
if (m->get_epoch() < osdmap.get_epoch())
|
1241
|
send_incremental(m, m->get_epoch()+1);
|
1242
|
goto didit;
|
1243
|
}
|
1244
|
|
1245
|
if (!can_mark_down(badboy)) {
|
1246
|
dout(5) << "preprocess_failure ignoring report of " << m->get_target() << " from " << m->get_orig_source_inst() << dendl;
|
1247
|
goto didit;
|
1248
|
}
|
1249
|
|
1250
|
dout(10) << "preprocess_failure new: " << m->get_target() << ", from " << m->get_orig_source_inst() << dendl;
|
1251
|
return false;
|
1252
|
|
1253
|
didit:
|
1254
|
m->put();
|
1255
|
return true;
|
1256
|
}
|
1257
|
|
1258
|
class C_AckMarkedDown : public Context {
|
1259
|
OSDMonitor *osdmon;
|
1260
|
MOSDMarkMeDown *m;
|
1261
|
public:
|
1262
|
C_AckMarkedDown(
|
1263
|
OSDMonitor *osdmon,
|
1264
|
MOSDMarkMeDown *m)
|
1265
|
: osdmon(osdmon), m(m) {}
|
1266
|
|
1267
|
void finish(int) {
|
1268
|
osdmon->mon->send_reply(
|
1269
|
m,
|
1270
|
new MOSDMarkMeDown(
|
1271
|
m->fsid,
|
1272
|
m->get_target(),
|
1273
|
m->get_epoch(),
|
1274
|
false));
|
1275
|
}
|
1276
|
~C_AckMarkedDown() {
|
1277
|
m->put();
|
1278
|
}
|
1279
|
};
|
1280
|
|
1281
|
bool OSDMonitor::preprocess_mark_me_down(MOSDMarkMeDown *m)
|
1282
|
{
|
1283
|
int requesting_down = m->get_target().name.num();
|
1284
|
int from = m->get_orig_source().num();
|
1285
|
|
1286
|
|
1287
|
if (check_source(m, m->fsid))
|
1288
|
goto reply;
|
1289
|
|
1290
|
|
1291
|
if (!m->get_orig_source().is_osd())
|
1292
|
goto reply;
|
1293
|
|
1294
|
if (!osdmap.exists(from) ||
|
1295
|
osdmap.is_down(from) ||
|
1296
|
osdmap.get_addr(from) != m->get_target().addr) {
|
1297
|
dout(5) << "preprocess_mark_me_down from dead osd."
|
1298
|
<< from << ", ignoring" << dendl;
|
1299
|
send_incremental(m, m->get_epoch()+1);
|
1300
|
goto reply;
|
1301
|
}
|
1302
|
|
1303
|
|
1304
|
if (!can_mark_down(requesting_down))
|
1305
|
goto reply;
|
1306
|
|
1307
|
dout(10) << "MOSDMarkMeDown for: " << m->get_target() << dendl;
|
1308
|
return false;
|
1309
|
|
1310
|
reply:
|
1311
|
if (m->request_ack) {
|
1312
|
Context *c(new C_AckMarkedDown(this, m));
|
1313
|
c->complete(0);
|
1314
|
}
|
1315
|
return true;
|
1316
|
}
|
1317
|
|
1318
|
bool OSDMonitor::prepare_mark_me_down(MOSDMarkMeDown *m)
|
1319
|
{
|
1320
|
int target_osd = m->get_target().name.num();
|
1321
|
|
1322
|
assert(osdmap.is_up(target_osd));
|
1323
|
assert(osdmap.get_addr(target_osd) == m->get_target().addr);
|
1324
|
|
1325
|
mon->clog->info() << "osd." << target_osd << " marked itself down\n";
|
1326
|
pending_inc.new_state[target_osd] = CEPH_OSD_UP;
|
1327
|
if (m->request_ack)
|
1328
|
wait_for_finished_proposal(new C_AckMarkedDown(this, m));
|
1329
|
return true;
|
1330
|
}
|
1331
|
|
1332
|
bool OSDMonitor::can_mark_down(int i)
|
1333
|
{
|
1334
|
if (osdmap.test_flag(CEPH_OSDMAP_NODOWN)) {
|
1335
|
dout(5) << "can_mark_down NODOWN flag set, will not mark osd." << i << " down" << dendl;
|
1336
|
return false;
|
1337
|
}
|
1338
|
int num_osds = osdmap.get_num_osds();
|
1339
|
if (num_osds == 0) {
|
1340
|
dout(5) << "can_mark_down no osds" << dendl;
|
1341
|
return false;
|
1342
|
}
|
1343
|
int up = osdmap.get_num_up_osds() - pending_inc.get_net_marked_down(&osdmap);
|
1344
|
float up_ratio = (float)up / (float)num_osds;
|
1345
|
if (up_ratio < g_conf->mon_osd_min_up_ratio) {
|
1346
|
dout(5) << "can_mark_down current up_ratio " << up_ratio << " < min "
|
1347
|
<< g_conf->mon_osd_min_up_ratio
|
1348
|
<< ", will not mark osd." << i << " down" << dendl;
|
1349
|
return false;
|
1350
|
}
|
1351
|
return true;
|
1352
|
}
|
1353
|
|
1354
|
bool OSDMonitor::can_mark_up(int i)
|
1355
|
{
|
1356
|
if (osdmap.test_flag(CEPH_OSDMAP_NOUP)) {
|
1357
|
dout(5) << "can_mark_up NOUP flag set, will not mark osd." << i << " up" << dendl;
|
1358
|
return false;
|
1359
|
}
|
1360
|
return true;
|
1361
|
}
|
1362
|
|
1363
|
|
1364
|
|
1365
|
|
1366
|
|
1367
|
bool OSDMonitor::can_mark_out(int i)
|
1368
|
{
|
1369
|
if (osdmap.test_flag(CEPH_OSDMAP_NOOUT)) {
|
1370
|
dout(5) << "can_mark_out NOOUT flag set, will not mark osds out" << dendl;
|
1371
|
return false;
|
1372
|
}
|
1373
|
int num_osds = osdmap.get_num_osds();
|
1374
|
if (num_osds == 0) {
|
1375
|
dout(5) << "can_mark_out no osds" << dendl;
|
1376
|
return false;
|
1377
|
}
|
1378
|
int in = osdmap.get_num_in_osds() - pending_inc.get_net_marked_out(&osdmap);
|
1379
|
float in_ratio = (float)in / (float)num_osds;
|
1380
|
if (in_ratio < g_conf->mon_osd_min_in_ratio) {
|
1381
|
if (i >= 0)
|
1382
|
dout(5) << "can_mark_down current in_ratio " << in_ratio << " < min "
|
1383
|
<< g_conf->mon_osd_min_in_ratio
|
1384
|
<< ", will not mark osd." << i << " out" << dendl;
|
1385
|
else
|
1386
|
dout(5) << "can_mark_down current in_ratio " << in_ratio << " < min "
|
1387
|
<< g_conf->mon_osd_min_in_ratio
|
1388
|
<< ", will not mark osds out" << dendl;
|
1389
|
return false;
|
1390
|
}
|
1391
|
|
1392
|
return true;
|
1393
|
}
|
1394
|
|
1395
|
bool OSDMonitor::can_mark_in(int i)
|
1396
|
{
|
1397
|
if (osdmap.test_flag(CEPH_OSDMAP_NOIN)) {
|
1398
|
dout(5) << "can_mark_in NOIN flag set, will not mark osd." << i << " in" << dendl;
|
1399
|
return false;
|
1400
|
}
|
1401
|
return true;
|
1402
|
}
|
1403
|
|
1404
|
void OSDMonitor::check_failures(utime_t now)
|
1405
|
{
|
1406
|
for (map<int,failure_info_t>::iterator p = failure_info.begin();
|
1407
|
p != failure_info.end();
|
1408
|
++p) {
|
1409
|
check_failure(now, p->first, p->second);
|
1410
|
}
|
1411
|
}
|
1412
|
|
1413
|
bool OSDMonitor::check_failure(utime_t now, int target_osd, failure_info_t& fi)
|
1414
|
{
|
1415
|
utime_t orig_grace(g_conf->osd_heartbeat_grace, 0);
|
1416
|
utime_t max_failed_since = fi.get_failed_since();
|
1417
|
utime_t failed_for = now - max_failed_since;
|
1418
|
|
1419
|
utime_t grace = orig_grace;
|
1420
|
double my_grace = 0, peer_grace = 0;
|
1421
|
if (g_conf->mon_osd_adjust_heartbeat_grace) {
|
1422
|
double halflife = (double)g_conf->mon_osd_laggy_halflife;
|
1423
|
double decay_k = ::log(.5) / halflife;
|
1424
|
|
1425
|
|
1426
|
|
1427
|
const osd_xinfo_t& xi = osdmap.get_xinfo(target_osd);
|
1428
|
double decay = exp((double)failed_for * decay_k);
|
1429
|
dout(20) << " halflife " << halflife << " decay_k " << decay_k
|
1430
|
<< " failed_for " << failed_for << " decay " << decay << dendl;
|
1431
|
my_grace = decay * (double)xi.laggy_interval * xi.laggy_probability;
|
1432
|
grace += my_grace;
|
1433
|
|
1434
|
|
1435
|
|
1436
|
|
1437
|
|
1438
|
|
1439
|
assert(fi.reporters.size());
|
1440
|
for (map<int,failure_reporter_t>::iterator p = fi.reporters.begin();
|
1441
|
p != fi.reporters.end();
|
1442
|
++p) {
|
1443
|
const osd_xinfo_t& xi = osdmap.get_xinfo(p->first);
|
1444
|
utime_t elapsed = now - xi.down_stamp;
|
1445
|
double decay = exp((double)elapsed * decay_k);
|
1446
|
peer_grace += decay * (double)xi.laggy_interval * xi.laggy_probability;
|
1447
|
}
|
1448
|
peer_grace /= (double)fi.reporters.size();
|
1449
|
grace += peer_grace;
|
1450
|
}
|
1451
|
|
1452
|
dout(10) << " osd." << target_osd << " has "
|
1453
|
<< fi.reporters.size() << " reporters and "
|
1454
|
<< fi.num_reports << " reports, "
|
1455
|
<< grace << " grace (" << orig_grace << " + " << my_grace << " + " << peer_grace << "), max_failed_since " << max_failed_since
|
1456
|
<< dendl;
|
1457
|
|
1458
|
|
1459
|
if (pending_inc.new_state.count(target_osd) &&
|
1460
|
pending_inc.new_state[target_osd] & CEPH_OSD_UP) {
|
1461
|
dout(10) << " already pending failure" << dendl;
|
1462
|
return true;
|
1463
|
}
|
1464
|
|
1465
|
if (failed_for >= grace &&
|
1466
|
((int)fi.reporters.size() >= g_conf->mon_osd_min_down_reporters) &&
|
1467
|
(fi.num_reports >= g_conf->mon_osd_min_down_reports)) {
|
1468
|
dout(1) << " we have enough reports/reporters to mark osd." << target_osd << " down" << dendl;
|
1469
|
pending_inc.new_state[target_osd] = CEPH_OSD_UP;
|
1470
|
|
1471
|
mon->clog->info() << osdmap.get_inst(target_osd) << " failed ("
|
1472
|
<< fi.num_reports << " reports from " << (int)fi.reporters.size() << " peers after "
|
1473
|
<< failed_for << " >= grace " << grace << ")\n";
|
1474
|
return true;
|
1475
|
}
|
1476
|
return false;
|
1477
|
}
|
1478
|
|
1479
|
bool OSDMonitor::prepare_failure(MOSDFailure *m)
|
1480
|
{
|
1481
|
dout(1) << "prepare_failure " << m->get_target() << " from " << m->get_orig_source_inst()
|
1482
|
<< " is reporting failure:" << m->if_osd_failed() << dendl;
|
1483
|
|
1484
|
int target_osd = m->get_target().name.num();
|
1485
|
int reporter = m->get_orig_source().num();
|
1486
|
assert(osdmap.is_up(target_osd));
|
1487
|
assert(osdmap.get_addr(target_osd) == m->get_target().addr);
|
1488
|
|
1489
|
|
1490
|
utime_t now = ceph_clock_now(g_ceph_context);
|
1491
|
utime_t failed_since = m->get_recv_stamp() - utime_t(m->failed_for ? m->failed_for : g_conf->osd_heartbeat_grace, 0);
|
1492
|
|
1493
|
if (m->if_osd_failed()) {
|
1494
|
|
1495
|
mon->clog->debug() << m->get_target() << " reported failed by "
|
1496
|
<< m->get_orig_source_inst() << "\n";
|
1497
|
failure_info_t& fi = failure_info[target_osd];
|
1498
|
MOSDFailure *old = fi.add_report(reporter, failed_since, m);
|
1499
|
if (old) {
|
1500
|
mon->no_reply(old);
|
1501
|
old->put();
|
1502
|
}
|
1503
|
|
1504
|
return check_failure(now, target_osd, fi);
|
1505
|
} else {
|
1506
|
|
1507
|
mon->clog->debug() << m->get_target() << " failure report canceled by "
|
1508
|
<< m->get_orig_source_inst() << "\n";
|
1509
|
if (failure_info.count(target_osd)) {
|
1510
|
failure_info_t& fi = failure_info[target_osd];
|
1511
|
list<MOSDFailure*> ls;
|
1512
|
fi.take_report_messages(ls);
|
1513
|
fi.cancel_report(reporter);
|
1514
|
while (!ls.empty()) {
|
1515
|
mon->no_reply(ls.front());
|
1516
|
ls.front()->put();
|
1517
|
ls.pop_front();
|
1518
|
}
|
1519
|
if (fi.reporters.empty()) {
|
1520
|
dout(10) << " removing last failure_info for osd." << target_osd << dendl;
|
1521
|
failure_info.erase(target_osd);
|
1522
|
} else {
|
1523
|
dout(10) << " failure_info for osd." << target_osd << " now "
|
1524
|
<< fi.reporters.size() << " reporters and "
|
1525
|
<< fi.num_reports << " reports" << dendl;
|
1526
|
}
|
1527
|
} else {
|
1528
|
dout(10) << " no failure_info for osd." << target_osd << dendl;
|
1529
|
}
|
1530
|
mon->no_reply(m);
|
1531
|
m->put();
|
1532
|
}
|
1533
|
|
1534
|
return false;
|
1535
|
}
|
1536
|
|
1537
|
void OSDMonitor::process_failures()
|
1538
|
{
|
1539
|
map<int,failure_info_t>::iterator p = failure_info.begin();
|
1540
|
while (p != failure_info.end()) {
|
1541
|
if (osdmap.is_up(p->first)) {
|
1542
|
++p;
|
1543
|
} else {
|
1544
|
dout(10) << "process_failures osd." << p->first << dendl;
|
1545
|
list<MOSDFailure*> ls;
|
1546
|
p->second.take_report_messages(ls);
|
1547
|
failure_info.erase(p++);
|
1548
|
|
1549
|
while (!ls.empty()) {
|
1550
|
send_latest(ls.front(), ls.front()->get_epoch());
|
1551
|
ls.pop_front();
|
1552
|
}
|
1553
|
}
|
1554
|
}
|
1555
|
}
|
1556
|
|
1557
|
void OSDMonitor::take_all_failures(list<MOSDFailure*>& ls)
|
1558
|
{
|
1559
|
dout(10) << __func__ << " on " << failure_info.size() << " osds" << dendl;
|
1560
|
|
1561
|
for (map<int,failure_info_t>::iterator p = failure_info.begin();
|
1562
|
p != failure_info.end();
|
1563
|
++p) {
|
1564
|
p->second.take_report_messages(ls);
|
1565
|
}
|
1566
|
failure_info.clear();
|
1567
|
}
|
1568
|
|
1569
|
|
1570
|
|
1571
|
|
1572
|
bool OSDMonitor::preprocess_boot(MOSDBoot *m)
|
1573
|
{
|
1574
|
int from = m->get_orig_source_inst().name.num();
|
1575
|
|
1576
|
|
1577
|
MonSession *session = m->get_session();
|
1578
|
if (!session)
|
1579
|
goto ignore;
|
1580
|
if (!session->is_capable("osd", MON_CAP_X)) {
|
1581
|
dout(0) << "got preprocess_boot message from entity with insufficient caps"
|
1582
|
<< session->caps << dendl;
|
1583
|
goto ignore;
|
1584
|
}
|
1585
|
|
1586
|
if (m->sb.cluster_fsid != mon->monmap->fsid) {
|
1587
|
dout(0) << "preprocess_boot on fsid " << m->sb.cluster_fsid
|
1588
|
<< " != " << mon->monmap->fsid << dendl;
|
1589
|
goto ignore;
|
1590
|
}
|
1591
|
|
1592
|
if (m->get_orig_source_inst().addr.is_blank_ip()) {
|
1593
|
dout(0) << "preprocess_boot got blank addr for " << m->get_orig_source_inst() << dendl;
|
1594
|
goto ignore;
|
1595
|
}
|
1596
|
|
1597
|
assert(m->get_orig_source_inst().name.is_osd());
|
1598
|
|
1599
|
|
1600
|
if ((osdmap.get_features(CEPH_ENTITY_TYPE_OSD, NULL) &
|
1601
|
CEPH_FEATURE_OSD_ERASURE_CODES) &&
|
1602
|
!(m->get_connection()->get_features() & CEPH_FEATURE_OSD_ERASURE_CODES)) {
|
1603
|
dout(0) << __func__ << " osdmap requires erasure code but osd at "
|
1604
|
<< m->get_orig_source_inst()
|
1605
|
<< " doesn't announce support -- ignore" << dendl;
|
1606
|
goto ignore;
|
1607
|
}
|
1608
|
|
1609
|
if ((osdmap.get_features(CEPH_ENTITY_TYPE_OSD, NULL) &
|
1610
|
CEPH_FEATURE_ERASURE_CODE_PLUGINS_V2) &&
|
1611
|
!(m->get_connection()->get_features() & CEPH_FEATURE_ERASURE_CODE_PLUGINS_V2)) {
|
1612
|
dout(0) << __func__ << " osdmap requires erasure code plugins v2 but osd at "
|
1613
|
<< m->get_orig_source_inst()
|
1614
|
<< " doesn't announce support -- ignore" << dendl;
|
1615
|
goto ignore;
|
1616
|
}
|
1617
|
|
1618
|
|
1619
|
if (osdmap.is_up(from) &&
|
1620
|
osdmap.get_inst(from) == m->get_orig_source_inst()) {
|
1621
|
|
1622
|
dout(7) << "preprocess_boot dup from " << m->get_orig_source_inst()
|
1623
|
<< " == " << osdmap.get_inst(from) << dendl;
|
1624
|
_booted(m, false);
|
1625
|
return true;
|
1626
|
}
|
1627
|
|
1628
|
if (osdmap.exists(from) &&
|
1629
|
!osdmap.get_uuid(from).is_zero() &&
|
1630
|
osdmap.get_uuid(from) != m->sb.osd_fsid) {
|
1631
|
dout(7) << __func__ << " from " << m->get_orig_source_inst()
|
1632
|
<< " clashes with existing osd: different fsid"
|
1633
|
<< " (ours: " << osdmap.get_uuid(from)
|
1634
|
<< " ; theirs: " << m->sb.osd_fsid << ")" << dendl;
|
1635
|
goto ignore;
|
1636
|
}
|
1637
|
|
1638
|
if (osdmap.exists(from) &&
|
1639
|
osdmap.get_info(from).up_from > m->version) {
|
1640
|
dout(7) << "prepare_boot msg from before last up_from, ignoring" << dendl;
|
1641
|
send_latest(m, m->sb.current_epoch+1);
|
1642
|
return true;
|
1643
|
}
|
1644
|
|
1645
|
|
1646
|
if (!can_mark_up(from)) {
|
1647
|
dout(7) << "preprocess_boot ignoring boot from " << m->get_orig_source_inst() << dendl;
|
1648
|
send_latest(m, m->sb.current_epoch+1);
|
1649
|
return true;
|
1650
|
}
|
1651
|
|
1652
|
dout(10) << "preprocess_boot from " << m->get_orig_source_inst() << dendl;
|
1653
|
return false;
|
1654
|
|
1655
|
ignore:
|
1656
|
m->put();
|
1657
|
return true;
|
1658
|
}
|
1659
|
|
1660
|
bool OSDMonitor::prepare_boot(MOSDBoot *m)
|
1661
|
{
|
1662
|
dout(7) << "prepare_boot from " << m->get_orig_source_inst() << " sb " << m->sb
|
1663
|
<< " cluster_addr " << m->cluster_addr
|
1664
|
<< " hb_back_addr " << m->hb_back_addr
|
1665
|
<< " hb_front_addr " << m->hb_front_addr
|
1666
|
<< dendl;
|
1667
|
|
1668
|
assert(m->get_orig_source().is_osd());
|
1669
|
int from = m->get_orig_source().num();
|
1670
|
|
1671
|
|
1672
|
if (from >= osdmap.get_max_osd()) {
|
1673
|
dout(1) << "boot from osd." << from << " >= max_osd " << osdmap.get_max_osd() << dendl;
|
1674
|
m->put();
|
1675
|
return false;
|
1676
|
}
|
1677
|
|
1678
|
int oldstate = osdmap.exists(from) ? osdmap.get_state(from) : CEPH_OSD_NEW;
|
1679
|
if (pending_inc.new_state.count(from))
|
1680
|
oldstate ^= pending_inc.new_state[from];
|
1681
|
|
1682
|
|
1683
|
if (osdmap.is_up(from)) {
|
1684
|
dout(7) << "prepare_boot was up, first marking down " << osdmap.get_inst(from) << dendl;
|
1685
|
|
1686
|
assert(osdmap.get_inst(from) != m->get_orig_source_inst());
|
1687
|
assert(osdmap.get_uuid(from) == m->sb.osd_fsid);
|
1688
|
|
1689
|
if (pending_inc.new_state.count(from) == 0 ||
|
1690
|
(pending_inc.new_state[from] & CEPH_OSD_UP) == 0) {
|
1691
|
|
1692
|
pending_inc.new_state[from] = CEPH_OSD_UP;
|
1693
|
}
|
1694
|
wait_for_finished_proposal(new C_RetryMessage(this, m));
|
1695
|
} else if (pending_inc.new_up_client.count(from)) {
|
1696
|
|
1697
|
dout(7) << "prepare_boot already prepared, waiting on " << m->get_orig_source_addr() << dendl;
|
1698
|
wait_for_finished_proposal(new C_RetryMessage(this, m));
|
1699
|
} else {
|
1700
|
|
1701
|
pending_inc.new_up_client[from] = m->get_orig_source_addr();
|
1702
|
if (!m->cluster_addr.is_blank_ip())
|
1703
|
pending_inc.new_up_cluster[from] = m->cluster_addr;
|
1704
|
pending_inc.new_hb_back_up[from] = m->hb_back_addr;
|
1705
|
if (!m->hb_front_addr.is_blank_ip())
|
1706
|
pending_inc.new_hb_front_up[from] = m->hb_front_addr;
|
1707
|
|
1708
|
|
1709
|
if ((g_conf->mon_osd_auto_mark_auto_out_in && (oldstate & CEPH_OSD_AUTOOUT)) ||
|
1710
|
(g_conf->mon_osd_auto_mark_new_in && (oldstate & CEPH_OSD_NEW)) ||
|
1711
|
(g_conf->mon_osd_auto_mark_in)) {
|
1712
|
if (can_mark_in(from)) {
|
1713
|
if (osdmap.osd_xinfo[from].old_weight > 0)
|
1714
|
pending_inc.new_weight[from] = osdmap.osd_xinfo[from].old_weight;
|
1715
|
else
|
1716
|
pending_inc.new_weight[from] = CEPH_OSD_IN;
|
1717
|
} else {
|
1718
|
dout(7) << "prepare_boot NOIN set, will not mark in " << m->get_orig_source_addr() << dendl;
|
1719
|
}
|
1720
|
}
|
1721
|
|
1722
|
down_pending_out.erase(from);
|
1723
|
|
1724
|
if (m->sb.weight)
|
1725
|
osd_weight[from] = m->sb.weight;
|
1726
|
|
1727
|
|
1728
|
dout(10) << " setting osd." << from << " uuid to " << m->sb.osd_fsid << dendl;
|
1729
|
if (!osdmap.exists(from) || osdmap.get_uuid(from) != m->sb.osd_fsid) {
|
1730
|
|
1731
|
assert(!osdmap.exists(from) || osdmap.get_uuid(from).is_zero());
|
1732
|
pending_inc.new_uuid[from] = m->sb.osd_fsid;
|
1733
|
}
|
1734
|
|
1735
|
|
1736
|
if (m->sb.newest_map == 0 && osdmap.exists(from)) {
|
1737
|
const osd_info_t& i = osdmap.get_info(from);
|
1738
|
if (i.up_from > i.lost_at) {
|
1739
|
dout(10) << " fresh osd; marking lost_at too" << dendl;
|
1740
|
pending_inc.new_lost[from] = osdmap.get_epoch();
|
1741
|
}
|
1742
|
}
|
1743
|
|
1744
|
|
1745
|
bufferlist osd_metadata;
|
1746
|
::encode(m->metadata, osd_metadata);
|
1747
|
pending_metadata[from] = osd_metadata;
|
1748
|
|
1749
|
|
1750
|
const osd_info_t& info = osdmap.get_info(from);
|
1751
|
dout(10) << " old osd_info: " << info << dendl;
|
1752
|
if (m->sb.mounted > info.last_clean_begin ||
|
1753
|
(m->sb.mounted == info.last_clean_begin &&
|
1754
|
m->sb.clean_thru > info.last_clean_end)) {
|
1755
|
epoch_t begin = m->sb.mounted;
|
1756
|
epoch_t end = m->sb.clean_thru;
|
1757
|
|
1758
|
dout(10) << "prepare_boot osd." << from << " last_clean_interval "
|
1759
|
<< "[" << info.last_clean_begin << "," << info.last_clean_end << ")"
|
1760
|
<< " -> [" << begin << "-" << end << ")"
|
1761
|
<< dendl;
|
1762
|
pending_inc.new_last_clean_interval[from] = pair<epoch_t,epoch_t>(begin, end);
|
1763
|
}
|
1764
|
|
1765
|
osd_xinfo_t xi = osdmap.get_xinfo(from);
|
1766
|
if (m->boot_epoch == 0) {
|
1767
|
xi.laggy_probability *= (1.0 - g_conf->mon_osd_laggy_weight);
|
1768
|
xi.laggy_interval *= (1.0 - g_conf->mon_osd_laggy_weight);
|
1769
|
dout(10) << " not laggy, new xi " << xi << dendl;
|
1770
|
} else {
|
1771
|
if (xi.down_stamp.sec()) {
|
1772
|
int interval = ceph_clock_now(g_ceph_context).sec() - xi.down_stamp.sec();
|
1773
|
xi.laggy_interval =
|
1774
|
interval * g_conf->mon_osd_laggy_weight +
|
1775
|
xi.laggy_interval * (1.0 - g_conf->mon_osd_laggy_weight);
|
1776
|
}
|
1777
|
xi.laggy_probability =
|
1778
|
g_conf->mon_osd_laggy_weight +
|
1779
|
xi.laggy_probability * (1.0 - g_conf->mon_osd_laggy_weight);
|
1780
|
dout(10) << " laggy, now xi " << xi << dendl;
|
1781
|
}
|
1782
|
|
1783
|
|
1784
|
if (m->osd_features)
|
1785
|
xi.features = m->osd_features;
|
1786
|
else
|
1787
|
xi.features = m->get_connection()->get_features();
|
1788
|
|
1789
|
pending_inc.new_xinfo[from] = xi;
|
1790
|
|
1791
|
|
1792
|
wait_for_finished_proposal(new C_Booted(this, m));
|
1793
|
}
|
1794
|
return true;
|
1795
|
}
|
1796
|
|
1797
|
void OSDMonitor::_booted(MOSDBoot *m, bool logit)
|
1798
|
{
|
1799
|
dout(7) << "_booted " << m->get_orig_source_inst()
|
1800
|
<< " w " << m->sb.weight << " from " << m->sb.current_epoch << dendl;
|
1801
|
|
1802
|
if (logit) {
|
1803
|
mon->clog->info() << m->get_orig_source_inst() << " boot\n";
|
1804
|
}
|
1805
|
|
1806
|
send_latest(m, m->sb.current_epoch+1);
|
1807
|
}
|
1808
|
|
1809
|
|
1810
|
|
1811
|
|
1812
|
|
1813
|
bool OSDMonitor::preprocess_alive(MOSDAlive *m)
|
1814
|
{
|
1815
|
int from = m->get_orig_source().num();
|
1816
|
|
1817
|
|
1818
|
MonSession *session = m->get_session();
|
1819
|
if (!session)
|
1820
|
goto ignore;
|
1821
|
if (!session->is_capable("osd", MON_CAP_X)) {
|
1822
|
dout(0) << "attempt to send MOSDAlive from entity with insufficient privileges:"
|
1823
|
<< session->caps << dendl;
|
1824
|
goto ignore;
|
1825
|
}
|
1826
|
|
1827
|
if (!osdmap.is_up(from) ||
|
1828
|
osdmap.get_inst(from) != m->get_orig_source_inst()) {
|
1829
|
dout(7) << "preprocess_alive ignoring alive message from down " << m->get_orig_source_inst() << dendl;
|
1830
|
goto ignore;
|
1831
|
}
|
1832
|
|
1833
|
if (osdmap.get_up_thru(from) >= m->want) {
|
1834
|
|
1835
|
dout(7) << "preprocess_alive want up_thru " << m->want << " dup from " << m->get_orig_source_inst() << dendl;
|
1836
|
_reply_map(m, m->version);
|
1837
|
return true;
|
1838
|
}
|
1839
|
|
1840
|
dout(10) << "preprocess_alive want up_thru " << m->want
|
1841
|
<< " from " << m->get_orig_source_inst() << dendl;
|
1842
|
return false;
|
1843
|
|
1844
|
ignore:
|
1845
|
m->put();
|
1846
|
return true;
|
1847
|
}
|
1848
|
|
1849
|
bool OSDMonitor::prepare_alive(MOSDAlive *m)
|
1850
|
{
|
1851
|
int from = m->get_orig_source().num();
|
1852
|
|
1853
|
if (0) {
|
1854
|
mon->clog->debug() << m->get_orig_source_inst() << " alive\n";
|
1855
|
}
|
1856
|
|
1857
|
dout(7) << "prepare_alive want up_thru " << m->want << " have " << m->version
|
1858
|
<< " from " << m->get_orig_source_inst() << dendl;
|
1859
|
pending_inc.new_up_thru[from] = m->version;
|
1860
|
wait_for_finished_proposal(new C_ReplyMap(this, m, m->version));
|
1861
|
return true;
|
1862
|
}
|
1863
|
|
1864
|
void OSDMonitor::_reply_map(PaxosServiceMessage *m, epoch_t e)
|
1865
|
{
|
1866
|
dout(7) << "_reply_map " << e
|
1867
|
<< " from " << m->get_orig_source_inst()
|
1868
|
<< dendl;
|
1869
|
send_latest(m, e);
|
1870
|
}
|
1871
|
|
1872
|
|
1873
|
|
1874
|
|
1875
|
bool OSDMonitor::preprocess_pgtemp(MOSDPGTemp *m)
|
1876
|
{
|
1877
|
dout(10) << "preprocess_pgtemp " << *m << dendl;
|
1878
|
vector<int> empty;
|
1879
|
int from = m->get_orig_source().num();
|
1880
|
size_t ignore_cnt = 0;
|
1881
|
|
1882
|
|
1883
|
MonSession *session = m->get_session();
|
1884
|
if (!session)
|
1885
|
goto ignore;
|
1886
|
if (!session->is_capable("osd", MON_CAP_X)) {
|
1887
|
dout(0) << "attempt to send MOSDPGTemp from entity with insufficient caps "
|
1888
|
<< session->caps << dendl;
|
1889
|
goto ignore;
|
1890
|
}
|
1891
|
|
1892
|
if (!osdmap.is_up(from) ||
|
1893
|
osdmap.get_inst(from) != m->get_orig_source_inst()) {
|
1894
|
dout(7) << "ignoring pgtemp message from down " << m->get_orig_source_inst() << dendl;
|
1895
|
goto ignore;
|
1896
|
}
|
1897
|
|
1898
|
for (map<pg_t,vector<int32_t> >::iterator p = m->pg_temp.begin(); p != m->pg_temp.end(); ++p) {
|
1899
|
dout(20) << " " << p->first
|
1900
|
<< (osdmap.pg_temp->count(p->first) ? (*osdmap.pg_temp)[p->first] : empty)
|
1901
|
<< " -> " << p->second << dendl;
|
1902
|
|
1903
|
|
1904
|
if (!osdmap.have_pg_pool(p->first.pool())) {
|
1905
|
|
1906
|
|
1907
|
|
1908
|
|
1909
|
|
1910
|
|
1911
|
|
1912
|
|
1913
|
|
1914
|
dout(10) << __func__ << " ignore " << p->first << " -> " << p->second
|
1915
|
<< ": pool has been removed" << dendl;
|
1916
|
ignore_cnt++;
|
1917
|
continue;
|
1918
|
}
|
1919
|
|
1920
|
|
1921
|
if (p->second.empty() && (osdmap.pg_temp->count(p->first) ||
|
1922
|
osdmap.primary_temp->count(p->first)))
|
1923
|
return false;
|
1924
|
|
1925
|
|
1926
|
|
1927
|
if (p->second.size() && (osdmap.pg_temp->count(p->first) == 0 ||
|
1928
|
(*osdmap.pg_temp)[p->first] != p->second ||
|
1929
|
osdmap.primary_temp->count(p->first)))
|
1930
|
return false;
|
1931
|
}
|
1932
|
|
1933
|
|
1934
|
if (ignore_cnt == m->pg_temp.size())
|
1935
|
goto ignore;
|
1936
|
|
1937
|
dout(7) << "preprocess_pgtemp e" << m->map_epoch << " no changes from " << m->get_orig_source_inst() << dendl;
|
1938
|
_reply_map(m, m->map_epoch);
|
1939
|
return true;
|
1940
|
|
1941
|
ignore:
|
1942
|
m->put();
|
1943
|
return true;
|
1944
|
}
|
1945
|
|
1946
|
bool OSDMonitor::prepare_pgtemp(MOSDPGTemp *m)
|
1947
|
{
|
1948
|
int from = m->get_orig_source().num();
|
1949
|
dout(7) << "prepare_pgtemp e" << m->map_epoch << " from " << m->get_orig_source_inst() << dendl;
|
1950
|
for (map<pg_t,vector<int32_t> >::iterator p = m->pg_temp.begin(); p != m->pg_temp.end(); ++p) {
|
1951
|
uint64_t pool = p->first.pool();
|
1952
|
if (pending_inc.old_pools.count(pool)) {
|
1953
|
dout(10) << __func__ << " ignore " << p->first << " -> " << p->second
|
1954
|
<< ": pool pending removal" << dendl;
|
1955
|
continue;
|
1956
|
}
|
1957
|
if (!osdmap.have_pg_pool(pool)) {
|
1958
|
dout(10) << __func__ << " ignore " << p->first << " -> " << p->second
|
1959
|
<< ": pool has been removed" << dendl;
|
1960
|
continue;
|
1961
|
}
|
1962
|
pending_inc.new_pg_temp[p->first] = p->second;
|
1963
|
|
1964
|
|
1965
|
|
1966
|
|
1967
|
if (osdmap.primary_temp->count(p->first) ||
|
1968
|
pending_inc.new_primary_temp.count(p->first))
|
1969
|
pending_inc.new_primary_temp[p->first] = -1;
|
1970
|
}
|
1971
|
pending_inc.new_up_thru[from] = m->map_epoch;
|
1972
|
wait_for_finished_proposal(new C_ReplyMap(this, m, m->map_epoch));
|
1973
|
return true;
|
1974
|
}
|
1975
|
|
1976
|
|
1977
|
|
1978
|
|
1979
|
bool OSDMonitor::preprocess_remove_snaps(MRemoveSnaps *m)
|
1980
|
{
|
1981
|
dout(7) << "preprocess_remove_snaps " << *m << dendl;
|
1982
|
|
1983
|
|
1984
|
MonSession *session = m->get_session();
|
1985
|
if (!session)
|
1986
|
goto ignore;
|
1987
|
if (!session->is_capable("osd", MON_CAP_R | MON_CAP_W)) {
|
1988
|
dout(0) << "got preprocess_remove_snaps from entity with insufficient caps "
|
1989
|
<< session->caps << dendl;
|
1990
|
goto ignore;
|
1991
|
}
|
1992
|
|
1993
|
for (map<int, vector<snapid_t> >::iterator q = m->snaps.begin();
|
1994
|
q != m->snaps.end();
|
1995
|
++q) {
|
1996
|
if (!osdmap.have_pg_pool(q->first)) {
|
1997
|
dout(10) << " ignoring removed_snaps " << q->second << " on non-existent pool " << q->first << dendl;
|
1998
|
continue;
|
1999
|
}
|
2000
|
const pg_pool_t *pi = osdmap.get_pg_pool(q->first);
|
2001
|
for (vector<snapid_t>::iterator p = q->second.begin();
|
2002
|
p != q->second.end();
|
2003
|
++p) {
|
2004
|
if (*p > pi->get_snap_seq() ||
|
2005
|
!pi->removed_snaps.contains(*p))
|
2006
|
return false;
|
2007
|
}
|
2008
|
}
|
2009
|
|
2010
|
ignore:
|
2011
|
m->put();
|
2012
|
return true;
|
2013
|
}
|
2014
|
|
2015
|
bool OSDMonitor::prepare_remove_snaps(MRemoveSnaps *m)
|
2016
|
{
|
2017
|
dout(7) << "prepare_remove_snaps " << *m << dendl;
|
2018
|
|
2019
|
for (map<int, vector<snapid_t> >::iterator p = m->snaps.begin();
|
2020
|
p != m->snaps.end();
|
2021
|
++p) {
|
2022
|
pg_pool_t& pi = osdmap.pools[p->first];
|
2023
|
for (vector<snapid_t>::iterator q = p->second.begin();
|
2024
|
q != p->second.end();
|
2025
|
++q) {
|
2026
|
if (!pi.removed_snaps.contains(*q) &&
|
2027
|
(!pending_inc.new_pools.count(p->first) ||
|
2028
|
!pending_inc.new_pools[p->first].removed_snaps.contains(*q))) {
|
2029
|
pg_pool_t *newpi = pending_inc.get_new_pool(p->first, &pi);
|
2030
|
newpi->removed_snaps.insert(*q);
|
2031
|
dout(10) << " pool " << p->first << " removed_snaps added " << *q
|
2032
|
<< " (now " << newpi->removed_snaps << ")" << dendl;
|
2033
|
if (*q > newpi->get_snap_seq()) {
|
2034
|
dout(10) << " pool " << p->first << " snap_seq " << newpi->get_snap_seq() << " -> " << *q << dendl;
|
2035
|
newpi->set_snap_seq(*q);
|
2036
|
}
|
2037
|
newpi->set_snap_epoch(pending_inc.epoch);
|
2038
|
}
|
2039
|
}
|
2040
|
}
|
2041
|
|
2042
|
m->put();
|
2043
|
return true;
|
2044
|
}
|
2045
|
|
2046
|
|
2047
|
|
2048
|
|
2049
|
|
2050
|
void OSDMonitor::send_latest(PaxosServiceMessage *m, epoch_t start)
|
2051
|
{
|
2052
|
dout(5) << "send_latest to " << m->get_orig_source_inst()
|
2053
|
<< " start " << start << dendl;
|
2054
|
if (start == 0)
|
2055
|
send_full(m);
|
2056
|
else
|
2057
|
send_incremental(m, start);
|
2058
|
m->put();
|
2059
|
}
|
2060
|
|
2061
|
|
2062
|
MOSDMap *OSDMonitor::build_latest_full()
|
2063
|
{
|
2064
|
MOSDMap *r = new MOSDMap(mon->monmap->fsid);
|
2065
|
get_version_full(osdmap.get_epoch(), r->maps[osdmap.get_epoch()]);
|
2066
|
r->oldest_map = get_first_committed();
|
2067
|
r->newest_map = osdmap.get_epoch();
|
2068
|
return r;
|
2069
|
}
|
2070
|
|
2071
|
MOSDMap *OSDMonitor::build_incremental(epoch_t from, epoch_t to)
|
2072
|
{
|
2073
|
dout(10) << "build_incremental [" << from << ".." << to << "]" << dendl;
|
2074
|
MOSDMap *m = new MOSDMap(mon->monmap->fsid);
|
2075
|
m->oldest_map = get_first_committed();
|
2076
|
m->newest_map = osdmap.get_epoch();
|
2077
|
|
2078
|
for (epoch_t e = to; e >= from && e > 0; e--) {
|
2079
|
bufferlist bl;
|
2080
|
int err = get_version(e, bl);
|
2081
|
if (err == 0) {
|
2082
|
assert(bl.length());
|
2083
|
|
2084
|
dout(20) << "build_incremental inc " << e << " "
|
2085
|
<< bl.length() << " bytes" << dendl;
|
2086
|
m->incremental_maps[e] = bl;
|
2087
|
} else {
|
2088
|
assert(err == -ENOENT);
|
2089
|
assert(!bl.length());
|
2090
|
get_version_full(e, bl);
|
2091
|
if (bl.length() > 0) {
|
2092
|
|
2093
|
dout(20) << "build_incremental full " << e << " "
|
2094
|
<< bl.length() << " bytes" << dendl;
|
2095
|
m->maps[e] = bl;
|
2096
|
} else {
|
2097
|
assert(0);
|
2098
|
}
|
2099
|
}
|
2100
|
}
|
2101
|
return m;
|
2102
|
}
|
2103
|
|
2104
|
void OSDMonitor::send_full(PaxosServiceMessage *m)
|
2105
|
{
|
2106
|
dout(5) << "send_full to " << m->get_orig_source_inst() << dendl;
|
2107
|
mon->send_reply(m, build_latest_full());
|
2108
|
}
|
2109
|
|
2110
|
|
2111
|
|
2112
|
|
2113
|
|
2114
|
|
2115
|
|
2116
|
|
2117
|
void OSDMonitor::send_incremental(PaxosServiceMessage *req, epoch_t first)
|
2118
|
{
|
2119
|
dout(5) << "send_incremental [" << first << ".." << osdmap.get_epoch() << "]"
|
2120
|
<< " to " << req->get_orig_source_inst()
|
2121
|
<< dendl;
|
2122
|
|
2123
|
int osd = -1;
|
2124
|
if (req->get_source().is_osd()) {
|
2125
|
osd = req->get_source().num();
|
2126
|
map<int,epoch_t>::iterator p = osd_epoch.find(osd);
|
2127
|
if (p != osd_epoch.end()) {
|
2128
|
if (first <= p->second) {
|
2129
|
dout(10) << __func__ << " osd." << osd << " should already have epoch "
|
2130
|
<< p->second << dendl;
|
2131
|
first = p->second + 1;
|
2132
|
if (first > osdmap.get_epoch())
|
2133
|
return;
|
2134
|
}
|
2135
|
}
|
2136
|
}
|
2137
|
|
2138
|
if (first < get_first_committed()) {
|
2139
|
first = get_first_committed();
|
2140
|
bufferlist bl;
|
2141
|
int err = get_version_full(first, bl);
|
2142
|
assert(err == 0);
|
2143
|
assert(bl.length());
|
2144
|
|
2145
|
dout(20) << "send_incremental starting with base full "
|
2146
|
<< first << " " << bl.length() << " bytes" << dendl;
|
2147
|
|
2148
|
MOSDMap *m = new MOSDMap(osdmap.get_fsid());
|
2149
|
m->oldest_map = first;
|
2150
|
m->newest_map = osdmap.get_epoch();
|
2151
|
m->maps[first] = bl;
|
2152
|
mon->send_reply(req, m);
|
2153
|
|
2154
|
if (osd >= 0)
|
2155
|
note_osd_has_epoch(osd, osdmap.get_epoch());
|
2156
|
return;
|
2157
|
}
|
2158
|
|
2159
|
|
2160
|
|
2161
|
epoch_t last = MIN(first + g_conf->osd_map_message_max, osdmap.get_epoch());
|
2162
|
MOSDMap *m = build_incremental(first, last);
|
2163
|
m->oldest_map = get_first_committed();
|
2164
|
m->newest_map = osdmap.get_epoch();
|
2165
|
mon->send_reply(req, m);
|
2166
|
|
2167
|
if (osd >= 0)
|
2168
|
note_osd_has_epoch(osd, last);
|
2169
|
}
|
2170
|
|
2171
|
|
2172
|
|
2173
|
|
2174
|
|
2175
|
|
2176
|
|
2177
|
void OSDMonitor::note_osd_has_epoch(int osd, epoch_t epoch)
|
2178
|
{
|
2179
|
dout(20) << __func__ << " osd." << osd << " epoch " << epoch << dendl;
|
2180
|
map<int,epoch_t>::iterator p = osd_epoch.find(osd);
|
2181
|
if (p != osd_epoch.end()) {
|
2182
|
dout(20) << __func__ << " osd." << osd << " epoch " << epoch
|
2183
|
<< " (was " << p->second << ")" << dendl;
|
2184
|
p->second = epoch;
|
2185
|
} else {
|
2186
|
dout(20) << __func__ << " osd." << osd << " epoch " << epoch << dendl;
|
2187
|
osd_epoch[osd] = epoch;
|
2188
|
}
|
2189
|
}
|
2190
|
|
2191
|
void OSDMonitor::send_incremental(epoch_t first, MonSession *session,
|
2192
|
bool onetime)
|
2193
|
{
|
2194
|
dout(5) << "send_incremental [" << first << ".." << osdmap.get_epoch() << "]"
|
2195
|
<< " to " << session->inst << dendl;
|
2196
|
|
2197
|
if (first < get_first_committed()) {
|
2198
|
first = get_first_committed();
|
2199
|
bufferlist bl;
|
2200
|
int err = get_version_full(first, bl);
|
2201
|
assert(err == 0);
|
2202
|
assert(bl.length());
|
2203
|
|
2204
|
dout(20) << "send_incremental starting with base full "
|
2205
|
<< first << " " << bl.length() << " bytes" << dendl;
|
2206
|
|
2207
|
MOSDMap *m = new MOSDMap(osdmap.get_fsid());
|
2208
|
m->oldest_map = first;
|
2209
|
m->newest_map = osdmap.get_epoch();
|
2210
|
m->maps[first] = bl;
|
2211
|
session->con->send_message(m);
|
2212
|
first++;
|
2213
|
}
|
2214
|
|
2215
|
while (first <= osdmap.get_epoch()) {
|
2216
|
epoch_t last = MIN(first + g_conf->osd_map_message_max, osdmap.get_epoch());
|
2217
|
MOSDMap *m = build_incremental(first, last);
|
2218
|
session->con->send_message(m);
|
2219
|
first = last + 1;
|
2220
|
|
2221
|
if (session->inst.name.is_osd())
|
2222
|
note_osd_has_epoch(session->inst.name.num(), last);
|
2223
|
|
2224
|
if (onetime)
|
2225
|
break;
|
2226
|
}
|
2227
|
}
|
2228
|
|
2229
|
|
2230
|
|
2231
|
|
2232
|
epoch_t OSDMonitor::blacklist(const entity_addr_t& a, utime_t until)
|
2233
|
{
|
2234
|
dout(10) << "blacklist " << a << " until " << until << dendl;
|
2235
|
pending_inc.new_blacklist[a] = until;
|
2236
|
return pending_inc.epoch;
|
2237
|
}
|
2238
|
|
2239
|
|
2240
|
void OSDMonitor::check_subs()
|
2241
|
{
|
2242
|
dout(10) << __func__ << dendl;
|
2243
|
string type = "osdmap";
|
2244
|
if (mon->session_map.subs.count(type) == 0)
|
2245
|
return;
|
2246
|
xlist<Subscription*>::iterator p = mon->session_map.subs[type]->begin();
|
2247
|
while (!p.end()) {
|
2248
|
Subscription *sub = *p;
|
2249
|
++p;
|
2250
|
check_sub(sub);
|
2251
|
}
|
2252
|
}
|
2253
|
|
2254
|
void OSDMonitor::check_sub(Subscription *sub)
|
2255
|
{
|
2256
|
dout(10) << __func__ << " " << sub << " next " << sub->next
|
2257
|
<< (sub->onetime ? " (onetime)":" (ongoing)") << dendl;
|
2258
|
if (sub->next <= osdmap.get_epoch()) {
|
2259
|
if (sub->next >= 1)
|
2260
|
send_incremental(sub->next, sub->session, sub->incremental_onetime);
|
2261
|
else
|
2262
|
sub->session->con->send_message(build_latest_full());
|
2263
|
if (sub->onetime)
|
2264
|
mon->session_map.remove_sub(sub);
|
2265
|
else
|
2266
|
sub->next = osdmap.get_epoch() + 1;
|
2267
|
}
|
2268
|
}
|
2269
|
|
2270
|
|
2271
|
|
2272
|
|
2273
|
void OSDMonitor::tick()
|
2274
|
{
|
2275
|
if (!is_active()) return;
|
2276
|
|
2277
|
dout(10) << osdmap << dendl;
|
2278
|
|
2279
|
if (!mon->is_leader()) return;
|
2280
|
|
2281
|
bool do_propose = false;
|
2282
|
utime_t now = ceph_clock_now(g_ceph_context);
|
2283
|
|
2284
|
|
2285
|
check_failures(now);
|
2286
|
|
2287
|
|
2288
|
|
2289
|
|
2290
|
|
2291
|
|
2292
|
|
2293
|
|
2294
|
if (can_mark_out(-1)) {
|
2295
|
set<int> down_cache;
|
2296
|
|
2297
|
map<int,utime_t>::iterator i = down_pending_out.begin();
|
2298
|
while (i != down_pending_out.end()) {
|
2299
|
int o = i->first;
|
2300
|
utime_t down = now;
|
2301
|
down -= i->second;
|
2302
|
++i;
|
2303
|
|
2304
|
if (osdmap.is_down(o) &&
|
2305
|
osdmap.is_in(o) &&
|
2306
|
can_mark_out(o)) {
|
2307
|
utime_t orig_grace(g_conf->mon_osd_down_out_interval, 0);
|
2308
|
utime_t grace = orig_grace;
|
2309
|
double my_grace = 0.0;
|
2310
|
|
2311
|
if (g_conf->mon_osd_adjust_down_out_interval) {
|
2312
|
|
2313
|
const osd_xinfo_t& xi = osdmap.get_xinfo(o);
|
2314
|
double halflife = (double)g_conf->mon_osd_laggy_halflife;
|
2315
|
double decay_k = ::log(.5) / halflife;
|
2316
|
double decay = exp((double)down * decay_k);
|
2317
|
dout(20) << "osd." << o << " laggy halflife " << halflife << " decay_k " << decay_k
|
2318
|
<< " down for " << down << " decay " << decay << dendl;
|
2319
|
my_grace = decay * (double)xi.laggy_interval * xi.laggy_probability;
|
2320
|
grace += my_grace;
|
2321
|
}
|
2322
|
|
2323
|
|
2324
|
if (g_conf->mon_osd_down_out_subtree_limit.length()) {
|
2325
|
int type = osdmap.crush->get_type_id(g_conf->mon_osd_down_out_subtree_limit);
|
2326
|
if (type > 0) {
|
2327
|
if (osdmap.containing_subtree_is_down(g_ceph_context, o, type, &down_cache)) {
|
2328
|
dout(10) << "tick entire containing " << g_conf->mon_osd_down_out_subtree_limit
|
2329
|
<< " subtree for osd." << o << " is down; resetting timer" << dendl;
|
2330
|
|
2331
|
down_pending_out[o] = now;
|
2332
|
continue;
|
2333
|
}
|
2334
|
}
|
2335
|
}
|
2336
|
|
2337
|
if (g_conf->mon_osd_down_out_interval > 0 &&
|
2338
|
down.sec() >= grace) {
|
2339
|
dout(10) << "tick marking osd." << o << " OUT after " << down
|
2340
|
<< " sec (target " << grace << " = " << orig_grace << " + " << my_grace << ")" << dendl;
|
2341
|
pending_inc.new_weight[o] = CEPH_OSD_OUT;
|
2342
|
|
2343
|
|
2344
|
if (pending_inc.new_state.count(o) == 0)
|
2345
|
pending_inc.new_state[o] = 0;
|
2346
|
pending_inc.new_state[o] |= CEPH_OSD_AUTOOUT;
|
2347
|
|
2348
|
|
2349
|
if (pending_inc.new_xinfo.count(o) == 0)
|
2350
|
pending_inc.new_xinfo[o] = osdmap.osd_xinfo[o];
|
2351
|
pending_inc.new_xinfo[o].old_weight = osdmap.osd_weight[o];
|
2352
|
|
2353
|
do_propose = true;
|
2354
|
|
2355
|
mon->clog->info() << "osd." << o << " out (down for " << down << ")\n";
|
2356
|
} else
|
2357
|
continue;
|
2358
|
}
|
2359
|
|
2360
|
down_pending_out.erase(o);
|
2361
|
}
|
2362
|
} else {
|
2363
|
dout(10) << "tick NOOUT flag set, not checking down osds" << dendl;
|
2364
|
}
|
2365
|
|
2366
|
|
2367
|
for (ceph::unordered_map<entity_addr_t,utime_t>::iterator p = osdmap.blacklist.begin();
|
2368
|
p != osdmap.blacklist.end();
|
2369
|
++p) {
|
2370
|
if (p->second < now) {
|
2371
|
dout(10) << "expiring blacklist item " << p->first << " expired " << p->second << " < now " << now << dendl;
|
2372
|
pending_inc.old_blacklist.push_back(p->first);
|
2373
|
do_propose = true;
|
2374
|
}
|
2375
|
}
|
2376
|
|
2377
|
|
2378
|
if (mon->pgmon()->is_readable()) {
|
2379
|
if (!mon->pgmon()->pg_map.full_osds.empty()) {
|
2380
|
dout(5) << "There are full osds, setting full flag" << dendl;
|
2381
|
add_flag(CEPH_OSDMAP_FULL);
|
2382
|
} else if (osdmap.test_flag(CEPH_OSDMAP_FULL)){
|
2383
|
dout(10) << "No full osds, removing full flag" << dendl;
|
2384
|
remove_flag(CEPH_OSDMAP_FULL);
|
2385
|
}
|
2386
|
if (pending_inc.new_flags != -1 &&
|
2387
|
(pending_inc.new_flags ^ osdmap.flags) & CEPH_OSDMAP_FULL) {
|
2388
|
dout(1) << "New setting for CEPH_OSDMAP_FULL -- doing propose" << dendl;
|
2389
|
do_propose = true;
|
2390
|
}
|
2391
|
}
|
2392
|
|
2393
|
#define SWAP_PRIMARIES_AT_START 0
|
2394
|
#define SWAP_TIME 1
|
2395
|
|
2396
|
|
2397
|
|
2398
|
|
2399
|
|
2400
|
|
2401
|
|
2402
|
|
2403
|
|
2404
|
|
2405
|
|
2406
|
|
2407
|
|
2408
|
|
2409
|
|
2410
|
|
2411
|
|
2412
|
|
2413
|
|
2414
|
|
2415
|
|
2416
|
if (update_pools_status())
|
2417
|
do_propose = true;
|
2418
|
|
2419
|
if (do_propose ||
|
2420
|
!pending_inc.new_pg_temp.empty())
|
2421
|
propose_pending();
|
2422
|
}
|
2423
|
|
2424
|
void OSDMonitor::handle_osd_timeouts(const utime_t &now,
|
2425
|
std::map<int,utime_t> &last_osd_report)
|
2426
|
{
|
2427
|
utime_t timeo(g_conf->mon_osd_report_timeout, 0);
|
2428
|
int max_osd = osdmap.get_max_osd();
|
2429
|
bool new_down = false;
|
2430
|
|
2431
|
for (int i=0; i < max_osd; ++i) {
|
2432
|
dout(30) << "handle_osd_timeouts: checking up on osd " << i << dendl;
|
2433
|
if (!osdmap.exists(i))
|
2434
|
continue;
|
2435
|
if (!osdmap.is_up(i))
|
2436
|
continue;
|
2437
|
const std::map<int,utime_t>::const_iterator t = last_osd_report.find(i);
|
2438
|
if (t == last_osd_report.end()) {
|
2439
|
|
2440
|
last_osd_report[i] = now;
|
2441
|
} else if (can_mark_down(i)) {
|
2442
|
utime_t diff = now - t->second;
|
2443
|
if (diff > timeo) {
|
2444
|
mon->clog->info() << "osd." << i << " marked down after no pg stats for " << diff << "seconds\n";
|
2445
|
derr << "no osd or pg stats from osd." << i << " since " << t->second << ", " << diff
|
2446
|
<< " seconds ago. marking down" << dendl;
|
2447
|
pending_inc.new_state[i] = CEPH_OSD_UP;
|
2448
|
new_down = true;
|
2449
|
}
|
2450
|
}
|
2451
|
}
|
2452
|
if (new_down) {
|
2453
|
propose_pending();
|
2454
|
}
|
2455
|
}
|
2456
|
|
2457
|
void OSDMonitor::mark_all_down()
|
2458
|
{
|
2459
|
assert(mon->is_leader());
|
2460
|
|
2461
|
dout(7) << "mark_all_down" << dendl;
|
2462
|
|
2463
|
set<int32_t> ls;
|
2464
|
osdmap.get_all_osds(ls);
|
2465
|
for (set<int32_t>::iterator it = ls.begin();
|
2466
|
it != ls.end();
|
2467
|
++it) {
|
2468
|
if (osdmap.is_down(*it)) continue;
|
2469
|
pending_inc.new_state[*it] = CEPH_OSD_UP;
|
2470
|
}
|
2471
|
|
2472
|
propose_pending();
|
2473
|
}
|
2474
|
|
2475
|
void OSDMonitor::get_health(list<pair<health_status_t,string> >& summary,
|
2476
|
list<pair<health_status_t,string> > *detail) const
|
2477
|
{
|
2478
|
int num_osds = osdmap.get_num_osds();
|
2479
|
|
2480
|
if (num_osds == 0) {
|
2481
|
summary.push_back(make_pair(HEALTH_ERR, "no osds"));
|
2482
|
} else {
|
2483
|
int num_in_osds = 0;
|
2484
|
int num_down_in_osds = 0;
|
2485
|
for (int i = 0; i < osdmap.get_max_osd(); i++) {
|
2486
|
if (!osdmap.exists(i) || osdmap.is_out(i))
|
2487
|
continue;
|
2488
|
++num_in_osds;
|
2489
|
if (!osdmap.is_up(i)) {
|
2490
|
++num_down_in_osds;
|
2491
|
if (detail) {
|
2492
|
const osd_info_t& info = osdmap.get_info(i);
|
2493
|
ostringstream ss;
|
2494
|
ss << "osd." << i << " is down since epoch " << info.down_at
|
2495
|
<< ", last address " << osdmap.get_addr(i);
|
2496
|
detail->push_back(make_pair(HEALTH_WARN, ss.str()));
|
2497
|
}
|
2498
|
}
|
2499
|
}
|
2500
|
assert(num_down_in_osds <= num_in_osds);
|
2501
|
if (num_down_in_osds > 0) {
|
2502
|
ostringstream ss;
|
2503
|
ss << num_down_in_osds << "/" << num_in_osds << " in osds are down";
|
2504
|
summary.push_back(make_pair(HEALTH_WARN, ss.str()));
|
2505
|
}
|
2506
|
|
2507
|
|
2508
|
if (osdmap.test_flag(CEPH_OSDMAP_PAUSERD |
|
2509
|
CEPH_OSDMAP_PAUSEWR |
|
2510
|
CEPH_OSDMAP_NOUP |
|
2511
|
CEPH_OSDMAP_NODOWN |
|
2512
|
CEPH_OSDMAP_NOIN |
|
2513
|
CEPH_OSDMAP_NOOUT |
|
2514
|
CEPH_OSDMAP_NOBACKFILL |
|
2515
|
CEPH_OSDMAP_NOREBALANCE |
|
2516
|
CEPH_OSDMAP_NORECOVER |
|
2517
|
CEPH_OSDMAP_NOSCRUB |
|
2518
|
CEPH_OSDMAP_NODEEP_SCRUB |
|
2519
|
CEPH_OSDMAP_NOTIERAGENT)) {
|
2520
|
ostringstream ss;
|
2521
|
ss << osdmap.get_flag_string() << " flag(s) set";
|
2522
|
summary.push_back(make_pair(HEALTH_WARN, ss.str()));
|
2523
|
if (detail)
|
2524
|
detail->push_back(make_pair(HEALTH_WARN, ss.str()));
|
2525
|
}
|
2526
|
|
2527
|
|
2528
|
if (g_conf->mon_warn_on_legacy_crush_tunables) {
|
2529
|
if (osdmap.crush->has_legacy_tunables()) {
|
2530
|
ostringstream ss;
|
2531
|
ss << "crush map has legacy tunables";
|
2532
|
summary.push_back(make_pair(HEALTH_WARN, ss.str()));
|
2533
|
if (detail) {
|
2534
|
ss << "; see http://ceph.com/docs/master/rados/operations/crush-map/#tunables";
|
2535
|
detail->push_back(make_pair(HEALTH_WARN, ss.str()));
|
2536
|
}
|
2537
|
}
|
2538
|
}
|
2539
|
|
2540
|
|
2541
|
if (g_conf->mon_warn_on_cache_pools_without_hit_sets) {
|
2542
|
int problem_cache_pools = 0;
|
2543
|
for (map<int64_t, pg_pool_t>::const_iterator p = osdmap.pools.begin();
|
2544
|
p != osdmap.pools.end();
|
2545
|
++p) {
|
2546
|
const pg_pool_t& info = p->second;
|
2547
|
if (info.cache_mode_requires_hit_set() &&
|
2548
|
info.hit_set_params.get_type() == HitSet::TYPE_NONE) {
|
2549
|
++problem_cache_pools;
|
2550
|
if (detail) {
|
2551
|
ostringstream ss;
|
2552
|
ss << "pool '" << osdmap.get_pool_name(p->first)
|
2553
|
<< "' with cache_mode " << info.get_cache_mode_name()
|
2554
|
<< " needs hit_set_type to be set but it is not";
|
2555
|
detail->push_back(make_pair(HEALTH_WARN, ss.str()));
|
2556
|
}
|
2557
|
}
|
2558
|
}
|
2559
|
if (problem_cache_pools) {
|
2560
|
ostringstream ss;
|
2561
|
ss << problem_cache_pools << " cache pools are missing hit_sets";
|
2562
|
summary.push_back(make_pair(HEALTH_WARN, ss.str()));
|
2563
|
}
|
2564
|
}
|
2565
|
|
2566
|
|
2567
|
|
2568
|
|
2569
|
|
2570
|
|
2571
|
|
2572
|
|
2573
|
|
2574
|
|
2575
|
|
2576
|
|
2577
|
|
2578
|
if (g_conf->mon_warn_on_osd_down_out_interval_zero &&
|
2579
|
g_conf->mon_osd_down_out_interval == 0) {
|
2580
|
ostringstream ss;
|
2581
|
ss << "mon." << mon->name << " has mon_osd_down_out_interval set to 0";
|
2582
|
summary.push_back(make_pair(HEALTH_WARN, ss.str()));
|
2583
|
if (detail) {
|
2584
|
ss << "; this has the same effect as the 'noout' flag";
|
2585
|
detail->push_back(make_pair(HEALTH_WARN, ss.str()));
|
2586
|
}
|
2587
|
}
|
2588
|
|
2589
|
get_pools_health(summary, detail);
|
2590
|
}
|
2591
|
}
|
2592
|
|
2593
|
void OSDMonitor::dump_info(Formatter *f)
|
2594
|
{
|
2595
|
f->open_object_section("osdmap");
|
2596
|
osdmap.dump(f);
|
2597
|
f->close_section();
|
2598
|
|
2599
|
f->open_array_section("osd_metadata");
|
2600
|
for (int i=0; i<osdmap.get_max_osd(); ++i) {
|
2601
|
if (osdmap.exists(i)) {
|
2602
|
f->open_object_section("osd");
|
2603
|
f->dump_unsigned("id", i);
|
2604
|
dump_osd_metadata(i, f, NULL);
|
2605
|
f->close_section();
|
2606
|
}
|
2607
|
}
|
2608
|
f->close_section();
|
2609
|
|
2610
|
f->dump_unsigned("osdmap_first_committed", get_first_committed());
|
2611
|
f->dump_unsigned("osdmap_last_committed", get_last_committed());
|
2612
|
|
2613
|
f->open_object_section("crushmap");
|
2614
|
osdmap.crush->dump(f);
|
2615
|
f->close_section();
|
2616
|
}
|
2617
|
|
2618
|
bool OSDMonitor::preprocess_command(MMonCommand *m)
|
2619
|
{
|
2620
|
int r = 0;
|
2621
|
bufferlist rdata;
|
2622
|
stringstream ss, ds;
|
2623
|
|
2624
|
map<string, cmd_vartype> cmdmap;
|
2625
|
if (!cmdmap_from_json(m->cmd, &cmdmap, ss)) {
|
2626
|
string rs = ss.str();
|
2627
|
mon->reply_command(m, -EINVAL, rs, get_last_committed());
|
2628
|
return true;
|
2629
|
}
|
2630
|
|
2631
|
MonSession *session = m->get_session();
|
2632
|
if (!session) {
|
2633
|
mon->reply_command(m, -EACCES, "access denied", rdata, get_last_committed());
|
2634
|
return true;
|
2635
|
}
|
2636
|
|
2637
|
string prefix;
|
2638
|
cmd_getval(g_ceph_context, cmdmap, "prefix", prefix);
|
2639
|
|
2640
|
string format;
|
2641
|
cmd_getval(g_ceph_context, cmdmap, "format", format, string("plain"));
|
2642
|
boost::scoped_ptr<Formatter> f(Formatter::create(format));
|
2643
|
|
2644
|
if (prefix == "osd stat") {
|
2645
|
osdmap.print_summary(f.get(), ds);
|
2646
|
if (f)
|
2647
|
f->flush(rdata);
|
2648
|
else
|
2649
|
rdata.append(ds);
|
2650
|
}
|
2651
|
else if (prefix == "osd perf") {
|
2652
|
const PGMap &pgm = mon->pgmon()->pg_map;
|
2653
|
if (f) {
|
2654
|
f->open_object_section("osdstats");
|
2655
|
pgm.dump_osd_perf_stats(f.get());
|
2656
|
f->close_section();
|
2657
|
f->flush(ds);
|
2658
|
} else {
|
2659
|
pgm.print_osd_perf_stats(&ds);
|
2660
|
}
|
2661
|
rdata.append(ds);
|
2662
|
}
|
2663
|
else if (prefix == "osd blocked-by") {
|
2664
|
const PGMap &pgm = mon->pgmon()->pg_map;
|
2665
|
if (f) {
|
2666
|
f->open_object_section("osd_blocked_by");
|
2667
|
pgm.dump_osd_blocked_by_stats(f.get());
|
2668
|
f->close_section();
|
2669
|
f->flush(ds);
|
2670
|
} else {
|
2671
|
pgm.print_osd_blocked_by_stats(&ds);
|
2672
|
}
|
2673
|
rdata.append(ds);
|
2674
|
}
|
2675
|
else if (prefix == "osd dump" ||
|
2676
|
prefix == "osd tree" ||
|
2677
|
prefix == "osd ls" ||
|
2678
|
prefix == "osd getmap" ||
|
2679
|
prefix == "osd getcrushmap") {
|
2680
|
string val;
|
2681
|
|
2682
|
epoch_t epoch = 0;
|
2683
|
int64_t epochnum;
|
2684
|
cmd_getval(g_ceph_context, cmdmap, "epoch", epochnum, (int64_t)0);
|
2685
|
epoch = epochnum;
|
2686
|
if (!epoch)
|
2687
|
epoch = osdmap.get_epoch();
|
2688
|
|
2689
|
bufferlist osdmap_bl;
|
2690
|
int err = get_version_full(epoch, osdmap_bl);
|
2691
|
if (err == -ENOENT) {
|
2692
|
r = -ENOENT;
|
2693
|
ss << "there is no map for epoch " << epoch;
|
2694
|
goto reply;
|
2695
|
}
|
2696
|
assert(err == 0);
|
2697
|
assert(osdmap_bl.length());
|
2698
|
|
2699
|
OSDMap *p;
|
2700
|
if (epoch == osdmap.get_epoch()) {
|
2701
|
p = &osdmap;
|
2702
|
} else {
|
2703
|
p = new OSDMap;
|
2704
|
p->decode(osdmap_bl);
|
2705
|
}
|
2706
|
|
2707
|
if (prefix == "osd dump") {
|
2708
|
stringstream ds;
|
2709
|
if (f) {
|
2710
|
f->open_object_section("osdmap");
|
2711
|
p->dump(f.get());
|
2712
|
f->close_section();
|
2713
|
f->flush(ds);
|
2714
|
} else {
|
2715
|
p->print(ds);
|
2716
|
}
|
2717
|
rdata.append(ds);
|
2718
|
if (!f)
|
2719
|
ds << " ";
|
2720
|
} else if (prefix == "osd ls") {
|
2721
|
if (f) {
|
2722
|
f->open_array_section("osds");
|
2723
|
for (int i = 0; i < osdmap.get_max_osd(); i++) {
|
2724
|
if (osdmap.exists(i)) {
|
2725
|
f->dump_int("osd", i);
|
2726
|
}
|
2727
|
}
|
2728
|
f->close_section();
|
2729
|
f->flush(ds);
|
2730
|
} else {
|
2731
|
bool first = true;
|
2732
|
for (int i = 0; i < osdmap.get_max_osd(); i++) {
|
2733
|
if (osdmap.exists(i)) {
|
2734
|
if (!first)
|
2735
|
ds << "\n";
|
2736
|
first = false;
|
2737
|
ds << i;
|
2738
|
}
|
2739
|
}
|
2740
|
}
|
2741
|
rdata.append(ds);
|
2742
|
} else if (prefix == "osd tree") {
|
2743
|
if (f) {
|
2744
|
f->open_object_section("tree");
|
2745
|
p->print_tree(NULL, f.get());
|
2746
|
f->close_section();
|
2747
|
f->flush(ds);
|
2748
|
} else {
|
2749
|
p->print_tree(&ds, NULL);
|
2750
|
}
|
2751
|
rdata.append(ds);
|
2752
|
} else if (prefix == "osd getmap") {
|
2753
|
rdata.append(osdmap_bl);
|
2754
|
ss << "got osdmap epoch " << p->get_epoch();
|
2755
|
} else if (prefix == "osd getcrushmap") {
|
2756
|
p->crush->encode(rdata);
|
2757
|
ss << "got crush map from osdmap epoch " << p->get_epoch();
|
2758
|
}
|
2759
|
if (p != &osdmap)
|
2760
|
delete p;
|
2761
|
} else if (prefix == "osd df") {
|
2762
|
string method;
|
2763
|
cmd_getval(g_ceph_context, cmdmap, "output_method", method);
|
2764
|
print_utilization(ds, f ? f.get() : NULL, method == "tree");
|
2765
|
rdata.append(ds);
|
2766
|
} else if (prefix == "osd getmaxosd") {
|
2767
|
if (f) {
|
2768
|
f->open_object_section("getmaxosd");
|
2769
|
f->dump_unsigned("epoch", osdmap.get_epoch());
|
2770
|
f->dump_int("max_osd", osdmap.get_max_osd());
|
2771
|
f->close_section();
|
2772
|
f->flush(rdata);
|
2773
|
} else {
|
2774
|
ds << "max_osd = " << osdmap.get_max_osd() << " in epoch " << osdmap.get_epoch();
|
2775
|
rdata.append(ds);
|
2776
|
}
|
2777
|
} else if (prefix == "osd find") {
|
2778
|
int64_t osd;
|
2779
|
if (!cmd_getval(g_ceph_context, cmdmap, "id", osd)) {
|
2780
|
ss << "unable to parse osd id value '"
|
2781
|
<< cmd_vartype_stringify(cmdmap["id"]) << "'";
|
2782
|
r = -EINVAL;
|
2783
|
goto reply;
|
2784
|
}
|
2785
|
if (!osdmap.exists(osd)) {
|
2786
|
ss << "osd." << osd << " does not exist";
|
2787
|
r = -ENOENT;
|
2788
|
goto reply;
|
2789
|
}
|
2790
|
string format;
|
2791
|
cmd_getval(g_ceph_context, cmdmap, "format", format);
|
2792
|
boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty", "json-pretty"));
|
2793
|
f->open_object_section("osd_location");
|
2794
|
f->dump_int("osd", osd);
|
2795
|
f->dump_stream("ip") << osdmap.get_addr(osd);
|
2796
|
f->open_object_section("crush_location");
|
2797
|
map<string,string> loc = osdmap.crush->get_full_location(osd);
|
2798
|
for (map<string,string>::iterator p = loc.begin(); p != loc.end(); ++p)
|
2799
|
f->dump_string(p->first.c_str(), p->second);
|
2800
|
f->close_section();
|
2801
|
f->close_section();
|
2802
|
f->flush(rdata);
|
2803
|
} else if (prefix == "osd metadata") {
|
2804
|
int64_t osd;
|
2805
|
if (!cmd_getval(g_ceph_context, cmdmap, "id", osd)) {
|
2806
|
ss << "unable to parse osd id value '"
|
2807
|
<< cmd_vartype_stringify(cmdmap["id"]) << "'";
|
2808
|
r = -EINVAL;
|
2809
|
goto reply;
|
2810
|
}
|
2811
|
if (!osdmap.exists(osd)) {
|
2812
|
ss << "osd." << osd << " does not exist";
|
2813
|
r = -ENOENT;
|
2814
|
goto reply;
|
2815
|
}
|
2816
|
string format;
|
2817
|
cmd_getval(g_ceph_context, cmdmap, "format", format);
|
2818
|
boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty", "json-pretty"));
|
2819
|
f->open_object_section("osd_metadata");
|
2820
|
r = dump_osd_metadata(osd, f.get(), &ss);
|
2821
|
if (r < 0)
|
2822
|
goto reply;
|
2823
|
f->close_section();
|
2824
|
f->flush(rdata);
|
2825
|
} else if (prefix == "osd map") {
|
2826
|
string poolstr, objstr, namespacestr;
|
2827
|
cmd_getval(g_ceph_context, cmdmap, "pool", poolstr);
|
2828
|
cmd_getval(g_ceph_context, cmdmap, "object", objstr);
|
2829
|
cmd_getval(g_ceph_context, cmdmap, "nspace", namespacestr);
|
2830
|
|
2831
|
int64_t pool = osdmap.lookup_pg_pool_name(poolstr.c_str());
|
2832
|
if (pool < 0) {
|
2833
|
ss << "pool " << poolstr << " does not exist";
|
2834
|
r = -ENOENT;
|
2835
|
goto reply;
|
2836
|
}
|
2837
|
object_locator_t oloc(pool, namespacestr);
|
2838
|
object_t oid(objstr);
|
2839
|
pg_t pgid = osdmap.object_locator_to_pg(oid, oloc);
|
2840
|
pg_t mpgid = osdmap.raw_pg_to_pg(pgid);
|
2841
|
vector<int> up, acting;
|
2842
|
int up_p, acting_p;
|
2843
|
osdmap.pg_to_up_acting_osds(mpgid, &up, &up_p, &acting, &acting_p);
|
2844
|
|
2845
|
string fullobjname;
|
2846
|
if (!namespacestr.empty())
|
2847
|
fullobjname = namespacestr + string("/") + oid.name;
|
2848
|
else
|
2849
|
fullobjname = oid.name;
|
2850
|
if (f) {
|
2851
|
f->open_object_section("osd_map");
|
2852
|
f->dump_unsigned("epoch", osdmap.get_epoch());
|
2853
|
f->dump_string("pool", poolstr);
|
2854
|
f->dump_int("pool_id", pool);
|
2855
|
f->dump_stream("objname") << fullobjname;
|
2856
|
f->dump_stream("raw_pgid") << pgid;
|
2857
|
f->dump_stream("pgid") << mpgid;
|
2858
|
f->open_array_section("up");
|
2859
|
for (vector<int>::iterator p = up.begin(); p != up.end(); ++p)
|
2860
|
f->dump_int("osd", *p);
|
2861
|
f->close_section();
|
2862
|
f->dump_int("up_primary", up_p);
|
2863
|
f->open_array_section("acting");
|
2864
|
for (vector<int>::iterator p = acting.begin(); p != acting.end(); ++p)
|
2865
|
f->dump_int("osd", *p);
|
2866
|
f->close_section();
|
2867
|
f->dump_int("acting_primary", acting_p);
|
2868
|
f->close_section();
|
2869
|
f->flush(rdata);
|
2870
|
} else {
|
2871
|
ds << "osdmap e" << osdmap.get_epoch()
|
2872
|
<< " pool '" << poolstr << "' (" << pool << ")"
|
2873
|
<< " object '" << fullobjname << "' ->"
|
2874
|
<< " pg " << pgid << " (" << mpgid << ")"
|
2875
|
<< " -> up (" << up << ", p" << up_p << ") acting ("
|
2876
|
<< acting << ", p" << acting_p << ")";
|
2877
|
rdata.append(ds);
|
2878
|
}
|
2879
|
} else if ((prefix == "osd scrub" ||
|
2880
|
prefix == "osd deep-scrub" ||
|
2881
|
prefix == "osd repair")) {
|
2882
|
string whostr;
|
2883
|
cmd_getval(g_ceph_context, cmdmap, "who", whostr);
|
2884
|
vector<string> pvec;
|
2885
|
get_str_vec(prefix, pvec);
|
2886
|
|
2887
|
if (whostr == "*") {
|
2888
|
ss << "osds ";
|
2889
|
int c = 0;
|
2890
|
for (int i = 0; i < osdmap.get_max_osd(); i++)
|
2891
|
if (osdmap.is_up(i)) {
|
2892
|
ss << (c++ ? "," : "") << i;
|
2893
|
mon->try_send_message(new MOSDScrub(osdmap.get_fsid(),
|
2894
|
pvec.back() == "repair",
|
2895
|
pvec.back() == "deep-scrub"),
|
2896
|
osdmap.get_inst(i));
|
2897
|
}
|
2898
|
r = 0;
|
2899
|
ss << " instructed to " << pvec.back();
|
2900
|
} else {
|
2901
|
long osd = parse_osd_id(whostr.c_str(), &ss);
|
2902
|
if (osd < 0) {
|
2903
|
r = -EINVAL;
|
2904
|
} else if (osdmap.is_up(osd)) {
|
2905
|
mon->try_send_message(new MOSDScrub(osdmap.get_fsid(),
|
2906
|
pvec.back() == "repair",
|
2907
|
pvec.back() == "deep-scrub"),
|
2908
|
osdmap.get_inst(osd));
|
2909
|
ss << "osd." << osd << " instructed to " << pvec.back();
|
2910
|
} else {
|
2911
|
ss << "osd." << osd << " is not up";
|
2912
|
r = -EAGAIN;
|
2913
|
}
|
2914
|
}
|
2915
|
} else if (prefix == "osd lspools") {
|
2916
|
int64_t auid;
|
2917
|
cmd_getval(g_ceph_context, cmdmap, "auid", auid, int64_t(0));
|
2918
|
if (f)
|
2919
|
f->open_array_section("pools");
|
2920
|
for (map<int64_t, pg_pool_t>::iterator p = osdmap.pools.begin();
|
2921
|
p != osdmap.pools.end();
|
2922
|
++p) {
|
2923
|
if (!auid || p->second.auid == (uint64_t)auid) {
|
2924
|
if (f) {
|
2925
|
f->open_object_section("pool");
|
2926
|
f->dump_int("poolnum", p->first);
|
2927
|
f->dump_string("poolname", osdmap.pool_name[p->first]);
|
2928
|
f->close_section();
|
2929
|
} else {
|
2930
|
ds << p->first << ' ' << osdmap.pool_name[p->first] << ',';
|
2931
|
}
|
2932
|
}
|
2933
|
}
|
2934
|
if (f) {
|
2935
|
f->close_section();
|
2936
|
f->flush(ds);
|
2937
|
}
|
2938
|
rdata.append(ds);
|
2939
|
} else if (prefix == "osd blacklist ls") {
|
2940
|
if (f)
|
2941
|
f->open_array_section("blacklist");
|
2942
|
|
2943
|
for (ceph::unordered_map<entity_addr_t,utime_t>::iterator p = osdmap.blacklist.begin();
|
2944
|
p != osdmap.blacklist.end();
|
2945
|
++p) {
|
2946
|
if (f) {
|
2947
|
f->open_object_section("entry");
|
2948
|
f->dump_stream("addr") << p->first;
|
2949
|
f->dump_stream("until") << p->second;
|
2950
|
f->close_section();
|
2951
|
} else {
|
2952
|
stringstream ss;
|
2953
|
string s;
|
2954
|
ss << p->first << " " << p->second;
|
2955
|
getline(ss, s);
|
2956
|
s += "\n";
|
2957
|
rdata.append(s);
|
2958
|
}
|
2959
|
}
|
2960
|
if (f) {
|
2961
|
f->close_section();
|
2962
|
f->flush(rdata);
|
2963
|
}
|
2964
|
ss << "listed " << osdmap.blacklist.size() << " entries";
|
2965
|
|
2966
|
} else if (prefix == "osd pool ls") {
|
2967
|
string detail;
|
2968
|
cmd_getval(g_ceph_context, cmdmap, "detail", detail);
|
2969
|
if (!f && detail == "detail") {
|
2970
|
ostringstream ss;
|
2971
|
osdmap.print_pools(ss);
|
2972
|
rdata.append(ss.str());
|
2973
|
} else {
|
2974
|
if (f)
|
2975
|
f->open_array_section("pools");
|
2976
|
for (map<int64_t,pg_pool_t>::const_iterator it = osdmap.get_pools().begin();
|
2977
|
it != osdmap.get_pools().end();
|
2978
|
++it) {
|
2979
|
if (f) {
|
2980
|
if (detail == "detail") {
|
2981
|
f->open_object_section("pool");
|
2982
|
f->dump_string("pool_name", osdmap.get_pool_name(it->first));
|
2983
|
it->second.dump(f.get());
|
2984
|
f->close_section();
|
2985
|
} else {
|
2986
|
f->dump_string("pool_name", osdmap.get_pool_name(it->first));
|
2987
|
}
|
2988
|
} else {
|
2989
|
rdata.append(osdmap.get_pool_name(it->first) + "\n");
|
2990
|
}
|
2991
|
}
|
2992
|
if (f) {
|
2993
|
f->close_section();
|
2994
|
f->flush(rdata);
|
2995
|
}
|
2996
|
}
|
2997
|
|
2998
|
} else if (prefix == "osd crush get-tunable") {
|
2999
|
string tunable;
|
3000
|
cmd_getval(g_ceph_context, cmdmap, "tunable", tunable);
|
3001
|
int value;
|
3002
|
cmd_getval(g_ceph_context, cmdmap, "value", value);
|
3003
|
ostringstream rss;
|
3004
|
if (f)
|
3005
|
f->open_object_section("tunable");
|
3006
|
if (tunable == "straw_calc_version") {
|
3007
|
if (f)
|
3008
|
f->dump_int(tunable.c_str(), osdmap.crush->get_straw_calc_version());
|
3009
|
else
|
3010
|
rss << osdmap.crush->get_straw_calc_version() << "\n";
|
3011
|
} else {
|
3012
|
r = -EINVAL;
|
3013
|
goto reply;
|
3014
|
}
|
3015
|
if (f) {
|
3016
|
f->close_section();
|
3017
|
f->flush(rdata);
|
3018
|
} else {
|
3019
|
rdata.append(rss.str());
|
3020
|
}
|
3021
|
r = 0;
|
3022
|
|
3023
|
} else if (prefix == "osd pool get") {
|
3024
|
string poolstr;
|
3025
|
cmd_getval(g_ceph_context, cmdmap, "pool", poolstr);
|
3026
|
int64_t pool = osdmap.lookup_pg_pool_name(poolstr.c_str());
|
3027
|
if (pool < 0) {
|
3028
|
ss << "unrecognized pool '" << poolstr << "'";
|
3029
|
r = -ENOENT;
|
3030
|
goto reply;
|
3031
|
}
|
3032
|
|
3033
|
const pg_pool_t *p = osdmap.get_pg_pool(pool);
|
3034
|
string var;
|
3035
|
cmd_getval(g_ceph_context, cmdmap, "var", var);
|
3036
|
|
3037
|
if (!p->is_tier() &&
|
3038
|
(var == "hit_set_type" || var == "hit_set_period" ||
|
3039
|
var == "hit_set_count" || var == "hit_set_fpp" ||
|
3040
|
var == "target_max_objects" || var == "target_max_bytes" ||
|
3041
|
var == "cache_target_full_ratio" ||
|
3042
|
var == "cache_target_dirty_ratio" ||
|
3043
|
var == "cache_min_flush_age" || var == "cache_min_evict_age")) {
|
3044
|
ss << "pool '" << poolstr
|
3045
|
<< "' is not a tier pool: variable not applicable";
|
3046
|
r = -EACCES;
|
3047
|
goto reply;
|
3048
|
}
|
3049
|
|
3050
|
if (!p->is_erasure() && var == "erasure_code_profile") {
|
3051
|
ss << "pool '" << poolstr
|
3052
|
<< "' is not a erasure pool: variable not applicable";
|
3053
|
r = -EACCES;
|
3054
|
goto reply;
|
3055
|
}
|
3056
|
|
3057
|
if (f) {
|
3058
|
f->open_object_section("pool");
|
3059
|
f->dump_string("pool", poolstr);
|
3060
|
f->dump_int("pool_id", pool);
|
3061
|
|
3062
|
if (var == "pg_num") {
|
3063
|
f->dump_int("pg_num", p->get_pg_num());
|
3064
|
} else if (var == "pgp_num") {
|
3065
|
f->dump_int("pgp_num", p->get_pgp_num());
|
3066
|
} else if (var == "auid") {
|
3067
|
f->dump_int("auid", p->get_auid());
|
3068
|
} else if (var == "size") {
|
3069
|
f->dump_int("size", p->get_size());
|
3070
|
} else if (var == "min_size") {
|
3071
|
f->dump_int("min_size", p->get_min_size());
|
3072
|
} else if (var == "crash_replay_interval") {
|
3073
|
f->dump_int("crash_replay_interval", p->get_crash_replay_interval());
|
3074
|
} else if (var == "crush_ruleset") {
|
3075
|
f->dump_int("crush_ruleset", p->get_crush_ruleset());
|
3076
|
} else if (var == "hit_set_period") {
|
3077
|
f->dump_int("hit_set_period", p->hit_set_period);
|
3078
|
} else if (var == "hit_set_count") {
|
3079
|
f->dump_int("hit_set_count", p->hit_set_count);
|
3080
|
} else if (var == "hit_set_type") {
|
3081
|
f->dump_string("hit_set_type", HitSet::get_type_name(p->hit_set_params.get_type()));
|
3082
|
} else if (var == "hit_set_fpp") {
|
3083
|
if (p->hit_set_params.get_type() != HitSet::TYPE_BLOOM) {
|
3084
|
f->close_section();
|
3085
|
ss << "hit set is no of type Bloom; invalid to get a false positive rate!";
|
3086
|
r = -EINVAL;
|
3087
|
goto reply;
|
3088
|
} else {
|
3089
|
BloomHitSet::Params *bloomp = static_cast<BloomHitSet::Params*>(p->hit_set_params.impl.get());
|
3090
|
f->dump_float("hit_set_fpp", bloomp->get_fpp());
|
3091
|
}
|
3092
|
} else if (var == "target_max_objects") {
|
3093
|
f->dump_unsigned("target_max_objects", p->target_max_objects);
|
3094
|
} else if (var == "target_max_bytes") {
|
3095
|
f->dump_unsigned("target_max_bytes", p->target_max_bytes);
|
3096
|
} else if (var == "cache_target_dirty_ratio") {
|
3097
|
f->dump_unsigned("cache_target_dirty_ratio_micro",
|
3098
|
p->cache_target_dirty_ratio_micro);
|
3099
|
f->dump_float("cache_target_dirty_ratio",
|
3100
|
((float)p->cache_target_dirty_ratio_micro/1000000));
|
3101
|
} else if (var == "cache_target_full_ratio") {
|
3102
|
f->dump_unsigned("cache_target_full_ratio_micro",
|
3103
|
p->cache_target_full_ratio_micro);
|
3104
|
f->dump_float("cache_target_full_ratio",
|
3105
|
((float)p->cache_target_full_ratio_micro/1000000));
|
3106
|
} else if (var == "cache_min_flush_age") {
|
3107
|
f->dump_unsigned("cache_min_flush_age", p->cache_min_flush_age);
|
3108
|
} else if (var == "cache_min_evict_age") {
|
3109
|
f->dump_unsigned("cache_min_evict_age", p->cache_min_evict_age);
|
3110
|
} else if (var == "erasure_code_profile") {
|
3111
|
f->dump_string("erasure_code_profile", p->erasure_code_profile);
|
3112
|
} else if (var == "min_read_recency_for_promote") {
|
3113
|
f->dump_int("min_read_recency_for_promote", p->min_read_recency_for_promote);
|
3114
|
} else if (var == "write_fadvise_dontneed") {
|
3115
|
f->dump_string("write_fadvise_dontneed", p->has_flag(pg_pool_t::FLAG_WRITE_FADVISE_DONTNEED) ? "true" : "false");
|
3116
|
}
|
3117
|
|
3118
|
f->close_section();
|
3119
|
f->flush(rdata);
|
3120
|
} else {
|
3121
|
if (var == "pg_num") {
|
3122
|
ss << "pg_num: " << p->get_pg_num();
|
3123
|
} else if (var == "pgp_num") {
|
3124
|
ss << "pgp_num: " << p->get_pgp_num();
|
3125
|
} else if (var == "auid") {
|
3126
|
ss << "auid: " << p->get_auid();
|
3127
|
} else if (var == "size") {
|
3128
|
ss << "size: " << p->get_size();
|
3129
|
} else if (var == "min_size") {
|
3130
|
ss << "min_size: " << p->get_min_size();
|
3131
|
} else if (var == "crash_replay_interval") {
|
3132
|
ss << "crash_replay_interval: " << p->get_crash_replay_interval();
|
3133
|
} else if (var == "crush_ruleset") {
|
3134
|
ss << "crush_ruleset: " << p->get_crush_ruleset();
|
3135
|
} else if (var == "hit_set_period") {
|
3136
|
ss << "hit_set_period: " << p->hit_set_period;
|
3137
|
} else if (var == "hit_set_count") {
|
3138
|
ss << "hit_set_count: " << p->hit_set_count;
|
3139
|
} else if (var == "hit_set_type") {
|
3140
|
ss << "hit_set_type: " << HitSet::get_type_name(p->hit_set_params.get_type());
|
3141
|
} else if (var == "hit_set_fpp") {
|
3142
|
if (p->hit_set_params.get_type() != HitSet::TYPE_BLOOM) {
|
3143
|
ss << "hit set is no of type Bloom; invalid to get a false positive rate!";
|
3144
|
r = -EINVAL;
|
3145
|
goto reply;
|
3146
|
}
|
3147
|
BloomHitSet::Params *bloomp = static_cast<BloomHitSet::Params*>(p->hit_set_params.impl.get());
|
3148
|
ss << "hit_set_fpp: " << bloomp->get_fpp();
|
3149
|
} else if (var == "target_max_objects") {
|
3150
|
ss << "target_max_objects: " << p->target_max_objects;
|
3151
|
} else if (var == "target_max_bytes") {
|
3152
|
ss << "target_max_bytes: " << p->target_max_bytes;
|
3153
|
} else if (var == "cache_target_dirty_ratio") {
|
3154
|
ss << "cache_target_dirty_ratio: "
|
3155
|
<< ((float)p->cache_target_dirty_ratio_micro/1000000);
|
3156
|
} else if (var == "cache_target_full_ratio") {
|
3157
|
ss << "cache_target_full_ratio: "
|
3158
|
<< ((float)p->cache_target_full_ratio_micro/1000000);
|
3159
|
} else if (var == "cache_min_flush_age") {
|
3160
|
ss << "cache_min_flush_age: " << p->cache_min_flush_age;
|
3161
|
} else if (var == "cache_min_evict_age") {
|
3162
|
ss << "cache_min_evict_age: " << p->cache_min_evict_age;
|
3163
|
} else if (var == "erasure_code_profile") {
|
3164
|
ss << "erasure_code_profile: " << p->erasure_code_profile;
|
3165
|
} else if (var == "min_read_recency_for_promote") {
|
3166
|
ss << "min_read_recency_for_promote: " << p->min_read_recency_for_promote;
|
3167
|
} else if (var == "write_fadvise_dontneed") {
|
3168
|
ss << "write_fadvise_dontneed: " << (p->has_flag(pg_pool_t::FLAG_WRITE_FADVISE_DONTNEED) ? "true" : "false");
|
3169
|
}
|
3170
|
|
3171
|
rdata.append(ss);
|
3172
|
ss.str("");
|
3173
|
}
|
3174
|
r = 0;
|
3175
|
|
3176
|
} else if (prefix == "osd pool stats") {
|
3177
|
string pool_name;
|
3178
|
cmd_getval(g_ceph_context, cmdmap, "name", pool_name);
|
3179
|
|
3180
|
PGMap& pg_map = mon->pgmon()->pg_map;
|
3181
|
|
3182
|
int64_t poolid = -ENOENT;
|
3183
|
bool one_pool = false;
|
3184
|
if (!pool_name.empty()) {
|
3185
|
poolid = osdmap.lookup_pg_pool_name(pool_name);
|
3186
|
if (poolid < 0) {
|
3187
|
assert(poolid == -ENOENT);
|
3188
|
ss << "unrecognized pool '" << pool_name << "'";
|
3189
|
r = -ENOENT;
|
3190
|
goto reply;
|
3191
|
}
|
3192
|
one_pool = true;
|
3193
|
}
|
3194
|
|
3195
|
stringstream rs;
|
3196
|
|
3197
|
if (f)
|
3198
|
f->open_array_section("pool_stats");
|
3199
|
if (osdmap.get_pools().size() == 0) {
|
3200
|
if (!f)
|
3201
|
ss << "there are no pools!";
|
3202
|
goto stats_out;
|
3203
|
}
|
3204
|
|
3205
|
for (map<int64_t,pg_pool_t>::const_iterator it = osdmap.get_pools().begin();
|
3206
|
it != osdmap.get_pools().end();
|
3207
|
++it) {
|
3208
|
|
3209
|
if (!one_pool)
|
3210
|
poolid = it->first;
|
3211
|
|
3212
|
pool_name = osdmap.get_pool_name(poolid);
|
3213
|
|
3214
|
if (f) {
|
3215
|
f->open_object_section("pool");
|
3216
|
f->dump_string("pool_name", pool_name.c_str());
|
3217
|
f->dump_int("pool_id", poolid);
|
3218
|
f->open_object_section("recovery");
|
3219
|
}
|
3220
|
|
3221
|
list<string> sl;
|
3222
|
stringstream tss;
|
3223
|
pg_map.pool_recovery_summary(f.get(), &sl, poolid);
|
3224
|
if (!f && !sl.empty()) {
|
3225
|
for (list<string>::iterator p = sl.begin(); p != sl.end(); ++p)
|
3226
|
tss << " " << *p << "\n";
|
3227
|
}
|
3228
|
|
3229
|
if (f) {
|
3230
|
f->close_section();
|
3231
|
f->open_object_section("recovery_rate");
|
3232
|
}
|
3233
|
|
3234
|
ostringstream rss;
|
3235
|
pg_map.pool_recovery_rate_summary(f.get(), &rss, poolid);
|
3236
|
if (!f && !rss.str().empty())
|
3237
|
tss << " recovery io " << rss.str() << "\n";
|
3238
|
|
3239
|
if (f) {
|
3240
|
f->close_section();
|
3241
|
f->open_object_section("client_io_rate");
|
3242
|
}
|
3243
|
|
3244
|
rss.clear();
|
3245
|
rss.str("");
|
3246
|
|
3247
|
pg_map.pool_client_io_rate_summary(f.get(), &rss, poolid);
|
3248
|
if (!f && !rss.str().empty())
|
3249
|
tss << " client io " << rss.str() << "\n";
|
3250
|
|
3251
|
if (f) {
|
3252
|
f->close_section();
|
3253
|
f->close_section();
|
3254
|
} else {
|
3255
|
rs << "pool " << pool_name << " id " << poolid << "\n";
|
3256
|
if (!tss.str().empty())
|
3257
|
rs << tss.str() << "\n";
|
3258
|
else
|
3259
|
rs << " nothing is going on\n\n";
|
3260
|
}
|
3261
|
|
3262
|
if (one_pool)
|
3263
|
break;
|
3264
|
}
|
3265
|
|
3266
|
stats_out:
|
3267
|
if (f) {
|
3268
|
f->close_section();
|
3269
|
f->flush(rdata);
|
3270
|
} else {
|
3271
|
rdata.append(rs.str());
|
3272
|
}
|
3273
|
rdata.append("\n");
|
3274
|
r = 0;
|
3275
|
|
3276
|
} else if (prefix == "osd pool get-quota") {
|
3277
|
string pool_name;
|
3278
|
cmd_getval(g_ceph_context, cmdmap, "pool", pool_name);
|
3279
|
|
3280
|
int64_t poolid = osdmap.lookup_pg_pool_name(pool_name);
|
3281
|
if (poolid < 0) {
|
3282
|
assert(poolid == -ENOENT);
|
3283
|
ss << "unrecognized pool '" << pool_name << "'";
|
3284
|
r = -ENOENT;
|
3285
|
goto reply;
|
3286
|
}
|
3287
|
const pg_pool_t *p = osdmap.get_pg_pool(poolid);
|
3288
|
|
3289
|
if (f) {
|
3290
|
f->open_object_section("pool_quotas");
|
3291
|
f->dump_string("pool_name", pool_name);
|
3292
|
f->dump_unsigned("pool_id", poolid);
|
3293
|
f->dump_unsigned("quota_max_objects", p->quota_max_objects);
|
3294
|
f->dump_unsigned("quota_max_bytes", p->quota_max_bytes);
|
3295
|
f->close_section();
|
3296
|
f->flush(rdata);
|
3297
|
} else {
|
3298
|
stringstream rs;
|
3299
|
rs << "quotas for pool '" << pool_name << "':\n"
|
3300
|
<< " max objects: ";
|
3301
|
if (p->quota_max_objects == 0)
|
3302
|
rs << "N/A";
|
3303
|
else
|
3304
|
rs << si_t(p->quota_max_objects) << " objects";
|
3305
|
rs << "\n"
|
3306
|
<< " max bytes : ";
|
3307
|
if (p->quota_max_bytes == 0)
|
3308
|
rs << "N/A";
|
3309
|
else
|
3310
|
rs << si_t(p->quota_max_bytes) << "B";
|
3311
|
rdata.append(rs.str());
|
3312
|
}
|
3313
|
rdata.append("\n");
|
3314
|
r = 0;
|
3315
|
} else if (prefix == "osd crush rule list" ||
|
3316
|
prefix == "osd crush rule ls") {
|
3317
|
string format;
|
3318
|
cmd_getval(g_ceph_context, cmdmap, "format", format);
|
3319
|
boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty", "json-pretty"));
|
3320
|
f->open_array_section("rules");
|
3321
|
osdmap.crush->list_rules(f.get());
|
3322
|
f->close_section();
|
3323
|
ostringstream rs;
|
3324
|
f->flush(rs);
|
3325
|
rs << "\n";
|
3326
|
rdata.append(rs.str());
|
3327
|
} else if (prefix == "osd crush rule dump") {
|
3328
|
string name;
|
3329
|
cmd_getval(g_ceph_context, cmdmap, "name", name);
|
3330
|
string format;
|
3331
|
cmd_getval(g_ceph_context, cmdmap, "format", format);
|
3332
|
boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty", "json-pretty"));
|
3333
|
if (name == "") {
|
3334
|
f->open_array_section("rules");
|
3335
|
osdmap.crush->dump_rules(f.get());
|
3336
|
f->close_section();
|
3337
|
} else {
|
3338
|
int ruleno = osdmap.crush->get_rule_id(name);
|
3339
|
if (ruleno < 0) {
|
3340
|
ss << "unknown crush ruleset '" << name << "'";
|
3341
|
r = ruleno;
|
3342
|
goto reply;
|
3343
|
}
|
3344
|
osdmap.crush->dump_rule(ruleno, f.get());
|
3345
|
}
|
3346
|
ostringstream rs;
|
3347
|
f->flush(rs);
|
3348
|
rs << "\n";
|
3349
|
rdata.append(rs.str());
|
3350
|
} else if (prefix == "osd crush dump") {
|
3351
|
string format;
|
3352
|
cmd_getval(g_ceph_context, cmdmap, "format", format);
|
3353
|
boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty", "json-pretty"));
|
3354
|
f->open_object_section("crush_map");
|
3355
|
osdmap.crush->dump(f.get());
|
3356
|
f->close_section();
|
3357
|
ostringstream rs;
|
3358
|
f->flush(rs);
|
3359
|
rs << "\n";
|
3360
|
rdata.append(rs.str());
|
3361
|
} else if (prefix == "osd crush show-tunables") {
|
3362
|
string format;
|
3363
|
cmd_getval(g_ceph_context, cmdmap, "format", format);
|
3364
|
boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty", "json-pretty"));
|
3365
|
f->open_object_section("crush_map_tunables");
|
3366
|
osdmap.crush->dump_tunables(f.get());
|
3367
|
f->close_section();
|
3368
|
ostringstream rs;
|
3369
|
f->flush(rs);
|
3370
|
rs << "\n";
|
3371
|
rdata.append(rs.str());
|
3372
|
} else if (prefix == "osd erasure-code-profile ls") {
|
3373
|
const map<string,map<string,string> > &profiles =
|
3374
|
osdmap.get_erasure_code_profiles();
|
3375
|
if (f)
|
3376
|
f->open_array_section("erasure-code-profiles");
|
3377
|
for(map<string,map<string,string> >::const_iterator i = profiles.begin();
|
3378
|
i != profiles.end();
|
3379
|
++i) {
|
3380
|
if (f)
|
3381
|
f->dump_string("profile", i->first.c_str());
|
3382
|
else
|
3383
|
rdata.append(i->first + "\n");
|
3384
|
}
|
3385
|
if (f) {
|
3386
|
f->close_section();
|
3387
|
ostringstream rs;
|
3388
|
f->flush(rs);
|
3389
|
rs << "\n";
|
3390
|
rdata.append(rs.str());
|
3391
|
}
|
3392
|
} else if (prefix == "osd erasure-code-profile get") {
|
3393
|
string name;
|
3394
|
cmd_getval(g_ceph_context, cmdmap, "name", name);
|
3395
|
if (!osdmap.has_erasure_code_profile(name)) {
|
3396
|
ss << "unknown erasure code profile '" << name << "'";
|
3397
|
r = -ENOENT;
|
3398
|
goto reply;
|
3399
|
}
|
3400
|
const map<string,string> &profile = osdmap.get_erasure_code_profile(name);
|
3401
|
if (f)
|
3402
|
f->open_object_section("profile");
|
3403
|
for (map<string,string>::const_iterator i = profile.begin();
|
3404
|
i != profile.end();
|
3405
|
++i) {
|
3406
|
if (f)
|
3407
|
f->dump_string(i->first.c_str(), i->second.c_str());
|
3408
|
else
|
3409
|
rdata.append(i->first + "=" + i->second + "\n");
|
3410
|
}
|
3411
|
if (f) {
|
3412
|
f->close_section();
|
3413
|
ostringstream rs;
|
3414
|
f->flush(rs);
|
3415
|
rs << "\n";
|
3416
|
rdata.append(rs.str());
|
3417
|
}
|
3418
|
} else {
|
3419
|
|
3420
|
return false;
|
3421
|
}
|
3422
|
|
3423
|
reply:
|
3424
|
string rs;
|
3425
|
getline(ss, rs);
|
3426
|
mon->reply_command(m, r, rs, rdata, get_last_committed());
|
3427
|
return true;
|
3428
|
}
|
3429
|
|
3430
|
void OSDMonitor::update_pool_flags(int64_t pool_id, uint64_t flags)
|
3431
|
{
|
3432
|
const pg_pool_t *pool = osdmap.get_pg_pool(pool_id);
|
3433
|
pending_inc.get_new_pool(pool_id, pool)->flags = flags;
|
3434
|
}
|
3435
|
|
3436
|
bool OSDMonitor::update_pools_status()
|
3437
|
{
|
3438
|
if (!mon->pgmon()->is_readable())
|
3439
|
return false;
|
3440
|
|
3441
|
bool ret = false;
|
3442
|
|
3443
|
const map<int64_t,pg_pool_t>& pools = osdmap.get_pools();
|
3444
|
for (map<int64_t,pg_pool_t>::const_iterator it = pools.begin();
|
3445
|
it != pools.end();
|
3446
|
++it) {
|
3447
|
if (!mon->pgmon()->pg_map.pg_pool_sum.count(it->first))
|
3448
|
continue;
|
3449
|
pool_stat_t& stats = mon->pgmon()->pg_map.pg_pool_sum[it->first];
|
3450
|
object_stat_sum_t& sum = stats.stats.sum;
|
3451
|
const pg_pool_t &pool = it->second;
|
3452
|
const string& pool_name = osdmap.get_pool_name(it->first);
|
3453
|
|
3454
|
bool pool_is_full =
|
3455
|
(pool.quota_max_bytes > 0 && (uint64_t)sum.num_bytes >= pool.quota_max_bytes) ||
|
3456
|
(pool.quota_max_objects > 0 && (uint64_t)sum.num_objects >= pool.quota_max_objects);
|
3457
|
|
3458
|
if (pool.has_flag(pg_pool_t::FLAG_FULL)) {
|
3459
|
if (pool_is_full)
|
3460
|
continue;
|
3461
|
|
3462
|
mon->clog->info() << "pool '" << pool_name
|
3463
|
<< "' no longer full; removing FULL flag";
|
3464
|
|
3465
|
update_pool_flags(it->first, pool.get_flags() & ~pg_pool_t::FLAG_FULL);
|
3466
|
ret = true;
|
3467
|
} else {
|
3468
|
if (!pool_is_full)
|
3469
|
continue;
|
3470
|
|
3471
|
if (pool.quota_max_bytes > 0 &&
|
3472
|
(uint64_t)sum.num_bytes >= pool.quota_max_bytes) {
|
3473
|
mon->clog->warn() << "pool '" << pool_name << "' is full"
|
3474
|
<< " (reached quota's max_bytes: "
|
3475
|
<< si_t(pool.quota_max_bytes) << ")";
|
3476
|
} else if (pool.quota_max_objects > 0 &&
|
3477
|
(uint64_t)sum.num_objects >= pool.quota_max_objects) {
|
3478
|
mon->clog->warn() << "pool '" << pool_name << "' is full"
|
3479
|
<< " (reached quota's max_objects: "
|
3480
|
<< pool.quota_max_objects << ")";
|
3481
|
} else {
|
3482
|
assert(0 == "we shouldn't reach this");
|
3483
|
}
|
3484
|
update_pool_flags(it->first, pool.get_flags() | pg_pool_t::FLAG_FULL);
|
3485
|
ret = true;
|
3486
|
}
|
3487
|
}
|
3488
|
return ret;
|
3489
|
}
|
3490
|
|
3491
|
void OSDMonitor::get_pools_health(
|
3492
|
list<pair<health_status_t,string> >& summary,
|
3493
|
list<pair<health_status_t,string> > *detail) const
|
3494
|
{
|
3495
|
const map<int64_t,pg_pool_t>& pools = osdmap.get_pools();
|
3496
|
for (map<int64_t,pg_pool_t>::const_iterator it = pools.begin();
|
3497
|
it != pools.end(); ++it) {
|
3498
|
if (!mon->pgmon()->pg_map.pg_pool_sum.count(it->first))
|
3499
|
continue;
|
3500
|
pool_stat_t& stats = mon->pgmon()->pg_map.pg_pool_sum[it->first];
|
3501
|
object_stat_sum_t& sum = stats.stats.sum;
|
3502
|
const pg_pool_t &pool = it->second;
|
3503
|
const string& pool_name = osdmap.get_pool_name(it->first);
|
3504
|
|
3505
|
if (pool.has_flag(pg_pool_t::FLAG_FULL)) {
|
3506
|
|
3507
|
|
3508
|
|
3509
|
stringstream ss;
|
3510
|
ss << "pool '" << pool_name << "' is full";
|
3511
|
summary.push_back(make_pair(HEALTH_WARN, ss.str()));
|
3512
|
if (detail)
|
3513
|
detail->push_back(make_pair(HEALTH_WARN, ss.str()));
|
3514
|
}
|
3515
|
|
3516
|
float warn_threshold = g_conf->mon_pool_quota_warn_threshold/100;
|
3517
|
float crit_threshold = g_conf->mon_pool_quota_crit_threshold/100;
|
3518
|
|
3519
|
if (pool.quota_max_objects > 0) {
|
3520
|
stringstream ss;
|
3521
|
health_status_t status = HEALTH_OK;
|
3522
|
if ((uint64_t)sum.num_objects >= pool.quota_max_objects) {
|
3523
|
|
3524
|
|
3525
|
} else if (crit_threshold > 0 &&
|
3526
|
sum.num_objects >= pool.quota_max_objects*crit_threshold) {
|
3527
|
ss << "pool '" << pool_name
|
3528
|
<< "' has " << sum.num_objects << " objects"
|
3529
|
<< " (max " << pool.quota_max_objects << ")";
|
3530
|
status = HEALTH_ERR;
|
3531
|
} else if (warn_threshold > 0 &&
|
3532
|
sum.num_objects >= pool.quota_max_objects*warn_threshold) {
|
3533
|
ss << "pool '" << pool_name
|
3534
|
<< "' has " << sum.num_objects << " objects"
|
3535
|
<< " (max " << pool.quota_max_objects << ")";
|
3536
|
status = HEALTH_WARN;
|
3537
|
}
|
3538
|
if (status != HEALTH_OK) {
|
3539
|
pair<health_status_t,string> s(status, ss.str());
|
3540
|
summary.push_back(s);
|
3541
|
if (detail)
|
3542
|
detail->push_back(s);
|
3543
|
}
|
3544
|
}
|
3545
|
|
3546
|
if (pool.quota_max_bytes > 0) {
|
3547
|
health_status_t status = HEALTH_OK;
|
3548
|
stringstream ss;
|
3549
|
if ((uint64_t)sum.num_bytes >= pool.quota_max_bytes) {
|
3550
|
|
3551
|
|
3552
|
} else if (crit_threshold > 0 &&
|
3553
|
sum.num_bytes >= pool.quota_max_bytes*crit_threshold) {
|
3554
|
ss << "pool '" << pool_name
|
3555
|
<< "' has " << si_t(sum.num_bytes) << " bytes"
|
3556
|
<< " (max " << si_t(pool.quota_max_bytes) << ")";
|
3557
|
status = HEALTH_ERR;
|
3558
|
} else if (warn_threshold > 0 &&
|
3559
|
sum.num_bytes >= pool.quota_max_bytes*warn_threshold) {
|
3560
|
ss << "pool '" << pool_name
|
3561
|
<< "' has " << si_t(sum.num_bytes) << " objects"
|
3562
|
<< " (max " << si_t(pool.quota_max_bytes) << ")";
|
3563
|
status = HEALTH_WARN;
|
3564
|
}
|
3565
|
if (status != HEALTH_OK) {
|
3566
|
pair<health_status_t,string> s(status, ss.str());
|
3567
|
summary.push_back(s);
|
3568
|
if (detail)
|
3569
|
detail->push_back(s);
|
3570
|
}
|
3571
|
}
|
3572
|
}
|
3573
|
}
|
3574
|
|
3575
|
|
3576
|
int OSDMonitor::prepare_new_pool(MPoolOp *m)
|
3577
|
{
|
3578
|
dout(10) << "prepare_new_pool from " << m->get_connection() << dendl;
|
3579
|
MonSession *session = m->get_session();
|
3580
|
if (!session)
|
3581
|
return -EPERM;
|
3582
|
string erasure_code_profile;
|
3583
|
stringstream ss;
|
3584
|
string ruleset_name;
|
3585
|
if (m->auid)
|
3586
|
return prepare_new_pool(m->name, m->auid, m->crush_rule, ruleset_name,
|
3587
|
0, 0,
|
3588
|
erasure_code_profile,
|
3589
|
pg_pool_t::TYPE_REPLICATED, 0, ss);
|
3590
|
else
|
3591
|
return prepare_new_pool(m->name, session->auid, m->crush_rule, ruleset_name,
|
3592
|
0, 0,
|
3593
|
erasure_code_profile,
|
3594
|
pg_pool_t::TYPE_REPLICATED, 0, ss);
|
3595
|
}
|
3596
|
|
3597
|
int OSDMonitor::crush_rename_bucket(const string& srcname,
|
3598
|
const string& dstname,
|
3599
|
ostream *ss)
|
3600
|
{
|
3601
|
int ret;
|
3602
|
|
3603
|
|
3604
|
|
3605
|
|
3606
|
if (!_have_pending_crush()) {
|
3607
|
ret = _get_stable_crush().can_rename_bucket(srcname,
|
3608
|
dstname,
|
3609
|
ss);
|
3610
|
if (ret)
|
3611
|
return ret;
|
3612
|
}
|
3613
|
|
3614
|
CrushWrapper newcrush;
|
3615
|
_get_pending_crush(newcrush);
|
3616
|
|
3617
|
ret = newcrush.rename_bucket(srcname,
|
3618
|
dstname,
|
3619
|
ss);
|
3620
|
if (ret)
|
3621
|
return ret;
|
3622
|
|
3623
|
pending_inc.crush.clear();
|
3624
|
newcrush.encode(pending_inc.crush);
|
3625
|
*ss << "renamed bucket " << srcname << " into " << dstname;
|
3626
|
return 0;
|
3627
|
}
|
3628
|
|
3629
|
int OSDMonitor::crush_ruleset_create_erasure(const string &name,
|
3630
|
const string &profile,
|
3631
|
int *ruleset,
|
3632
|
stringstream &ss)
|
3633
|
{
|
3634
|
int ruleid = osdmap.crush->get_rule_id(name);
|
3635
|
if (ruleid != -ENOENT) {
|
3636
|
*ruleset = osdmap.crush->get_rule_mask_ruleset(ruleid);
|
3637
|
return -EEXIST;
|
3638
|
}
|
3639
|
|
3640
|
CrushWrapper newcrush;
|
3641
|
_get_pending_crush(newcrush);
|
3642
|
|
3643
|
ruleid = newcrush.get_rule_id(name);
|
3644
|
if (ruleid != -ENOENT) {
|
3645
|
*ruleset = newcrush.get_rule_mask_ruleset(ruleid);
|
3646
|
return -EALREADY;
|
3647
|
} else {
|
3648
|
ErasureCodeInterfaceRef erasure_code;
|
3649
|
int err = get_erasure_code(profile, &erasure_code, ss);
|
3650
|
if (err) {
|
3651
|
ss << "failed to load plugin using profile " << profile << std::endl;
|
3652
|
return err;
|
3653
|
}
|
3654
|
|
3655
|
err = erasure_code->create_ruleset(name, newcrush, &ss);
|
3656
|
erasure_code.reset();
|
3657
|
if (err < 0)
|
3658
|
return err;
|
3659
|
*ruleset = err;
|
3660
|
pending_inc.crush.clear();
|
3661
|
newcrush.encode(pending_inc.crush);
|
3662
|
return 0;
|
3663
|
}
|
3664
|
}
|
3665
|
|
3666
|
int OSDMonitor::get_erasure_code(const string &erasure_code_profile,
|
3667
|
ErasureCodeInterfaceRef *erasure_code,
|
3668
|
stringstream &ss) const
|
3669
|
{
|
3670
|
if (pending_inc.has_erasure_code_profile(erasure_code_profile))
|
3671
|
return -EAGAIN;
|
3672
|
const map<string,string> &profile =
|
3673
|
osdmap.get_erasure_code_profile(erasure_code_profile);
|
3674
|
map<string,string>::const_iterator plugin =
|
3675
|
profile.find("plugin");
|
3676
|
if (plugin == profile.end()) {
|
3677
|
ss << "cannot determine the erasure code plugin"
|
3678
|
<< " because there is no 'plugin' entry in the erasure_code_profile "
|
3679
|
<< profile << std::endl;
|
3680
|
return -EINVAL;
|
3681
|
}
|
3682
|
ErasureCodePluginRegistry &instance = ErasureCodePluginRegistry::instance();
|
3683
|
return instance.factory(plugin->second, profile, erasure_code, ss);
|
3684
|
}
|
3685
|
|
3686
|
int OSDMonitor::check_cluster_features(uint64_t features,
|
3687
|
stringstream &ss)
|
3688
|
{
|
3689
|
stringstream unsupported_ss;
|
3690
|
int unsupported_count = 0;
|
3691
|
if ((mon->get_quorum_features() & features) != features) {
|
3692
|
unsupported_ss << "the monitor cluster";
|
3693
|
++unsupported_count;
|
3694
|
}
|
3695
|
|
3696
|
set<int32_t> up_osds;
|
3697
|
osdmap.get_up_osds(up_osds);
|
3698
|
for (set<int32_t>::iterator it = up_osds.begin();
|
3699
|
it != up_osds.end(); ++it) {
|
3700
|
const osd_xinfo_t &xi = osdmap.get_xinfo(*it);
|
3701
|
if ((xi.features & features) != features) {
|
3702
|
if (unsupported_count > 0)
|
3703
|
unsupported_ss << ", ";
|
3704
|
unsupported_ss << "osd." << *it;
|
3705
|
unsupported_count ++;
|
3706
|
}
|
3707
|
}
|
3708
|
|
3709
|
if (unsupported_count > 0) {
|
3710
|
ss << "features " << features << " unsupported by: "
|
3711
|
<< unsupported_ss.str();
|
3712
|
return -ENOTSUP;
|
3713
|
}
|
3714
|
|
3715
|
|
3716
|
for (map<int32_t,osd_xinfo_t>::const_iterator p =
|
3717
|
pending_inc.new_xinfo.begin();
|
3718
|
p != pending_inc.new_xinfo.end(); ++p) {
|
3719
|
const osd_xinfo_t &xi = p->second;
|
3720
|
if ((xi.features & features) != features) {
|
3721
|
dout(10) << __func__ << " pending osd." << p->first
|
3722
|
<< " features are insufficient; retry" << dendl;
|
3723
|
return -EAGAIN;
|
3724
|
}
|
3725
|
}
|
3726
|
|
3727
|
return 0;
|
3728
|
}
|
3729
|
|
3730
|
bool OSDMonitor::validate_crush_against_features(const CrushWrapper *newcrush,
|
3731
|
stringstream& ss)
|
3732
|
{
|
3733
|
OSDMap::Incremental new_pending = pending_inc;
|
3734
|
::encode(*newcrush, new_pending.crush);
|
3735
|
OSDMap newmap;
|
3736
|
newmap.deepish_copy_from(osdmap);
|
3737
|
newmap.apply_incremental(new_pending);
|
3738
|
|
3739
|
uint64_t features =
|
3740
|
newmap.get_features(CEPH_ENTITY_TYPE_MON, NULL) |
|
3741
|
newmap.get_features(CEPH_ENTITY_TYPE_OSD, NULL);
|
3742
|
|
3743
|
stringstream features_ss;
|
3744
|
int r = check_cluster_features(features, features_ss);
|
3745
|
if (!r)
|
3746
|
return true;
|
3747
|
|
3748
|
ss << "Could not change CRUSH: " << features_ss.str();
|
3749
|
return false;
|
3750
|
}
|
3751
|
|
3752
|
bool OSDMonitor::erasure_code_profile_in_use(const map<int64_t, pg_pool_t> &pools,
|
3753
|
const string &profile,
|
3754
|
ostream &ss)
|
3755
|
{
|
3756
|
bool found = false;
|
3757
|
for (map<int64_t, pg_pool_t>::const_iterator p = pools.begin();
|
3758
|
p != pools.end();
|
3759
|
++p) {
|
3760
|
if (p->second.erasure_code_profile == profile) {
|
3761
|
ss << osdmap.pool_name[p->first] << " ";
|
3762
|
found = true;
|
3763
|
}
|
3764
|
}
|
3765
|
if (found) {
|
3766
|
ss << "pool(s) are using the erasure code profile '" << profile << "'";
|
3767
|
}
|
3768
|
return found;
|
3769
|
}
|
3770
|
|
3771
|
int OSDMonitor::parse_erasure_code_profile(const vector<string> &erasure_code_profile,
|
3772
|
map<string,string> *erasure_code_profile_map,
|
3773
|
stringstream &ss)
|
3774
|
{
|
3775
|
int r = get_json_str_map(g_conf->osd_pool_default_erasure_code_profile,
|
3776
|
ss,
|
3777
|
erasure_code_profile_map);
|
3778
|
if (r)
|
3779
|
return r;
|
3780
|
assert((*erasure_code_profile_map).count("plugin"));
|
3781
|
string default_plugin = (*erasure_code_profile_map)["plugin"];
|
3782
|
map<string,string> user_map;
|
3783
|
for (vector<string>::const_iterator i = erasure_code_profile.begin();
|
3784
|
i != erasure_code_profile.end();
|
3785
|
++i) {
|
3786
|
size_t equal = i->find('=');
|
3787
|
if (equal == string::npos) {
|
3788
|
user_map[*i] = string();
|
3789
|
(*erasure_code_profile_map)[*i] = string();
|
3790
|
} else {
|
3791
|
const string key = i->substr(0, equal);
|
3792
|
equal++;
|
3793
|
const string value = i->substr(equal);
|
3794
|
user_map[key] = value;
|
3795
|
(*erasure_code_profile_map)[key] = value;
|
3796
|
}
|
3797
|
}
|
3798
|
|
3799
|
if (user_map.count("plugin") && user_map["plugin"] != default_plugin)
|
3800
|
(*erasure_code_profile_map) = user_map;
|
3801
|
|
3802
|
if ((*erasure_code_profile_map).count("directory") == 0)
|
3803
|
(*erasure_code_profile_map)["directory"] =
|
3804
|
g_conf->osd_pool_default_erasure_code_directory;
|
3805
|
|
3806
|
return 0;
|
3807
|
}
|
3808
|
|
3809
|
int OSDMonitor::prepare_pool_size(const unsigned pool_type,
|
3810
|
const string &erasure_code_profile,
|
3811
|
unsigned *size, unsigned *min_size,
|
3812
|
stringstream &ss)
|
3813
|
{
|
3814
|
int err = 0;
|
3815
|
switch (pool_type) {
|
3816
|
case pg_pool_t::TYPE_REPLICATED:
|
3817
|
*size = g_conf->osd_pool_default_size;
|
3818
|
*min_size = g_conf->get_osd_pool_default_min_size();
|
3819
|
break;
|
3820
|
case pg_pool_t::TYPE_ERASURE:
|
3821
|
{
|
3822
|
ErasureCodeInterfaceRef erasure_code;
|
3823
|
err = get_erasure_code(erasure_code_profile, &erasure_code, ss);
|
3824
|
if (err == 0) {
|
3825
|
*size = erasure_code->get_chunk_count();
|
3826
|
*min_size = erasure_code->get_data_chunk_count();
|
3827
|
}
|
3828
|
}
|
3829
|
break;
|
3830
|
default:
|
3831
|
ss << "prepare_pool_size: " << pool_type << " is not a known pool type";
|
3832
|
err = -EINVAL;
|
3833
|
break;
|
3834
|
}
|
3835
|
return err;
|
3836
|
}
|
3837
|
|
3838
|
int OSDMonitor::prepare_pool_stripe_width(const unsigned pool_type,
|
3839
|
const string &erasure_code_profile,
|
3840
|
uint32_t *stripe_width,
|
3841
|
stringstream &ss)
|
3842
|
{
|
3843
|
int err = 0;
|
3844
|
switch (pool_type) {
|
3845
|
case pg_pool_t::TYPE_REPLICATED:
|
3846
|
|
3847
|
break;
|
3848
|
case pg_pool_t::TYPE_ERASURE:
|
3849
|
{
|
3850
|
ErasureCodeInterfaceRef erasure_code;
|
3851
|
err = get_erasure_code(erasure_code_profile, &erasure_code, ss);
|
3852
|
uint32_t desired_stripe_width = g_conf->osd_pool_erasure_code_stripe_width;
|
3853
|
if (err == 0)
|
3854
|
*stripe_width = erasure_code->get_data_chunk_count() *
|
3855
|
erasure_code->get_chunk_size(desired_stripe_width);
|
3856
|
}
|
3857
|
break;
|
3858
|
default:
|
3859
|
ss << "prepare_pool_stripe_width: "
|
3860
|
<< pool_type << " is not a known pool type";
|
3861
|
err = -EINVAL;
|
3862
|
break;
|
3863
|
}
|
3864
|
return err;
|
3865
|
}
|
3866
|
|
3867
|
int OSDMonitor::prepare_pool_crush_ruleset(const unsigned pool_type,
|
3868
|
const string &erasure_code_profile,
|
3869
|
const string &ruleset_name,
|
3870
|
int *crush_ruleset,
|
3871
|
stringstream &ss)
|
3872
|
{
|
3873
|
if (*crush_ruleset < 0) {
|
3874
|
switch (pool_type) {
|
3875
|
case pg_pool_t::TYPE_REPLICATED:
|
3876
|
{
|
3877
|
if (ruleset_name == "") {
|
3878
|
|
3879
|
*crush_ruleset = osdmap.crush->get_osd_pool_default_crush_replicated_ruleset(g_ceph_context);
|
3880
|
if (*crush_ruleset < 0) {
|
3881
|
|
3882
|
ss << "No suitable CRUSH ruleset exists";
|
3883
|
return *crush_ruleset;
|
3884
|
}
|
3885
|
} else {
|
3886
|
return get_crush_ruleset(ruleset_name, crush_ruleset, ss);
|
3887
|
}
|
3888
|
}
|
3889
|
break;
|
3890
|
case pg_pool_t::TYPE_ERASURE:
|
3891
|
{
|
3892
|
int err = crush_ruleset_create_erasure(ruleset_name,
|
3893
|
erasure_code_profile,
|
3894
|
crush_ruleset, ss);
|
3895
|
switch (err) {
|
3896
|
case -EALREADY:
|
3897
|
dout(20) << "prepare_pool_crush_ruleset: ruleset "
|
3898
|
<< ruleset_name << " try again" << dendl;
|
3899
|
case 0:
|
3900
|
|
3901
|
err = -EAGAIN;
|
3902
|
break;
|
3903
|
case -EEXIST:
|
3904
|
err = 0;
|
3905
|
break;
|
3906
|
}
|
3907
|
return err;
|
3908
|
}
|
3909
|
break;
|
3910
|
default:
|
3911
|
ss << "prepare_pool_crush_ruleset: " << pool_type
|
3912
|
<< " is not a known pool type";
|
3913
|
return -EINVAL;
|
3914
|
break;
|
3915
|
}
|
3916
|
} else {
|
3917
|
if (!osdmap.crush->ruleset_exists(*crush_ruleset)) {
|
3918
|
ss << "CRUSH ruleset " << *crush_ruleset << " not found";
|
3919
|
return -ENOENT;
|
3920
|
}
|
3921
|
}
|
3922
|
|
3923
|
return 0;
|
3924
|
}
|
3925
|
|
3926
|
int OSDMonitor::get_crush_ruleset(const string &ruleset_name,
|
3927
|
int *crush_ruleset,
|
3928
|
stringstream &ss)
|
3929
|
{
|
3930
|
int ret;
|
3931
|
ret = osdmap.crush->get_rule_id(ruleset_name);
|
3932
|
if (ret != -ENOENT) {
|
3933
|
|
3934
|
*crush_ruleset = ret;
|
3935
|
} else {
|
3936
|
CrushWrapper newcrush;
|
3937
|
_get_pending_crush(newcrush);
|
3938
|
|
3939
|
ret = newcrush.get_rule_id(ruleset_name);
|
3940
|
if (ret != -ENOENT) {
|
3941
|
|
3942
|
dout(20) << __func__ << ": ruleset " << ruleset_name
|
3943
|
<< " try again" << dendl;
|
3944
|
return -EAGAIN;
|
3945
|
} else {
|
3946
|
|
3947
|
ss << "specified ruleset " << ruleset_name << " doesn't exist";
|
3948
|
return ret;
|
3949
|
}
|
3950
|
}
|
3951
|
return 0;
|
3952
|
}
|
3953
|
|
3954
|
|
3955
|
|
3956
|
|
3957
|
|
3958
|
|
3959
|
|
3960
|
|
3961
|
|
3962
|
|
3963
|
|
3964
|
|
3965
|
|
3966
|
|
3967
|
|
3968
|
int OSDMonitor::prepare_new_pool(string& name, uint64_t auid,
|
3969
|
int crush_ruleset,
|
3970
|
const string &crush_ruleset_name,
|
3971
|
unsigned pg_num, unsigned pgp_num,
|
3972
|
const string &erasure_code_profile,
|
3973
|
const unsigned pool_type,
|
3974
|
const uint64_t expected_num_objects,
|
3975
|
stringstream &ss)
|
3976
|
{
|
3977
|
if (name.length() == 0)
|
3978
|
return -EINVAL;
|
3979
|
int r;
|
3980
|
r = prepare_pool_crush_ruleset(pool_type, erasure_code_profile,
|
3981
|
crush_ruleset_name, &crush_ruleset, ss);
|
3982
|
if (r)
|
3983
|
return r;
|
3984
|
unsigned size, min_size;
|
3985
|
r = prepare_pool_size(pool_type, erasure_code_profile, &size, &min_size, ss);
|
3986
|
if (r)
|
3987
|
return r;
|
3988
|
uint32_t stripe_width = 0;
|
3989
|
r = prepare_pool_stripe_width(pool_type, erasure_code_profile, &stripe_width, ss);
|
3990
|
if (r)
|
3991
|
return r;
|
3992
|
|
3993
|
for (map<int64_t,string>::iterator p = pending_inc.new_pool_names.begin();
|
3994
|
p != pending_inc.new_pool_names.end();
|
3995
|
++p) {
|
3996
|
if (p->second == name)
|
3997
|
return 0;
|
3998
|
}
|
3999
|
|
4000
|
if (-1 == pending_inc.new_pool_max)
|
4001
|
pending_inc.new_pool_max = osdmap.pool_max;
|
4002
|
int64_t pool = ++pending_inc.new_pool_max;
|
4003
|
pg_pool_t empty;
|
4004
|
pg_pool_t *pi = pending_inc.get_new_pool(pool, &empty);
|
4005
|
pi->type = pool_type;
|
4006
|
pi->flags = g_conf->osd_pool_default_flags;
|
4007
|
if (g_conf->osd_pool_default_flag_hashpspool)
|
4008
|
pi->set_flag(pg_pool_t::FLAG_HASHPSPOOL);
|
4009
|
if (g_conf->osd_pool_default_flag_nodelete)
|
4010
|
pi->set_flag(pg_pool_t::FLAG_NODELETE);
|
4011
|
if (g_conf->osd_pool_default_flag_nopgchange)
|
4012
|
pi->set_flag(pg_pool_t::FLAG_NOPGCHANGE);
|
4013
|
if (g_conf->osd_pool_default_flag_nosizechange)
|
4014
|
pi->set_flag(pg_pool_t::FLAG_NOSIZECHANGE);
|
4015
|
|
4016
|
pi->size = size;
|
4017
|
pi->min_size = min_size;
|
4018
|
pi->crush_ruleset = crush_ruleset;
|
4019
|
pi->expected_num_objects = expected_num_objects;
|
4020
|
pi->object_hash = CEPH_STR_HASH_RJENKINS;
|
4021
|
pi->set_pg_num(pg_num ? pg_num : g_conf->osd_pool_default_pg_num);
|
4022
|
pi->set_pgp_num(pgp_num ? pgp_num : g_conf->osd_pool_default_pgp_num);
|
4023
|
pi->last_change = pending_inc.epoch;
|
4024
|
pi->auid = auid;
|
4025
|
pi->erasure_code_profile = erasure_code_profile;
|
4026
|
pi->stripe_width = stripe_width;
|
4027
|
pi->cache_target_dirty_ratio_micro =
|
4028
|
g_conf->osd_pool_default_cache_target_dirty_ratio * 1000000;
|
4029
|
pi->cache_target_full_ratio_micro =
|
4030
|
g_conf->osd_pool_default_cache_target_full_ratio * 1000000;
|
4031
|
pi->cache_min_flush_age = g_conf->osd_pool_default_cache_min_flush_age;
|
4032
|
pi->cache_min_evict_age = g_conf->osd_pool_default_cache_min_evict_age;
|
4033
|
pending_inc.new_pool_names[pool] = name;
|
4034
|
return 0;
|
4035
|
}
|
4036
|
|
4037
|
bool OSDMonitor::prepare_set_flag(MMonCommand *m, int flag)
|
4038
|
{
|
4039
|
ostringstream ss;
|
4040
|
if (pending_inc.new_flags < 0)
|
4041
|
pending_inc.new_flags = osdmap.get_flags();
|
4042
|
pending_inc.new_flags |= flag;
|
4043
|
ss << "set " << OSDMap::get_flag_string(flag);
|
4044
|
wait_for_finished_proposal(new Monitor::C_Command(mon, m, 0, ss.str(),
|
4045
|
get_last_committed() + 1));
|
4046
|
return true;
|
4047
|
}
|
4048
|
|
4049
|
bool OSDMonitor::prepare_unset_flag(MMonCommand *m, int flag)
|
4050
|
{
|
4051
|
ostringstream ss;
|
4052
|
if (pending_inc.new_flags < 0)
|
4053
|
pending_inc.new_flags = osdmap.get_flags();
|
4054
|
pending_inc.new_flags &= ~flag;
|
4055
|
ss << "unset " << OSDMap::get_flag_string(flag);
|
4056
|
wait_for_finished_proposal(new Monitor::C_Command(mon, m, 0, ss.str(),
|
4057
|
get_last_committed() + 1));
|
4058
|
return true;
|
4059
|
}
|
4060
|
|
4061
|
int OSDMonitor::parse_osd_id(const char *s, stringstream *pss)
|
4062
|
{
|
4063
|
|
4064
|
if (strncmp(s, "osd.", 4) == 0) {
|
4065
|
s += 4;
|
4066
|
}
|
4067
|
|
4068
|
|
4069
|
ostringstream ss;
|
4070
|
long id = parse_pos_long(s, &ss);
|
4071
|
if (id < 0) {
|
4072
|
*pss << ss.str();
|
4073
|
return id;
|
4074
|
}
|
4075
|
if (id > 0xffff) {
|
4076
|
*pss << "osd id " << id << " is too large";
|
4077
|
return -ERANGE;
|
4078
|
}
|
4079
|
return id;
|
4080
|
}
|
4081
|
|
4082
|
|
4083
|
|
4084
|
|
4085
|
|
4086
|
|
4087
|
|
4088
|
int OSDMonitor::set_crash_replay_interval(const int64_t pool_id, const uint32_t cri)
|
4089
|
{
|
4090
|
pg_pool_t p;
|
4091
|
if (pending_inc.new_pools.count(pool_id)) {
|
4092
|
p = pending_inc.new_pools[pool_id];
|
4093
|
} else {
|
4094
|
const pg_pool_t *p_ptr = osdmap.get_pg_pool(pool_id);
|
4095
|
if (p_ptr == NULL) {
|
4096
|
return -ENOENT;
|
4097
|
} else {
|
4098
|
p = *p_ptr;
|
4099
|
}
|
4100
|
}
|
4101
|
|
4102
|
dout(10) << "Set pool " << pool_id << " crash_replay_interval=" << cri << dendl;
|
4103
|
p.crash_replay_interval = cri;
|
4104
|
p.last_change = pending_inc.epoch;
|
4105
|
pending_inc.new_pools[pool_id] = p;
|
4106
|
|
4107
|
return 0;
|
4108
|
}
|
4109
|
|
4110
|
|
4111
|
int OSDMonitor::prepare_command_pool_set(map<string,cmd_vartype> &cmdmap,
|
4112
|
stringstream& ss)
|
4113
|
{
|
4114
|
string poolstr;
|
4115
|
cmd_getval(g_ceph_context, cmdmap, "pool", poolstr);
|
4116
|
int64_t pool = osdmap.lookup_pg_pool_name(poolstr.c_str());
|
4117
|
if (pool < 0) {
|
4118
|
ss << "unrecognized pool '" << poolstr << "'";
|
4119
|
return -ENOENT;
|
4120
|
}
|
4121
|
string var;
|
4122
|
cmd_getval(g_ceph_context, cmdmap, "var", var);
|
4123
|
|
4124
|
pg_pool_t p = *osdmap.get_pg_pool(pool);
|
4125
|
if (pending_inc.new_pools.count(pool))
|
4126
|
p = pending_inc.new_pools[pool];
|
4127
|
|
4128
|
|
4129
|
|
4130
|
|
4131
|
|
4132
|
|
4133
|
string val;
|
4134
|
string interr, floaterr;
|
4135
|
int64_t n = 0;
|
4136
|
double f = 0;
|
4137
|
int64_t uf = 0;
|
4138
|
if (!cmd_getval(g_ceph_context, cmdmap, "val", val)) {
|
4139
|
|
4140
|
if (!cmd_getval(g_ceph_context, cmdmap, "val", n))
|
4141
|
return -EINVAL;
|
4142
|
} else {
|
4143
|
|
4144
|
n = strict_strtoll(val.c_str(), 10, &interr);
|
4145
|
|
4146
|
f = strict_strtod(val.c_str(), &floaterr);
|
4147
|
uf = llrintl(f * (double)1000000.0);
|
4148
|
}
|
4149
|
|
4150
|
if (!p.is_tier() &&
|
4151
|
(var == "hit_set_type" || var == "hit_set_period" ||
|
4152
|
var == "hit_set_count" || var == "hit_set_fpp" ||
|
4153
|
var == "target_max_objects" || var == "target_max_bytes" ||
|
4154
|
var == "cache_target_full_ratio" || var == "cache_target_dirty_ratio" ||
|
4155
|
var == "cache_min_flush_age" || var == "cache_min_evict_age")) {
|
4156
|
ss << "pool '" << poolstr << "' is not a tier pool: variable not applicable";
|
4157
|
return -EACCES;
|
4158
|
}
|
4159
|
|
4160
|
if (var == "size") {
|
4161
|
if (p.has_flag(pg_pool_t::FLAG_NOSIZECHANGE)) {
|
4162
|
ss << "pool size change is disabled; you must unset nosizechange flag for the pool first";
|
4163
|
return -EPERM;
|
4164
|
}
|
4165
|
if (p.type == pg_pool_t::TYPE_ERASURE) {
|
4166
|
ss << "can not change the size of an erasure-coded pool";
|
4167
|
return -ENOTSUP;
|
4168
|
}
|
4169
|
if (interr.length()) {
|
4170
|
ss << "error parsing integer value '" << val << "': " << interr;
|
4171
|
return -EINVAL;
|
4172
|
}
|
4173
|
if (n <= 0 || n > 10) {
|
4174
|
ss << "pool size must be between 1 and 10";
|
4175
|
return -EINVAL;
|
4176
|
}
|
4177
|
p.size = n;
|
4178
|
if (n < p.min_size)
|
4179
|
p.min_size = n;
|
4180
|
} else if (var == "min_size") {
|
4181
|
if (p.has_flag(pg_pool_t::FLAG_NOSIZECHANGE)) {
|
4182
|
ss << "pool min size change is disabled; you must unset nosizechange flag for the pool first";
|
4183
|
return -EPERM;
|
4184
|
}
|
4185
|
if (interr.length()) {
|
4186
|
ss << "error parsing integer value '" << val << "': " << interr;
|
4187
|
return -EINVAL;
|
4188
|
}
|
4189
|
|
4190
|
if (p.type != pg_pool_t::TYPE_ERASURE) {
|
4191
|
if (n < 1 || n > p.size) {
|
4192
|
ss << "pool min_size must be between 1 and " << (int)p.size;
|
4193
|
return -EINVAL;
|
4194
|
}
|
4195
|
} else {
|
4196
|
ErasureCodeInterfaceRef erasure_code;
|
4197
|
int k;
|
4198
|
stringstream tmp;
|
4199
|
int err = get_erasure_code(p.erasure_code_profile, &erasure_code, tmp);
|
4200
|
if (err == 0) {
|
4201
|
k = erasure_code->get_data_chunk_count();
|
4202
|
} else {
|
4203
|
ss << __func__ << " get_erasure_code failed: " << tmp;
|
4204
|
return err;;
|
4205
|
}
|
4206
|
|
4207
|
if (n < k || n > p.size) {
|
4208
|
ss << "pool min_size must be between " << k << " and " << (int)p.size;
|
4209
|
return -EINVAL;
|
4210
|
}
|
4211
|
}
|
4212
|
p.min_size = n;
|
4213
|
} else if (var == "auid") {
|
4214
|
if (interr.length()) {
|
4215
|
ss << "error parsing integer value '" << val << "': " << interr;
|
4216
|
return -EINVAL;
|
4217
|
}
|
4218
|
p.auid = n;
|
4219
|
} else if (var == "crash_replay_interval") {
|
4220
|
if (interr.length()) {
|
4221
|
ss << "error parsing integer value '" << val << "': " << interr;
|
4222
|
return -EINVAL;
|
4223
|
}
|
4224
|
p.crash_replay_interval = n;
|
4225
|
} else if (var == "pg_num") {
|
4226
|
if (p.has_flag(pg_pool_t::FLAG_NOPGCHANGE)) {
|
4227
|
ss << "pool pg_num change is disabled; you must unset nopgchange flag for the pool first";
|
4228
|
return -EPERM;
|
4229
|
}
|
4230
|
if (interr.length()) {
|
4231
|
ss << "error parsing integer value '" << val << "': " << interr;
|
4232
|
return -EINVAL;
|
4233
|
}
|
4234
|
if (n <= (int)p.get_pg_num()) {
|
4235
|
ss << "specified pg_num " << n << " <= current " << p.get_pg_num();
|
4236
|
if (n < (int)p.get_pg_num())
|
4237
|
return -EEXIST;
|
4238
|
return 0;
|
4239
|
}
|
4240
|
string force;
|
4241
|
cmd_getval(g_ceph_context,cmdmap, "force", force);
|
4242
|
if (p.cache_mode != pg_pool_t::CACHEMODE_NONE &&
|
4243
|
force != "--yes-i-really-mean-it") {
|
4244
|
ss << "splits in cache pools must be followed by scrubs and leave sufficient free space to avoid overfilling. use --yes-i-really-mean-it to force.";
|
4245
|
return -EPERM;
|
4246
|
}
|
4247
|
int expected_osds = MAX(1, MIN(p.get_pg_num(), osdmap.get_num_osds()));
|
4248
|
int64_t new_pgs = n - p.get_pg_num();
|
4249
|
int64_t pgs_per_osd = new_pgs / expected_osds;
|
4250
|
if (pgs_per_osd > g_conf->mon_osd_max_split_count) {
|
4251
|
ss << "specified pg_num " << n << " is too large (creating "
|
4252
|
<< new_pgs << " new PGs on ~" << expected_osds
|
4253
|
<< " OSDs exceeds per-OSD max of " << g_conf->mon_osd_max_split_count
|
4254
|
<< ')';
|
4255
|
return -E2BIG;
|
4256
|
}
|
4257
|
for(set<pg_t>::iterator i = mon->pgmon()->pg_map.creating_pgs.begin();
|
4258
|
i != mon->pgmon()->pg_map.creating_pgs.end();
|
4259
|
++i) {
|
4260
|
if (i->m_pool == static_cast<uint64_t>(pool)) {
|
4261
|
ss << "currently creating pgs, wait";
|
4262
|
return -EBUSY;
|
4263
|
}
|
4264
|
}
|
4265
|
p.set_pg_num(n);
|
4266
|
} else if (var == "pgp_num") {
|
4267
|
if (p.has_flag(pg_pool_t::FLAG_NOPGCHANGE)) {
|
4268
|
ss << "pool pgp_num change is disabled; you must unset nopgchange flag for the pool first";
|
4269
|
return -EPERM;
|
4270
|
}
|
4271
|
if (interr.length()) {
|
4272
|
ss << "error parsing integer value '" << val << "': " << interr;
|
4273
|
return -EINVAL;
|
4274
|
}
|
4275
|
if (n <= 0) {
|
4276
|
ss << "specified pgp_num must > 0, but you set to " << n;
|
4277
|
return -EINVAL;
|
4278
|
}
|
4279
|
if (n > (int)p.get_pg_num()) {
|
4280
|
ss << "specified pgp_num " << n << " > pg_num " << p.get_pg_num();
|
4281
|
return -EINVAL;
|
4282
|
}
|
4283
|
for(set<pg_t>::iterator i = mon->pgmon()->pg_map.creating_pgs.begin();
|
4284
|
i != mon->pgmon()->pg_map.creating_pgs.end();
|
4285
|
++i) {
|
4286
|
if (i->m_pool == static_cast<uint64_t>(pool)) {
|
4287
|
ss << "currently creating pgs, wait";
|
4288
|
return -EBUSY;
|
4289
|
}
|
4290
|
}
|
4291
|
p.set_pgp_num(n);
|
4292
|
} else if (var == "crush_ruleset") {
|
4293
|
if (interr.length()) {
|
4294
|
ss << "error parsing integer value '" << val << "': " << interr;
|
4295
|
return -EINVAL;
|
4296
|
}
|
4297
|
if (!osdmap.crush->ruleset_exists(n)) {
|
4298
|
ss << "crush ruleset " << n << " does not exist";
|
4299
|
return -ENOENT;
|
4300
|
}
|
4301
|
p.crush_ruleset = n;
|
4302
|
} else if (var == "hashpspool" || var == "nodelete" || var == "nopgchange" ||
|
4303
|
var == "nosizechange") {
|
4304
|
uint64_t flag = pg_pool_t::get_flag_by_name(var);
|
4305
|
|
4306
|
if (val == "true" || (interr.empty() && n == 1)) {
|
4307
|
p.set_flag(flag);
|
4308
|
} else if (val == "false" || (interr.empty() && n == 0)) {
|
4309
|
p.unset_flag(flag);
|
4310
|
} else {
|
4311
|
ss << "expecting value 'true', 'false', '0', or '1'";
|
4312
|
return -EINVAL;
|
4313
|
}
|
4314
|
} else if (var == "hit_set_type") {
|
4315
|
if (val == "none")
|
4316
|
p.hit_set_params = HitSet::Params();
|
4317
|
else {
|
4318
|
int err = check_cluster_features(CEPH_FEATURE_OSD_CACHEPOOL, ss);
|
4319
|
if (err)
|
4320
|
return err;
|
4321
|
if (val == "bloom") {
|
4322
|
BloomHitSet::Params *bsp = new BloomHitSet::Params;
|
4323
|
bsp->set_fpp(g_conf->osd_pool_default_hit_set_bloom_fpp);
|
4324
|
p.hit_set_params = HitSet::Params(bsp);
|
4325
|
} else if (val == "explicit_hash")
|
4326
|
p.hit_set_params = HitSet::Params(new ExplicitHashHitSet::Params);
|
4327
|
else if (val == "explicit_object")
|
4328
|
p.hit_set_params = HitSet::Params(new ExplicitObjectHitSet::Params);
|
4329
|
else {
|
4330
|
ss << "unrecognized hit_set type '" << val << "'";
|
4331
|
return -EINVAL;
|
4332
|
}
|
4333
|
}
|
4334
|
} else if (var == "hit_set_period") {
|
4335
|
if (interr.length()) {
|
4336
|
ss << "error parsing integer value '" << val << "': " << interr;
|
4337
|
return -EINVAL;
|
4338
|
}
|
4339
|
p.hit_set_period = n;
|
4340
|
} else if (var == "hit_set_count") {
|
4341
|
|
4342
|
if (interr.length()) {
|
4343
|
ss << "error parsing integer value '" << val << "': " << interr;
|
4344
|
return -EINVAL;
|
4345
|
}
|
4346
|
p.hit_set_count = n;
|
4347
|
} else if (var == "hit_set_fpp") {
|
4348
|
if (floaterr.length()) {
|
4349
|
ss << "error parsing floating point value '" << val << "': " << floaterr;
|
4350
|
return -EINVAL;
|
4351
|
}
|
4352
|
if (p.hit_set_params.get_type() != HitSet::TYPE_BLOOM) {
|
4353
|
ss << "hit set is not of type Bloom; invalid to set a false positive rate!";
|
4354
|
return -EINVAL;
|
4355
|
}
|
4356
|
BloomHitSet::Params *bloomp = static_cast<BloomHitSet::Params*>(p.hit_set_params.impl.get());
|
4357
|
bloomp->set_fpp(f);
|
4358
|
} else if (var == "debug_fake_ec_pool") {
|
4359
|
if (val == "true" || (interr.empty() && n == 1)) {
|
4360
|
p.flags |= pg_pool_t::FLAG_DEBUG_FAKE_EC_POOL;
|
4361
|
}
|
4362
|
} else if (var == "target_max_objects") {
|
4363
|
if (interr.length()) {
|
4364
|
ss << "error parsing int '" << val << "': " << interr;
|
4365
|
return -EINVAL;
|
4366
|
}
|
4367
|
p.target_max_objects = n;
|
4368
|
} else if (var == "target_max_bytes") {
|
4369
|
if (interr.length()) {
|
4370
|
ss << "error parsing int '" << val << "': " << interr;
|
4371
|
return -EINVAL;
|
4372
|
}
|
4373
|
p.target_max_bytes = n;
|
4374
|
} else if (var == "cache_target_dirty_ratio") {
|
4375
|
if (floaterr.length()) {
|
4376
|
ss << "error parsing float '" << val << "': " << floaterr;
|
4377
|
return -EINVAL;
|
4378
|
}
|
4379
|
if (f < 0 || f > 1.0) {
|
4380
|
ss << "value must be in the range 0..1";
|
4381
|
return -ERANGE;
|
4382
|
}
|
4383
|
p.cache_target_dirty_ratio_micro = uf;
|
4384
|
} else if (var == "cache_target_full_ratio") {
|
4385
|
if (floaterr.length()) {
|
4386
|
ss << "error parsing float '" << val << "': " << floaterr;
|
4387
|
return -EINVAL;
|
4388
|
}
|
4389
|
if (f < 0 || f > 1.0) {
|
4390
|
ss << "value must be in the range 0..1";
|
4391
|
return -ERANGE;
|
4392
|
}
|
4393
|
p.cache_target_full_ratio_micro = uf;
|
4394
|
} else if (var == "cache_min_flush_age") {
|
4395
|
if (interr.length()) {
|
4396
|
ss << "error parsing int '" << val << "': " << interr;
|
4397
|
return -EINVAL;
|
4398
|
}
|
4399
|
p.cache_min_flush_age = n;
|
4400
|
} else if (var == "cache_min_evict_age") {
|
4401
|
if (interr.length()) {
|
4402
|
ss << "error parsing int '" << val << "': " << interr;
|
4403
|
return -EINVAL;
|
4404
|
}
|
4405
|
p.cache_min_evict_age = n;
|
4406
|
} else if (var == "min_read_recency_for_promote") {
|
4407
|
if (interr.length()) {
|
4408
|
ss << "error parsing integer value '" << val << "': " << interr;
|
4409
|
return -EINVAL;
|
4410
|
}
|
4411
|
p.min_read_recency_for_promote = n;
|
4412
|
} else if (var == "write_fadvise_dontneed") {
|
4413
|
if (val == "true" || (interr.empty() && n == 1)) {
|
4414
|
p.flags |= pg_pool_t::FLAG_WRITE_FADVISE_DONTNEED;
|
4415
|
} else if (val == "false" || (interr.empty() && n == 0)) {
|
4416
|
p.flags &= ~pg_pool_t::FLAG_WRITE_FADVISE_DONTNEED;
|
4417
|
} else {
|
4418
|
ss << "expecting value 'true', 'false', '0', or '1'";
|
4419
|
return -EINVAL;
|
4420
|
}
|
4421
|
} else {
|
4422
|
ss << "unrecognized variable '" << var << "'";
|
4423
|
return -EINVAL;
|
4424
|
}
|
4425
|
ss << "set pool " << pool << " " << var << " to " << val;
|
4426
|
p.last_change = pending_inc.epoch;
|
4427
|
pending_inc.new_pools[pool] = p;
|
4428
|
return 0;
|
4429
|
}
|
4430
|
|
4431
|
bool OSDMonitor::prepare_command(MMonCommand *m)
|
4432
|
{
|
4433
|
stringstream ss;
|
4434
|
map<string, cmd_vartype> cmdmap;
|
4435
|
if (!cmdmap_from_json(m->cmd, &cmdmap, ss)) {
|
4436
|
string rs = ss.str();
|
4437
|
mon->reply_command(m, -EINVAL, rs, get_last_committed());
|
4438
|
return true;
|
4439
|
}
|
4440
|
|
4441
|
MonSession *session = m->get_session();
|
4442
|
if (!session) {
|
4443
|
mon->reply_command(m, -EACCES, "access denied", get_last_committed());
|
4444
|
return true;
|
4445
|
}
|
4446
|
|
4447
|
return prepare_command_impl(m, cmdmap);
|
4448
|
}
|
4449
|
|
4450
|
bool OSDMonitor::prepare_command_impl(MMonCommand *m,
|
4451
|
map<string,cmd_vartype> &cmdmap)
|
4452
|
{
|
4453
|
bool ret = false;
|
4454
|
stringstream ss;
|
4455
|
string rs;
|
4456
|
bufferlist rdata;
|
4457
|
int err = 0;
|
4458
|
|
4459
|
string format;
|
4460
|
cmd_getval(g_ceph_context, cmdmap, "format", format, string("plain"));
|
4461
|
boost::scoped_ptr<Formatter> f(Formatter::create(format));
|
4462
|
|
4463
|
string prefix;
|
4464
|
cmd_getval(g_ceph_context, cmdmap, "prefix", prefix);
|
4465
|
|
4466
|
int64_t osdid;
|
4467
|
string name;
|
4468
|
bool osdid_present = cmd_getval(g_ceph_context, cmdmap, "id", osdid);
|
4469
|
if (osdid_present) {
|
4470
|
ostringstream oss;
|
4471
|
oss << "osd." << osdid;
|
4472
|
name = oss.str();
|
4473
|
}
|
4474
|
|
4475
|
|
4476
|
|
4477
|
|
4478
|
|
4479
|
|
4480
|
|
4481
|
|
4482
|
|
4483
|
|
4484
|
|
4485
|
|
4486
|
|
4487
|
|
4488
|
|
4489
|
|
4490
|
|
4491
|
|
4492
|
|
4493
|
|
4494
|
|
4495
|
|
4496
|
|
4497
|
|
4498
|
|
4499
|
|
4500
|
|
4501
|
|
4502
|
|
4503
|
|
4504
|
|
4505
|
if (prefix == "osd setcrushmap" ||
|
4506
|
(prefix == "osd crush set" && !osdid_present)) {
|
4507
|
dout(10) << "prepare_command setting new crush map" << dendl;
|
4508
|
bufferlist data(m->get_data());
|
4509
|
CrushWrapper crush;
|
4510
|
try {
|
4511
|
bufferlist::iterator bl(data.begin());
|
4512
|
crush.decode(bl);
|
4513
|
}
|
4514
|
catch (const std::exception &e) {
|
4515
|
err = -EINVAL;
|
4516
|
ss << "Failed to parse crushmap: " << e.what();
|
4517
|
goto reply;
|
4518
|
}
|
4519
|
|
4520
|
if (!validate_crush_against_features(&crush, ss)) {
|
4521
|
err = -EINVAL;
|
4522
|
goto reply;
|
4523
|
}
|
4524
|
|
4525
|
|
4526
|
dout(10) << " testing map" << dendl;
|
4527
|
stringstream ess;
|
4528
|
CrushTester tester(crush, ess);
|
4529
|
int r = tester.test_with_crushtool(g_conf->crushtool,
|
4530
|
g_conf->mon_lease);
|
4531
|
if (r < 0) {
|
4532
|
if (r == -EINTR) {
|
4533
|
ss << "(note: crushtool tests not run because they took too long) ";
|
4534
|
} else {
|
4535
|
derr << "error on crush map: " << ess.str() << dendl;
|
4536
|
ss << "Failed to parse crushmap: " << ess.str();
|
4537
|
err = r;
|
4538
|
goto reply;
|
4539
|
}
|
4540
|
}
|
4541
|
|
4542
|
dout(10) << " result " << ess.str() << dendl;
|
4543
|
|
4544
|
pending_inc.crush = data;
|
4545
|
ss << "set crush map";
|
4546
|
goto update;
|
4547
|
} else if (prefix == "osd crush add-bucket") {
|
4548
|
|
4549
|
string name, typestr;
|
4550
|
cmd_getval(g_ceph_context, cmdmap, "name", name);
|
4551
|
cmd_getval(g_ceph_context, cmdmap, "type", typestr);
|
4552
|
|
4553
|
if (!_have_pending_crush() &&
|
4554
|
_get_stable_crush().name_exists(name)) {
|
4555
|
ss << "bucket '" << name << "' already exists";
|
4556
|
goto reply;
|
4557
|
}
|
4558
|
|
4559
|
CrushWrapper newcrush;
|
4560
|
_get_pending_crush(newcrush);
|
4561
|
|
4562
|
if (newcrush.name_exists(name)) {
|
4563
|
ss << "bucket '" << name << "' already exists";
|
4564
|
goto update;
|
4565
|
}
|
4566
|
int type = newcrush.get_type_id(typestr);
|
4567
|
if (type < 0) {
|
4568
|
ss << "type '" << typestr << "' does not exist";
|
4569
|
err = -EINVAL;
|
4570
|
goto reply;
|
4571
|
}
|
4572
|
if (type == 0) {
|
4573
|
ss << "type '" << typestr << "' is for devices, not buckets";
|
4574
|
err = -EINVAL;
|
4575
|
goto reply;
|
4576
|
}
|
4577
|
int bucketno;
|
4578
|
err = newcrush.add_bucket(0, 0,
|
4579
|
CRUSH_HASH_DEFAULT, type, 0, NULL,
|
4580
|
NULL, &bucketno);
|
4581
|
if (err < 0) {
|
4582
|
ss << "add_bucket error: '" << cpp_strerror(err) << "'";
|
4583
|
goto reply;
|
4584
|
}
|
4585
|
err = newcrush.set_item_name(bucketno, name);
|
4586
|
if (err < 0) {
|
4587
|
ss << "error setting bucket name to '" << name << "'";
|
4588
|
goto reply;
|
4589
|
}
|
4590
|
|
4591
|
pending_inc.crush.clear();
|
4592
|
newcrush.encode(pending_inc.crush);
|
4593
|
ss << "added bucket " << name << " type " << typestr
|
4594
|
<< " to crush map";
|
4595
|
goto update;
|
4596
|
} else if (prefix == "osd crush rename-bucket") {
|
4597
|
string srcname, dstname;
|
4598
|
cmd_getval(g_ceph_context, cmdmap, "srcname", srcname);
|
4599
|
cmd_getval(g_ceph_context, cmdmap, "dstname", dstname);
|
4600
|
|
4601
|
err = crush_rename_bucket(srcname, dstname, &ss);
|
4602
|
if (err == -EALREADY)
|
4603
|
err = 0;
|
4604
|
if (err)
|
4605
|
goto reply;
|
4606
|
else
|
4607
|
goto update;
|
4608
|
} else if (osdid_present &&
|
4609
|
(prefix == "osd crush set" || prefix == "osd crush add")) {
|
4610
|
|
4611
|
|
4612
|
|
4613
|
|
4614
|
if (!osdmap.exists(osdid)) {
|
4615
|
err = -ENOENT;
|
4616
|
ss << name << " does not exist. create it before updating the crush map";
|
4617
|
goto reply;
|
4618
|
}
|
4619
|
|
4620
|
double weight;
|
4621
|
if (!cmd_getval(g_ceph_context, cmdmap, "weight", weight)) {
|
4622
|
ss << "unable to parse weight value '"
|
4623
|
<< cmd_vartype_stringify(cmdmap["weight"]) << "'";
|
4624
|
err = -EINVAL;
|
4625
|
goto reply;
|
4626
|
}
|
4627
|
|
4628
|
string args;
|
4629
|
vector<string> argvec;
|
4630
|
cmd_getval(g_ceph_context, cmdmap, "args", argvec);
|
4631
|
map<string,string> loc;
|
4632
|
CrushWrapper::parse_loc_map(argvec, &loc);
|
4633
|
|
4634
|
if (prefix == "osd crush set"
|
4635
|
&& !_get_stable_crush().item_exists(osdid)) {
|
4636
|
err = -ENOENT;
|
4637
|
ss << "unable to set item id " << osdid << " name '" << name
|
4638
|
<< "' weight " << weight << " at location " << loc
|
4639
|
<< ": does not exist";
|
4640
|
goto reply;
|
4641
|
}
|
4642
|
|
4643
|
dout(5) << "adding/updating crush item id " << osdid << " name '"
|
4644
|
<< name << "' weight " << weight << " at location "
|
4645
|
<< loc << dendl;
|
4646
|
CrushWrapper newcrush;
|
4647
|
_get_pending_crush(newcrush);
|
4648
|
|
4649
|
string action;
|
4650
|
if (prefix == "osd crush set" ||
|
4651
|
newcrush.check_item_loc(g_ceph_context, osdid, loc, (int *)NULL)) {
|
4652
|
action = "set";
|
4653
|
err = newcrush.update_item(g_ceph_context, osdid, weight, name, loc);
|
4654
|
} else {
|
4655
|
action = "add";
|
4656
|
err = newcrush.insert_item(g_ceph_context, osdid, weight, name, loc);
|
4657
|
if (err == 0)
|
4658
|
err = 1;
|
4659
|
}
|
4660
|
|
4661
|
if (err < 0)
|
4662
|
goto reply;
|
4663
|
|
4664
|
if (err == 0 && !_have_pending_crush()) {
|
4665
|
ss << action << " item id " << osdid << " name '" << name << "' weight "
|
4666
|
<< weight << " at location " << loc << ": no change";
|
4667
|
goto reply;
|
4668
|
}
|
4669
|
|
4670
|
pending_inc.crush.clear();
|
4671
|
newcrush.encode(pending_inc.crush);
|
4672
|
ss << action << " item id " << osdid << " name '" << name << "' weight "
|
4673
|
<< weight << " at location " << loc << " to crush map";
|
4674
|
getline(ss, rs);
|
4675
|
wait_for_finished_proposal(new Monitor::C_Command(mon, m, 0, rs,
|
4676
|
get_last_committed() + 1));
|
4677
|
return true;
|
4678
|
|
4679
|
} else if (prefix == "osd crush create-or-move") {
|
4680
|
do {
|
4681
|
|
4682
|
if (!osdmap.exists(osdid)) {
|
4683
|
err = -ENOENT;
|
4684
|
ss << name << " does not exist. create it before updating the crush map";
|
4685
|
goto reply;
|
4686
|
}
|
4687
|
|
4688
|
double weight;
|
4689
|
if (!cmd_getval(g_ceph_context, cmdmap, "weight", weight)) {
|
4690
|
ss << "unable to parse weight value '"
|
4691
|
<< cmd_vartype_stringify(cmdmap["weight"]) << "'";
|
4692
|
err = -EINVAL;
|
4693
|
goto reply;
|
4694
|
}
|
4695
|
|
4696
|
string args;
|
4697
|
vector<string> argvec;
|
4698
|
cmd_getval(g_ceph_context, cmdmap, "args", argvec);
|
4699
|
map<string,string> loc;
|
4700
|
CrushWrapper::parse_loc_map(argvec, &loc);
|
4701
|
|
4702
|
dout(0) << "create-or-move crush item name '" << name << "' initial_weight " << weight
|
4703
|
<< " at location " << loc << dendl;
|
4704
|
|
4705
|
CrushWrapper newcrush;
|
4706
|
_get_pending_crush(newcrush);
|
4707
|
|
4708
|
err = newcrush.create_or_move_item(g_ceph_context, osdid, weight, name, loc);
|
4709
|
if (err == 0) {
|
4710
|
ss << "create-or-move updated item name '" << name << "' weight " << weight
|
4711
|
<< " at location " << loc << " to crush map";
|
4712
|
break;
|
4713
|
}
|
4714
|
if (err > 0) {
|
4715
|
pending_inc.crush.clear();
|
4716
|
newcrush.encode(pending_inc.crush);
|
4717
|
ss << "create-or-move updating item name '" << name << "' weight " << weight
|
4718
|
<< " at location " << loc << " to crush map";
|
4719
|
getline(ss, rs);
|
4720
|
wait_for_finished_proposal(new Monitor::C_Command(mon, m, 0, rs,
|
4721
|
get_last_committed() + 1));
|
4722
|
return true;
|
4723
|
}
|
4724
|
} while (false);
|
4725
|
|
4726
|
} else if (prefix == "osd crush move") {
|
4727
|
do {
|
4728
|
|
4729
|
|
4730
|
string args;
|
4731
|
vector<string> argvec;
|
4732
|
cmd_getval(g_ceph_context, cmdmap, "name", name);
|
4733
|
cmd_getval(g_ceph_context, cmdmap, "args", argvec);
|
4734
|
map<string,string> loc;
|
4735
|
CrushWrapper::parse_loc_map(argvec, &loc);
|
4736
|
|
4737
|
dout(0) << "moving crush item name '" << name << "' to location " << loc << dendl;
|
4738
|
CrushWrapper newcrush;
|
4739
|
_get_pending_crush(newcrush);
|
4740
|
|
4741
|
if (!newcrush.name_exists(name)) {
|
4742
|
err = -ENOENT;
|
4743
|
ss << "item " << name << " does not exist";
|
4744
|
break;
|
4745
|
}
|
4746
|
int id = newcrush.get_item_id(name);
|
4747
|
|
4748
|
if (!newcrush.check_item_loc(g_ceph_context, id, loc, (int *)NULL)) {
|
4749
|
err = newcrush.move_bucket(g_ceph_context, id, loc);
|
4750
|
if (err >= 0) {
|
4751
|
ss << "moved item id " << id << " name '" << name << "' to location " << loc << " in crush map";
|
4752
|
pending_inc.crush.clear();
|
4753
|
newcrush.encode(pending_inc.crush);
|
4754
|
getline(ss, rs);
|
4755
|
wait_for_finished_proposal(new Monitor::C_Command(mon, m, 0, rs,
|
4756
|
get_last_committed() + 1));
|
4757
|
return true;
|
4758
|
}
|
4759
|
} else {
|
4760
|
ss << "no need to move item id " << id << " name '" << name << "' to location " << loc << " in crush map";
|
4761
|
err = 0;
|
4762
|
}
|
4763
|
} while (false);
|
4764
|
|
4765
|
} else if (prefix == "osd crush link") {
|
4766
|
|
4767
|
string name;
|
4768
|
cmd_getval(g_ceph_context, cmdmap, "name", name);
|
4769
|
vector<string> argvec;
|
4770
|
cmd_getval(g_ceph_context, cmdmap, "args", argvec);
|
4771
|
map<string,string> loc;
|
4772
|
CrushWrapper::parse_loc_map(argvec, &loc);
|
4773
|
|
4774
|
|
4775
|
|
4776
|
int id = osdmap.crush->get_item_id(name);
|
4777
|
if (!osdmap.crush->name_exists(name)) {
|
4778
|
err = -ENOENT;
|
4779
|
ss << "item " << name << " does not exist";
|
4780
|
goto reply;
|
4781
|
} else {
|
4782
|
dout(5) << "resolved crush name '" << name << "' to id " << id << dendl;
|
4783
|
}
|
4784
|
if (osdmap.crush->check_item_loc(g_ceph_context, id, loc, (int*) NULL)) {
|
4785
|
ss << "no need to move item id " << id << " name '" << name
|
4786
|
<< "' to location " << loc << " in crush map";
|
4787
|
err = 0;
|
4788
|
goto reply;
|
4789
|
}
|
4790
|
|
4791
|
dout(5) << "linking crush item name '" << name << "' at location " << loc << dendl;
|
4792
|
CrushWrapper newcrush;
|
4793
|
_get_pending_crush(newcrush);
|
4794
|
|
4795
|
if (!newcrush.name_exists(name)) {
|
4796
|
err = -ENOENT;
|
4797
|
ss << "item " << name << " does not exist";
|
4798
|
} else {
|
4799
|
int id = newcrush.get_item_id(name);
|
4800
|
if (!newcrush.check_item_loc(g_ceph_context, id, loc, (int *)NULL)) {
|
4801
|
err = newcrush.link_bucket(g_ceph_context, id, loc);
|
4802
|
if (err >= 0) {
|
4803
|
ss << "linked item id " << id << " name '" << name
|
4804
|
<< "' to location " << loc << " in crush map";
|
4805
|
pending_inc.crush.clear();
|
4806
|
newcrush.encode(pending_inc.crush);
|
4807
|
} else {
|
4808
|
ss << "cannot link item id " << id << " name '" << name
|
4809
|
<< "' to location " << loc;
|
4810
|
}
|
4811
|
} else {
|
4812
|
ss << "no need to move item id " << id << " name '" << name
|
4813
|
<< "' to location " << loc << " in crush map";
|
4814
|
err = 0;
|
4815
|
}
|
4816
|
}
|
4817
|
wait_for_finished_proposal(new Monitor::C_Command(mon, m, err, ss.str(),
|
4818
|
get_last_committed() + 1));
|
4819
|
return true;
|
4820
|
} else if (prefix == "osd crush rm" ||
|
4821
|
prefix == "osd crush remove" ||
|
4822
|
prefix == "osd crush unlink") {
|
4823
|
do {
|
4824
|
|
4825
|
CrushWrapper newcrush;
|
4826
|
_get_pending_crush(newcrush);
|
4827
|
|
4828
|
string name;
|
4829
|
cmd_getval(g_ceph_context, cmdmap, "name", name);
|
4830
|
|
4831
|
if (!osdmap.crush->name_exists(name)) {
|
4832
|
err = 0;
|
4833
|
ss << "device '" << name << "' does not appear in the crush map";
|
4834
|
break;
|
4835
|
}
|
4836
|
if (!newcrush.name_exists(name)) {
|
4837
|
err = 0;
|
4838
|
ss << "device '" << name << "' does not appear in the crush map";
|
4839
|
getline(ss, rs);
|
4840
|
wait_for_finished_proposal(new Monitor::C_Command(mon, m, 0, rs,
|
4841
|
get_last_committed() + 1));
|
4842
|
return true;
|
4843
|
}
|
4844
|
int id = newcrush.get_item_id(name);
|
4845
|
bool unlink_only = prefix == "osd crush unlink";
|
4846
|
string ancestor_str;
|
4847
|
if (cmd_getval(g_ceph_context, cmdmap, "ancestor", ancestor_str)) {
|
4848
|
if (!newcrush.name_exists(ancestor_str)) {
|
4849
|
err = -ENOENT;
|
4850
|
ss << "ancestor item '" << ancestor_str
|
4851
|
<< "' does not appear in the crush map";
|
4852
|
break;
|
4853
|
}
|
4854
|
int ancestor = newcrush.get_item_id(ancestor_str);
|
4855
|
err = newcrush.remove_item_under(g_ceph_context, id, ancestor,
|
4856
|
unlink_only);
|
4857
|
} else {
|
4858
|
err = newcrush.remove_item(g_ceph_context, id, unlink_only);
|
4859
|
}
|
4860
|
if (err == -ENOENT) {
|
4861
|
ss << "item " << id << " does not appear in that position";
|
4862
|
err = 0;
|
4863
|
break;
|
4864
|
}
|
4865
|
if (err == 0) {
|
4866
|
pending_inc.crush.clear();
|
4867
|
newcrush.encode(pending_inc.crush);
|
4868
|
ss << "removed item id " << id << " name '" << name << "' from crush map";
|
4869
|
getline(ss, rs);
|
4870
|
wait_for_finished_proposal(new Monitor::C_Command(mon, m, 0, rs,
|
4871
|
get_last_committed() + 1));
|
4872
|
return true;
|
4873
|
}
|
4874
|
} while (false);
|
4875
|
|
4876
|
} else if (prefix == "osd crush reweight-all") {
|
4877
|
|
4878
|
CrushWrapper newcrush;
|
4879
|
_get_pending_crush(newcrush);
|
4880
|
|
4881
|
newcrush.reweight(g_ceph_context);
|
4882
|
pending_inc.crush.clear();
|
4883
|
newcrush.encode(pending_inc.crush);
|
4884
|
ss << "reweighted crush hierarchy";
|
4885
|
getline(ss, rs);
|
4886
|
wait_for_finished_proposal(new Monitor::C_Command(mon, m, 0, rs,
|
4887
|
get_last_committed() + 1));
|
4888
|
return true;
|
4889
|
} else if (prefix == "osd crush reweight") {
|
4890
|
|
4891
|
CrushWrapper newcrush;
|
4892
|
_get_pending_crush(newcrush);
|
4893
|
|
4894
|
string name;
|
4895
|
cmd_getval(g_ceph_context, cmdmap, "name", name);
|
4896
|
if (!newcrush.name_exists(name)) {
|
4897
|
err = -ENOENT;
|
4898
|
ss << "device '" << name << "' does not appear in the crush map";
|
4899
|
goto reply;
|
4900
|
}
|
4901
|
|
4902
|
int id = newcrush.get_item_id(name);
|
4903
|
if (id < 0) {
|
4904
|
ss << "device '" << name << "' is not a leaf in the crush map";
|
4905
|
err = -EINVAL;
|
4906
|
goto reply;
|
4907
|
}
|
4908
|
double w;
|
4909
|
if (!cmd_getval(g_ceph_context, cmdmap, "weight", w)) {
|
4910
|
ss << "unable to parse weight value '"
|
4911
|
<< cmd_vartype_stringify(cmdmap["weight"]) << "'";
|
4912
|
err = -EINVAL;
|
4913
|
goto reply;
|
4914
|
}
|
4915
|
|
4916
|
err = newcrush.adjust_item_weightf(g_ceph_context, id, w);
|
4917
|
if (err < 0)
|
4918
|
goto reply;
|
4919
|
pending_inc.crush.clear();
|
4920
|
newcrush.encode(pending_inc.crush);
|
4921
|
ss << "reweighted item id " << id << " name '" << name << "' to " << w
|
4922
|
<< " in crush map";
|
4923
|
getline(ss, rs);
|
4924
|
wait_for_finished_proposal(new Monitor::C_Command(mon, m, 0, rs,
|
4925
|
get_last_committed() + 1));
|
4926
|
return true;
|
4927
|
} else if (prefix == "osd crush reweight-subtree") {
|
4928
|
|
4929
|
CrushWrapper newcrush;
|
4930
|
_get_pending_crush(newcrush);
|
4931
|
|
4932
|
string name;
|
4933
|
cmd_getval(g_ceph_context, cmdmap, "name", name);
|
4934
|
if (!newcrush.name_exists(name)) {
|
4935
|
err = -ENOENT;
|
4936
|
ss << "device '" << name << "' does not appear in the crush map";
|
4937
|
goto reply;
|
4938
|
}
|
4939
|
|
4940
|
int id = newcrush.get_item_id(name);
|
4941
|
if (id >= 0) {
|
4942
|
ss << "device '" << name << "' is not a subtree in the crush map";
|
4943
|
err = -EINVAL;
|
4944
|
goto reply;
|
4945
|
}
|
4946
|
double w;
|
4947
|
if (!cmd_getval(g_ceph_context, cmdmap, "weight", w)) {
|
4948
|
ss << "unable to parse weight value '"
|
4949
|
<< cmd_vartype_stringify(cmdmap["weight"]) << "'";
|
4950
|
err = -EINVAL;
|
4951
|
goto reply;
|
4952
|
}
|
4953
|
|
4954
|
err = newcrush.adjust_subtree_weightf(g_ceph_context, id, w);
|
4955
|
if (err < 0)
|
4956
|
goto reply;
|
4957
|
pending_inc.crush.clear();
|
4958
|
newcrush.encode(pending_inc.crush);
|
4959
|
ss << "reweighted subtree id " << id << " name '" << name << "' to " << w
|
4960
|
<< " in crush map";
|
4961
|
getline(ss, rs);
|
4962
|
wait_for_finished_proposal(new Monitor::C_Command(mon, m, 0, rs,
|
4963
|
get_last_committed() + 1));
|
4964
|
return true;
|
4965
|
} else if (prefix == "osd crush tunables") {
|
4966
|
CrushWrapper newcrush;
|
4967
|
_get_pending_crush(newcrush);
|
4968
|
|
4969
|
err = 0;
|
4970
|
string profile;
|
4971
|
cmd_getval(g_ceph_context, cmdmap, "profile", profile);
|
4972
|
if (profile == "legacy" || profile == "argonaut") {
|
4973
|
newcrush.set_tunables_legacy();
|
4974
|
} else if (profile == "bobtail") {
|
4975
|
newcrush.set_tunables_bobtail();
|
4976
|
} else if (profile == "firefly") {
|
4977
|
newcrush.set_tunables_firefly();
|
4978
|
} else if (profile == "hammer") {
|
4979
|
newcrush.set_tunables_hammer();
|
4980
|
} else if (profile == "optimal") {
|
4981
|
newcrush.set_tunables_optimal();
|
4982
|
} else if (profile == "default") {
|
4983
|
newcrush.set_tunables_default();
|
4984
|
} else {
|
4985
|
ss << "unrecognized profile '" << profile << "'";
|
4986
|
err = -EINVAL;
|
4987
|
goto reply;
|
4988
|
}
|
4989
|
|
4990
|
if (!validate_crush_against_features(&newcrush, ss)) {
|
4991
|
err = -EINVAL;
|
4992
|
goto reply;
|
4993
|
}
|
4994
|
|
4995
|
pending_inc.crush.clear();
|
4996
|
newcrush.encode(pending_inc.crush);
|
4997
|
ss << "adjusted tunables profile to " << profile;
|
4998
|
getline(ss, rs);
|
4999
|
wait_for_finished_proposal(new Monitor::C_Command(mon, m, 0, rs,
|
5000
|
get_last_committed() + 1));
|
5001
|
return true;
|
5002
|
} else if (prefix == "osd crush set-tunable") {
|
5003
|
CrushWrapper newcrush;
|
5004
|
_get_pending_crush(newcrush);
|
5005
|
|
5006
|
err = 0;
|
5007
|
string tunable;
|
5008
|
cmd_getval(g_ceph_context, cmdmap, "tunable", tunable);
|
5009
|
|
5010
|
int64_t value = -1;
|
5011
|
if (!cmd_getval(g_ceph_context, cmdmap, "value", value)) {
|
5012
|
err = -EINVAL;
|
5013
|
ss << "failed to parse integer value " << cmd_vartype_stringify(cmdmap["value"]);
|
5014
|
goto reply;
|
5015
|
}
|
5016
|
|
5017
|
if (tunable == "straw_calc_version") {
|
5018
|
if (value < 0 || value > 2) {
|
5019
|
ss << "value must be 0 or 1; got " << value;
|
5020
|
err = -EINVAL;
|
5021
|
goto reply;
|
5022
|
}
|
5023
|
newcrush.set_straw_calc_version(value);
|
5024
|
} else {
|
5025
|
ss << "unrecognized tunable '" << tunable << "'";
|
5026
|
err = -EINVAL;
|
5027
|
goto reply;
|
5028
|
}
|
5029
|
|
5030
|
if (!validate_crush_against_features(&newcrush, ss)) {
|
5031
|
err = -EINVAL;
|
5032
|
goto reply;
|
5033
|
}
|
5034
|
|
5035
|
pending_inc.crush.clear();
|
5036
|
newcrush.encode(pending_inc.crush);
|
5037
|
ss << "adjusted tunable " << tunable << " to " << value;
|
5038
|
getline(ss, rs);
|
5039
|
wait_for_finished_proposal(new Monitor::C_Command(mon, m, 0, rs,
|
5040
|
get_last_committed() + 1));
|
5041
|
return true;
|
5042
|
|
5043
|
} else if (prefix == "osd crush rule create-simple") {
|
5044
|
string name, root, type, mode;
|
5045
|
cmd_getval(g_ceph_context, cmdmap, "name", name);
|
5046
|
cmd_getval(g_ceph_context, cmdmap, "root", root);
|
5047
|
cmd_getval(g_ceph_context, cmdmap, "type", type);
|
5048
|
cmd_getval(g_ceph_context, cmdmap, "mode", mode);
|
5049
|
if (mode == "")
|
5050
|
mode = "firstn";
|
5051
|
|
5052
|
if (osdmap.crush->rule_exists(name)) {
|
5053
|
|
5054
|
|
5055
|
ss << "ruleset " << name << " already exists";
|
5056
|
err = 0;
|
5057
|
goto reply;
|
5058
|
}
|
5059
|
|
5060
|
CrushWrapper newcrush;
|
5061
|
_get_pending_crush(newcrush);
|
5062
|
|
5063
|
if (newcrush.rule_exists(name)) {
|
5064
|
|
5065
|
|
5066
|
ss << "ruleset " << name << " already exists";
|
5067
|
err = 0;
|
5068
|
} else {
|
5069
|
int ruleno = newcrush.add_simple_ruleset(name, root, type, mode,
|
5070
|
pg_pool_t::TYPE_REPLICATED, &ss);
|
5071
|
if (ruleno < 0) {
|
5072
|
err = ruleno;
|
5073
|
goto reply;
|
5074
|
}
|
5075
|
|
5076
|
pending_inc.crush.clear();
|
5077
|
newcrush.encode(pending_inc.crush);
|
5078
|
}
|
5079
|
getline(ss, rs);
|
5080
|
wait_for_finished_proposal(new Monitor::C_Command(mon, m, 0, rs,
|
5081
|
get_last_committed() + 1));
|
5082
|
return true;
|
5083
|
|
5084
|
} else if (prefix == "osd erasure-code-profile rm") {
|
5085
|
string name;
|
5086
|
cmd_getval(g_ceph_context, cmdmap, "name", name);
|
5087
|
|
5088
|
if (erasure_code_profile_in_use(pending_inc.new_pools, name, ss))
|
5089
|
goto wait;
|
5090
|
|
5091
|
if (erasure_code_profile_in_use(osdmap.pools, name, ss)) {
|
5092
|
err = -EBUSY;
|
5093
|
goto reply;
|
5094
|
}
|
5095
|
|
5096
|
if (osdmap.has_erasure_code_profile(name) ||
|
5097
|
pending_inc.new_erasure_code_profiles.count(name)) {
|
5098
|
if (osdmap.has_erasure_code_profile(name)) {
|
5099
|
pending_inc.old_erasure_code_profiles.push_back(name);
|
5100
|
} else {
|
5101
|
dout(20) << "erasure code profile rm " << name << ": creation canceled" << dendl;
|
5102
|
pending_inc.new_erasure_code_profiles.erase(name);
|
5103
|
}
|
5104
|
|
5105
|
getline(ss, rs);
|
5106
|
wait_for_finished_proposal(new Monitor::C_Command(mon, m, 0, rs,
|
5107
|
get_last_committed() + 1));
|
5108
|
return true;
|
5109
|
} else {
|
5110
|
ss << "erasure-code-profile " << name << " does not exist";
|
5111
|
err = 0;
|
5112
|
goto reply;
|
5113
|
}
|
5114
|
|
5115
|
} else if (prefix == "osd erasure-code-profile set") {
|
5116
|
string name;
|
5117
|
cmd_getval(g_ceph_context, cmdmap, "name", name);
|
5118
|
vector<string> profile;
|
5119
|
cmd_getval(g_ceph_context, cmdmap, "profile", profile);
|
5120
|
bool force;
|
5121
|
if (profile.size() > 0 && profile.back() == "--force") {
|
5122
|
profile.pop_back();
|
5123
|
force = true;
|
5124
|
} else {
|
5125
|
force = false;
|
5126
|
}
|
5127
|
map<string,string> profile_map;
|
5128
|
err = parse_erasure_code_profile(profile, &profile_map, ss);
|
5129
|
if (err)
|
5130
|
goto reply;
|
5131
|
if (profile_map.find("plugin") == profile_map.end()) {
|
5132
|
ss << "erasure-code-profile " << profile_map
|
5133
|
<< " must contain a plugin entry" << std::endl;
|
5134
|
err = -EINVAL;
|
5135
|
goto reply;
|
5136
|
}
|
5137
|
string plugin = profile_map["plugin"];
|
5138
|
|
5139
|
if (osdmap.has_erasure_code_profile(name)) {
|
5140
|
if (osdmap.get_erasure_code_profile(name) == profile_map) {
|
5141
|
err = 0;
|
5142
|
goto reply;
|
5143
|
}
|
5144
|
if (!force) {
|
5145
|
err = -EPERM;
|
5146
|
ss << "will not override erasure code profile " << name
|
5147
|
<< " because the existing profile "
|
5148
|
<< osdmap.get_erasure_code_profile(name)
|
5149
|
<< " is different from the proposed profile "
|
5150
|
<< profile_map;
|
5151
|
goto reply;
|
5152
|
}
|
5153
|
}
|
5154
|
|
5155
|
if (pending_inc.has_erasure_code_profile(name)) {
|
5156
|
dout(20) << "erasure code profile " << name << " try again" << dendl;
|
5157
|
goto wait;
|
5158
|
} else {
|
5159
|
if (plugin == "isa" || plugin == "lrc") {
|
5160
|
err = check_cluster_features(CEPH_FEATURE_ERASURE_CODE_PLUGINS_V2, ss);
|
5161
|
if (err == -EAGAIN)
|
5162
|
goto wait;
|
5163
|
if (err)
|
5164
|
goto reply;
|
5165
|
} else if (plugin == "shec") {
|
5166
|
if (!g_ceph_context->check_experimental_feature_enabled("shec", &ss)) {
|
5167
|
err = -EINVAL;
|
5168
|
goto reply;
|
5169
|
}
|
5170
|
}
|
5171
|
dout(20) << "erasure code profile " << name << " set" << dendl;
|
5172
|
pending_inc.set_erasure_code_profile(name, profile_map);
|
5173
|
}
|
5174
|
|
5175
|
getline(ss, rs);
|
5176
|
wait_for_finished_proposal(new Monitor::C_Command(mon, m, 0, rs,
|
5177
|
get_last_committed() + 1));
|
5178
|
return true;
|
5179
|
|
5180
|
} else if (prefix == "osd crush rule create-erasure") {
|
5181
|
err = check_cluster_features(CEPH_FEATURE_CRUSH_V2, ss);
|
5182
|
if (err == -EAGAIN)
|
5183
|
goto wait;
|
5184
|
if (err)
|
5185
|
goto reply;
|
5186
|
string name, poolstr;
|
5187
|
cmd_getval(g_ceph_context, cmdmap, "name", name);
|
5188
|
string profile;
|
5189
|
cmd_getval(g_ceph_context, cmdmap, "profile", profile);
|
5190
|
if (profile == "")
|
5191
|
profile = "default";
|
5192
|
if (profile == "default") {
|
5193
|
if (!osdmap.has_erasure_code_profile(profile)) {
|
5194
|
if (pending_inc.has_erasure_code_profile(profile)) {
|
5195
|
dout(20) << "erasure code profile " << profile << " already pending" << dendl;
|
5196
|
goto wait;
|
5197
|
}
|
5198
|
|
5199
|
map<string,string> profile_map;
|
5200
|
err = osdmap.get_erasure_code_profile_default(g_ceph_context,
|
5201
|
profile_map,
|
5202
|
&ss);
|
5203
|
if (err)
|
5204
|
goto reply;
|
5205
|
dout(20) << "erasure code profile " << profile << " set" << dendl;
|
5206
|
pending_inc.set_erasure_code_profile(profile, profile_map);
|
5207
|
goto wait;
|
5208
|
}
|
5209
|
}
|
5210
|
|
5211
|
int ruleset;
|
5212
|
err = crush_ruleset_create_erasure(name, profile, &ruleset, ss);
|
5213
|
if (err < 0) {
|
5214
|
switch(err) {
|
5215
|
case -EEXIST:
|
5216
|
ss << "rule " << name << " already exists";
|
5217
|
err = 0;
|
5218
|
goto reply;
|
5219
|
break;
|
5220
|
case -EALREADY:
|
5221
|
ss << "rule " << name << " already exists";
|
5222
|
err = 0;
|
5223
|
break;
|
5224
|
default:
|
5225
|
goto reply;
|
5226
|
break;
|
5227
|
}
|
5228
|
} else {
|
5229
|
ss << "created ruleset " << name << " at " << ruleset;
|
5230
|
}
|
5231
|
|
5232
|
getline(ss, rs);
|
5233
|
wait_for_finished_proposal(new Monitor::C_Command(mon, m, 0, rs,
|
5234
|
get_last_committed() + 1));
|
5235
|
return true;
|
5236
|
|
5237
|
} else if (prefix == "osd crush rule rm") {
|
5238
|
string name;
|
5239
|
cmd_getval(g_ceph_context, cmdmap, "name", name);
|
5240
|
|
5241
|
if (!osdmap.crush->rule_exists(name)) {
|
5242
|
ss << "rule " << name << " does not exist";
|
5243
|
err = 0;
|
5244
|
goto reply;
|
5245
|
}
|
5246
|
|
5247
|
CrushWrapper newcrush;
|
5248
|
_get_pending_crush(newcrush);
|
5249
|
|
5250
|
if (!newcrush.rule_exists(name)) {
|
5251
|
ss << "rule " << name << " does not exist";
|
5252
|
err = 0;
|
5253
|
} else {
|
5254
|
int ruleno = newcrush.get_rule_id(name);
|
5255
|
assert(ruleno >= 0);
|
5256
|
|
5257
|
|
5258
|
|
5259
|
|
5260
|
int ruleset = newcrush.get_rule_mask_ruleset(ruleno);
|
5261
|
if (osdmap.crush_ruleset_in_use(ruleset)) {
|
5262
|
ss << "crush ruleset " << name << " " << ruleset << " is in use";
|
5263
|
err = -EBUSY;
|
5264
|
goto reply;
|
5265
|
}
|
5266
|
|
5267
|
err = newcrush.remove_rule(ruleno);
|
5268
|
if (err < 0) {
|
5269
|
goto reply;
|
5270
|
}
|
5271
|
|
5272
|
pending_inc.crush.clear();
|
5273
|
newcrush.encode(pending_inc.crush);
|
5274
|
}
|
5275
|
getline(ss, rs);
|
5276
|
wait_for_finished_proposal(new Monitor::C_Command(mon, m, 0, rs,
|
5277
|
get_last_committed() + 1));
|
5278
|
return true;
|
5279
|
|
5280
|
} else if (prefix == "osd setmaxosd") {
|
5281
|
int64_t newmax;
|
5282
|
if (!cmd_getval(g_ceph_context, cmdmap, "newmax", newmax)) {
|
5283
|
ss << "unable to parse 'newmax' value '"
|
5284
|
<< cmd_vartype_stringify(cmdmap["newmax"]) << "'";
|
5285
|
err = -EINVAL;
|
5286
|
goto reply;
|
5287
|
}
|
5288
|
|
5289
|
if (newmax > g_conf->mon_max_osd) {
|
5290
|
err = -ERANGE;
|
5291
|
ss << "cannot set max_osd to " << newmax << " which is > conf.mon_max_osd ("
|
5292
|
<< g_conf->mon_max_osd << ")";
|
5293
|
goto reply;
|
5294
|
}
|
5295
|
|
5296
|
|
5297
|
|
5298
|
|
5299
|
if (newmax < osdmap.get_max_osd()) {
|
5300
|
|
5301
|
|
5302
|
|
5303
|
for (int i = newmax; i <= osdmap.get_max_osd(); i++) {
|
5304
|
if (osdmap.exists(i)) {
|
5305
|
err = -EBUSY;
|
5306
|
ss << "cannot shrink max_osd to " << newmax
|
5307
|
<< " because osd." << i << " (and possibly others) still in use";
|
5308
|
goto reply;
|
5309
|
}
|
5310
|
}
|
5311
|
}
|
5312
|
|
5313
|
pending_inc.new_max_osd = newmax;
|
5314
|
ss << "set new max_osd = " << pending_inc.new_max_osd;
|
5315
|
getline(ss, rs);
|
5316
|
wait_for_finished_proposal(new Monitor::C_Command(mon, m, 0, rs,
|
5317
|
get_last_committed() + 1));
|
5318
|
return true;
|
5319
|
|
5320
|
} else if (prefix == "osd pause") {
|
5321
|
return prepare_set_flag(m, CEPH_OSDMAP_PAUSERD | CEPH_OSDMAP_PAUSEWR);
|
5322
|
|
5323
|
} else if (prefix == "osd unpause") {
|
5324
|
return prepare_unset_flag(m, CEPH_OSDMAP_PAUSERD | CEPH_OSDMAP_PAUSEWR);
|
5325
|
|
5326
|
} else if (prefix == "osd set") {
|
5327
|
string key;
|
5328
|
cmd_getval(g_ceph_context, cmdmap, "key", key);
|
5329
|
if (key == "full")
|
5330
|
return prepare_set_flag(m, CEPH_OSDMAP_FULL);
|
5331
|
else if (key == "pause")
|
5332
|
return prepare_set_flag(m, CEPH_OSDMAP_PAUSERD | CEPH_OSDMAP_PAUSEWR);
|
5333
|
else if (key == "noup")
|
5334
|
return prepare_set_flag(m, CEPH_OSDMAP_NOUP);
|
5335
|
else if (key == "nodown")
|
5336
|
return prepare_set_flag(m, CEPH_OSDMAP_NODOWN);
|
5337
|
else if (key == "noout")
|
5338
|
return prepare_set_flag(m, CEPH_OSDMAP_NOOUT);
|
5339
|
else if (key == "noin")
|
5340
|
return prepare_set_flag(m, CEPH_OSDMAP_NOIN);
|
5341
|
else if (key == "nobackfill")
|
5342
|
return prepare_set_flag(m, CEPH_OSDMAP_NOBACKFILL);
|
5343
|
else if (key == "norebalance")
|
5344
|
return prepare_set_flag(m, CEPH_OSDMAP_NOREBALANCE);
|
5345
|
else if (key == "norecover")
|
5346
|
return prepare_set_flag(m, CEPH_OSDMAP_NORECOVER);
|
5347
|
else if (key == "noscrub")
|
5348
|
return prepare_set_flag(m, CEPH_OSDMAP_NOSCRUB);
|
5349
|
else if (key == "nodeep-scrub")
|
5350
|
return prepare_set_flag(m, CEPH_OSDMAP_NODEEP_SCRUB);
|
5351
|
else if (key == "notieragent")
|
5352
|
return prepare_set_flag(m, CEPH_OSDMAP_NOTIERAGENT);
|
5353
|
else {
|
5354
|
ss << "unrecognized flag '" << key << "'";
|
5355
|
err = -EINVAL;
|
5356
|
}
|
5357
|
|
5358
|
} else if (prefix == "osd unset") {
|
5359
|
string key;
|
5360
|
cmd_getval(g_ceph_context, cmdmap, "key", key);
|
5361
|
if (key == "full")
|
5362
|
return prepare_unset_flag(m, CEPH_OSDMAP_FULL);
|
5363
|
else if (key == "pause")
|
5364
|
return prepare_unset_flag(m, CEPH_OSDMAP_PAUSERD | CEPH_OSDMAP_PAUSEWR);
|
5365
|
else if (key == "noup")
|
5366
|
return prepare_unset_flag(m, CEPH_OSDMAP_NOUP);
|
5367
|
else if (key == "nodown")
|
5368
|
return prepare_unset_flag(m, CEPH_OSDMAP_NODOWN);
|
5369
|
else if (key == "noout")
|
5370
|
return prepare_unset_flag(m, CEPH_OSDMAP_NOOUT);
|
5371
|
else if (key == "noin")
|
5372
|
return prepare_unset_flag(m, CEPH_OSDMAP_NOIN);
|
5373
|
else if (key == "nobackfill")
|
5374
|
return prepare_unset_flag(m, CEPH_OSDMAP_NOBACKFILL);
|
5375
|
else if (key == "norebalance")
|
5376
|
return prepare_unset_flag(m, CEPH_OSDMAP_NOREBALANCE);
|
5377
|
else if (key == "norecover")
|
5378
|
return prepare_unset_flag(m, CEPH_OSDMAP_NORECOVER);
|
5379
|
else if (key == "noscrub")
|
5380
|
return prepare_unset_flag(m, CEPH_OSDMAP_NOSCRUB);
|
5381
|
else if (key == "nodeep-scrub")
|
5382
|
return prepare_unset_flag(m, CEPH_OSDMAP_NODEEP_SCRUB);
|
5383
|
else if (key == "notieragent")
|
5384
|
return prepare_unset_flag(m, CEPH_OSDMAP_NOTIERAGENT);
|
5385
|
else {
|
5386
|
ss << "unrecognized flag '" << key << "'";
|
5387
|
err = -EINVAL;
|
5388
|
}
|
5389
|
|
5390
|
} else if (prefix == "osd cluster_snap") {
|
5391
|
|
5392
|
ss << "cluster snapshot currently disabled (broken implementation)";
|
5393
|
|
5394
|
|
5395
|
} else if (prefix == "osd down" ||
|
5396
|
prefix == "osd out" ||
|
5397
|
prefix == "osd in" ||
|
5398
|
prefix == "osd rm") {
|
5399
|
|
5400
|
bool any = false;
|
5401
|
|
5402
|
vector<string> idvec;
|
5403
|
cmd_getval(g_ceph_context, cmdmap, "ids", idvec);
|
5404
|
for (unsigned j = 0; j < idvec.size(); j++) {
|
5405
|
long osd = parse_osd_id(idvec[j].c_str(), &ss);
|
5406
|
if (osd < 0) {
|
5407
|
ss << "invalid osd id" << osd;
|
5408
|
err = -EINVAL;
|
5409
|
continue;
|
5410
|
} else if (!osdmap.exists(osd)) {
|
5411
|
ss << "osd." << osd << " does not exist. ";
|
5412
|
continue;
|
5413
|
}
|
5414
|
if (prefix == "osd down") {
|
5415
|
if (osdmap.is_down(osd)) {
|
5416
|
ss << "osd." << osd << " is already down. ";
|
5417
|
} else {
|
5418
|
pending_inc.new_state[osd] = CEPH_OSD_UP;
|
5419
|
ss << "marked down osd." << osd << ". ";
|
5420
|
any = true;
|
5421
|
}
|
5422
|
} else if (prefix == "osd out") {
|
5423
|
if (osdmap.is_out(osd)) {
|
5424
|
ss << "osd." << osd << " is already out. ";
|
5425
|
} else {
|
5426
|
pending_inc.new_weight[osd] = CEPH_OSD_OUT;
|
5427
|
ss << "marked out osd." << osd << ". ";
|
5428
|
any = true;
|
5429
|
}
|
5430
|
} else if (prefix == "osd in") {
|
5431
|
if (osdmap.is_in(osd)) {
|
5432
|
ss << "osd." << osd << " is already in. ";
|
5433
|
} else {
|
5434
|
pending_inc.new_weight[osd] = CEPH_OSD_IN;
|
5435
|
ss << "marked in osd." << osd << ". ";
|
5436
|
any = true;
|
5437
|
}
|
5438
|
} else if (prefix == "osd rm") {
|
5439
|
if (osdmap.is_up(osd)) {
|
5440
|
if (any)
|
5441
|
ss << ", ";
|
5442
|
ss << "osd." << osd << " is still up; must be down before removal. ";
|
5443
|
err = -EBUSY;
|
5444
|
} else {
|
5445
|
pending_inc.new_state[osd] = osdmap.get_state(osd);
|
5446
|
pending_inc.new_uuid[osd] = uuid_d();
|
5447
|
pending_metadata_rm.insert(osd);
|
5448
|
if (any) {
|
5449
|
ss << ", osd." << osd;
|
5450
|
} else {
|
5451
|
ss << "removed osd." << osd;
|
5452
|
}
|
5453
|
any = true;
|
5454
|
}
|
5455
|
}
|
5456
|
}
|
5457
|
if (any) {
|
5458
|
getline(ss, rs);
|
5459
|
wait_for_finished_proposal(new Monitor::C_Command(mon, m, err, rs,
|
5460
|
get_last_committed() + 1));
|
5461
|
return true;
|
5462
|
}
|
5463
|
} else if (prefix == "osd pg-temp") {
|
5464
|
string pgidstr;
|
5465
|
if (!cmd_getval(g_ceph_context, cmdmap, "pgid", pgidstr)) {
|
5466
|
ss << "unable to parse 'pgid' value '"
|
5467
|
<< cmd_vartype_stringify(cmdmap["pgid"]) << "'";
|
5468
|
err = -EINVAL;
|
5469
|
goto reply;
|
5470
|
}
|
5471
|
pg_t pgid;
|
5472
|
if (!pgid.parse(pgidstr.c_str())) {
|
5473
|
ss << "invalid pgid '" << pgidstr << "'";
|
5474
|
err = -EINVAL;
|
5475
|
goto reply;
|
5476
|
}
|
5477
|
PGMap& pg_map = mon->pgmon()->pg_map;
|
5478
|
if (!pg_map.pg_stat.count(pgid)) {
|
5479
|
ss << "pg " << pgid << " does not exist";
|
5480
|
err = -ENOENT;
|
5481
|
goto reply;
|
5482
|
}
|
5483
|
|
5484
|
vector<string> id_vec;
|
5485
|
vector<int32_t> new_pg_temp;
|
5486
|
if (!cmd_getval(g_ceph_context, cmdmap, "id", id_vec)) {
|
5487
|
ss << "unable to parse 'id' value(s) '"
|
5488
|
<< cmd_vartype_stringify(cmdmap["id"]) << "'";
|
5489
|
err = -EINVAL;
|
5490
|
goto reply;
|
5491
|
}
|
5492
|
for (unsigned i = 0; i < id_vec.size(); i++) {
|
5493
|
int32_t osd = parse_osd_id(id_vec[i].c_str(), &ss);
|
5494
|
if (osd < 0) {
|
5495
|
err = -EINVAL;
|
5496
|
goto reply;
|
5497
|
}
|
5498
|
if (!osdmap.exists(osd)) {
|
5499
|
ss << "osd." << osd << " does not exist";
|
5500
|
err = -ENOENT;
|
5501
|
goto reply;
|
5502
|
}
|
5503
|
|
5504
|
new_pg_temp.push_back(osd);
|
5505
|
}
|
5506
|
|
5507
|
pending_inc.new_pg_temp[pgid] = new_pg_temp;
|
5508
|
ss << "set " << pgid << " pg_temp mapping to " << new_pg_temp;
|
5509
|
goto update;
|
5510
|
} else if (prefix == "osd primary-temp") {
|
5511
|
string pgidstr;
|
5512
|
if (!cmd_getval(g_ceph_context, cmdmap, "pgid", pgidstr)) {
|
5513
|
ss << "unable to parse 'pgid' value '"
|
5514
|
<< cmd_vartype_stringify(cmdmap["pgid"]) << "'";
|
5515
|
err = -EINVAL;
|
5516
|
goto reply;
|
5517
|
}
|
5518
|
pg_t pgid;
|
5519
|
if (!pgid.parse(pgidstr.c_str())) {
|
5520
|
ss << "invalid pgid '" << pgidstr << "'";
|
5521
|
err = -EINVAL;
|
5522
|
goto reply;
|
5523
|
}
|
5524
|
PGMap& pg_map = mon->pgmon()->pg_map;
|
5525
|
if (!pg_map.pg_stat.count(pgid)) {
|
5526
|
ss << "pg " << pgid << " does not exist";
|
5527
|
err = -ENOENT;
|
5528
|
goto reply;
|
5529
|
}
|
5530
|
|
5531
|
string id;
|
5532
|
int32_t osd;
|
5533
|
if (!cmd_getval(g_ceph_context, cmdmap, "id", id)) {
|
5534
|
ss << "unable to parse 'id' value '"
|
5535
|
<< cmd_vartype_stringify(cmdmap["id"]) << "'";
|
5536
|
err = -EINVAL;
|
5537
|
goto reply;
|
5538
|
}
|
5539
|
if (strcmp(id.c_str(), "-1")) {
|
5540
|
osd = parse_osd_id(id.c_str(), &ss);
|
5541
|
if (osd < 0) {
|
5542
|
err = -EINVAL;
|
5543
|
goto reply;
|
5544
|
}
|
5545
|
if (!osdmap.exists(osd)) {
|
5546
|
ss << "osd." << osd << " does not exist";
|
5547
|
err = -ENOENT;
|
5548
|
goto reply;
|
5549
|
}
|
5550
|
} else {
|
5551
|
osd = -1;
|
5552
|
}
|
5553
|
|
5554
|
if (!g_conf->mon_osd_allow_primary_temp) {
|
5555
|
ss << "you must enable 'mon osd allow primary temp = true' on the mons before you can set primary_temp mappings. note that this is for developers only: older clients/OSDs will break and there is no feature bit infrastructure in place.";
|
5556
|
err = -EPERM;
|
5557
|
goto reply;
|
5558
|
}
|
5559
|
|
5560
|
pending_inc.new_primary_temp[pgid] = osd;
|
5561
|
ss << "set " << pgid << " primary_temp mapping to " << osd;
|
5562
|
goto update;
|
5563
|
} else if (prefix == "osd primary-affinity") {
|
5564
|
int64_t id;
|
5565
|
if (!cmd_getval(g_ceph_context, cmdmap, "id", id)) {
|
5566
|
ss << "invalid osd id value '"
|
5567
|
<< cmd_vartype_stringify(cmdmap["id"]) << "'";
|
5568
|
err = -EINVAL;
|
5569
|
goto reply;
|
5570
|
}
|
5571
|
double w;
|
5572
|
if (!cmd_getval(g_ceph_context, cmdmap, "weight", w)) {
|
5573
|
ss << "unable to parse 'weight' value '"
|
5574
|
<< cmd_vartype_stringify(cmdmap["weight"]) << "'";
|
5575
|
err = -EINVAL;
|
5576
|
goto reply;
|
5577
|
}
|
5578
|
long ww = (int)((double)CEPH_OSD_MAX_PRIMARY_AFFINITY*w);
|
5579
|
if (ww < 0L) {
|
5580
|
ss << "weight must be >= 0";
|
5581
|
err = -EINVAL;
|
5582
|
goto reply;
|
5583
|
}
|
5584
|
if (!g_conf->mon_osd_allow_primary_affinity) {
|
5585
|
ss << "you must enable 'mon osd allow primary affinity = true' on the mons before you can adjust primary-affinity. note that older clients will no longer be able to communicate with the cluster.";
|
5586
|
err = -EPERM;
|
5587
|
goto reply;
|
5588
|
}
|
5589
|
err = check_cluster_features(CEPH_FEATURE_OSD_PRIMARY_AFFINITY, ss);
|
5590
|
if (err == -EAGAIN)
|
5591
|
goto wait;
|
5592
|
if (err < 0)
|
5593
|
goto reply;
|
5594
|
if (osdmap.exists(id)) {
|
5595
|
pending_inc.new_primary_affinity[id] = ww;
|
5596
|
ss << "set osd." << id << " primary-affinity to " << w << " (" << ios::hex << ww << ios::dec << ")";
|
5597
|
getline(ss, rs);
|
5598
|
wait_for_finished_proposal(new Monitor::C_Command(mon, m, 0, rs,
|
5599
|
get_last_committed() + 1));
|
5600
|
return true;
|
5601
|
}
|
5602
|
} else if (prefix == "osd reweight") {
|
5603
|
int64_t id;
|
5604
|
if (!cmd_getval(g_ceph_context, cmdmap, "id", id)) {
|
5605
|
ss << "unable to parse osd id value '"
|
5606
|
<< cmd_vartype_stringify(cmdmap["id"]) << "'";
|
5607
|
err = -EINVAL;
|
5608
|
goto reply;
|
5609
|
}
|
5610
|
double w;
|
5611
|
if (!cmd_getval(g_ceph_context, cmdmap, "weight", w)) {
|
5612
|
ss << "unable to parse weight value '"
|
5613
|
<< cmd_vartype_stringify(cmdmap["weight"]) << "'";
|
5614
|
err = -EINVAL;
|
5615
|
goto reply;
|
5616
|
}
|
5617
|
long ww = (int)((double)CEPH_OSD_IN*w);
|
5618
|
if (ww < 0L) {
|
5619
|
ss << "weight must be >= 0";
|
5620
|
err = -EINVAL;
|
5621
|
goto reply;
|
5622
|
}
|
5623
|
if (osdmap.exists(id)) {
|
5624
|
pending_inc.new_weight[id] = ww;
|
5625
|
ss << "reweighted osd." << id << " to " << w << " (" << ios::hex << ww << ios::dec << ")";
|
5626
|
getline(ss, rs);
|
5627
|
wait_for_finished_proposal(new Monitor::C_Command(mon, m, 0, rs,
|
5628
|
get_last_committed() + 1));
|
5629
|
return true;
|
5630
|
}
|
5631
|
|
5632
|
} else if (prefix == "osd lost") {
|
5633
|
int64_t id;
|
5634
|
if (!cmd_getval(g_ceph_context, cmdmap, "id", id)) {
|
5635
|
ss << "unable to parse osd id value '"
|
5636
|
<< cmd_vartype_stringify(cmdmap["id"]) << "'";
|
5637
|
err = -EINVAL;
|
5638
|
goto reply;
|
5639
|
}
|
5640
|
string sure;
|
5641
|
if (!cmd_getval(g_ceph_context, cmdmap, "sure", sure) || sure != "--yes-i-really-mean-it") {
|
5642
|
ss << "are you SURE? this might mean real, permanent data loss. pass "
|
5643
|
"--yes-i-really-mean-it if you really do.";
|
5644
|
err = -EPERM;
|
5645
|
goto reply;
|
5646
|
} else if (!osdmap.exists(id) || !osdmap.is_down(id)) {
|
5647
|
ss << "osd." << id << " is not down or doesn't exist";
|
5648
|
} else {
|
5649
|
epoch_t e = osdmap.get_info(id).down_at;
|
5650
|
pending_inc.new_lost[id] = e;
|
5651
|
ss << "marked osd lost in epoch " << e;
|
5652
|
getline(ss, rs);
|
5653
|
wait_for_finished_proposal(new Monitor::C_Command(mon, m, 0, rs,
|
5654
|
get_last_committed() + 1));
|
5655
|
return true;
|
5656
|
}
|
5657
|
|
5658
|
} else if (prefix == "osd create") {
|
5659
|
int i = -1;
|
5660
|
|
5661
|
|
5662
|
uuid_d uuid;
|
5663
|
string uuidstr;
|
5664
|
if (cmd_getval(g_ceph_context, cmdmap, "uuid", uuidstr)) {
|
5665
|
if (!uuid.parse(uuidstr.c_str())) {
|
5666
|
err = -EINVAL;
|
5667
|
goto reply;
|
5668
|
}
|
5669
|
dout(10) << " osd create got uuid " << uuid << dendl;
|
5670
|
i = osdmap.identify_osd(uuid);
|
5671
|
if (i >= 0) {
|
5672
|
|
5673
|
err = 0;
|
5674
|
if (f) {
|
5675
|
f->open_object_section("created_osd");
|
5676
|
f->dump_int("osdid", i);
|
5677
|
f->close_section();
|
5678
|
f->flush(rdata);
|
5679
|
} else {
|
5680
|
ss << i;
|
5681
|
rdata.append(ss);
|
5682
|
}
|
5683
|
goto reply;
|
5684
|
}
|
5685
|
i = pending_inc.identify_osd(uuid);
|
5686
|
if (i >= 0) {
|
5687
|
|
5688
|
wait_for_finished_proposal(new C_RetryMessage(this, m));
|
5689
|
return true;
|
5690
|
}
|
5691
|
}
|
5692
|
|
5693
|
|
5694
|
for (i=0; i < osdmap.get_max_osd(); i++) {
|
5695
|
if (!osdmap.exists(i) &&
|
5696
|
pending_inc.new_up_client.count(i) == 0 &&
|
5697
|
(pending_inc.new_state.count(i) == 0 ||
|
5698
|
(pending_inc.new_state[i] & CEPH_OSD_EXISTS) == 0))
|
5699
|
goto done;
|
5700
|
}
|
5701
|
|
5702
|
|
5703
|
if (pending_inc.new_max_osd < 0)
|
5704
|
pending_inc.new_max_osd = osdmap.get_max_osd() + 1;
|
5705
|
else
|
5706
|
pending_inc.new_max_osd++;
|
5707
|
i = pending_inc.new_max_osd - 1;
|
5708
|
|
5709
|
done:
|
5710
|
dout(10) << " creating osd." << i << dendl;
|
5711
|
pending_inc.new_state[i] |= CEPH_OSD_EXISTS | CEPH_OSD_NEW;
|
5712
|
if (!uuid.is_zero())
|
5713
|
pending_inc.new_uuid[i] = uuid;
|
5714
|
if (f) {
|
5715
|
f->open_object_section("created_osd");
|
5716
|
f->dump_int("osdid", i);
|
5717
|
f->close_section();
|
5718
|
f->flush(rdata);
|
5719
|
} else {
|
5720
|
ss << i;
|
5721
|
rdata.append(ss);
|
5722
|
}
|
5723
|
wait_for_finished_proposal(new Monitor::C_Command(mon, m, 0, rs, rdata,
|
5724
|
get_last_committed() + 1));
|
5725
|
return true;
|
5726
|
|
5727
|
} else if (prefix == "osd blacklist") {
|
5728
|
string addrstr;
|
5729
|
cmd_getval(g_ceph_context, cmdmap, "addr", addrstr);
|
5730
|
entity_addr_t addr;
|
5731
|
if (!addr.parse(addrstr.c_str(), 0))
|
5732
|
ss << "unable to parse address " << addrstr;
|
5733
|
else {
|
5734
|
string blacklistop;
|
5735
|
cmd_getval(g_ceph_context, cmdmap, "blacklistop", blacklistop);
|
5736
|
if (blacklistop == "add") {
|
5737
|
utime_t expires = ceph_clock_now(g_ceph_context);
|
5738
|
double d;
|
5739
|
|
5740
|
cmd_getval(g_ceph_context, cmdmap, "expire", d, double(60*60));
|
5741
|
expires += d;
|
5742
|
|
5743
|
pending_inc.new_blacklist[addr] = expires;
|
5744
|
ss << "blacklisting " << addr << " until " << expires << " (" << d << " sec)";
|
5745
|
getline(ss, rs);
|
5746
|
wait_for_finished_proposal(new Monitor::C_Command(mon, m, 0, rs,
|
5747
|
get_last_committed() + 1));
|
5748
|
return true;
|
5749
|
} else if (blacklistop == "rm") {
|
5750
|
if (osdmap.is_blacklisted(addr) ||
|
5751
|
pending_inc.new_blacklist.count(addr)) {
|
5752
|
if (osdmap.is_blacklisted(addr))
|
5753
|
pending_inc.old_blacklist.push_back(addr);
|
5754
|
else
|
5755
|
pending_inc.new_blacklist.erase(addr);
|
5756
|
ss << "un-blacklisting " << addr;
|
5757
|
getline(ss, rs);
|
5758
|
wait_for_finished_proposal(new Monitor::C_Command(mon, m, 0, rs,
|
5759
|
get_last_committed() + 1));
|
5760
|
return true;
|
5761
|
}
|
5762
|
ss << addr << " isn't blacklisted";
|
5763
|
err = 0;
|
5764
|
goto reply;
|
5765
|
}
|
5766
|
}
|
5767
|
} else if (prefix == "osd pool mksnap") {
|
5768
|
string poolstr;
|
5769
|
cmd_getval(g_ceph_context, cmdmap, "pool", poolstr);
|
5770
|
int64_t pool = osdmap.lookup_pg_pool_name(poolstr.c_str());
|
5771
|
if (pool < 0) {
|
5772
|
ss << "unrecognized pool '" << poolstr << "'";
|
5773
|
err = -ENOENT;
|
5774
|
goto reply;
|
5775
|
}
|
5776
|
string snapname;
|
5777
|
cmd_getval(g_ceph_context, cmdmap, "snap", snapname);
|
5778
|
const pg_pool_t *p = osdmap.get_pg_pool(pool);
|
5779
|
if (p->is_unmanaged_snaps_mode()) {
|
5780
|
ss << "pool " << poolstr << " is in unmanaged snaps mode";
|
5781
|
err = -EINVAL;
|
5782
|
goto reply;
|
5783
|
} else if (p->snap_exists(snapname.c_str())) {
|
5784
|
ss << "pool " << poolstr << " snap " << snapname << " already exists";
|
5785
|
err = 0;
|
5786
|
goto reply;
|
5787
|
}
|
5788
|
pg_pool_t *pp = 0;
|
5789
|
if (pending_inc.new_pools.count(pool))
|
5790
|
pp = &pending_inc.new_pools[pool];
|
5791
|
if (!pp) {
|
5792
|
pp = &pending_inc.new_pools[pool];
|
5793
|
*pp = *p;
|
5794
|
}
|
5795
|
if (pp->snap_exists(snapname.c_str())) {
|
5796
|
ss << "pool " << poolstr << " snap " << snapname << " already exists";
|
5797
|
} else {
|
5798
|
pp->add_snap(snapname.c_str(), ceph_clock_now(g_ceph_context));
|
5799
|
pp->set_snap_epoch(pending_inc.epoch);
|
5800
|
ss << "created pool " << poolstr << " snap " << snapname;
|
5801
|
}
|
5802
|
getline(ss, rs);
|
5803
|
wait_for_finished_proposal(new Monitor::C_Command(mon, m, 0, rs,
|
5804
|
get_last_committed() + 1));
|
5805
|
return true;
|
5806
|
} else if (prefix == "osd pool rmsnap") {
|
5807
|
string poolstr;
|
5808
|
cmd_getval(g_ceph_context, cmdmap, "pool", poolstr);
|
5809
|
int64_t pool = osdmap.lookup_pg_pool_name(poolstr.c_str());
|
5810
|
if (pool < 0) {
|
5811
|
ss << "unrecognized pool '" << poolstr << "'";
|
5812
|
err = -ENOENT;
|
5813
|
goto reply;
|
5814
|
}
|
5815
|
string snapname;
|
5816
|
cmd_getval(g_ceph_context, cmdmap, "snap", snapname);
|
5817
|
const pg_pool_t *p = osdmap.get_pg_pool(pool);
|
5818
|
if (p->is_unmanaged_snaps_mode()) {
|
5819
|
ss << "pool " << poolstr << " is in unmanaged snaps mode";
|
5820
|
err = -EINVAL;
|
5821
|
goto reply;
|
5822
|
} else if (!p->snap_exists(snapname.c_str())) {
|
5823
|
ss << "pool " << poolstr << " snap " << snapname << " does not exist";
|
5824
|
err = 0;
|
5825
|
goto reply;
|
5826
|
}
|
5827
|
pg_pool_t *pp = 0;
|
5828
|
if (pending_inc.new_pools.count(pool))
|
5829
|
pp = &pending_inc.new_pools[pool];
|
5830
|
if (!pp) {
|
5831
|
pp = &pending_inc.new_pools[pool];
|
5832
|
*pp = *p;
|
5833
|
}
|
5834
|
snapid_t sn = pp->snap_exists(snapname.c_str());
|
5835
|
if (sn) {
|
5836
|
pp->remove_snap(sn);
|
5837
|
pp->set_snap_epoch(pending_inc.epoch);
|
5838
|
ss << "removed pool " << poolstr << " snap " << snapname;
|
5839
|
} else {
|
5840
|
ss << "already removed pool " << poolstr << " snap " << snapname;
|
5841
|
}
|
5842
|
getline(ss, rs);
|
5843
|
wait_for_finished_proposal(new Monitor::C_Command(mon, m, 0, rs,
|
5844
|
get_last_committed() + 1));
|
5845
|
return true;
|
5846
|
} else if (prefix == "osd pool create") {
|
5847
|
int64_t pg_num;
|
5848
|
int64_t pgp_num;
|
5849
|
cmd_getval(g_ceph_context, cmdmap, "pg_num", pg_num, int64_t(0));
|
5850
|
if ((pg_num == 0) || (pg_num > g_conf->mon_max_pool_pg_num)) {
|
5851
|
ss << "'pg_num' must be greater than 0 and less than or equal to "
|
5852
|
<< g_conf->mon_max_pool_pg_num
|
5853
|
<< " (you may adjust 'mon max pool pg num' for higher values)";
|
5854
|
err = -ERANGE;
|
5855
|
goto reply;
|
5856
|
}
|
5857
|
|
5858
|
cmd_getval(g_ceph_context, cmdmap, "pgp_num", pgp_num, pg_num);
|
5859
|
if ((pgp_num == 0) || (pgp_num > pg_num)) {
|
5860
|
ss << "'pgp_num' must be greater than 0 and lower or equal than 'pg_num'"
|
5861
|
<< ", which in this case is " << pg_num;
|
5862
|
err = -ERANGE;
|
5863
|
goto reply;
|
5864
|
}
|
5865
|
|
5866
|
string pool_type_str;
|
5867
|
cmd_getval(g_ceph_context, cmdmap, "pool_type", pool_type_str);
|
5868
|
if (pool_type_str.empty())
|
5869
|
pool_type_str = pg_pool_t::get_default_type();
|
5870
|
|
5871
|
string poolstr;
|
5872
|
cmd_getval(g_ceph_context, cmdmap, "pool", poolstr);
|
5873
|
int64_t pool_id = osdmap.lookup_pg_pool_name(poolstr);
|
5874
|
if (pool_id >= 0) {
|
5875
|
const pg_pool_t *p = osdmap.get_pg_pool(pool_id);
|
5876
|
if (pool_type_str != p->get_type_name()) {
|
5877
|
ss << "pool '" << poolstr << "' cannot change to type " << pool_type_str;
|
5878
|
err = -EINVAL;
|
5879
|
} else {
|
5880
|
ss << "pool '" << poolstr << "' already exists";
|
5881
|
err = 0;
|
5882
|
}
|
5883
|
goto reply;
|
5884
|
}
|
5885
|
|
5886
|
int pool_type;
|
5887
|
if (pool_type_str == "replicated") {
|
5888
|
pool_type = pg_pool_t::TYPE_REPLICATED;
|
5889
|
} else if (pool_type_str == "erasure") {
|
5890
|
err = check_cluster_features(CEPH_FEATURE_CRUSH_V2 |
|
5891
|
CEPH_FEATURE_OSD_ERASURE_CODES,
|
5892
|
ss);
|
5893
|
if (err == -EAGAIN)
|
5894
|
goto wait;
|
5895
|
if (err)
|
5896
|
goto reply;
|
5897
|
pool_type = pg_pool_t::TYPE_ERASURE;
|
5898
|
} else {
|
5899
|
ss << "unknown pool type '" << pool_type_str << "'";
|
5900
|
err = -EINVAL;
|
5901
|
goto reply;
|
5902
|
}
|
5903
|
|
5904
|
bool implicit_ruleset_creation = false;
|
5905
|
string ruleset_name;
|
5906
|
cmd_getval(g_ceph_context, cmdmap, "ruleset", ruleset_name);
|
5907
|
string erasure_code_profile;
|
5908
|
cmd_getval(g_ceph_context, cmdmap, "erasure_code_profile", erasure_code_profile);
|
5909
|
|
5910
|
if (pool_type == pg_pool_t::TYPE_ERASURE) {
|
5911
|
if (erasure_code_profile == "")
|
5912
|
erasure_code_profile = "default";
|
5913
|
|
5914
|
if (erasure_code_profile == "default") {
|
5915
|
if (!osdmap.has_erasure_code_profile(erasure_code_profile)) {
|
5916
|
if (pending_inc.has_erasure_code_profile(erasure_code_profile)) {
|
5917
|
dout(20) << "erasure code profile " << erasure_code_profile << " already pending" << dendl;
|
5918
|
goto wait;
|
5919
|
}
|
5920
|
|
5921
|
map<string,string> profile_map;
|
5922
|
err = osdmap.get_erasure_code_profile_default(g_ceph_context,
|
5923
|
profile_map,
|
5924
|
&ss);
|
5925
|
if (err)
|
5926
|
goto reply;
|
5927
|
dout(20) << "erasure code profile " << erasure_code_profile << " set" << dendl;
|
5928
|
pending_inc.set_erasure_code_profile(erasure_code_profile, profile_map);
|
5929
|
goto wait;
|
5930
|
}
|
5931
|
}
|
5932
|
if (ruleset_name == "") {
|
5933
|
implicit_ruleset_creation = true;
|
5934
|
if (erasure_code_profile == "default") {
|
5935
|
ruleset_name = "erasure-code";
|
5936
|
} else {
|
5937
|
dout(1) << "implicitly use ruleset named after the pool: "
|
5938
|
<< poolstr << dendl;
|
5939
|
ruleset_name = poolstr;
|
5940
|
}
|
5941
|
}
|
5942
|
} else {
|
5943
|
|
5944
|
ruleset_name = erasure_code_profile;
|
5945
|
}
|
5946
|
|
5947
|
if (!implicit_ruleset_creation && ruleset_name != "") {
|
5948
|
int ruleset;
|
5949
|
err = get_crush_ruleset(ruleset_name, &ruleset, ss);
|
5950
|
if (err == -EAGAIN) {
|
5951
|
wait_for_finished_proposal(new C_RetryMessage(this, m));
|
5952
|
return true;
|
5953
|
}
|
5954
|
if (err)
|
5955
|
goto reply;
|
5956
|
}
|
5957
|
|
5958
|
int64_t expected_num_objects;
|
5959
|
cmd_getval(g_ceph_context, cmdmap, "expected_num_objects", expected_num_objects, int64_t(0));
|
5960
|
if (expected_num_objects < 0) {
|
5961
|
ss << "'expected_num_objects' must be non-negative";
|
5962
|
err = -EINVAL;
|
5963
|
goto reply;
|
5964
|
}
|
5965
|
err = prepare_new_pool(poolstr, 0,
|
5966
|
-1,
|
5967
|
ruleset_name,
|
5968
|
pg_num, pgp_num,
|
5969
|
erasure_code_profile, pool_type,
|
5970
|
(uint64_t)expected_num_objects,
|
5971
|
ss);
|
5972
|
if (err < 0) {
|
5973
|
switch(err) {
|
5974
|
case -EEXIST:
|
5975
|
ss << "pool '" << poolstr << "' already exists";
|
5976
|
break;
|
5977
|
case -EAGAIN:
|
5978
|
wait_for_finished_proposal(new C_RetryMessage(this, m));
|
5979
|
return true;
|
5980
|
default:
|
5981
|
goto reply;
|
5982
|
break;
|
5983
|
}
|
5984
|
} else {
|
5985
|
ss << "pool '" << poolstr << "' created";
|
5986
|
}
|
5987
|
getline(ss, rs);
|
5988
|
wait_for_finished_proposal(new Monitor::C_Command(mon, m, 0, rs,
|
5989
|
get_last_committed() + 1));
|
5990
|
return true;
|
5991
|
|
5992
|
} else if (prefix == "osd pool delete") {
|
5993
|
|
5994
|
string poolstr, poolstr2, sure;
|
5995
|
cmd_getval(g_ceph_context, cmdmap, "pool", poolstr);
|
5996
|
cmd_getval(g_ceph_context, cmdmap, "pool2", poolstr2);
|
5997
|
cmd_getval(g_ceph_context, cmdmap, "sure", sure);
|
5998
|
int64_t pool = osdmap.lookup_pg_pool_name(poolstr.c_str());
|
5999
|
if (pool < 0) {
|
6000
|
ss << "pool '" << poolstr << "' does not exist";
|
6001
|
err = 0;
|
6002
|
goto reply;
|
6003
|
}
|
6004
|
|
6005
|
if (poolstr2 != poolstr || sure != "--yes-i-really-really-mean-it") {
|
6006
|
ss << "WARNING: this will *PERMANENTLY DESTROY* all data stored in pool " << poolstr
|
6007
|
<< ". If you are *ABSOLUTELY CERTAIN* that is what you want, pass the pool name *twice*, "
|
6008
|
<< "followed by --yes-i-really-really-mean-it.";
|
6009
|
err = -EPERM;
|
6010
|
goto reply;
|
6011
|
}
|
6012
|
err = _prepare_remove_pool(pool, &ss);
|
6013
|
if (err == -EAGAIN) {
|
6014
|
wait_for_finished_proposal(new C_RetryMessage(this, m));
|
6015
|
return true;
|
6016
|
}
|
6017
|
if (err < 0)
|
6018
|
goto reply;
|
6019
|
goto update;
|
6020
|
} else if (prefix == "osd pool rename") {
|
6021
|
string srcpoolstr, destpoolstr;
|
6022
|
cmd_getval(g_ceph_context, cmdmap, "srcpool", srcpoolstr);
|
6023
|
cmd_getval(g_ceph_context, cmdmap, "destpool", destpoolstr);
|
6024
|
int64_t pool_src = osdmap.lookup_pg_pool_name(srcpoolstr.c_str());
|
6025
|
int64_t pool_dst = osdmap.lookup_pg_pool_name(destpoolstr.c_str());
|
6026
|
|
6027
|
if (pool_src < 0) {
|
6028
|
if (pool_dst >= 0) {
|
6029
|
|
6030
|
|
6031
|
|
6032
|
|
6033
|
|
6034
|
|
6035
|
ss << "pool '" << srcpoolstr << "' does not exist; pool '"
|
6036
|
<< destpoolstr << "' does -- assuming successful rename";
|
6037
|
err = 0;
|
6038
|
} else {
|
6039
|
ss << "unrecognized pool '" << srcpoolstr << "'";
|
6040
|
err = -ENOENT;
|
6041
|
}
|
6042
|
goto reply;
|
6043
|
} else if (pool_dst >= 0) {
|
6044
|
|
6045
|
ss << "pool '" << destpoolstr << "' already exists";
|
6046
|
err = -EEXIST;
|
6047
|
goto reply;
|
6048
|
}
|
6049
|
|
6050
|
int ret = _prepare_rename_pool(pool_src, destpoolstr);
|
6051
|
if (ret == 0) {
|
6052
|
ss << "pool '" << srcpoolstr << "' renamed to '" << destpoolstr << "'";
|
6053
|
} else {
|
6054
|
ss << "failed to rename pool '" << srcpoolstr << "' to '" << destpoolstr << "': "
|
6055
|
<< cpp_strerror(ret);
|
6056
|
}
|
6057
|
getline(ss, rs);
|
6058
|
wait_for_finished_proposal(new Monitor::C_Command(mon, m, ret, rs,
|
6059
|
get_last_committed() + 1));
|
6060
|
return true;
|
6061
|
|
6062
|
} else if (prefix == "osd pool set") {
|
6063
|
err = prepare_command_pool_set(cmdmap, ss);
|
6064
|
if (err == -EAGAIN)
|
6065
|
goto wait;
|
6066
|
if (err < 0)
|
6067
|
goto reply;
|
6068
|
|
6069
|
getline(ss, rs);
|
6070
|
wait_for_finished_proposal(new Monitor::C_Command(mon, m, 0, rs,
|
6071
|
get_last_committed() + 1));
|
6072
|
return true;
|
6073
|
} else if (prefix == "osd tier add") {
|
6074
|
err = check_cluster_features(CEPH_FEATURE_OSD_CACHEPOOL, ss);
|
6075
|
if (err == -EAGAIN)
|
6076
|
goto wait;
|
6077
|
if (err)
|
6078
|
goto reply;
|
6079
|
string poolstr;
|
6080
|
cmd_getval(g_ceph_context, cmdmap, "pool", poolstr);
|
6081
|
int64_t pool_id = osdmap.lookup_pg_pool_name(poolstr);
|
6082
|
if (pool_id < 0) {
|
6083
|
ss << "unrecognized pool '" << poolstr << "'";
|
6084
|
err = -ENOENT;
|
6085
|
goto reply;
|
6086
|
}
|
6087
|
string tierpoolstr;
|
6088
|
cmd_getval(g_ceph_context, cmdmap, "tierpool", tierpoolstr);
|
6089
|
int64_t tierpool_id = osdmap.lookup_pg_pool_name(tierpoolstr);
|
6090
|
if (tierpool_id < 0) {
|
6091
|
ss << "unrecognized pool '" << tierpoolstr << "'";
|
6092
|
err = -ENOENT;
|
6093
|
goto reply;
|
6094
|
}
|
6095
|
const pg_pool_t *p = osdmap.get_pg_pool(pool_id);
|
6096
|
assert(p);
|
6097
|
const pg_pool_t *tp = osdmap.get_pg_pool(tierpool_id);
|
6098
|
assert(tp);
|
6099
|
|
6100
|
if (!_check_become_tier(tierpool_id, tp, pool_id, p, &err, &ss)) {
|
6101
|
goto reply;
|
6102
|
}
|
6103
|
|
6104
|
|
6105
|
string force_nonempty;
|
6106
|
cmd_getval(g_ceph_context, cmdmap, "force_nonempty", force_nonempty);
|
6107
|
const pool_stat_t& tier_stats =
|
6108
|
mon->pgmon()->pg_map.get_pg_pool_sum_stat(tierpool_id);
|
6109
|
if (tier_stats.stats.sum.num_objects != 0 &&
|
6110
|
force_nonempty != "--force-nonempty") {
|
6111
|
ss << "tier pool '" << tierpoolstr << "' is not empty; --force-nonempty to force";
|
6112
|
err = -ENOTEMPTY;
|
6113
|
goto reply;
|
6114
|
}
|
6115
|
|
6116
|
pg_pool_t *np = pending_inc.get_new_pool(pool_id, p);
|
6117
|
pg_pool_t *ntp = pending_inc.get_new_pool(tierpool_id, tp);
|
6118
|
if (np->tiers.count(tierpool_id) || ntp->is_tier()) {
|
6119
|
wait_for_finished_proposal(new C_RetryMessage(this, m));
|
6120
|
return true;
|
6121
|
}
|
6122
|
np->tiers.insert(tierpool_id);
|
6123
|
np->set_snap_epoch(pending_inc.epoch);
|
6124
|
ntp->tier_of = pool_id;
|
6125
|
ss << "pool '" << tierpoolstr << "' is now (or already was) a tier of '" << poolstr << "'";
|
6126
|
wait_for_finished_proposal(new Monitor::C_Command(mon, m, 0, ss.str(),
|
6127
|
get_last_committed() + 1));
|
6128
|
return true;
|
6129
|
} else if (prefix == "osd tier remove") {
|
6130
|
string poolstr;
|
6131
|
cmd_getval(g_ceph_context, cmdmap, "pool", poolstr);
|
6132
|
int64_t pool_id = osdmap.lookup_pg_pool_name(poolstr);
|
6133
|
if (pool_id < 0) {
|
6134
|
ss << "unrecognized pool '" << poolstr << "'";
|
6135
|
err = -ENOENT;
|
6136
|
goto reply;
|
6137
|
}
|
6138
|
string tierpoolstr;
|
6139
|
cmd_getval(g_ceph_context, cmdmap, "tierpool", tierpoolstr);
|
6140
|
int64_t tierpool_id = osdmap.lookup_pg_pool_name(tierpoolstr);
|
6141
|
if (tierpool_id < 0) {
|
6142
|
ss << "unrecognized pool '" << tierpoolstr << "'";
|
6143
|
err = -ENOENT;
|
6144
|
goto reply;
|
6145
|
}
|
6146
|
const pg_pool_t *p = osdmap.get_pg_pool(pool_id);
|
6147
|
assert(p);
|
6148
|
const pg_pool_t *tp = osdmap.get_pg_pool(tierpool_id);
|
6149
|
assert(tp);
|
6150
|
|
6151
|
if (!_check_remove_tier(pool_id, p, &err, &ss)) {
|
6152
|
goto reply;
|
6153
|
}
|
6154
|
|
6155
|
if (p->tiers.count(tierpool_id) == 0) {
|
6156
|
ss << "pool '" << tierpoolstr << "' is now (or already was) not a tier of '" << poolstr << "'";
|
6157
|
err = 0;
|
6158
|
goto reply;
|
6159
|
}
|
6160
|
if (tp->tier_of != pool_id) {
|
6161
|
ss << "tier pool '" << tierpoolstr << "' is a tier of '"
|
6162
|
<< osdmap.get_pool_name(tp->tier_of) << "': "
|
6163
|
|
6164
|
<< "THIS SHOULD NOT HAVE HAPPENED AT ALL";
|
6165
|
err = -EINVAL;
|
6166
|
goto reply;
|
6167
|
}
|
6168
|
if (p->read_tier == tierpool_id) {
|
6169
|
ss << "tier pool '" << tierpoolstr << "' is the overlay for '" << poolstr << "'; please remove-overlay first";
|
6170
|
err = -EBUSY;
|
6171
|
goto reply;
|
6172
|
}
|
6173
|
|
6174
|
pg_pool_t *np = pending_inc.get_new_pool(pool_id, p);
|
6175
|
pg_pool_t *ntp = pending_inc.get_new_pool(tierpool_id, tp);
|
6176
|
if (np->tiers.count(tierpool_id) == 0 ||
|
6177
|
ntp->tier_of != pool_id ||
|
6178
|
np->read_tier == tierpool_id) {
|
6179
|
wait_for_finished_proposal(new C_RetryMessage(this, m));
|
6180
|
return true;
|
6181
|
}
|
6182
|
np->tiers.erase(tierpool_id);
|
6183
|
ntp->clear_tier();
|
6184
|
ss << "pool '" << tierpoolstr << "' is now (or already was) not a tier of '" << poolstr << "'";
|
6185
|
wait_for_finished_proposal(new Monitor::C_Command(mon, m, 0, ss.str(),
|
6186
|
get_last_committed() + 1));
|
6187
|
return true;
|
6188
|
} else if (prefix == "osd tier set-overlay") {
|
6189
|
err = check_cluster_features(CEPH_FEATURE_OSD_CACHEPOOL, ss);
|
6190
|
if (err == -EAGAIN)
|
6191
|
goto wait;
|
6192
|
if (err)
|
6193
|
goto reply;
|
6194
|
string poolstr;
|
6195
|
cmd_getval(g_ceph_context, cmdmap, "pool", poolstr);
|
6196
|
int64_t pool_id = osdmap.lookup_pg_pool_name(poolstr);
|
6197
|
if (pool_id < 0) {
|
6198
|
ss << "unrecognized pool '" << poolstr << "'";
|
6199
|
err = -ENOENT;
|
6200
|
goto reply;
|
6201
|
}
|
6202
|
string overlaypoolstr;
|
6203
|
cmd_getval(g_ceph_context, cmdmap, "overlaypool", overlaypoolstr);
|
6204
|
int64_t overlaypool_id = osdmap.lookup_pg_pool_name(overlaypoolstr);
|
6205
|
if (overlaypool_id < 0) {
|
6206
|
ss << "unrecognized pool '" << overlaypoolstr << "'";
|
6207
|
err = -ENOENT;
|
6208
|
goto reply;
|
6209
|
}
|
6210
|
const pg_pool_t *p = osdmap.get_pg_pool(pool_id);
|
6211
|
assert(p);
|
6212
|
if (p->tiers.count(overlaypool_id) == 0) {
|
6213
|
ss << "tier pool '" << overlaypoolstr << "' is not a tier of '" << poolstr << "'";
|
6214
|
err = -EINVAL;
|
6215
|
goto reply;
|
6216
|
}
|
6217
|
if (p->read_tier == overlaypool_id) {
|
6218
|
err = 0;
|
6219
|
ss << "overlay for '" << poolstr << "' is now (or already was) '" << overlaypoolstr << "'";
|
6220
|
goto reply;
|
6221
|
}
|
6222
|
if (p->has_read_tier()) {
|
6223
|
ss << "pool '" << poolstr << "' has overlay '"
|
6224
|
<< osdmap.get_pool_name(p->read_tier)
|
6225
|
<< "'; please remove-overlay first";
|
6226
|
err = -EINVAL;
|
6227
|
goto reply;
|
6228
|
}
|
6229
|
|
6230
|
|
6231
|
pg_pool_t *np = pending_inc.get_new_pool(pool_id, p);
|
6232
|
np->read_tier = overlaypool_id;
|
6233
|
np->write_tier = overlaypool_id;
|
6234
|
np->last_force_op_resend = pending_inc.epoch;
|
6235
|
ss << "overlay for '" << poolstr << "' is now (or already was) '" << overlaypoolstr << "'";
|
6236
|
wait_for_finished_proposal(new Monitor::C_Command(mon, m, 0, ss.str(),
|
6237
|
get_last_committed() + 1));
|
6238
|
return true;
|
6239
|
} else if (prefix == "osd tier remove-overlay") {
|
6240
|
string poolstr;
|
6241
|
cmd_getval(g_ceph_context, cmdmap, "pool", poolstr);
|
6242
|
int64_t pool_id = osdmap.lookup_pg_pool_name(poolstr);
|
6243
|
if (pool_id < 0) {
|
6244
|
ss << "unrecognized pool '" << poolstr << "'";
|
6245
|
err = -ENOENT;
|
6246
|
goto reply;
|
6247
|
}
|
6248
|
const pg_pool_t *p = osdmap.get_pg_pool(pool_id);
|
6249
|
assert(p);
|
6250
|
if (!p->has_read_tier()) {
|
6251
|
err = 0;
|
6252
|
ss << "there is now (or already was) no overlay for '" << poolstr << "'";
|
6253
|
goto reply;
|
6254
|
}
|
6255
|
|
6256
|
if (!_check_remove_tier(pool_id, p, &err, &ss)) {
|
6257
|
goto reply;
|
6258
|
}
|
6259
|
|
6260
|
|
6261
|
pg_pool_t *np = pending_inc.get_new_pool(pool_id, p);
|
6262
|
np->clear_read_tier();
|
6263
|
np->clear_write_tier();
|
6264
|
np->last_force_op_resend = pending_inc.epoch;
|
6265
|
ss << "there is now (or already was) no overlay for '" << poolstr << "'";
|
6266
|
wait_for_finished_proposal(new Monitor::C_Command(mon, m, 0, ss.str(),
|
6267
|
get_last_committed() + 1));
|
6268
|
return true;
|
6269
|
} else if (prefix == "osd tier cache-mode") {
|
6270
|
err = check_cluster_features(CEPH_FEATURE_OSD_CACHEPOOL, ss);
|
6271
|
if (err == -EAGAIN)
|
6272
|
goto wait;
|
6273
|
if (err)
|
6274
|
goto reply;
|
6275
|
string poolstr;
|
6276
|
cmd_getval(g_ceph_context, cmdmap, "pool", poolstr);
|
6277
|
int64_t pool_id = osdmap.lookup_pg_pool_name(poolstr);
|
6278
|
if (pool_id < 0) {
|
6279
|
ss << "unrecognized pool '" << poolstr << "'";
|
6280
|
err = -ENOENT;
|
6281
|
goto reply;
|
6282
|
}
|
6283
|
const pg_pool_t *p = osdmap.get_pg_pool(pool_id);
|
6284
|
assert(p);
|
6285
|
if (!p->is_tier()) {
|
6286
|
ss << "pool '" << poolstr << "' is not a tier";
|
6287
|
err = -EINVAL;
|
6288
|
goto reply;
|
6289
|
}
|
6290
|
string modestr;
|
6291
|
cmd_getval(g_ceph_context, cmdmap, "mode", modestr);
|
6292
|
pg_pool_t::cache_mode_t mode = pg_pool_t::get_cache_mode_from_str(modestr);
|
6293
|
if (mode < 0) {
|
6294
|
ss << "'" << modestr << "' is not a valid cache mode";
|
6295
|
err = -EINVAL;
|
6296
|
goto reply;
|
6297
|
}
|
6298
|
|
6299
|
|
6300
|
if (p->cache_mode == mode &&
|
6301
|
(pending_inc.new_pools.count(pool_id) == 0 ||
|
6302
|
pending_inc.new_pools[pool_id].cache_mode == p->cache_mode)) {
|
6303
|
ss << "set cache-mode for pool '" << poolstr << "'"
|
6304
|
<< " to " << pg_pool_t::get_cache_mode_name(mode);
|
6305
|
err = 0;
|
6306
|
goto reply;
|
6307
|
}
|
6308
|
|
6309
|
|
6310
|
|
6311
|
|
6312
|
|
6313
|
|
6314
|
|
6315
|
|
6316
|
|
6317
|
|
6318
|
|
6319
|
|
6320
|
|
6321
|
|
6322
|
|
6323
|
|
6324
|
|
6325
|
|
6326
|
|
6327
|
|
6328
|
|
6329
|
|
6330
|
|
6331
|
|
6332
|
if (p->cache_mode == pg_pool_t::CACHEMODE_WRITEBACK &&
|
6333
|
(mode != pg_pool_t::CACHEMODE_FORWARD &&
|
6334
|
mode != pg_pool_t::CACHEMODE_READFORWARD &&
|
6335
|
mode != pg_pool_t::CACHEMODE_READPROXY)) {
|
6336
|
ss << "unable to set cache-mode '" << pg_pool_t::get_cache_mode_name(mode)
|
6337
|
<< "' on a '" << pg_pool_t::get_cache_mode_name(p->cache_mode)
|
6338
|
<< "' pool; only '"
|
6339
|
<< pg_pool_t::get_cache_mode_name(pg_pool_t::CACHEMODE_FORWARD)
|
6340
|
<< "','"
|
6341
|
<< pg_pool_t::get_cache_mode_name(pg_pool_t::CACHEMODE_READFORWARD)
|
6342
|
<< "','"
|
6343
|
<< pg_pool_t::get_cache_mode_name(pg_pool_t::CACHEMODE_READPROXY)
|
6344
|
<< "' allowed.";
|
6345
|
err = -EINVAL;
|
6346
|
goto reply;
|
6347
|
}
|
6348
|
if ((p->cache_mode == pg_pool_t::CACHEMODE_READFORWARD &&
|
6349
|
(mode != pg_pool_t::CACHEMODE_WRITEBACK &&
|
6350
|
mode != pg_pool_t::CACHEMODE_FORWARD &&
|
6351
|
mode != pg_pool_t::CACHEMODE_READPROXY)) ||
|
6352
|
|
6353
|
(p->cache_mode == pg_pool_t::CACHEMODE_READPROXY &&
|
6354
|
(mode != pg_pool_t::CACHEMODE_WRITEBACK &&
|
6355
|
mode != pg_pool_t::CACHEMODE_FORWARD &&
|
6356
|
mode != pg_pool_t::CACHEMODE_READFORWARD)) ||
|
6357
|
|
6358
|
(p->cache_mode == pg_pool_t::CACHEMODE_FORWARD &&
|
6359
|
(mode != pg_pool_t::CACHEMODE_WRITEBACK &&
|
6360
|
mode != pg_pool_t::CACHEMODE_READFORWARD &&
|
6361
|
mode != pg_pool_t::CACHEMODE_READPROXY))) {
|
6362
|
|
6363
|
const pool_stat_t& tier_stats =
|
6364
|
mon->pgmon()->pg_map.get_pg_pool_sum_stat(pool_id);
|
6365
|
|
6366
|
if (tier_stats.stats.sum.num_objects_dirty > 0) {
|
6367
|
ss << "unable to set cache-mode '"
|
6368
|
<< pg_pool_t::get_cache_mode_name(mode) << "' on pool '" << poolstr
|
6369
|
<< "': dirty objects found";
|
6370
|
err = -EBUSY;
|
6371
|
goto reply;
|
6372
|
}
|
6373
|
}
|
6374
|
|
6375
|
pg_pool_t *np = pending_inc.get_new_pool(pool_id, p);
|
6376
|
np->cache_mode = mode;
|
6377
|
|
6378
|
|
6379
|
np->flags |= pg_pool_t::FLAG_INCOMPLETE_CLONES;
|
6380
|
ss << "set cache-mode for pool '" << poolstr
|
6381
|
<< "' to " << pg_pool_t::get_cache_mode_name(mode);
|
6382
|
wait_for_finished_proposal(new Monitor::C_Command(mon, m, 0, ss.str(),
|
6383
|
get_last_committed() + 1));
|
6384
|
return true;
|
6385
|
} else if (prefix == "osd tier add-cache") {
|
6386
|
err = check_cluster_features(CEPH_FEATURE_OSD_CACHEPOOL, ss);
|
6387
|
if (err == -EAGAIN)
|
6388
|
goto wait;
|
6389
|
if (err)
|
6390
|
goto reply;
|
6391
|
string poolstr;
|
6392
|
cmd_getval(g_ceph_context, cmdmap, "pool", poolstr);
|
6393
|
int64_t pool_id = osdmap.lookup_pg_pool_name(poolstr);
|
6394
|
if (pool_id < 0) {
|
6395
|
ss << "unrecognized pool '" << poolstr << "'";
|
6396
|
err = -ENOENT;
|
6397
|
goto reply;
|
6398
|
}
|
6399
|
string tierpoolstr;
|
6400
|
cmd_getval(g_ceph_context, cmdmap, "tierpool", tierpoolstr);
|
6401
|
int64_t tierpool_id = osdmap.lookup_pg_pool_name(tierpoolstr);
|
6402
|
if (tierpool_id < 0) {
|
6403
|
ss << "unrecognized pool '" << tierpoolstr << "'";
|
6404
|
err = -ENOENT;
|
6405
|
goto reply;
|
6406
|
}
|
6407
|
const pg_pool_t *p = osdmap.get_pg_pool(pool_id);
|
6408
|
assert(p);
|
6409
|
const pg_pool_t *tp = osdmap.get_pg_pool(tierpool_id);
|
6410
|
assert(tp);
|
6411
|
|
6412
|
if (!_check_become_tier(tierpool_id, tp, pool_id, p, &err, &ss)) {
|
6413
|
goto reply;
|
6414
|
}
|
6415
|
|
6416
|
int64_t size = 0;
|
6417
|
if (!cmd_getval(g_ceph_context, cmdmap, "size", size)) {
|
6418
|
ss << "unable to parse 'size' value '"
|
6419
|
<< cmd_vartype_stringify(cmdmap["size"]) << "'";
|
6420
|
err = -EINVAL;
|
6421
|
goto reply;
|
6422
|
}
|
6423
|
|
6424
|
const pool_stat_t& tier_stats =
|
6425
|
mon->pgmon()->pg_map.get_pg_pool_sum_stat(tierpool_id);
|
6426
|
if (tier_stats.stats.sum.num_objects != 0) {
|
6427
|
ss << "tier pool '" << tierpoolstr << "' is not empty";
|
6428
|
err = -ENOTEMPTY;
|
6429
|
goto reply;
|
6430
|
}
|
6431
|
string modestr = g_conf->osd_tier_default_cache_mode;
|
6432
|
pg_pool_t::cache_mode_t mode = pg_pool_t::get_cache_mode_from_str(modestr);
|
6433
|
if (mode < 0) {
|
6434
|
ss << "osd tier cache default mode '" << modestr << "' is not a valid cache mode";
|
6435
|
err = -EINVAL;
|
6436
|
goto reply;
|
6437
|
}
|
6438
|
HitSet::Params hsp;
|
6439
|
if (g_conf->osd_tier_default_cache_hit_set_type == "bloom") {
|
6440
|
BloomHitSet::Params *bsp = new BloomHitSet::Params;
|
6441
|
bsp->set_fpp(g_conf->osd_pool_default_hit_set_bloom_fpp);
|
6442
|
hsp = HitSet::Params(bsp);
|
6443
|
} else if (g_conf->osd_tier_default_cache_hit_set_type == "explicit_hash") {
|
6444
|
hsp = HitSet::Params(new ExplicitHashHitSet::Params);
|
6445
|
}
|
6446
|
else if (g_conf->osd_tier_default_cache_hit_set_type == "explicit_object") {
|
6447
|
hsp = HitSet::Params(new ExplicitObjectHitSet::Params);
|
6448
|
} else {
|
6449
|
ss << "osd tier cache default hit set type '" <<
|
6450
|
g_conf->osd_tier_default_cache_hit_set_type << "' is not a known type";
|
6451
|
err = -EINVAL;
|
6452
|
goto reply;
|
6453
|
}
|
6454
|
|
6455
|
pg_pool_t *np = pending_inc.get_new_pool(pool_id, p);
|
6456
|
pg_pool_t *ntp = pending_inc.get_new_pool(tierpool_id, tp);
|
6457
|
if (np->tiers.count(tierpool_id) || ntp->is_tier()) {
|
6458
|
wait_for_finished_proposal(new C_RetryMessage(this, m));
|
6459
|
return true;
|
6460
|
}
|
6461
|
np->tiers.insert(tierpool_id);
|
6462
|
np->read_tier = np->write_tier = tierpool_id;
|
6463
|
np->set_snap_epoch(pending_inc.epoch);
|
6464
|
ntp->tier_of = pool_id;
|
6465
|
ntp->cache_mode = mode;
|
6466
|
ntp->hit_set_count = g_conf->osd_tier_default_cache_hit_set_count;
|
6467
|
ntp->hit_set_period = g_conf->osd_tier_default_cache_hit_set_period;
|
6468
|
ntp->min_read_recency_for_promote = g_conf->osd_tier_default_cache_min_read_recency_for_promote;
|
6469
|
ntp->hit_set_params = hsp;
|
6470
|
ntp->target_max_bytes = size;
|
6471
|
ss << "pool '" << tierpoolstr << "' is now (or already was) a cache tier of '" << poolstr << "'";
|
6472
|
wait_for_finished_proposal(new Monitor::C_Command(mon, m, 0, ss.str(),
|
6473
|
get_last_committed() + 1));
|
6474
|
return true;
|
6475
|
} else if (prefix == "osd pool set-quota") {
|
6476
|
string poolstr;
|
6477
|
cmd_getval(g_ceph_context, cmdmap, "pool", poolstr);
|
6478
|
int64_t pool_id = osdmap.lookup_pg_pool_name(poolstr);
|
6479
|
if (pool_id < 0) {
|
6480
|
ss << "unrecognized pool '" << poolstr << "'";
|
6481
|
err = -ENOENT;
|
6482
|
goto reply;
|
6483
|
}
|
6484
|
|
6485
|
string field;
|
6486
|
cmd_getval(g_ceph_context, cmdmap, "field", field);
|
6487
|
if (field != "max_objects" && field != "max_bytes") {
|
6488
|
ss << "unrecognized field '" << field << "'; max_bytes of max_objects";
|
6489
|
err = -EINVAL;
|
6490
|
goto reply;
|
6491
|
}
|
6492
|
|
6493
|
|
6494
|
string val;
|
6495
|
cmd_getval(g_ceph_context, cmdmap, "val", val);
|
6496
|
stringstream tss;
|
6497
|
int64_t value = unit_to_bytesize(val, &tss);
|
6498
|
if (value < 0) {
|
6499
|
ss << "error parsing value '" << value << "': " << tss.str();
|
6500
|
err = value;
|
6501
|
goto reply;
|
6502
|
}
|
6503
|
|
6504
|
pg_pool_t *pi = pending_inc.get_new_pool(pool_id, osdmap.get_pg_pool(pool_id));
|
6505
|
if (field == "max_objects") {
|
6506
|
pi->quota_max_objects = value;
|
6507
|
} else if (field == "max_bytes") {
|
6508
|
pi->quota_max_bytes = value;
|
6509
|
} else {
|
6510
|
assert(0 == "unrecognized option");
|
6511
|
}
|
6512
|
ss << "set-quota " << field << " = " << value << " for pool " << poolstr;
|
6513
|
rs = ss.str();
|
6514
|
wait_for_finished_proposal(new Monitor::C_Command(mon, m, 0, rs,
|
6515
|
get_last_committed() + 1));
|
6516
|
return true;
|
6517
|
|
6518
|
} else if (prefix == "osd reweight-by-utilization") {
|
6519
|
int64_t oload;
|
6520
|
cmd_getval(g_ceph_context, cmdmap, "oload", oload, int64_t(120));
|
6521
|
string out_str;
|
6522
|
err = reweight_by_utilization(oload, out_str, false, NULL);
|
6523
|
if (err < 0) {
|
6524
|
ss << "FAILED reweight-by-utilization: " << out_str;
|
6525
|
} else if (err == 0) {
|
6526
|
ss << "no change: " << out_str;
|
6527
|
} else {
|
6528
|
ss << "SUCCESSFUL reweight-by-utilization: " << out_str;
|
6529
|
getline(ss, rs);
|
6530
|
wait_for_finished_proposal(new Monitor::C_Command(mon, m, 0, rs,
|
6531
|
get_last_committed() + 1));
|
6532
|
return true;
|
6533
|
}
|
6534
|
} else if (prefix == "osd reweight-by-pg") {
|
6535
|
int64_t oload;
|
6536
|
cmd_getval(g_ceph_context, cmdmap, "oload", oload, int64_t(120));
|
6537
|
set<int64_t> pools;
|
6538
|
vector<string> poolnamevec;
|
6539
|
cmd_getval(g_ceph_context, cmdmap, "pools", poolnamevec);
|
6540
|
for (unsigned j = 0; j < poolnamevec.size(); j++) {
|
6541
|
int64_t pool = osdmap.lookup_pg_pool_name(poolnamevec[j]);
|
6542
|
if (pool < 0) {
|
6543
|
ss << "pool '" << poolnamevec[j] << "' does not exist";
|
6544
|
err = -ENOENT;
|
6545
|
goto reply;
|
6546
|
}
|
6547
|
pools.insert(pool);
|
6548
|
}
|
6549
|
string out_str;
|
6550
|
err = reweight_by_utilization(oload, out_str, true,
|
6551
|
pools.empty() ? NULL : &pools);
|
6552
|
if (err < 0) {
|
6553
|
ss << "FAILED reweight-by-pg: " << out_str;
|
6554
|
} else if (err == 0) {
|
6555
|
ss << "no change: " << out_str;
|
6556
|
} else {
|
6557
|
ss << "SUCCESSFUL reweight-by-pg: " << out_str;
|
6558
|
getline(ss, rs);
|
6559
|
wait_for_finished_proposal(new Monitor::C_Command(mon, m, 0, rs,
|
6560
|
get_last_committed() + 1));
|
6561
|
return true;
|
6562
|
}
|
6563
|
} else if (prefix == "osd thrash") {
|
6564
|
int64_t num_epochs;
|
6565
|
cmd_getval(g_ceph_context, cmdmap, "num_epochs", num_epochs, int64_t(0));
|
6566
|
|
6567
|
thrash_map = num_epochs;
|
6568
|
ss << "will thrash map for " << thrash_map << " epochs";
|
6569
|
ret = thrash();
|
6570
|
err = 0;
|
6571
|
} else {
|
6572
|
err = -EINVAL;
|
6573
|
}
|
6574
|
|
6575
|
reply:
|
6576
|
getline(ss, rs);
|
6577
|
if (err < 0 && rs.length() == 0)
|
6578
|
rs = cpp_strerror(err);
|
6579
|
mon->reply_command(m, err, rs, rdata, get_last_committed());
|
6580
|
return ret;
|
6581
|
|
6582
|
update:
|
6583
|
getline(ss, rs);
|
6584
|
wait_for_finished_proposal(new Monitor::C_Command(mon, m, 0, rs,
|
6585
|
get_last_committed() + 1));
|
6586
|
return true;
|
6587
|
|
6588
|
wait:
|
6589
|
wait_for_finished_proposal(new C_RetryMessage(this, m));
|
6590
|
return true;
|
6591
|
}
|
6592
|
|
6593
|
bool OSDMonitor::preprocess_pool_op(MPoolOp *m)
|
6594
|
{
|
6595
|
if (m->op == POOL_OP_CREATE)
|
6596
|
return preprocess_pool_op_create(m);
|
6597
|
|
6598
|
if (!osdmap.get_pg_pool(m->pool)) {
|
6599
|
dout(10) << "attempt to delete non-existent pool id " << m->pool << dendl;
|
6600
|
_pool_op_reply(m, 0, osdmap.get_epoch());
|
6601
|
return true;
|
6602
|
}
|
6603
|
|
6604
|
|
6605
|
bool snap_exists = false;
|
6606
|
const pg_pool_t *p = osdmap.get_pg_pool(m->pool);
|
6607
|
if (p->snap_exists(m->name.c_str()))
|
6608
|
snap_exists = true;
|
6609
|
|
6610
|
switch (m->op) {
|
6611
|
case POOL_OP_CREATE_SNAP:
|
6612
|
if (p->is_unmanaged_snaps_mode()) {
|
6613
|
_pool_op_reply(m, -EINVAL, osdmap.get_epoch());
|
6614
|
return true;
|
6615
|
}
|
6616
|
if (snap_exists) {
|
6617
|
_pool_op_reply(m, 0, osdmap.get_epoch());
|
6618
|
return true;
|
6619
|
}
|
6620
|
return false;
|
6621
|
case POOL_OP_CREATE_UNMANAGED_SNAP:
|
6622
|
if (p->is_pool_snaps_mode()) {
|
6623
|
_pool_op_reply(m, -EINVAL, osdmap.get_epoch());
|
6624
|
return true;
|
6625
|
}
|
6626
|
return false;
|
6627
|
case POOL_OP_DELETE_SNAP:
|
6628
|
if (p->is_unmanaged_snaps_mode()) {
|
6629
|
_pool_op_reply(m, -EINVAL, osdmap.get_epoch());
|
6630
|
return true;
|
6631
|
}
|
6632
|
if (!snap_exists) {
|
6633
|
_pool_op_reply(m, 0, osdmap.get_epoch());
|
6634
|
return true;
|
6635
|
}
|
6636
|
return false;
|
6637
|
case POOL_OP_DELETE_UNMANAGED_SNAP:
|
6638
|
if (p->is_pool_snaps_mode()) {
|
6639
|
_pool_op_reply(m, -EINVAL, osdmap.get_epoch());
|
6640
|
return true;
|
6641
|
}
|
6642
|
if (p->is_removed_snap(m->snapid)) {
|
6643
|
_pool_op_reply(m, 0, osdmap.get_epoch());
|
6644
|
return true;
|
6645
|
}
|
6646
|
return false;
|
6647
|
case POOL_OP_DELETE:
|
6648
|
if (osdmap.lookup_pg_pool_name(m->name.c_str()) >= 0) {
|
6649
|
_pool_op_reply(m, 0, osdmap.get_epoch());
|
6650
|
return true;
|
6651
|
}
|
6652
|
return false;
|
6653
|
case POOL_OP_AUID_CHANGE:
|
6654
|
return false;
|
6655
|
default:
|
6656
|
assert(0);
|
6657
|
break;
|
6658
|
}
|
6659
|
|
6660
|
return false;
|
6661
|
}
|
6662
|
|
6663
|
bool OSDMonitor::preprocess_pool_op_create(MPoolOp *m)
|
6664
|
{
|
6665
|
MonSession *session = m->get_session();
|
6666
|
if (!session) {
|
6667
|
_pool_op_reply(m, -EPERM, osdmap.get_epoch());
|
6668
|
return true;
|
6669
|
}
|
6670
|
if (!session->is_capable("osd", MON_CAP_W)) {
|
6671
|
dout(5) << "attempt to create new pool without sufficient auid privileges!"
|
6672
|
<< "message: " << *m << std::endl
|
6673
|
<< "caps: " << session->caps << dendl;
|
6674
|
_pool_op_reply(m, -EPERM, osdmap.get_epoch());
|
6675
|
return true;
|
6676
|
}
|
6677
|
|
6678
|
int64_t pool = osdmap.lookup_pg_pool_name(m->name.c_str());
|
6679
|
if (pool >= 0) {
|
6680
|
_pool_op_reply(m, 0, osdmap.get_epoch());
|
6681
|
return true;
|
6682
|
}
|
6683
|
|
6684
|
return false;
|
6685
|
}
|
6686
|
|
6687
|
bool OSDMonitor::prepare_pool_op(MPoolOp *m)
|
6688
|
{
|
6689
|
dout(10) << "prepare_pool_op " << *m << dendl;
|
6690
|
if (m->op == POOL_OP_CREATE) {
|
6691
|
return prepare_pool_op_create(m);
|
6692
|
} else if (m->op == POOL_OP_DELETE) {
|
6693
|
return prepare_pool_op_delete(m);
|
6694
|
}
|
6695
|
|
6696
|
int ret = 0;
|
6697
|
bool changed = false;
|
6698
|
|
6699
|
if (!osdmap.have_pg_pool(m->pool)) {
|
6700
|
_pool_op_reply(m, -ENOENT, osdmap.get_epoch());
|
6701
|
return false;
|
6702
|
}
|
6703
|
|
6704
|
const pg_pool_t *pool = osdmap.get_pg_pool(m->pool);
|
6705
|
|
6706
|
switch (m->op) {
|
6707
|
case POOL_OP_CREATE_SNAP:
|
6708
|
case POOL_OP_DELETE_SNAP:
|
6709
|
if (!pool->is_unmanaged_snaps_mode()) {
|
6710
|
bool snap_exists = pool->snap_exists(m->name.c_str());
|
6711
|
if ((m->op == POOL_OP_CREATE_SNAP && snap_exists)
|
6712
|
|| (m->op == POOL_OP_DELETE_SNAP && !snap_exists)) {
|
6713
|
ret = 0;
|
6714
|
} else {
|
6715
|
break;
|
6716
|
}
|
6717
|
} else {
|
6718
|
ret = -EINVAL;
|
6719
|
}
|
6720
|
_pool_op_reply(m, ret, osdmap.get_epoch());
|
6721
|
return false;
|
6722
|
|
6723
|
case POOL_OP_DELETE_UNMANAGED_SNAP:
|
6724
|
|
6725
|
|
6726
|
if (!pool->is_unmanaged_snaps_mode()) {
|
6727
|
_pool_op_reply(m, -ENOTSUP, osdmap.get_epoch());
|
6728
|
return false;
|
6729
|
}
|
6730
|
|
6731
|
case POOL_OP_CREATE_UNMANAGED_SNAP:
|
6732
|
|
6733
|
|
6734
|
if (pool->is_pool_snaps_mode()) {
|
6735
|
_pool_op_reply(m, -EINVAL, osdmap.get_epoch());
|
6736
|
return false;
|
6737
|
}
|
6738
|
}
|
6739
|
|
6740
|
|
6741
|
pg_pool_t pp;
|
6742
|
if (pending_inc.new_pools.count(m->pool))
|
6743
|
pp = pending_inc.new_pools[m->pool];
|
6744
|
else
|
6745
|
pp = *osdmap.get_pg_pool(m->pool);
|
6746
|
|
6747
|
bufferlist reply_data;
|
6748
|
|
6749
|
|
6750
|
switch (m->op) {
|
6751
|
case POOL_OP_CREATE_SNAP:
|
6752
|
case POOL_OP_DELETE_SNAP:
|
6753
|
if (pp.is_unmanaged_snaps_mode()) {
|
6754
|
ret = -EINVAL;
|
6755
|
goto out;
|
6756
|
}
|
6757
|
break;
|
6758
|
|
6759
|
case POOL_OP_CREATE_UNMANAGED_SNAP:
|
6760
|
case POOL_OP_DELETE_UNMANAGED_SNAP:
|
6761
|
if (pp.is_pool_snaps_mode()) {
|
6762
|
ret = -EINVAL;
|
6763
|
goto out;
|
6764
|
}
|
6765
|
}
|
6766
|
|
6767
|
switch (m->op) {
|
6768
|
case POOL_OP_CREATE_SNAP:
|
6769
|
if (!pp.snap_exists(m->name.c_str())) {
|
6770
|
pp.add_snap(m->name.c_str(), ceph_clock_now(g_ceph_context));
|
6771
|
dout(10) << "create snap in pool " << m->pool << " " << m->name << " seq " << pp.get_snap_epoch() << dendl;
|
6772
|
changed = true;
|
6773
|
}
|
6774
|
break;
|
6775
|
|
6776
|
case POOL_OP_DELETE_SNAP:
|
6777
|
{
|
6778
|
snapid_t s = pp.snap_exists(m->name.c_str());
|
6779
|
if (s) {
|
6780
|
pp.remove_snap(s);
|
6781
|
changed = true;
|
6782
|
}
|
6783
|
}
|
6784
|
break;
|
6785
|
|
6786
|
case POOL_OP_CREATE_UNMANAGED_SNAP:
|
6787
|
{
|
6788
|
uint64_t snapid;
|
6789
|
pp.add_unmanaged_snap(snapid);
|
6790
|
::encode(snapid, reply_data);
|
6791
|
changed = true;
|
6792
|
}
|
6793
|
break;
|
6794
|
|
6795
|
case POOL_OP_DELETE_UNMANAGED_SNAP:
|
6796
|
if (!pp.is_removed_snap(m->snapid)) {
|
6797
|
pp.remove_unmanaged_snap(m->snapid);
|
6798
|
changed = true;
|
6799
|
}
|
6800
|
break;
|
6801
|
|
6802
|
case POOL_OP_AUID_CHANGE:
|
6803
|
if (pp.auid != m->auid) {
|
6804
|
pp.auid = m->auid;
|
6805
|
changed = true;
|
6806
|
}
|
6807
|
break;
|
6808
|
|
6809
|
default:
|
6810
|
assert(0);
|
6811
|
break;
|
6812
|
}
|
6813
|
|
6814
|
if (changed) {
|
6815
|
pp.set_snap_epoch(pending_inc.epoch);
|
6816
|
pending_inc.new_pools[m->pool] = pp;
|
6817
|
}
|
6818
|
|
6819
|
out:
|
6820
|
wait_for_finished_proposal(new OSDMonitor::C_PoolOp(this, m, ret, pending_inc.epoch, &reply_data));
|
6821
|
return true;
|
6822
|
}
|
6823
|
|
6824
|
bool OSDMonitor::prepare_pool_op_create(MPoolOp *m)
|
6825
|
{
|
6826
|
int err = prepare_new_pool(m);
|
6827
|
wait_for_finished_proposal(new OSDMonitor::C_PoolOp(this, m, err, pending_inc.epoch));
|
6828
|
return true;
|
6829
|
}
|
6830
|
|
6831
|
int OSDMonitor::_check_remove_pool(int64_t pool, const pg_pool_t *p,
|
6832
|
ostream *ss)
|
6833
|
{
|
6834
|
const string& poolstr = osdmap.get_pool_name(pool);
|
6835
|
|
6836
|
|
6837
|
MDSMap const &pending_mdsmap = mon->mdsmon()->pending_mdsmap;
|
6838
|
if (pending_mdsmap.pool_in_use(pool)) {
|
6839
|
*ss << "pool '" << poolstr << "' is in use by CephFS";
|
6840
|
return -EBUSY;
|
6841
|
}
|
6842
|
|
6843
|
if (p->tier_of >= 0) {
|
6844
|
*ss << "pool '" << poolstr << "' is a tier of '"
|
6845
|
<< osdmap.get_pool_name(p->tier_of) << "'";
|
6846
|
return -EBUSY;
|
6847
|
}
|
6848
|
if (!p->tiers.empty()) {
|
6849
|
*ss << "pool '" << poolstr << "' has tiers";
|
6850
|
for(std::set<uint64_t>::iterator i = p->tiers.begin(); i != p->tiers.end(); ++i) {
|
6851
|
*ss << " " << osdmap.get_pool_name(*i);
|
6852
|
}
|
6853
|
return -EBUSY;
|
6854
|
}
|
6855
|
|
6856
|
if (!g_conf->mon_allow_pool_delete) {
|
6857
|
*ss << "pool deletion is disabled; you must first set the mon_allow_pool_delete config option to true before you can destroy a pool";
|
6858
|
return -EPERM;
|
6859
|
}
|
6860
|
|
6861
|
if (p->has_flag(pg_pool_t::FLAG_NODELETE)) {
|
6862
|
*ss << "pool deletion is disabled; you must unset nodelete flag for the pool first";
|
6863
|
return -EPERM;
|
6864
|
}
|
6865
|
|
6866
|
*ss << "pool '" << poolstr << "' removed";
|
6867
|
return 0;
|
6868
|
}
|
6869
|
|
6870
|
|
6871
|
|
6872
|
|
6873
|
|
6874
|
|
6875
|
|
6876
|
|
6877
|
bool OSDMonitor::_check_become_tier(
|
6878
|
const int64_t tier_pool_id, const pg_pool_t *tier_pool,
|
6879
|
const int64_t base_pool_id, const pg_pool_t *base_pool,
|
6880
|
int *err,
|
6881
|
ostream *ss) const
|
6882
|
{
|
6883
|
const std::string &tier_pool_name = osdmap.get_pool_name(tier_pool_id);
|
6884
|
const std::string &base_pool_name = osdmap.get_pool_name(base_pool_id);
|
6885
|
|
6886
|
const MDSMap &pending_mdsmap = mon->mdsmon()->pending_mdsmap;
|
6887
|
if (pending_mdsmap.pool_in_use(tier_pool_id)) {
|
6888
|
*ss << "pool '" << tier_pool_name << "' is in use by CephFS";
|
6889
|
*err = -EBUSY;
|
6890
|
return false;
|
6891
|
}
|
6892
|
|
6893
|
if (base_pool->tiers.count(tier_pool_id)) {
|
6894
|
assert(tier_pool->tier_of == base_pool_id);
|
6895
|
*err = 0;
|
6896
|
*ss << "pool '" << tier_pool_name << "' is now (or already was) a tier of '"
|
6897
|
<< base_pool_name << "'";
|
6898
|
return false;
|
6899
|
}
|
6900
|
|
6901
|
if (tier_pool->is_tier()) {
|
6902
|
*ss << "tier pool '" << tier_pool_name << "' is already a tier of '"
|
6903
|
<< osdmap.get_pool_name(tier_pool->tier_of) << "'";
|
6904
|
*err = -EINVAL;
|
6905
|
return false;
|
6906
|
}
|
6907
|
|
6908
|
*err = 0;
|
6909
|
return true;
|
6910
|
}
|
6911
|
|
6912
|
|
6913
|
|
6914
|
|
6915
|
|
6916
|
|
6917
|
|
6918
|
|
6919
|
|
6920
|
bool OSDMonitor::_check_remove_tier(
|
6921
|
const int64_t base_pool_id, const pg_pool_t *base_pool,
|
6922
|
int *err, ostream *ss) const
|
6923
|
{
|
6924
|
const std::string &base_pool_name = osdmap.get_pool_name(base_pool_id);
|
6925
|
|
6926
|
|
6927
|
|
6928
|
const MDSMap &pending_mdsmap = mon->mdsmon()->pending_mdsmap;
|
6929
|
if (pending_mdsmap.pool_in_use(base_pool_id)) {
|
6930
|
*ss << "pool '" << base_pool_name << "' is in use by CephFS via its tier";
|
6931
|
*err = -EBUSY;
|
6932
|
return false;
|
6933
|
}
|
6934
|
|
6935
|
*err = 0;
|
6936
|
return true;
|
6937
|
}
|
6938
|
|
6939
|
int OSDMonitor::_prepare_remove_pool(int64_t pool, ostream *ss)
|
6940
|
{
|
6941
|
dout(10) << "_prepare_remove_pool " << pool << dendl;
|
6942
|
const pg_pool_t *p = osdmap.get_pg_pool(pool);
|
6943
|
int r = _check_remove_pool(pool, p, ss);
|
6944
|
if (r < 0)
|
6945
|
return r;
|
6946
|
|
6947
|
if (pending_inc.new_pools.count(pool)) {
|
6948
|
|
6949
|
|
6950
|
pg_pool_t *p = &pending_inc.new_pools[pool];
|
6951
|
int r = _check_remove_pool(pool, p, ss);
|
6952
|
if (r < 0)
|
6953
|
return -EAGAIN;
|
6954
|
}
|
6955
|
|
6956
|
if (pending_inc.old_pools.count(pool)) {
|
6957
|
dout(10) << "_prepare_remove_pool " << pool << " already pending removal"
|
6958
|
<< dendl;
|
6959
|
return 0;
|
6960
|
}
|
6961
|
|
6962
|
|
6963
|
pending_inc.old_pools.insert(pool);
|
6964
|
|
6965
|
|
6966
|
for (map<pg_t,vector<int32_t> >::iterator p = osdmap.pg_temp->begin();
|
6967
|
p != osdmap.pg_temp->end();
|
6968
|
++p) {
|
6969
|
if (p->first.pool() == (uint64_t)pool) {
|
6970
|
dout(10) << "_prepare_remove_pool " << pool << " removing obsolete pg_temp "
|
6971
|
<< p->first << dendl;
|
6972
|
pending_inc.new_pg_temp[p->first].clear();
|
6973
|
}
|
6974
|
}
|
6975
|
for (map<pg_t,int32_t>::iterator p = osdmap.primary_temp->begin();
|
6976
|
p != osdmap.primary_temp->end();
|
6977
|
++p) {
|
6978
|
if (p->first.pool() == (uint64_t)pool) {
|
6979
|
dout(10) << "_prepare_remove_pool " << pool
|
6980
|
<< " removing obsolete primary_temp" << p->first << dendl;
|
6981
|
pending_inc.new_primary_temp[p->first] = -1;
|
6982
|
}
|
6983
|
}
|
6984
|
return 0;
|
6985
|
}
|
6986
|
|
6987
|
int OSDMonitor::_prepare_rename_pool(int64_t pool, string newname)
|
6988
|
{
|
6989
|
dout(10) << "_prepare_rename_pool " << pool << dendl;
|
6990
|
if (pending_inc.old_pools.count(pool)) {
|
6991
|
dout(10) << "_prepare_rename_pool " << pool << " pending removal" << dendl;
|
6992
|
return -ENOENT;
|
6993
|
}
|
6994
|
for (map<int64_t,string>::iterator p = pending_inc.new_pool_names.begin();
|
6995
|
p != pending_inc.new_pool_names.end();
|
6996
|
++p) {
|
6997
|
if (p->second == newname && p->first != pool) {
|
6998
|
return -EEXIST;
|
6999
|
}
|
7000
|
}
|
7001
|
|
7002
|
pending_inc.new_pool_names[pool] = newname;
|
7003
|
return 0;
|
7004
|
}
|
7005
|
|
7006
|
bool OSDMonitor::prepare_pool_op_delete(MPoolOp *m)
|
7007
|
{
|
7008
|
ostringstream ss;
|
7009
|
int ret = _prepare_remove_pool(m->pool, &ss);
|
7010
|
if (ret == -EAGAIN) {
|
7011
|
wait_for_finished_proposal(new C_RetryMessage(this, m));
|
7012
|
return true;
|
7013
|
}
|
7014
|
if (ret < 0)
|
7015
|
dout(10) << __func__ << " got " << ret << " " << ss.str() << dendl;
|
7016
|
wait_for_finished_proposal(new OSDMonitor::C_PoolOp(this, m, ret,
|
7017
|
pending_inc.epoch));
|
7018
|
return true;
|
7019
|
}
|
7020
|
|
7021
|
void OSDMonitor::_pool_op_reply(MPoolOp *m, int ret, epoch_t epoch, bufferlist *blp)
|
7022
|
{
|
7023
|
dout(20) << "_pool_op_reply " << ret << dendl;
|
7024
|
MPoolOpReply *reply = new MPoolOpReply(m->fsid, m->get_tid(),
|
7025
|
ret, epoch, get_last_committed(), blp);
|
7026
|
mon->send_reply(m, reply);
|
7027
|
m->put();
|
7028
|
}
|