Bug #18162 » ec-handle-error-in-backfill-read.patch
src/osd/ECBackend.cc | ||
---|---|---|
// Make sure we don't do redundant reads for recovery
|
||
assert(!for_recovery || !do_redundant_reads);
|
||
map<hobject_t, set<pg_shard_t>, hobject_t::BitwiseComparator>::const_iterator miter =
|
||
get_parent()->get_missing_loc_shards().find(hoid);
|
||
set<int> have;
|
||
map<shard_id_t, pg_shard_t> shards;
|
||
... | ... | |
}
|
||
}
|
||
if (miter != get_parent()->get_missing_loc_shards().end()) {
|
||
bool miter_first = true;
|
||
for (map<hobject_t, set<pg_shard_t>, hobject_t::BitwiseComparator>::const_iterator miter =
|
||
get_parent()->get_missing_loc_shards().find(hoid);
|
||
miter != get_parent()->get_missing_loc_shards().end();
|
||
miter++) {
|
||
if (miter_first) {
|
||
dout(20) << __func__ << hoid
|
||
<< " has missing_loc, resetting have" << dendl;
|
||
miter_first = false;
|
||
have.clear();
|
||
}
|
||
dout(20) << __func__ << hoid
|
||
<< " presumed available at " << miter->second
|
||
<< dendl;
|
||
for (set<pg_shard_t>::iterator i = miter->second.begin();
|
||
i != miter->second.end();
|
||
++i) {
|
src/osd/ReplicatedPG.cc | ||
---|---|---|
r = pgbackend->objects_get_attr(oid.get_head(), SS_ATTR, &bv);
|
||
if (r < 0) {
|
||
// try _snapset
|
||
if (!(oid.is_snapdir() && !oid_existed))
|
||
r = pgbackend->objects_get_attr(oid.get_snapdir(), SS_ATTR, &bv);
|
||
if (!(oid.is_snapdir() && !oid_existed))
|
||
r = pgbackend->objects_get_attr(oid.get_snapdir(), SS_ATTR, &bv);
|
||
if (r < 0 && !can_create)
|
||
return NULL;
|
||
}
|
||
... | ... | |
obc->drop_recovery_read(&blocked_ops);
|
||
requeue_ops(blocked_ops);
|
||
}
|
||
recovering.erase(soid);
|
||
for (list<pg_shard_t>::const_iterator i = from.begin(); i != from.end(); i++)
|
||
if (missing_loc.get_locations(soid).empty()) {
|
||
dout(20) << __func__ << ": " << soid
|
||
<< " had an empty location list, reconstructing" << dendl;
|
||
assert(!actingbackfill.empty());
|
||
for (set<pg_shard_t>::iterator i = actingbackfill.begin();
|
||
i != actingbackfill.end(); ++i) {
|
||
pg_shard_t peer = *i;
|
||
if (!peer_missing[peer].is_missing(soid)) {
|
||
missing_loc.add_location(soid, peer);
|
||
dout(20) << __func__ << ": " << soid
|
||
<< " assumed to be available in " << peer << dendl;
|
||
}
|
||
}
|
||
}
|
||
for (list<pg_shard_t>::const_iterator i = from.begin();
|
||
i != from.end(); i++) {
|
||
dout(20) << __func__ << ": " << soid
|
||
<< " marked as not available in " << *i
|
||
<< dendl;
|
||
missing_loc.remove_location(soid, *i);
|
||
}
|
||
/* If we were backfilling, we will retry the push from
|
||
recover_backfill, and its backfills_in_flight entry will only be
|
||
cleared when we succeed. */
|
||
recovering.erase(soid);
|
||
dout(0) << "_failed_push " << soid << " from shard " << from
|
||
<< ", reps on " << missing_loc.get_locations(soid)
|
||
<< " unfound? " << missing_loc.is_unfound(soid) << dendl;
|
||
... | ... | |
last_backfill_started = hobject_t();
|
||
set<hobject_t, hobject_t::Comparator>::iterator i = backfills_in_flight.begin();
|
||
while (i != backfills_in_flight.end()) {
|
||
assert(recovering.count(*i));
|
||
if(!recovering.count(*i))
|
||
dout(10) << __func__ << ": " << *i
|
||
<< " is still pending backfill retry" << dendl;
|
||
backfills_in_flight.erase(i++);
|
||
}
|
||
... | ... | |
// this shouldn't happen!
|
||
// We already checked num_missing() so we must have missing replicas
|
||
osd->clog->error() << info.pgid << " recovery ending with missing replicas\n";
|
||
set<pg_shard_t>::const_iterator end = actingbackfill.end();
|
||
set<pg_shard_t>::const_iterator a = actingbackfill.begin();
|
||
for (; a != end; ++a) {
|
||
if (*a == get_primary()) continue;
|
||
pg_shard_t peer = *a;
|
||
map<pg_shard_t, pg_missing_t>::const_iterator pm = peer_missing.find(peer);
|
||
if (pm == peer_missing.end()) {
|
||
continue;
|
||
}
|
||
if (pm->second.num_missing()) {
|
||
dout(10) << __func__ << " osd." << peer << " has "
|
||
<< pm->second.num_missing() << " missing: "
|
||
<< pm->second << dendl;
|
||
}
|
||
}
|
||
return work_in_progress;
|
||
}
|
||
... | ... | |
const hobject_t soid(p->second);
|
||
if (cmp(soid, pi->second.last_backfill, get_sort_bitwise()) > 0) {
|
||
if (!recovering.count(soid)) {
|
||
if (!recovering.count(soid) && !backfills_in_flight.count(soid)) {
|
||
derr << __func__ << ": object added to missing set for backfill, but "
|
||
<< "is not in recovering, error!" << dendl;
|
||
assert(0);
|
||
... | ... | |
<< dendl;
|
||
}
|
||
bool trim = true;
|
||
bool retrying = recovering.empty() && !backfills_in_flight.empty();
|
||
if (retrying) {
|
||
/* If we had any errors, arrange for us to retain the information
|
||
about the range before the first failed object, and to retry
|
||
it. */
|
||
map<hobject_t,eversion_t,hobject_t::Comparator>::iterator p;
|
||
p = backfill_info.objects.find(*backfills_in_flight.begin());
|
||
if (p-- == backfill_info.objects.end() ||
|
||
p == backfill_info.objects.end()) {
|
||
last_backfill_started = *backfills_in_flight.begin();
|
||
trim = false;
|
||
dout(20) << "backfill retry: adjusting last_backfill_started to "
|
||
<< last_backfill_started
|
||
<< " and disabling trimming" << dendl;
|
||
} else {
|
||
last_backfill_started = MIN_HOBJ(last_backfill_started, p->first,
|
||
get_sort_bitwise());
|
||
dout(20) << "backfill retry: adjusting last_backfill_started to "
|
||
<< last_backfill_started << dendl;
|
||
}
|
||
}
|
||
// update our local interval to cope with recent changes
|
||
backfill_info.begin = last_backfill_started;
|
||
update_range(&backfill_info, handle);
|
||
... | ... | |
vector<boost::tuple<hobject_t, eversion_t, pg_shard_t> > to_remove;
|
||
set<hobject_t, hobject_t::BitwiseComparator> add_to_stat;
|
||
for (set<pg_shard_t>::iterator i = backfill_targets.begin();
|
||
i != backfill_targets.end();
|
||
++i) {
|
||
peer_backfill_info[*i].trim_to(
|
||
MAX_HOBJ(peer_info[*i].last_backfill, last_backfill_started,
|
||
get_sort_bitwise()));
|
||
if (trim) {
|
||
dout(20) << "trimming backfill interval up to "
|
||
<< last_backfill_started << dendl;
|
||
for (set<pg_shard_t>::iterator i = backfill_targets.begin();
|
||
i != backfill_targets.end();
|
||
++i) {
|
||
peer_backfill_info[*i].trim_to(
|
||
MAX_HOBJ(peer_info[*i].last_backfill, last_backfill_started,
|
||
get_sort_bitwise()));
|
||
}
|
||
backfill_info.trim_to(last_backfill_started);
|
||
}
|
||
backfill_info.trim_to(last_backfill_started);
|
||
hobject_t backfill_pos = MIN_HOBJ(backfill_info.begin,
|
||
earliest_peer_backfill(),
|
||
get_sort_bitwise());
|
||
while (ops < max) {
|
||
if (cmp(backfill_info.begin, earliest_peer_backfill(),
|
||
get_sort_bitwise()) <= 0 &&
|
||
... | ... | |
backfill_info.end = hobject_t::get_max();
|
||
update_range(&backfill_info, handle);
|
||
backfill_info.trim();
|
||
dout(20) << "resetting and trimming backfill interval" << dendl;
|
||
}
|
||
backfill_pos = MIN_HOBJ(backfill_info.begin, earliest_peer_backfill(),
|
||
get_sort_bitwise());
|
||
dout(20) << " my backfill interval " << backfill_info << dendl;
|
||
... | ... | |
// Get object within set of peers to operate on and
|
||
// the set of targets for which that object applies.
|
||
hobject_t check = earliest_peer_backfill();
|
||
if (cmp(check, backfill_info.begin, get_sort_bitwise()) < 0) {
|
||
set<pg_shard_t> check_targets;
|
||
for (set<pg_shard_t>::iterator i = backfill_targets.begin();
|
||
i != backfill_targets.end();
|
||
... | ... | |
to_push.push_back(
|
||
boost::tuple<hobject_t, eversion_t, ObjectContextRef, vector<pg_shard_t> >
|
||
(backfill_info.begin, obj_v, obc, all_push));
|
||
if (retrying && backfills_in_flight.count(backfill_info.begin))
|
||
dout(20) << " BACKFILL retrying " << backfill_info.begin
|
||
<< " with locations "
|
||
<< missing_loc.get_locations(backfill_info.begin)
|
||
<< dendl;
|
||
// Count all simultaneous pushes of the same object as a single op
|
||
ops++;
|
||
} else {
|
||
... | ... | |
}
|
||
}
|
||
}
|
||
backfill_pos = MIN_HOBJ(backfill_info.begin, earliest_peer_backfill(),
|
||
get_sort_bitwise());
|
||
hobject_t backfill_pos = MIN_HOBJ(backfill_info.begin,
|
||
earliest_peer_backfill(),
|
||
get_sort_bitwise());
|
||
for (set<hobject_t, hobject_t::BitwiseComparator>::iterator i = add_to_stat.begin();
|
||
i != add_to_stat.end();
|
||
... | ... | |
PGBackend::RecoveryHandle *h = pgbackend->open_recovery_op();
|
||
for (unsigned i = 0; i < to_push.size(); ++i) {
|
||
handle.reset_tp_timeout();
|
||
if (backfills_in_flight.count(to_push[i].get<0>()))
|
||
dout(0) << "retrying backfill of " << to_push[i].get<0>()
|
||
<< " with locations "
|
||
<< missing_loc.get_locations(to_push[i].get<0>()) << dendl;
|
||
prep_backfill_object_push(to_push[i].get<0>(), to_push[i].get<1>(),
|
||
to_push[i].get<2>(), to_push[i].get<3>(), h);
|
||
}
|