//------------------------------------------------------------------------------
//! @file FsckEntry.hh
//! @author Elvin Sindrilaru - CERN
//------------------------------------------------------------------------------
/************************************************************************
* EOS - the CERN Disk Storage System *
* Copyright (C) 2019 CERN/Switzerland *
* *
* This program is free software: you can redistribute it and/or modify *
* it under the terms of the GNU General Public License as published by *
* the Free Software Foundation, either version 3 of the License, or *
* (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU General Public License for more details. *
* *
* You should have received a copy of the GNU General Public License *
* along with this program. If not, see .*
************************************************************************/
#include "mgm/fsck/FsckEntry.hh"
#include "mgm/fsck/Fsck.hh"
#include "mgm/XrdMgmOfs.hh"
#include "mgm/FsView.hh"
#include "mgm/Stat.hh"
#include "mgm/proc/proc_fs.hh"
#include "namespace/interface/IView.hh"
#include "namespace/interface/IFileMDSvc.hh"
#include "common/StringConversion.hh"
#include "common/LayoutId.hh"
#include "namespace/Prefetcher.hh"
#include "namespace/ns_quarkdb/persistency/MetadataFetcher.hh"
using eos::common::StringConversion;
using eos::common::LayoutId;
EOSMGMNAMESPACE_BEGIN
//----------------------------------------------------------------------------
//! Constructor
//----------------------------------------------------------------------------
FsckEntry::FsckEntry(eos::IFileMD::id_t fid,
const std::set& fsid_err,
const std::string& expected_err,
std::shared_ptr qcl):
mFid(fid), mFsidErr(fsid_err),
mReportedErr(eos::common::ConvertToFsckErr(expected_err)), mRepairFactory(),
mQcl(qcl)
{
using namespace eos::common;
mMapRepairOps = {
{FsckErr::MgmXsDiff, &FsckEntry::RepairMgmXsSzDiff},
{FsckErr::MgmSzDiff, &FsckEntry::RepairMgmXsSzDiff},
{FsckErr::FstXsDiff, &FsckEntry::RepairFstXsSzDiff},
{FsckErr::FstSzDiff, &FsckEntry::RepairFstXsSzDiff},
{FsckErr::BlockxsErr, &FsckEntry::RepairFstXsSzDiff},
{FsckErr::UnregRepl, &FsckEntry::RepairInconsistencies},
{FsckErr::DiffRepl, &FsckEntry::RepairInconsistencies},
{FsckErr::MissRepl, &FsckEntry::RepairInconsistencies},
{FsckErr::StripeErr, &FsckEntry::RepairInconsistencies}
};
mRepairFactory = [](eos::common::FileId::fileid_t fid,
eos::common::FileSystem::fsid_t fsid_src,
eos::common::FileSystem::fsid_t fsid_trg ,
std::set exclude_srcs,
std::set exclude_dsts,
bool drop_src, const std::string & app_tag,
bool repair_excluded) {
return std::make_shared(
fid, fsid_src, fsid_trg, exclude_srcs, exclude_dsts, drop_src, app_tag,
false, eos::common::VirtualIdentity::Root(), repair_excluded);
};
}
//------------------------------------------------------------------------------
// Destructor
//------------------------------------------------------------------------------
FsckEntry::~FsckEntry()
{
if (gOFS) {
gOFS->mFidTracker.RemoveEntry(mFid);
}
}
//------------------------------------------------------------------------------
// Collect MGM file metadata information
//------------------------------------------------------------------------------
bool
FsckEntry::CollectMgmInfo()
{
if (mQcl == nullptr) {
return false;
}
try {
mMgmFmd = eos::MetadataFetcher::getFileFromId(*mQcl.get(),
FileIdentifier(mFid)).get();
} catch (const eos::MDException& e) {
return false;
}
if (mMgmFmd.cont_id()) {
// Double check that the parent exists, if not, this is a detached entry and
// we need to clean it up and mark the parentId with 0 otherwise the fsck
// mechanism gets confused.
try {
eos::common::RWMutexReadLock ns_rd_lock(gOFS->eosViewRWMutex);
(void) gOFS->eosDirectoryService->getContainerMD(mMgmFmd.cont_id());
} catch (const eos::MDException& e) {
mMgmFmd.set_cont_id(0ull);
}
}
return true;
}
//------------------------------------------------------------------------------
// Collect FST file metadata information from all replicas
//------------------------------------------------------------------------------
void
FsckEntry::CollectAllFstInfo()
{
for (const auto fsid : mMgmFmd.locations()) {
CollectFstInfo(fsid);
}
}
//------------------------------------------------------------------------------
// Collect FST file metadata information
//------------------------------------------------------------------------------
void
FsckEntry::CollectFstInfo(eos::common::FileSystem::fsid_t fsid)
{
using eos::common::FileId;
if ((fsid == 0ull) || (mFstFileInfo.find(fsid) != mFstFileInfo.end())) {
return;
}
std::string host_port;
std::string fst_local_path;
{
eos::common::RWMutexReadLock fs_rd_lock(FsView::gFsView.ViewMutex);
FileSystem* fs = FsView::gFsView.mIdView.lookupByID(fsid);
if (fs) {
host_port = fs->GetString("hostport");
fst_local_path = fs->GetPath();
}
}
if (host_port.empty() || fst_local_path.empty()) {
eos_err("msg=\"missing or misconfigured file system\" fsid=%lu", fsid);
mFstFileInfo.emplace(fsid, std::make_unique("",
FstErr::NotExist));
return;
}
std::ostringstream oss;
oss << "root://" << host_port << "//dummy";
std::string surl = oss.str();
XrdCl::URL url(surl);
if (!url.IsValid()) {
eos_err("msg=\"invalid url\" url=\"%s\"", surl.c_str());
mFstFileInfo.emplace(fsid, std::make_unique("",
FstErr::NoContact));
return;
}
std::string fpath_local = FileId::FidPrefix2FullPath
(FileId::Fid2Hex(mFid).c_str(), fst_local_path.c_str());
// Check that the file exists on disk
XrdCl::StatInfo* stat_info_raw {nullptr};
std::unique_ptr stat_info;
uint16_t timeout = 10;
XrdCl::FileSystem fs(url);
XrdCl::XRootDStatus status = fs.Stat(fpath_local.c_str(), stat_info_raw,
timeout);
stat_info.reset(stat_info_raw);
if (!status.IsOK()) {
eos_err("msg=\"failed stat\" fxid=%08llx fsid=%lu local_path=%s "
"xrd_code=%u xrd_errno=%u", mFid, fsid, fpath_local.c_str(),
status.code, status.errNo);
if (status.code == XrdCl::errOperationExpired) {
mFstFileInfo.emplace(fsid, std::make_unique("",
FstErr::NoContact));
} else {
if (XProtocol::toErrno(status.errNo) == ENOENT) {
mFstFileInfo.emplace(fsid, std::make_unique("",
FstErr::NotOnDisk));
} else {
mFstFileInfo.emplace(fsid, std::make_unique("",
FstErr::NoContact));
}
}
return;
}
// Collect file metadata stored on the FST about the current file
auto ret_pair = mFstFileInfo.emplace(fsid, std::make_unique
(fpath_local.c_str(), FstErr::None));
auto& finfo = ret_pair.first->second;
finfo->mDiskSize = stat_info->GetSize();
(void) GetFstFmd(finfo, fs, fsid);
}
//------------------------------------------------------------------------------
// Method to repair an mgm checksum difference error
//------------------------------------------------------------------------------
bool
FsckEntry::RepairMgmXsSzDiff()
{
// This only makes sense for replica layouts
if (LayoutId::IsRain(mMgmFmd.layout_id())) {
return true;
}
std::string mgm_xs_val =
StringConversion::BinData2HexString(mMgmFmd.checksum().c_str(),
SHA256_DIGEST_LENGTH,
LayoutId::GetChecksumLen(mMgmFmd.layout_id()));
// Make sure the disk xs and size values match between all the replicas
uint64_t sz_val {0ull};
std::string xs_val;
bool mgm_xs_sz_match = false; // one of the disk xs matches the mgm one
bool disk_xs_sz_match = true; // flag to mark that all disk xs match
for (auto it = mFstFileInfo.cbegin(); it != mFstFileInfo.cend(); ++it) {
auto& finfo = it->second;
if (finfo->mFstErr != FstErr::None) {
eos_err("msg=\"unavailable replica info\" fxid=%08llx fsid=%lu",
mFid, it->first);
disk_xs_sz_match = false;
continue;
}
if (xs_val.empty() && (sz_val == 0ull)) {
xs_val = finfo->mFstFmd.mProtoFmd.diskchecksum();
sz_val = finfo->mFstFmd.mProtoFmd.size();
if ((mgm_xs_val == xs_val) &&
(mMgmFmd.size() == sz_val) &&
(mMgmFmd.size() == finfo->mDiskSize)) {
mgm_xs_sz_match = true;
continue;
}
} else {
uint64_t current_sz_val = finfo->mFstFmd.mProtoFmd.size();
std::string current_xs_val = finfo->mFstFmd.mProtoFmd.diskchecksum();
if ((mgm_xs_val == current_xs_val) &&
(mMgmFmd.size() == current_sz_val) &&
(mMgmFmd.size() == finfo->mDiskSize)) {
mgm_xs_sz_match = true;
continue;
}
if ((xs_val != current_xs_val) ||
(sz_val != current_sz_val) ||
(sz_val != finfo->mDiskSize)) {
// There is a xs/size diff between two replicas, we can not fix
disk_xs_sz_match = false;
continue;
}
}
}
if (mgm_xs_sz_match) {
std::set good_fsids;
std::set bad_fsids;
for (auto it = mFstFileInfo.cbegin(); it != mFstFileInfo.cend(); ++it) {
auto& finfo = it->second;
if ((mMgmFmd.size() != finfo->mFstFmd.mProtoFmd.size()) ||
(mMgmFmd.size() != finfo->mDiskSize) ||
(mgm_xs_val != finfo->mFstFmd.mProtoFmd.diskchecksum())) {
if ((mMgmFmd.size() != finfo->mDiskSize)) {
bad_fsids.insert(it->first);
} else {
// Trigger a resync of the FST info as it looks to be out of sync
ResyncFstMd(false);
return false;
}
} else {
good_fsids.insert(it->first);
}
}
if (good_fsids.empty()) {
eos_err("msg=\"mgm xs/size repair failed, no correct replicas\" "
"fxid=%08llx", mFid);
return false;
}
for (const auto bad_fsid : bad_fsids) {
DropReplica(bad_fsid);
}
bool all_repaired = true;
// Attempt repair only if we don't have enough good replicas
size_t num_nominal_rep = LayoutId::GetStripeNumber(mMgmFmd.layout_id()) + 1;
if (good_fsids.size() < num_nominal_rep) {
for (auto bad_fsid : bad_fsids) {
// Trigger an fsck repair job (much like a drain job) doing a TPC
auto repair_job = mRepairFactory(mFid, bad_fsid, 0, bad_fsids,
bad_fsids, true, "fsck", false);
repair_job->DoIt();
if (repair_job->GetStatus() != FsckRepairJob::Status::OK) {
eos_err("msg=\"mgm xs/size repair failed\" fxid=%08llx bad_fsid=%lu",
mFid, bad_fsid);
all_repaired = false;
} else {
eos_info("msg=\"mgm xs/size repair replica successful\" "
"fxid=%08llx bad_fsid=%lu", mFid, bad_fsid);
}
}
}
if (all_repaired) {
eos_info("msg=\"mgm xs/size repair successful\" fxid=%08llx", mFid);
} else {
eos_warning("msg=\"mgm xs/size repair failed\" fxid=%08llx", mFid);
}
return all_repaired;
}
if (disk_xs_sz_match && sz_val) {
size_t out_sz;
auto xs_binary = StringConversion::Hex2BinDataChar(xs_val, out_sz,
SHA256_DIGEST_LENGTH);
if (xs_binary == nullptr) {
eos_err("msg=\"mgm xs/size repair failed due to disk checksum conversion "
"error\" fxid=%08llx disk_xs=\"%s\"", mFid, xs_val.c_str());
return false;
}
eos::Buffer xs_buff;
xs_buff.putData(xs_binary.get(), SHA256_DIGEST_LENGTH);
if (gOFS) {
try {
eos::Prefetcher::prefetchFileMDAndWait(gOFS->eosView, mFid);
// Grab the file metadata object and update it
eos::common::RWMutexReadLock ns_rd_lock(gOFS->eosViewRWMutex);
auto fmd = gOFS->eosFileService->getFileMD(mFid);
fmd->setChecksum(xs_buff);
fmd->setSize(sz_val);
gOFS->eosView->updateFileStore(fmd.get());
// Update also the MGM fmd object
mMgmFmd.set_checksum(xs_buff.getDataPtr(), xs_buff.getSize());
mMgmFmd.set_size(sz_val);
} catch (const eos::MDException& e) {
eos_err("msg=\"mgm xs/size repair failed - no such filemd\" fxid=%08llx", mFid);
return false;
}
} else {
// For testing we just update the MGM fmd object
mMgmFmd.set_checksum(xs_buff.getDataPtr(), xs_buff.getSize());
mMgmFmd.set_size(sz_val);
}
eos_info("msg=\"mgm xs/size repair successful\" fxid=%08llx old_mgm_xs=\"%s\" "
"new_mgm_xs=\"%s\"", mFid, mgm_xs_val.c_str(), xs_val.c_str());
} else {
eos_err("msg=\"mgm xs/size repair failed - not all disk xs/size match\" "
"fxid=%08llx", mFid);
}
return disk_xs_sz_match;
}
//----------------------------------------------------------------------------
// Method to repair an FST checksum and/or size difference error
//----------------------------------------------------------------------------
bool
FsckEntry::RepairFstXsSzDiff()
{
std::set bad_fsids;
std::set good_fsids;
if (LayoutId::IsRain(mMgmFmd.layout_id())) {
bad_fsids.insert(*mFsidErr.begin());
} else { // for replica layouts
std::string mgm_xs_val =
StringConversion::BinData2HexString(mMgmFmd.checksum().c_str(),
SHA256_DIGEST_LENGTH,
LayoutId::GetChecksumLen(mMgmFmd.layout_id()));
// Make sure at least one disk xs and size match the MGM ones
uint64_t sz_val {0ull};
std::string xs_val;
for (auto it = mFstFileInfo.cbegin(); it != mFstFileInfo.cend(); ++it) {
auto& finfo = it->second;
if (finfo->mFstErr != FstErr::None) {
eos_err("msg=\"unavailable replica info\" fxid=%08llx fsid=%lu",
mFid, it->first);
bad_fsids.insert(it->first);
continue;
}
xs_val = finfo->mFstFmd.mProtoFmd.diskchecksum();
sz_val = finfo->mFstFmd.mProtoFmd.disksize();
eos_static_debug("mgm_sz=%llu mgm_xs=%s fst_sz_sz=%llu fst_sz_disk=%llu, "
"fst_xs=%s", mMgmFmd.size(), mgm_xs_val.c_str(),
finfo->mFstFmd.mProtoFmd.size(),
finfo->mFstFmd.mProtoFmd.disksize(),
finfo->mFstFmd.mProtoFmd.checksum().c_str());
// The disksize/xs must also match the original reference size/xs
if ((mgm_xs_val == xs_val) && (mMgmFmd.size() == sz_val) &&
(finfo->mFstFmd.mProtoFmd.size() == sz_val) &&
(finfo->mFstFmd.mProtoFmd.checksum() == xs_val)) {
good_fsids.insert(finfo->mFstFmd.mProtoFmd.fsid());
} else {
// It could be that the diskchecksum for the replica was not yet
// computed - this does not mean the replica is bad
if (!finfo->mFstFmd.mProtoFmd.diskchecksum().empty()) {
std::string hex_xs_val = StringConversion::BinData2HexString(
finfo->mFstFmd.mProtoFmd.diskchecksum().c_str(),
SHA256_DIGEST_LENGTH,
LayoutId::GetChecksumLen(finfo->mFstFmd.mProtoFmd.lid()));
if (!hex_xs_val.empty()) {
bad_fsids.insert(finfo->mFstFmd.mProtoFmd.fsid());
}
}
}
}
if (bad_fsids.empty()) {
eos_warning("msg=\"fst xs/size repair skip - no bad replicas\" fxid=%08llx",
mFid);
return true;
}
if (good_fsids.empty()) {
eos_err("msg=\"fst xs/size repair failed - no good replicas\" fxid=%08llx",
mFid);
return false;
}
}
// Have more good stripes then layout requirements
size_t num_nominal_rep = LayoutId::GetStripeNumber(mMgmFmd.layout_id()) + 1;
if (good_fsids.size() >= num_nominal_rep) {
if (LayoutId::IsRain(mMgmFmd.layout_id()) &&
(good_fsids.size() > num_nominal_rep)) {
eos_crit("msg=\"more stripes than RAIN layout\" fxid=%08llx", mFid);
return false;
}
while (good_fsids.size() > num_nominal_rep) {
bad_fsids.insert(*good_fsids.begin());
good_fsids.erase(good_fsids.begin());
}
for (auto bad_fsid : bad_fsids) {
// If we have enough stripes - just drop it
DropReplica(bad_fsid);
}
bad_fsids.clear();
}
bool all_repaired {true};
for (auto bad_fsid : bad_fsids) {
// Trigger an fsck repair job (much like a drain job) doing a TPC
auto repair_job = mRepairFactory(mFid, bad_fsid, 0, bad_fsids,
bad_fsids, true, "fsck", false);
repair_job->DoIt();
if (repair_job->GetStatus() != FsckRepairJob::Status::OK) {
eos_err("msg=\"fst xs/size repair failed\" fxid=%08llx bad_fsid=%lu",
mFid, bad_fsid);
all_repaired = false;
} else {
eos_info("msg=\"fst xs/size repair successful\" fxid=%08llx bad_fsid=%lu",
mFid, bad_fsid);
}
if (LayoutId::IsRain(mMgmFmd.layout_id())) {
break;
}
}
// Trigger an MGM resync on all the replicas so that the locations get
// updated properly
ResyncFstMd(true);
return all_repaired;
}
//------------------------------------------------------------------------------
// Method to repair file inconsistencies
//------------------------------------------------------------------------------
bool
FsckEntry::RepairInconsistencies()
{
if (LayoutId::IsRain(mMgmFmd.layout_id())) {
return RepairRainInconsistencies();
} else {
return RepairReplicaInconsistencies();
}
}
//------------------------------------------------------------------------------
// Method to repair RAIN file inconsistencies
//------------------------------------------------------------------------------
bool
FsckEntry::RepairRainInconsistencies()
{
using namespace eos::common;
if (mReportedErr == FsckErr::UnregRepl) {
if (static_cast(mMgmFmd.locations_size()) >=
LayoutId::GetStripeNumber(mMgmFmd.layout_id()) + 1) {
// If we have enough stripes and current error refers to a stripe which
// is not in the list of locations then drop it
bool found = false;
for (const auto loc : mMgmFmd.locations()) {
if (*mFsidErr.begin() == loc) {
found = true;
break;
}
}
if (!found) {
DropReplica(*mFsidErr.begin());
}
return true;
} else {
// If not enough stripes then register it and trigger a check
if (gOFS) {
try {
// Grab the file metadata object and update it
eos::Prefetcher::prefetchFileMDAndWait(gOFS->eosView, mFid);
eos::common::RWMutexReadLock ns_rd_lock(gOFS->eosViewRWMutex);
auto fmd = gOFS->eosFileService->getFileMD(mFid);
fmd->addLocation(*mFsidErr.begin());
gOFS->eosView->updateFileStore(fmd.get());
} catch (const eos::MDException& e) {
eos_err("msg=\"unregistered stripe repair failed - no such filemd\" "
"fxid=%08llx", mFid);
return false;
}
} else {
// For testing just update the MGM fmd object
mMgmFmd.mutable_locations()->Add(*mFsidErr.begin());
}
}
}
if (mMgmFmd.locations().empty()) {
eos_err("msg=\"failed repair, no location available\" fxid=%08llx", mFid);
return false;
}
// Trigger a fsck repair job to make sure all the remaining stripes are
// recovered and new ones are created if need be. By default pick the
// first stripe as "source" unless we have a better candidate
bool drop_src_fsid = false;
bool repair_excluded = false;
eos::common::FileSystem::fsid_t src_fsid = mMgmFmd.locations(0);
std::set bad_fsids;
if (mReportedErr == FsckErr::MissRepl) {
src_fsid = *mFsidErr.begin();
drop_src_fsid = true;
} else if (mReportedErr == FsckErr::DiffRepl) {
// For rep_diff_n errors the source file systems is not to be dropped
// or skipped during the scheduling process as it's a valid stripe
// useful when doing the transfer.
src_fsid = 0;
// Over-replication should never happend for RAIN files
if (static_cast(mMgmFmd.locations_size()) >
LayoutId::GetStripeNumber(mMgmFmd.layout_id()) + 1) {
eos_err("msg=\"RAIN file over-replicated, to be handled manually\" "
"fxid=%08llu fsid_err=%lu", mFid, *mFsidErr.begin());
return false;
} else if (static_cast(mMgmFmd.locations_size()) ==
LayoutId::GetStripeNumber(mMgmFmd.layout_id()) + 1) {
eos_info("msg=\"stripe inconsistency repair successful\" fxid=%08llx "
"src_fsid=%lu", mFid, src_fsid);
return true;
}
} else if (mReportedErr == FsckErr::StripeErr) {
// File has too many corrupted stripes, we can't recover
if (mFsidErr.find(0) != mFsidErr.end()) {
eos_err("msg=\"RAIN file has too many corrupted stripes, unable to "
"reconstruct\" fxid=%08llu", mFid);
return false;
}
bad_fsids = mFsidErr;
// If there is over replication, drop replicas until we have the right
// number of stripes
while ((mMgmFmd.locations_size() >
LayoutId::GetStripeNumber(mMgmFmd.layout_id()) + 1) &&
!bad_fsids.empty()) {
const FileSystem::fsid_t drop_fsid = *bad_fsids.begin();
bad_fsids.erase(drop_fsid);
eos_info("msg=\"drop over-replicated stripe\" fxid=%08llx dfsid=%lu",
mFid, drop_fsid);
(void)DropReplica(drop_fsid);
mFstFileInfo.erase(drop_fsid);
auto* mutable_loc = mMgmFmd.mutable_locations();
for (auto it = mutable_loc->begin(); it != mutable_loc->end(); ++it) {
if (*it == drop_fsid) {
mutable_loc->erase(it);
break;
}
}
}
if (bad_fsids.empty()) {
eos_info("msg=\"stripe inconsistency repair successful\" fxid=%08llx",
mFid);
return true;
}
src_fsid = *bad_fsids.begin();
repair_excluded = true;
}
auto repair_job = mRepairFactory(mFid, src_fsid, 0, bad_fsids, bad_fsids,
drop_src_fsid, "fsck", repair_excluded);
repair_job->DoIt();
if (repair_job->GetStatus() != FsckRepairJob::Status::OK) {
eos_err("msg=\"stripe inconsistency repair failed\" fxid=%08llx "
"src_fsid=%lu", mFid, src_fsid);
return false;
} else {
eos_info("msg=\"stripe inconsistency repair successful\" fxid=%08llx "
"src_fsid=%lu", mFid, src_fsid);
return true;
}
}
//------------------------------------------------------------------------------
// Method to repair replica file inconsistencies
//------------------------------------------------------------------------------
bool
FsckEntry::RepairReplicaInconsistencies()
{
std::string mgm_xs_val =
StringConversion::BinData2HexString(mMgmFmd.checksum().c_str(),
SHA256_DIGEST_LENGTH,
LayoutId::GetChecksumLen(mMgmFmd.layout_id()));
std::set to_drop;
std::set unreg_fsids;
std::set repmiss_fsids;
// Account for missing replicas from MGM's perspective
for (const auto& fsid : mMgmFmd.locations()) {
eos_info("fxid=%08llx fsid=%lu", mFid, fsid);
auto it = mFstFileInfo.find(fsid);
if ((it == mFstFileInfo.end()) ||
(it->second->mFstErr == FstErr::NotOnDisk)) {
eos_info("msg=\"mark as missing\" fxid=%08llx fsid=%lu", mFid, fsid);
repmiss_fsids.insert(fsid);
}
}
// Account for unregisterd replicas and other replicas to be dropped
for (const auto& elem : mFstFileInfo) {
bool found = false;
for (const auto& loc : mMgmFmd.locations()) {
if (elem.first == loc) {
found = true;
break;
}
}
auto& finfo = elem.second;
if (found) {
if ((finfo->mFstErr == FstErr::NotOnDisk) ||
(finfo->mFstErr == FstErr::NotExist)) {
to_drop.insert(elem.first);
}
} else {
// The file system id does not exist
if (finfo->mFstErr == FstErr::NotExist) {
to_drop.insert(elem.first);
} else {
// Make sure the FST size/xs match the MGM ones
if ((finfo->mFstFmd.mProtoFmd.disksize() != mMgmFmd.size()) ||
(finfo->mFstFmd.mProtoFmd.diskchecksum() != mgm_xs_val)) {
to_drop.insert(elem.first);
} else {
unreg_fsids.insert(elem.first);
}
}
}
}
// First drop any missing replicas from the MGM
for (const auto& drop_fsid : repmiss_fsids) {
// Update the local MGM fmd object
auto mutable_loc = mMgmFmd.mutable_locations();
for (auto it = mutable_loc->begin(); it != mutable_loc->end(); ++it) {
if (*it == drop_fsid) {
mutable_loc->erase(it);
break;
}
}
if (gOFS) {
try { // Update the MGM file md object
eos::Prefetcher::prefetchFileMDWithParentsAndWait(gOFS->eosView, mFid);
eos::common::RWMutexReadLock ns_rd_lock(gOFS->eosViewRWMutex);
auto fmd = gOFS->eosFileService->getFileMD(mFid);
fmd->unlinkLocation(drop_fsid);
fmd->removeLocation(drop_fsid);
gOFS->eosView->updateFileStore(fmd.get());
eos_info("msg=\"remove missing replica\" fxid=%08llx drop_fsid=%lu",
mFid, drop_fsid);
} catch (const eos::MDException& e) {
eos_err("msg=\"replica inconsistency repair failed, no file metadata\" "
"fxid=%08llx", mFid);
return false;
}
}
}
// Then drop any other inconsistent replicas from both the MGM and the FST
for (auto fsid : to_drop) {
(void) DropReplica(fsid);
// Drop also from the local map of FST fmd info
mFstFileInfo.erase(fsid);
auto mutable_loc = mMgmFmd.mutable_locations();
for (auto it = mutable_loc->begin(); it != mutable_loc->end(); ++it) {
if (*it == fsid) {
mutable_loc->erase(it);
break;
}
}
}
to_drop.clear();
bool to_delete = (mMgmFmd.cont_id() == 0ull);
if (to_delete) {
XrdOucErrInfo err;
eos::common::VirtualIdentity vid = eos::common::VirtualIdentity::Root();
XrdOucEnv env(SSTR("mgm.fid=" << eos::common::FileId::Fid2Hex(mFid)
<< "&mgm.fsid=" << 0
<< "&mgm.dropall=1").c_str());
gOFS->Drop("", nullptr, env, err, vid, nullptr);
eos_info("msg=\"deleted detached file md\" fxid=%08llx", mFid);
return true;
}
// Decide if we need to attach or discard any replicas
uint32_t num_expected_rep = LayoutId::GetStripeNumber(mMgmFmd.layout_id()) + 1;
uint32_t num_actual_rep = mMgmFmd.locations().size();
if (num_actual_rep >= num_expected_rep) { // over-replicated
int over_replicated = num_actual_rep - num_expected_rep;
// All the unregistered replicas can be dropped
to_drop.insert(unreg_fsids.begin(), unreg_fsids.end());
while ((over_replicated > 0) && !mMgmFmd.locations().empty()) {
to_drop.insert(mMgmFmd.locations(0));
mMgmFmd.mutable_locations()->erase(mMgmFmd.locations().begin());
--over_replicated;
}
} else {
if (num_actual_rep < num_expected_rep) { // under-replicated
// While under-replicated and we still have unregistered replicas then
// attach them
while ((num_actual_rep < num_expected_rep) && !unreg_fsids.empty()) {
eos::common::FileSystem::fsid_t new_fsid = *unreg_fsids.begin();
unreg_fsids.erase(unreg_fsids.begin());
mMgmFmd.add_locations(new_fsid);
if (gOFS) {
try {
eos::Prefetcher::prefetchFileMDWithParentsAndWait(gOFS->eosView, mFid);
eos::common::RWMutexReadLock ns_rd_lock(gOFS->eosViewRWMutex);
auto fmd = gOFS->eosFileService->getFileMD(mFid);
fmd->addLocation(new_fsid);
gOFS->eosView->updateFileStore(fmd.get());
eos_info("msg=\"attached unregistered replica\" fxid=%08llx "
"new_fsid=%lu", mFid, new_fsid);
} catch (const eos::MDException& e) {
eos_err("msg=\"unregistered replica repair failed, no file metadata\" "
"fxid=%08llx", mFid);
return false;
}
}
++num_actual_rep;
}
// Drop any remaining unregistered replicas
to_drop.insert(unreg_fsids.begin(), unreg_fsids.end());
// If still under-replicated then start creating new replicas
while ((num_actual_rep < num_expected_rep) && mMgmFmd.locations_size()) {
// Trigger a fsck repair job but without dropping the source, this is
// similar to adjust replica
eos::common::FileSystem::fsid_t good_fsid = mMgmFmd.locations(0);
auto repair_job = mRepairFactory(mFid, good_fsid, 0, {}, to_drop,
false, "fsck", false);
repair_job->DoIt();
if (repair_job->GetStatus() != FsckRepairJob::Status::OK) {
eos_err("msg=\"replica inconsistency repair failed\" fxid=%08llx "
"src_fsid=%lu", mFid, good_fsid);
return false;
} else {
eos_info("msg=\"replica inconsistency repair successful\" fxid=%08llx "
"src_fsid=%lu", mFid, good_fsid);
}
++num_actual_rep;
}
if (num_actual_rep < num_expected_rep) {
eos_err("msg=\"replica inconsistency repair failed\" fxid=%08llx", mFid);
return false;
}
}
}
// Discard unregistered/bad replicas
for (auto fsid : to_drop) {
eos_info("msg=\"droping replica\" fxid=%08llx fsid=%lu", mFid, fsid);
(void) DropReplica(fsid);
// Drop also from the local map of FST fmd info
mFstFileInfo.erase(fsid);
}
ResyncFstMd(true);
eos_info("msg=\"file replicas consistent\" fxid=%08llx", mFid);
return true;
}
//----------------------------------------------------------------------------
// Resync local FST metadata with the MGM info. The refresh flag needs to
// be set whenever there is an FsckRepairJob done before.
//----------------------------------------------------------------------------
void
FsckEntry::ResyncFstMd(bool refresh_mgm_md)
{
if (refresh_mgm_md) {
CollectMgmInfo();
}
for (const auto& fsid : mMgmFmd.locations()) {
if (gOFS) {
(void) gOFS->QueryResync(mFid, fsid);
}
}
}
//------------------------------------------------------------------------------
// Drop replica form FST and also update the namespace view for the given
// file system id
//------------------------------------------------------------------------------
bool
FsckEntry::DropReplica(eos::common::FileSystem::fsid_t fsid) const
{
bool retc = true;
if (fsid == 0ull) {
return retc;
}
eos_info("msg=\"drop (unregistered) replica\" fxid=%08llx fsid=%lu",
mFid, fsid);
// Send external deletion to the FST
if (gOFS && !gOFS->DeleteExternal(fsid, mFid, true)) {
eos_err("msg=\"failed to send unlink to FST\" fxid=%08llx fsid=%lu",
mFid, fsid);
retc = false;
}
// Drop from the namespace, we don't need the path as root can drop by fid
XrdOucErrInfo err;
eos::common::VirtualIdentity vid = eos::common::VirtualIdentity::Root();
if (gOFS && gOFS->_dropstripe("", mFid, err, vid, fsid, true)) {
eos_err("msg=\"failed to drop replicas from ns\" fxid=%08llx fsid=%lu",
mFid, fsid);
}
return retc;
}
//------------------------------------------------------------------------------
// Repair entry
//------------------------------------------------------------------------------
bool
FsckEntry::Repair()
{
using namespace eos::common;
bool success = false;
// If no MGM object then we are in testing mode
if (gOFS) {
gOFS->MgmStats.Add("FsckRepairStarted", 0, 0, 1);
if (CollectMgmInfo() == false) {
eos_err("msg=\"no repair action, file is orphan\" fxid=%08llx fsid=%lu "
"err=%s", mFid, *mFsidErr.begin(), FsckErrToString(mReportedErr).c_str());
success = true;
NotifyOutcome(success);
(void) DropReplica(*mFsidErr.begin());
// This could be a ghost fid entry still present in the file system map
// and we need to also drop it from there
std::string out, err;
auto root_vid = eos::common::VirtualIdentity::Root();
(void) proc_fs_dropghosts(*mFsidErr.begin(), {mFid}, root_vid, out, err);
return success;
}
if (mMgmFmd.cont_id() == 0ull) {
eos_info("msg=\"force remove detached file\" fxid=%08llx", mFid);
std::string err_msg;
if (!gOFS->RemoveDetached(mFid, false, true, err_msg)) {
eos_err("msg=\"operation failed due to: %s\"", err_msg.c_str());
}
NotifyOutcome(true);
return true;
}
CollectAllFstInfo();
CollectFstInfo(*mFsidErr.begin());
}
if (mReportedErr != FsckErr::None) {
auto it = mMapRepairOps.find(mReportedErr);
if (it == mMapRepairOps.end()) {
eos_err("msg=\"unknown type of error\" errr=%i", mReportedErr);
NotifyOutcome(success);
return success;
}
eos_static_info("msg=\"fsck repair\" fxid=%08llx err_type=%i fsid_err=%lu",
mFid, mReportedErr, *mFsidErr.begin());
auto fn_with_obj = std::bind(it->second, this);
success = fn_with_obj();
NotifyOutcome(success);
return success;
}
// If no explicit error given then try to repair all types of errors, we put
// the ones with higher priority first
std::list repair_ops {
&FsckEntry::RepairMgmXsSzDiff,
&FsckEntry::RepairFstXsSzDiff,
&FsckEntry::RepairInconsistencies};
for (const auto& op : repair_ops) {
auto fn_with_obj = std::bind(op, this);
if (!fn_with_obj()) {
NotifyOutcome(success);
return success;
}
}
success = true;
NotifyOutcome(success);
return success;
}
//------------------------------------------------------------------------------
// Get file metadata info stored at the FST
//------------------------------------------------------------------------------
bool
FsckEntry::GetFstFmd(std::unique_ptr& finfo,
XrdCl::FileSystem& fs,
eos::common::FileSystem::fsid_t fsid)
{
XrdCl::Buffer* raw_response {nullptr};
// Create query command for file metadata
std::ostringstream oss;
oss << "/?fst.pcmd=getfmd&fst.getfmd.fsid=" << fsid
<< "&fst.getfmd.fid=" << std::hex << mFid;
XrdCl::Buffer arg;
arg.FromString(oss.str().c_str());
uint16_t timeout = 10;
XrdCl::XRootDStatus status = fs.Query(XrdCl::QueryCode::OpaqueFile, arg,
raw_response, timeout);
std::unique_ptr response(raw_response);
if (!status.IsOK()) {
if (status.code == XrdCl::errOperationExpired) {
eos_err("msg=\"timeout file metadata query\" fxid=%08llx fsid=%lu",
mFid, fsid);
finfo->mFstErr = FstErr::NoContact;
} else {
eos_err("msg=\"failed file metadata query\" fxid=08llx fsid=%lu",
mFid, fsid);
finfo->mFstErr = FstErr::NoFmdInfo;
}
return false;
}
if ((response == nullptr) ||
(strncmp(response->GetBuffer(), "ERROR", 5) == 0)) {
eos_err("msg=\"no local fst metadata present\" fxid=%08llx fsid=%lu",
mFid, fsid);
finfo->mFstErr = FstErr::NoFmdInfo;
return false;
}
// Parse in the file metadata info
XrdOucEnv fmd_env(response->GetBuffer());
if (!eos::common::EnvToFstFmd(fmd_env, finfo->mFstFmd)) {
eos_err("msg=\"failed parsing fmd env\" fsid=%lu", fsid);
finfo->mFstErr = FstErr::NoFmdInfo;
return false;
}
return true;
}
//------------------------------------------------------------------------------
// Update MGM stats and backend depending on the final outcome
//------------------------------------------------------------------------------
void
FsckEntry::NotifyOutcome(bool success) const
{
if (gOFS) {
// Update the MGM statistics and QDB backend in case of success
if (success) {
gOFS->MgmStats.Add("FsckRepairSuccessful", 0, 0, 1);
const std::string sfsck_err = eos::common::FsckErrToString(mReportedErr);
if (mReportedErr == eos::common::FsckErr::StripeErr) {
for (auto fsid : mFsidErr) {
gOFS->mFsckEngine->NotifyFixedErr(mFid, fsid, sfsck_err);
}
} else {
// If error is not stripe error, only the first fsid has been fixed
gOFS->mFsckEngine->NotifyFixedErr(mFid, *mFsidErr.begin(), sfsck_err);
}
// Such errors are reported by all the attached locations so when they
// are fixed we need to update the fsck info for all of them
if (mReportedErr == eos::common::FsckErr::DiffRepl) {
for (const auto& loc : mMgmFmd.locations()) {
gOFS->mFsckEngine->NotifyFixedErr(mFid, loc, sfsck_err);
}
}
} else {
gOFS->MgmStats.Add("FsckRepairFailed", 0, 0, 1);
}
}
}
EOSMGMNAMESPACE_END