//------------------------------------------------------------------------------
// File: ScanDir.hh
// Author: Elvin Sindrilaru - CERN
//------------------------------------------------------------------------------
/************************************************************************
* EOS - the CERN Disk Storage System *
* Copyright (C) 2019 CERN/Switzerland *
* *
* This program is free software: you can redistribute it and/or modify *
* it under the terms of the GNU General Public License as published by *
* the Free Software Foundation, either version 3 of the License, or *
* (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU General Public License for more details. *
* *
* You should have received a copy of the GNU General Public License *
* along with this program. If not, see .*
************************************************************************/
#pragma once
#include "fst/Namespace.hh"
#include "common/Logging.hh"
#include "common/FileSystem.hh"
#include "common/FileId.hh"
#include "common/AssistedThread.hh"
#include "common/SteadyClock.hh"
#include "common/RateLimit.hh"
#include "common/LayoutId.hh"
#include "namespace/interface/IFileMD.hh"
#include "namespace/ns_quarkdb/persistency/MetadataFetcher.hh"
#include
EOSFSTNAMESPACE_BEGIN
class Load;
class FileIo;
class CheckSum;
constexpr uint64_t DEFAULT_RAIN_RESCAN_INTERVAL = 4 * 7 * 24 * 3600;
constexpr uint64_t DEFAULT_DISK_INTERVAL = 4 * 3600;
constexpr uint64_t DEFAULT_NS_INTERVAL = 3 * 24 * 3600;
//------------------------------------------------------------------------------
//! Class ScanDir
//! @brief Scan a directory tree and checks checksums (and blockchecksums if
//! present) on a regular interval with limited bandwidth
//------------------------------------------------------------------------------
class ScanDir: public eos::common::LogId
{
public:
//----------------------------------------------------------------------------
//! Constructor
//----------------------------------------------------------------------------
ScanDir(const char* dirpath, eos::common::FileSystem::fsid_t fsid,
eos::fst::Load* fstload, bool bgthread = true,
long int file_rescan_interval = 60, int ratebandwidth = 50,
bool fake_clock = false);
//----------------------------------------------------------------------------
//! Destructor
//----------------------------------------------------------------------------
virtual ~ScanDir();
//----------------------------------------------------------------------------
//! Update scanner configuration
//!
//! @param key configuration type
//! @param value configuration value
//----------------------------------------------------------------------------
void SetConfig(const std::string&, long long value);
//------------------------------------------------------------------------------
//! Infinite loop doing the scanning and verification of disk entries
//!
//! @param assistant thread running the job
//------------------------------------------------------------------------------
void RunDiskScan(ThreadAssistant& assistant) noexcept;
#ifndef _NOOFS
//------------------------------------------------------------------------------
//! Infinite loop doing the scanning of namespace entries
//!
//! @param assistant thread running the job
//------------------------------------------------------------------------------
void RunNsScan(ThreadAssistant& assistant) noexcept;
#endif
//----------------------------------------------------------------------------
//! Method traversing all the files in the subtree and potentially rescanning
//! some of them
//!
//! @param assistant thread running the job
//----------------------------------------------------------------------------
void ScanSubtree(ThreadAssistant& assistant) noexcept;
//----------------------------------------------------------------------------
//! Decide if a rescan is needed based on the timestamp provided and the
//! configured rescan interval
//!
//! @param timestamp_us timestamp in seconds
//! @param rain_ts if true it refers to a rain scan timestamp, otherwise to a
//! regular scan timestamp
//!
//! @return true if file is to be rescanned, otherwise false
//----------------------------------------------------------------------------
bool DoRescan(const std::string& timestamp_sec,
bool rain_ts = false) const;
//----------------------------------------------------------------------------
//! Check the given file for errors and properly account them both at the
//! scanner level and also by setting the proper xattrs on the file.
//!
//! @param fpath file path
//!
//! @return true if file check, otherwise false
//----------------------------------------------------------------------------
bool CheckFile(const std::string& fpath);
//----------------------------------------------------------------------------
//! Get block checksum object for the given file. First we need to check if
//! there is a block checksum file (.xsmap) correspnding to the given raw
//! file.
//!
//! @param file_path full path to raw file
//!
//! @return block checksum object
//----------------------------------------------------------------------------
std::unique_ptr
GetBlockXS(const std::string& file_path);
//----------------------------------------------------------------------------
//! Check the given file for errors and properly account them both at the
//! scanner level and also by setting the proper xattrs on the file.
//!
//! @param io io object attached to the file
//! @param fpath file path
//! @param fid file id
//! @param scan_ts_sec time file was last checked
//! @param mtime time file contents was last modified
//!
//! @return true if file check, otherwise false
//----------------------------------------------------------------------------
bool ScanFile(const std::unique_ptr& io,
const std::string& fpath, eos::common::FileId::fileid_t fid,
const std::string& scan_ts_sec, time_t mtime);
//----------------------------------------------------------------------------
//! Scan the given file for checksum errors taking the load into consideration
//!
//! @param io io object attached to the file
//! @param scan_size final scan size
//! @param scan_xs_hex scanned file checksum in hex
//! @param filexs_err set to true if file has a checksum error
//! @param blockxs_err set to true if file has a block checksum errror
//!
//! @return true if file is correct, otherwise false if file does not exist,
//! or there is any type of checksum error
//----------------------------------------------------------------------------
bool ScanFileLoadAware(const std::unique_ptr& io,
unsigned long long& scan_size,
std::string& scan_xs_hex,
bool& filexs_err, bool& blockxs_err);
#ifndef _NOOFS
//----------------------------------------------------------------------------
//! Check for stripes that are unable to reconstruct the original file
//!
//! @param stripes list of replica index and stripe urls
//! @param xs_val expected checksum
//! @param xs_obj checksum object used to calculate the checksum
//! @param layout layout id
//! @param opaqueInfo oopaque information
//!
//! @return true if file has expected checksum, false otherwise
//----------------------------------------------------------------------------
bool IsValidStripeCombination(
const std::vector>& stripes,
const std::string& xs_val, std::unique_ptr& xs_obj,
eos::common::LayoutId::layoutid_t layout, const std::string& opaqueInfo);
//----------------------------------------------------------------------------
//! Check the given file for rain stripes errors
//!
//! @param io io object attached to the file
//! @param fpath file path
//! @param fid file id
//! @param scan_ts_sec time file was last checked
//!
//! @return true if file check, otherwise false
//----------------------------------------------------------------------------
bool ScanRainFile(const std::unique_ptr& io,
const std::string& fpath,
eos::common::FileId::fileid_t fid,
const std::string& scan_ts_sec);
//----------------------------------------------------------------------------
//! Check each stripe to verify if they can reconstruct the original file
//!
//! @param fid file id
//! @param invalid_fsid fsids of invalid stripes
//!
//! @return true if check happened, false if error occurred
//----------------------------------------------------------------------------
bool
ScanRainFileLoadAware(eos::common::FileId::fileid_t fid,
std::set& invalid_fsid);
#endif
//----------------------------------------------------------------------------
//! Get clock reference for testing purposes
//----------------------------------------------------------------------------
inline eos::common::SteadyClock& GetClock()
{
return mClock;
}
//----------------------------------------------------------------------------
//! Get timestamp in seconds smeared +/-20% of
//! mEntryIntervalSec/mRainEntryIntervalSec around the current timestamp value
//!
//! @param rain_ts if true it refers to a rain scan timestamp, otherwise to a
//! regular scan timestamp
//!
//! @return string representing timestamp in seconds since epoch
//----------------------------------------------------------------------------
std::string GetTimestampSmearedSec(bool rain_ts = false) const;
private:
#ifdef IN_TEST_HARNESS
public:
#endif
//----------------------------------------------------------------------------
//! Enforce the scan rate by throttling the current thread and also adjust it
//! depending on the IO load on the mountpoint
//!
//! @param offset current offset in file
//! @param open_ts time point when file was opened
//! @param scan_rate current scan rate, if 0 then then rate limiting is
//! disabled
//----------------------------------------------------------------------------
void EnforceAndAdjustScanRate(const off_t offset,
std::chrono::time_point
open_ts,
int& scan_rate);
#ifndef _NOOFS
//----------------------------------------------------------------------------
//! Collect all file ids present on the current file system from the NS view
//!
//! @param type can be either eos::fsview::sFilesSuffix or
//! eos::fsview::sUnlinkedSuffix
//!
//! @return queue holding the file ids
//----------------------------------------------------------------------------
std::deque CollectNsFids(const std::string& type) const;
//----------------------------------------------------------------------------
//! Account for missing replicas
//----------------------------------------------------------------------------
void AccountMissing();
//----------------------------------------------------------------------------
//! Cleanup unlinked replicas which are older than 1 hour
//----------------------------------------------------------------------------
void CleanupUnlinked();
#endif
//! Default ns scan rate is bound by the number of IO ops a disk can handle
//! and we set it to half the average max IOOPS for HDD which is 100.
static constexpr unsigned long long sDefaultNsScanRate {50};
//----------------------------------------------------------------------------
//! Check if file is unlinked from the namespace and in the process of being
//! deleted from the disk. Files that are unlinked for more than 30 min
//! definetely have a problem and we don't account them as in the process of
//! being deleted.
//!
//! @param fid file identifier
//!
//! @return true if file is being deleted, otherwise false
//----------------------------------------------------------------------------
bool IsBeingDeleted(const eos::IFileMD::id_t fid) const;
//----------------------------------------------------------------------------
//! Drop ghost fid from the given file system id
//!
//! @param fsid file system id
//! @param fid file identifier
//!
//! @return true if successful, otherwise false
//----------------------------------------------------------------------------
bool DropGhostFid(const eos::common::FileSystem::fsid_t fsid,
const eos::IFileMD::id_t fid) const;
//----------------------------------------------------------------------------
//! Print log message - depending on whether or not we run in standalone mode
//! or inside the FST daemon
//!
//! @param log_level log level used for the printout
//----------------------------------------------------------------------------
template
void LogMsg(int log_level, Args&& ... args)
{
if (mBgThread) {
eos_static_log(log_level, std::forward(args) ...);
} else {
if ((log_level == LOG_INFO) || (log_level == LOG_DEBUG)) {
fprintf(stdout, std::forward(args) ...);
} else {
fprintf(stderr, std::forward(args) ...);
fprintf(stderr, "%s", "\n");
}
}
}
eos::fst::Load* mFstLoad; ///< Object for providing load information
eos::common::FileSystem::fsid_t mFsId; ///< Corresponding file system id
std::string mDirPath; ///< Root directory used by the scanner
std::atomic mRateBandwidth; ///< Max scan IO rate in MB/s
//! Time interval after which a file is rescanned in seconds, if 0 then
//! rescanning is completely disabled
std::atomic mEntryIntervalSec;
//! Time interval after which a rain file is rescanned in seconds, if 0 then
//! rescanning is completely disabled
std::atomic mRainEntryIntervalSec;
//! Time interval after which the disk scanner will run again, default 4h
std::atomic mDiskIntervalSec;
//! Time interval after which the scanner will run again, default 3 days
std::atomic mNsIntervalSec;
// Configuration variable to track changes in disk scan intervals
uint64_t mConfDiskIntervalSec;
// Statistics
long int mNumScannedFiles;
long int mNumCorruptedFiles;
long int mNumHWCorruptedFiles;
long long int mTotalScanSize;
long int mNumTotalFiles;
long int mNumSkippedFiles;
char* mBuffer; ///< Buffer used for reading
uint32_t mBufferSize; ///< Size of the reading buffer
bool mBgThread; ///< If true running as background thread inside the FST
AssistedThread mDiskThread; ///< Thread doing the scanning of the disk
AssistedThread mNsThread; ///< Thread doing the scanning of NS entries
eos::common::SteadyClock mClock; ///< Clock wrapper used for testing
//! Rate limiter for ns scanning which actually limits the number of stat
//! requests send across the disks in one FSTs.
std::unique_ptr mRateLimit;
};
EOSFSTNAMESPACE_END