// ---------------------------------------------------------------------- // File: Scrub.cc // Author: Andreas-Joachim Peters - CERN // ---------------------------------------------------------------------- /************************************************************************ * EOS - the CERN Disk Storage System * * Copyright (C) 2011 CERN/Switzerland * * * * This program is free software: you can redistribute it and/or modify * * it under the terms of the GNU General Public License as published by * * the Free Software Foundation, either version 3 of the License, or * * (at your option) any later version. * * * * This program is distributed in the hope that it will be useful, * * but WITHOUT ANY WARRANTY; without even the implied warranty of * * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * * GNU General Public License for more details. * * * * You should have received a copy of the GNU General Public License * * along with this program. If not, see .* ************************************************************************/ #include "fst/storage/Storage.hh" #include "fst/storage/FileSystem.hh" #include #ifdef __APPLE__ #define O_DIRECT 0 #endif EOSFSTNAMESPACE_BEGIN /*----------------------------------------------------------------------------*/ void Storage::Scrub() { // create a 1M pattern eos_info("%s", "msg=\"create scrubbing pattern ...\""); for (int i = 0; i < 1024 * 1024 / 8; i += 2) { mScrubPattern[0][i] = 0xaaaa5555aaaa5555ULL; mScrubPattern[0][i + 1] = 0x5555aaaa5555aaaaULL; mScrubPattern[1][i] = 0x5555aaaa5555aaaaULL; mScrubPattern[1][i + 1] = 0xaaaa5555aaaa5555ULL; } eos_info("%s", "msg=\"start scrubbing\""); // this thread reads the oldest files and checks their integrity while (true) { time_t start = time(0); std::set fsids; // Collect all file system ids registered { eos::common::RWMutexReadLock fs_rd_lock(mFsMutex); for (const auto& elem : mFsMap) { fsids.insert(elem.first); } } eos_debug("msg=\"running on %lu file systems\"", fsids.size()); std::string path {""}; uint64_t free {0ull}; uint64_t blocks {0ull}; bool direct_io = false; eos::common::BootStatus boot_st; eos::common::ConfigStatus config_st; for (auto fsid : fsids) { { eos::common::RWMutexReadLock fs_rd_lock(mFsMutex); auto it = mFsMap.find(fsid); if (it == mFsMap.end()) { eos_warning("msg=\"skip removed file system\" fsid=%lu", fsid); continue; } auto fs = it->second; path = fs->GetPath(); if (fs->GetStatfs() == nullptr) { eos_notice("msg=\"statfs failed on file system\" fsid=%lu path=\"%s\"", fsid, path.c_str()); continue; } free = fs->GetStatfs()->GetStatfs()->f_bfree; blocks = fs->GetStatfs()->GetStatfs()->f_blocks; // Disable direct IO for ZFS direct_io = (fs->GetStatfs()->GetStatfs()->f_type != 0x2fc12fc1); boot_st = fs->GetStatus(); config_st = fs->GetConfigStatus(); } // Skip scrubbing file systems for which either of the following // conditions hold: // - not a local file system (i.e. remote) // - not in writable mode // - not booted if (path.empty() || (path[0] != '/') || (config_st < eos::common::ConfigStatus::kWO) || (boot_st != eos::common::BootStatus::kBooted)) { continue; } struct stat buf; std::string no_scrub = path + "/" + ".eosnoscrub"; if (!::stat(no_scrub.c_str(), &buf)) { eos_debug("msg=\"scrub is disabled, remove %s to activate\"", no_scrub.c_str()); continue; } if (ScrubFs(path.c_str(), free, blocks, fsid, direct_io)) { // Filesystem has errors eos::common::RWMutexReadLock fs_rd_lock(mFsMutex); auto it = mFsMap.find(fsid); if (it == mFsMap.end()) { eos_warning("msg=\"skip removed file system\" fsid=%lu", fsid); continue; } it->second->BroadcastError(EIO, "filesystem probe error detected"); } } time_t stop = time(0); int nsleep = ((300) - (stop - start)); if (nsleep > 0) { eos_debug("msg=\"scrubber will pause for %u seconds\"", nsleep); std::this_thread::sleep_for(std::chrono::seconds(nsleep)); } } } //------------------------------------------------------------------------------ // Scrub filesystem //------------------------------------------------------------------------------ int Storage::ScrubFs(const char* path, unsigned long long free, unsigned long long blocks, unsigned long id, bool direct_io) { int MB = 1; // the test files have 1 MB int index = 10 - (int)(10.0 * free / blocks); eos_static_debug("Running Scrubber on filesystem path=%s id=%u free=%llu blocks=%llu index=%d", path, id, free, blocks, index); int fserrors = 0; for (int fs = 1; fs <= index; fs++) { // check if test file exists, if not, write it XrdOucString scrubfile[2]; scrubfile[0] = path; scrubfile[1] = path; scrubfile[0] += "/scrub.write-once."; scrubfile[0] += fs; scrubfile[1] += "/scrub.re-write."; scrubfile[1] += fs; struct stat buf; int dflags = 0; if (direct_io) { dflags = O_DIRECT; } for (int k = 0; k < 2; k++) { eos_static_debug("Scrubbing file %s", scrubfile[k].c_str()); if (((k == 0) && stat(scrubfile[k].c_str(), &buf)) || ((k == 0) && (buf.st_size != (MB * 1024 * 1024))) || ((k == 1))) { // ok, create this file once int ff = 0; if (k == 0) { ff = open(scrubfile[k].c_str(), O_CREAT | O_TRUNC | O_WRONLY | dflags, S_IRWXU); } else { ff = open(scrubfile[k].c_str(), O_CREAT | O_WRONLY | dflags, S_IRWXU); } if (ff < 0) { if (errno == EMFILE) { eos_static_warning("Unable to create/wopen scrubfile %s errno=%d", scrubfile[k].c_str(), errno); // this is not fatal, since it might be a temporary problem return 0; } eos_static_crit("Unable to create/wopen scrubfile %s errno=%d", scrubfile[k].c_str(), errno); fserrors = 1; break; } // select the pattern randomly int rshift = (int)((1.0 * rand() / RAND_MAX) + 0.5); eos_static_debug("rshift is %d", rshift); for (int i = 0; i < MB; i++) { int nwrite = write(ff, mScrubPattern[rshift], 1024 * 1024); if (nwrite != (1024 * 1024)) { eos_static_crit("Unable to write all needed bytes for scrubfile %s errno=%d", scrubfile[k].c_str(), errno); fserrors = 1; break; } if (k != 0) { std::this_thread::sleep_for(std::chrono::milliseconds(100)); } } close(ff); } // do a read verify int ff = open(scrubfile[k].c_str(), dflags | O_RDONLY); if (ff < 0) { if (errno == EMFILE) { eos_static_warning("Unable to create/wopen scrubfile %s errno=%d", scrubfile[k].c_str(), errno); // this is not fatal, since it might be a temporary problem return 0; } else { eos_static_crit("Unable to open static scrubfile %s, errno=%d", scrubfile[k].c_str(), errno); return 1; } } int eberrors = 0; for (int i = 0; i < MB; i++) { int nread = read(ff, mScrubPatternVerify, 1024 * 1024); if (nread != (1024 * 1024)) { eos_static_crit("Unable to read all needed bytes from scrubfile %s errno=%d", scrubfile[k].c_str(), errno); fserrors = 1; break; } unsigned long long* ref = (unsigned long long*) mScrubPattern[0]; unsigned long long* cmp = (unsigned long long*) mScrubPatternVerify; // do a quick check for (int b = 0; b < MB * 1024 / 8; b++) { if ((*ref != *cmp)) { ref = (unsigned long long*) mScrubPattern[1]; if (*(ref) == *cmp) { // ok - pattern shifted } else { // this is real fatal error eberrors++; } } } std::this_thread::sleep_for(std::chrono::milliseconds(100)); } if (eberrors) { eos_static_alert("%d block errors on filesystem %lu scrubfile %s", eberrors, id, scrubfile[k].c_str()); fserrors++; } close(ff); } } if (fserrors) { return 1; } return 0; } EOSFSTNAMESPACE_END