#!/bin/bash -ex
# ******************************************************************************
# EOS - the CERN Disk Storage System
# Copyright (C) 2019 CERN/Switzerland
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see .
# ******************************************************************************
SCRIPTPATH="$( cd "$(dirname "$0")" >/dev/null 2>&1 ; pwd -P )"
source ${SCRIPTPATH}/eos-test-utils
function usage() {
echo "usage: $(basename $0) [--max-delay ] --type docker/local/k8s "
echo " --max-delay : optional max delay in seconds"
echo " docker : script runs in a Docker based setup"
echo " k8s : script runs in a Kubernetes setup and requires a namespace argument"
echo " local : script runs locally, needs EOS_MGM_URL to be set"
}
# Create and upload test files to the eos instance. We create a random file and
# upload it multiple times to EOS one file per type of corruption.
function create_test_files() {
EOS_ROOT=/eos/dockertest/
EOS_RAIN_DIR=${EOS_ROOT}/fsck/rain
EOS_REPLICA_DIR=${EOS_ROOT}/fsck/replica
# Create one file per type of fsck error and trim whitespaces
exec_cmd eos-cli1 "export PATH=/opt/eos/xrootd/bin/:${PATH} &&
dd if=/dev/urandom of=/tmp/test_file.dat bs=1M count=10 &&
eos -r 0 0 mkdir -p ${EOS_RAIN_DIR} &&
eos -r 0 0 mkdir -p ${EOS_REPLICA_DIR} &&
eos -r 0 0 attr set default=replica ${EOS_REPLICA_DIR} &&
eos -r 0 0 attr set default=raid6 ${EOS_RAIN_DIR} &&
xrdcp -f /tmp/test_file.dat \${EOS_MGM_URL}/${EOS_REPLICA_DIR}/file_d_mem_sz_diff.dat &&
xrdcp -f /tmp/test_file.dat \${EOS_MGM_URL}/${EOS_REPLICA_DIR}/file_m_mem_sz_diff1.dat &&
xrdcp -f /tmp/test_file.dat \${EOS_MGM_URL}/${EOS_REPLICA_DIR}/file_m_mem_sz_diff2.dat &&
xrdcp -f /tmp/test_file.dat \${EOS_MGM_URL}/${EOS_REPLICA_DIR}/file_d_cx_diff.dat &&
xrdcp -f /tmp/test_file.dat \${EOS_MGM_URL}/${EOS_REPLICA_DIR}/file_m_cx_diff.dat &&
xrdcp -f /tmp/test_file.dat \${EOS_MGM_URL}/${EOS_REPLICA_DIR}/file_unreg.dat &&
xrdcp -f /tmp/test_file.dat \${EOS_MGM_URL}/${EOS_REPLICA_DIR}/file_rep_missing.dat &&
xrdcp -f /tmp/test_file.dat \${EOS_MGM_URL}/${EOS_REPLICA_DIR}/file_rep_diff_under.dat &&
xrdcp -f /tmp/test_file.dat \${EOS_MGM_URL}/${EOS_REPLICA_DIR}/file_rep_diff_over.dat &&
xrdcp -f /tmp/test_file.dat \${EOS_MGM_URL}/${EOS_REPLICA_DIR}/file_orphan.dat &&
xrdcp -f /tmp/test_file.dat \${EOS_MGM_URL}/${EOS_RAIN_DIR}/rain_blockxs_err.dat &&
xrdcp -f /tmp/test_file.dat \${EOS_MGM_URL}/${EOS_RAIN_DIR}/rain_stripe_diff.dat &&
xrdcp -f /tmp/test_file.dat \${EOS_MGM_URL}/${EOS_RAIN_DIR}/rain_stripe_sz_err.dat &&
xrdcp -f /tmp/test_file.dat \${EOS_MGM_URL}/${EOS_RAIN_DIR}/rain_invalid_stripe_err.dat &&
xrdcp -f /tmp/test_file.dat \${EOS_MGM_URL}/${EOS_RAIN_DIR}/rain_invalid_stripe_err2.dat &&
xrdcp -f /tmp/test_file.dat \${EOS_MGM_URL}/${EOS_RAIN_DIR}/rain_stripe_diff_over.dat &&
xrdcp -f /tmp/test_file.dat \${EOS_MGM_URL}/${EOS_RAIN_DIR}/rain_stripe_diff_over2.dat"
FXID_D_MEM_SZ_DIFF=$(exec_cmd eos-cli1 "eos fileinfo ${EOS_REPLICA_DIR}/file_d_mem_sz_diff.dat -m | sed -r 's/[[:alnum:]]+=/\n&/g' | awk -F '=' '\$1==\"fxid\" {print \$2};' | sed -e 's/^[[:space:]]*//' -e 's/[[:space:]]*\$//'")
FXID_M_MEM_SZ_DIFF1=$(exec_cmd eos-cli1 "eos fileinfo ${EOS_REPLICA_DIR}/file_m_mem_sz_diff1.dat -m | sed -r 's/[[:alnum:]]+=/\n&/g' | awk -F '=' '\$1==\"fxid\" {print \$2};' | sed -e 's/^[[:space:]]*//' -e 's/[[:space:]]*\$//'")
FXID_M_MEM_SZ_DIFF2=$(exec_cmd eos-cli1 "eos fileinfo ${EOS_REPLICA_DIR}/file_m_mem_sz_diff2.dat -m | sed -r 's/[[:alnum:]]+=/\n&/g' | awk -F '=' '\$1==\"fxid\" {print \$2};' | sed -e 's/^[[:space:]]*//' -e 's/[[:space:]]*\$//'")
FXID_D_CX_DIFF=$(exec_cmd eos-cli1 "eos fileinfo ${EOS_REPLICA_DIR}/file_d_cx_diff.dat -m | sed -r 's/[[:alnum:]]+=/\n&/g' | awk -F '=' '\$1==\"fxid\" {print \$2};' | sed -e 's/^[[:space:]]*//' -e 's/[[:space:]]*\$//'")
FXID_M_CX_DIFF=$(exec_cmd eos-cli1 "eos fileinfo ${EOS_REPLICA_DIR}/file_m_cx_diff.dat -m | sed -r 's/[[:alnum:]]+=/\n&/g' | awk -F '=' '\$1==\"fxid\" {print \$2};' | sed -e 's/^[[:space:]]*//' -e 's/[[:space:]]*\$//'")
FXID_UNREG=$(exec_cmd eos-cli1 "eos fileinfo ${EOS_REPLICA_DIR}/file_unreg.dat -m | sed -r 's/[[:alnum:]]+=/\n&/g' | awk -F '=' '\$1==\"fxid\" {print \$2};' | sed -e 's/^[[:space:]]*//' -e 's/[[:space:]]*\$//'")
FXID_REP_MISSING=$(exec_cmd eos-cli1 "eos fileinfo ${EOS_REPLICA_DIR}/file_rep_missing.dat -m | sed -r 's/[[:alnum:]]+=/\n&/g' | awk -F '=' '\$1==\"fxid\" {print \$2};' | sed -e 's/^[[:space:]]*//' -e 's/[[:space:]]*\$//'")
FXID_REP_DIFF_UNDER=$(exec_cmd eos-cli1 "eos fileinfo ${EOS_REPLICA_DIR}/file_rep_diff_under.dat -m | sed -r 's/[[:alnum:]]+=/\n&/g' | awk -F '=' '\$1==\"fxid\" {print \$2};' | sed -e 's/^[[:space:]]*//' -e 's/[[:space:]]*\$//'")
FXID_REP_DIFF_OVER=$(exec_cmd eos-cli1 "eos fileinfo ${EOS_REPLICA_DIR}/file_rep_diff_over.dat -m | sed -r 's/[[:alnum:]]+=/\n&/g' | awk -F '=' '\$1==\"fxid\" {print \$2};' | sed -e 's/^[[:space:]]*//' -e 's/[[:space:]]*\$//'")
FXID_ORPHAN=$(exec_cmd eos-cli1 "eos fileinfo ${EOS_REPLICA_DIR}/file_orphan.dat -m | sed -r 's/[[:alnum:]]+=/\n&/g' | awk -F '=' '\$1==\"fxid\" {print \$2};' | sed -e 's/^[[:space:]]*//' -e 's/[[:space:]]*\$//'")
FXID_RAIN_BLOCKXS=$(exec_cmd eos-cli1 "eos fileinfo ${EOS_RAIN_DIR}/rain_blockxs_err.dat -m | sed -r 's/[[:alnum:]]+=/\n&/g' | awk -F '=' '\$1==\"fxid\" {print \$2};' | sed -e 's/^[[:space:]]*//' -e 's/[[:space:]]*\$//'")
FXID_RAIN_DIFF=$(exec_cmd eos-cli1 "eos fileinfo ${EOS_RAIN_DIR}/rain_stripe_diff.dat -m | sed -r 's/[[:alnum:]]+=/\n&/g' | awk -F '=' '\$1==\"fxid\" {print \$2};' | sed -e 's/^[[:space:]]*//' -e 's/[[:space:]]*\$//'")
FXID_RAIN_STRIPE_SZ=$(exec_cmd eos-cli1 "eos fileinfo ${EOS_RAIN_DIR}/rain_stripe_sz_err.dat -m | sed -r 's/[[:alnum:]]+=/\n&/g' | awk -F '=' '\$1==\"fxid\" {print \$2};' | sed -e 's/^[[:space:]]*//' -e 's/[[:space:]]*\$//'")
FXID_RAIN_INVALID_STRIPE=$(exec_cmd eos-cli1 "eos fileinfo ${EOS_RAIN_DIR}/rain_invalid_stripe_err.dat -m | sed -r 's/[[:alnum:]]+=/\n&/g' | awk -F '=' '\$1==\"fxid\" {print \$2};' | sed -e 's/^[[:space:]]*//' -e 's/[[:space:]]*\$//'")
FXID_RAIN_INVALID_STRIPE2=$(exec_cmd eos-cli1 "eos fileinfo ${EOS_RAIN_DIR}/rain_invalid_stripe_err2.dat -m | sed -r 's/[[:alnum:]]+=/\n&/g' | awk -F '=' '\$1==\"fxid\" {print \$2};' | sed -e 's/^[[:space:]]*//' -e 's/[[:space:]]*\$//'")
FXID_RAIN_DIFF_OVER=$(exec_cmd eos-cli1 "eos fileinfo ${EOS_RAIN_DIR}/rain_stripe_diff_over.dat -m | sed -r 's/[[:alnum:]]+=/\n&/g' | awk -F '=' '\$1==\"fxid\" {print \$2};' | sed -e 's/^[[:space:]]*//' -e 's/[[:space:]]*\$//'")
FXID_RAIN_DIFF_OVER2=$(exec_cmd eos-cli1 "eos fileinfo ${EOS_RAIN_DIR}/rain_stripe_diff_over2.dat -m | sed -r 's/[[:alnum:]]+=/\n&/g' | awk -F '=' '\$1==\"fxid\" {print \$2};' | sed -e 's/^[[:space:]]*//' -e 's/[[:space:]]*\$//'")
# If any of the FXID_* variables are empty then we have a problem
if [[ -z "${FXID_D_MEM_SZ_DIFF}" ||
-z "${FXID_M_MEM_SZ_DIFF1}" ||
-z "${FXID_M_MEM_SZ_DIFF2}" ||
-z "${FXID_D_CX_DIFF}" ||
-z "${FXID_M_CX_DIFF}" ||
-z "${FXID_UNREG}" ||
-z "${FXID_REP_MISSING}" ||
-z "${FXID_REP_DIFF_UNDER}" ||
-z "${FXID_REP_DIFF_OVER}" ||
-z "${FXID_ORPHAN}" ||
-z "${FXID_RAIN_BLOCKXS}" ||
-z "${FXID_RAIN_DIFF}" ||
-z "${FXID_RAIN_STRIPE_SZ}" ||
-z "${FXID_RAIN_INVALID_STRIPE}" ||
-z "${FXID_RAIN_INVALID_STRIPE2}" ||
-z "${FXID_RAIN_DIFF_OVER}" ||
-z "${FXID_RAIN_DIFF_OVER2}" ]]; then
echo "error: some of the fxids could not be retrieved"
cleanup
exit 1
fi
# Cleanup generated test file
exec_cmd eos-cli1 "rm -rf /tmp/test_file.dat"
}
# Corrupt file to generate d_mem_sz_diff error
function corrupt_d_mem_sz_diff() {
local CMD_OUT=$(exec_cmd eos-cli1 "eos fileinfo fxid:${FXID_D_MEM_SZ_DIFF} -m --fullpath | sed -r 's/[[:alnum:]]+=/\n&/g' | awk -F '=' '{if (\$1 ==\"fsid\" || \$1 ==\"fullpath\") {print \$2};}' | tail -n2")
# Extract the fxid and local path, trim the input
local FSID=$(echo "${CMD_OUT}" | head -n1 | sed -e 's/^[[:space:]]*//' -e 's/[[:space:]]*$//')
local LPATH=$(echo "${CMD_OUT}" | tail -n1)
exec_cmd "eos-fst${FSID}" "echo \"dummy\" >> ${LPATH}"
}
# Corrupt file to generate m_mem_sz_diff - where repair just needs to update the ns size
function corrupt_m_mem_sz_diff1() {
# Use the eos-ns-inspect tool to corrupt the MGM file size
local QDB_CLUSTER=$(exec_cmd eos-mgm1 "cat /etc/xrd.cf.mgm | grep "^mgmofs.qdbcluster" | awk '{print \$2;}'")
local QDB_PWDFILE=$(exec_cmd eos-mgm1 "cat /etc/xrd.cf.mgm | grep "^mgmofs.qdbpassword_file" | awk '{print \$2;}'")
exec_cmd eos-cli1 "eos-ns-inspect change-fid --no-dry-run --members ${QDB_CLUSTER} --password-file ${QDB_PWDFILE} --fid $(( 16#${FXID_M_MEM_SZ_DIFF1} )) --new-size 1234568"
exec_cmd eos-cli1 "eos -r 0 0 ns cache drop-single-file $(( 16#${FXID_M_MEM_SZ_DIFF1} )) || true"
}
# Corrupt file to generate m_mem_sz_diff - where repair involves dropping the
# broken replica and triggering a replication of the correct one
function corrupt_m_mem_sz_diff2() {
local CMD_OUT=$(exec_cmd eos-cli1 "eos fileinfo fxid:${FXID_M_MEM_SZ_DIFF2} -m --fullpath | sed -r 's/[[:alnum:]]+=/\n&/g' | awk -F '=' '{if (\$1 ==\"fsid\" || \$1 ==\"fullpath\") {print \$2};}' | tail -n2")
# Extract the fxid and local path, trim the input
local FSID=$(echo "${CMD_OUT}" | head -n1 | sed -e 's/^[[:space:]]*//' -e 's/[[:space:]]*$//')
local LPATH=$(echo "${CMD_OUT}" | tail -n1)
# Corrupt the size and disksize values registered in the db by appending
# some data to the physical file and then triggering a file verify
exec_cmd "eos-fst${FSID}" "echo \"dummy content\" >> ${LPATH}"
exec_cmd "eos-mgm1" "eos file verify fxid:${FXID_M_MEM_SZ_DIFF2}"
}
# Corrupt file to generate d_cx_diff error
function corrupt_d_cx_diff() {
local CMD_OUT=$(exec_cmd eos-cli1 "eos fileinfo fxid:${FXID_D_CX_DIFF} -m --fullpath | sed -r 's/[[:alnum:]]+=/\n&/g' | awk -F '=' '{if (\$1 ==\"fsid\" || \$1 ==\"fullpath\") {print \$2};}' | tail -n2")
# Extract the fxid and local path, trim the input
local FSID=$(echo "${CMD_OUT}" | head -n1 | sed -e 's/^[[:space:]]*//' -e 's/[[:space:]]*$//')
local LPATH=$(echo "${CMD_OUT}" | tail -n1)
# Corrupt the checksum of the file by writing random bytes to it
exec_cmd "eos-fst${FSID}" "dd if=/dev/urandom of=${LPATH} bs=1M count=10"
}
# Corrupt file to generate m_cx_diff
function corrupt_m_cx_diff() {
# Use the eos-ns-inspect tool to corrupt the MGM checksum value
local QDB_CLUSTER=$(exec_cmd eos-mgm1 "cat /etc/xrd.cf.mgm | grep "^mgmofs.qdbcluster" | awk '{print \$2;}'")
local QDB_PWDFILE=$(exec_cmd eos-mgm1 "cat /etc/xrd.cf.mgm | grep "^mgmofs.qdbpassword_file" | awk '{print \$2;}'")
exec_cmd eos-cli1 "eos-ns-inspect change-fid --no-dry-run --members ${QDB_CLUSTER} --password-file ${QDB_PWDFILE} --fid $(( 16#${FXID_M_CX_DIFF} )) --new-checksum 12345678"
}
# Corrupt file to generate rep_missing_n error
function corrupt_rep_missing_n {
local CMD_OUT=$(exec_cmd eos-cli1 "eos fileinfo fxid:${FXID_REP_MISSING} -m --fullpath | sed -r 's/[[:alnum:]]+=/\n&/g' | awk -F '=' '{if (\$1 ==\"fsid\" || \$1 ==\"fullpath\") {print \$2};}' | tail -n2")
# Extract the fxid and local path, trim the input
local FSID=$(echo "${CMD_OUT}" | head -n1 | sed -e 's/^[[:space:]]*//' -e 's/[[:space:]]*$//')
local LPATH=$(echo "${CMD_OUT}" | tail -n1)
exec_cmd "eos-fst${FSID}" "rm -rf ${LPATH}"
}
# Corrupt file to generate rep_diff_under error
function corrupt_rep_diff_under() {
local CMD_OUT=$(exec_cmd eos-cli1 "eos fileinfo fxid:${FXID_REP_DIFF_UNDER} -m --fullpath | sed -r 's/[[:alnum:]]+=/\n&/g' | awk -F '=' '{if (\$1 ==\"fsid\" || \$1 ==\"fullpath\") {print \$2};}' | tail -n2")
# Extract the fxid and local path, trim the input
local FSID=$(echo "${CMD_OUT}" | head -n1 | sed -e 's/^[[:space:]]*//' -e 's/[[:space:]]*$//')
local LPATH=$(echo "${CMD_OUT}" | tail -n1)
exec_cmd eos-cli1 "eos -r 0 0 file drop fxid:${FXID_REP_DIFF_UNDER} ${FSID}"
}
# Corrupt file to generate rep_diff_over error
function corrupt_rep_diff_over() {
local CMD_OUT=$(exec_cmd eos-cli1 "eos fileinfo fxid:${FXID_REP_DIFF_OVER} -m --fullpath | sed -r 's/[[:alnum:]]+=/\n&/g' | awk -F '=' '{if (\$1 ==\"fsid\") {print \$2};}' | tail -n2")
# Extract the fxid and local path, trim the input
local VECT_FSID=( $(echo "${CMD_OUT}" | sed -e 's/^[[:space:]]*//' -e 's/[[:space:]]*$//') )
echo "Used locations: ${VECT_FSID[@]}"
local NEW_FSID=""
local FST_ONLINE=$(exec_cmd eos-mgm1 "eos fs ls | grep \"online\" | wc -l")
for (( i=1; i<=${FST_ONLINE}; i++ )); do
local found=false
for e in ${VECT_FSID[@]}; do
if [[ "$i" == "$e" ]]; then
found=true
break
fi
done
if [[ "${found}" == false ]]; then
NEW_FSID=$i
break
fi
done
if [[ "${NEW_FSID}" == "" ]]; then
echo "error: no new FSID found for replication command"
exit 1
fi
exec_cmd eos-cli1 "eos -r 0 0 file replicate fxid:${FXID_REP_DIFF_OVER} ${VECT_FSID[0]} ${NEW_FSID}"
}
# Corrupt file to generate file_unreg error
function corrupt_unreg() {
local CMD_OUT=$(exec_cmd eos-cli1 "eos fileinfo fxid:${FXID_UNREG} -m --fullpath | sed -r 's/[[:alnum:]]+=/\n&/g' | awk -F '=' '{if (\$1 ==\"fsid\" || \$1 ==\"fullpath\") {print \$2};}' | tail -n2")
# Extract the fxid and local path, trim the input
local FSID=$(echo "${CMD_OUT}" | head -n1 | sed -e 's/^[[:space:]]*//' -e 's/[[:space:]]*$//')
exec_cmd eos-cli1 "eos -r 0 0 file drop fxid:${FXID_UNREG} ${FSID} -f"
}
# Corrupt file to generate RAIN block checksum error
function corrupt_rain_blocxs_err() {
local CMD_OUT=$(exec_cmd eos-cli1 "eos fileinfo fxid:${FXID_RAIN_BLOCKXS} -m --fullpath | sed -r 's/[[:alnum:]]+=/\n&/g' | awk -F '=' '{if (\$1 ==\"fsid\" || \$1 ==\"fullpath\") {print \$2};}' | tail -n2")
# Extract the fxid and local path, trim the input
local FSID=$(echo "${CMD_OUT}" | head -n1 | sed -e 's/^[[:space:]]*//' -e 's/[[:space:]]*$//')
local LPATH=$(echo "${CMD_OUT}" | tail -n1)
# Corrupt the checksum of the file by writing random bytes at the beginning of the file
exec_cmd "eos-fst${FSID}" "dd if=/dev/urandom of=${LPATH} bs=1 seek=4 count=3 conv=notrunc"
}
# Corrupt file to generate rain stripe diff error
function corrupt_rain_stripe_diff() {
local CMD_OUT=$(exec_cmd eos-cli1 "eos fileinfo fxid:${FXID_RAIN_DIFF} -m --fullpath | sed -r 's/[[:alnum:]]+=/\n&/g' | awk -F '=' '{if (\$1 ==\"fsid\" || \$1 ==\"fullpath\") {print \$2};}' | tail -n2")
# Extract the fxid and local path, trim the input
local FSID=$(echo "${CMD_OUT}" | head -n1 | sed -e 's/^[[:space:]]*//' -e 's/[[:space:]]*$//')
exec_cmd eos-cli1 "eos -r 0 0 file drop fxid:${FXID_RAIN_DIFF} ${FSID} -f"
}
# Corrupt file to generate orphan_n error
function corrupt_orphan() {
local CMD_OUT=$(exec_cmd eos-cli1 "eos fileinfo fxid:${FXID_ORPHAN} -m --fullpath | sed -r 's/[[:alnum:]]+=/\n&/g' | awk -F '=' '{if (\$1 ==\"fsid\" || \$1 ==\"fullpath\") {print \$2};}' | tail -n2")
# Extract the fxid and local path, trim the input
local FSID=$(echo "${CMD_OUT}" | head -n1 | sed -e 's/^[[:space:]]*//' -e 's/[[:space:]]*$//')
local LPATH=$(echo "${CMD_OUT}" | tail -n1)
# Do it twice to force delete the filemd object and set the file system off
# so that the deletion message is not sent before we drop the list of
# deletions.
exec_cmd eos-mgm1 "eos fs config ${FSID} configstatus=off"
exec_cmd eos-mgm1 "eos rm -F fxid:${FXID_ORPHAN}"
exec_cmd eos-mgm1 "eos rm -F fxid:${FXID_ORPHAN}"
exec_cmd eos-mgm1 "eos fs dropdeletion ${FSID}"
exec_cmd eos-mgm1 "eos fs config ${FSID} configstatus=rw"
}
# Corrupt file to generate RAIN stripe size error
function corrupt_rain_stripe_size() {
local CMD_OUT=$(exec_cmd eos-cli1 "eos fileinfo fxid:${FXID_RAIN_STRIPE_SZ} -m --fullpath | sed -r 's/[[:alnum:]]+=/\n&/g' | awk -F '=' '{if (\$1 ==\"fsid\" || \$1 ==\"fullpath\") {print \$2};}' | tail -n2")
# Extract the fxid and local path, trim the input
local FSID=$(echo "${CMD_OUT}" | head -n1 | sed -e 's/^[[:space:]]*//' -e 's/[[:space:]]*$//')
local LPATH=$(echo "${CMD_OUT}" | tail -n1)
# Corrupt the size of the stripe by truncating to 0
exec_cmd "eos-fst${FSID}" "truncate --size 0 ${LPATH}"
}
# Corrupt stripe to generate RAIN stripe error
function corrupt_rain_invalid_stripe() {
local CMD_OUT=$(exec_cmd eos-cli1 "eos fileinfo fxid:${FXID_RAIN_INVALID_STRIPE} -m --fullpath | sed -r 's/[[:alnum:]]+=/\n&/g' | awk -F '=' '{if (\$1 ==\"fsid\" || \$1 ==\"fullpath\") {print \$2};}' | tail -n2")
# Extract the fxid and local path, trim the input
local FSID=$(echo "${CMD_OUT}" | head -n1 | sed -e 's/^[[:space:]]*//' -e 's/[[:space:]]*$//')
local LPATH=$(echo "${CMD_OUT}" | tail -n1)
# Corrupt data in the stripe then fix the checksum
exec_cmd "eos-fst${FSID}" "dd if=/dev/urandom of=${LPATH} bs=1 seek=5000 count=10 conv=notrunc"
exec_cmd "eos-fst${FSID}" "eos-compute-blockxs ${LPATH}"
}
# Corrupt 2 stripes to generate RAIN stripe error
function corrupt_rain_invalid_stripe2() {
local CMD_OUT=$(exec_cmd eos-cli1 "eos fileinfo fxid:${FXID_RAIN_INVALID_STRIPE2} -m --fullpath | sed -r 's/[[:alnum:]]+=/\n&/g' | awk -F '=' '{if (\$1 ==\"fsid\" || \$1 ==\"fullpath\") {print \$2};}'")
local FIRST_FST=$(echo "${CMD_OUT}" | head -n 4 | tail -n 2)
local LAST_FST=$(echo "${CMD_OUT}" | tail -n 2)
# Extract the fxid and local path, trim the input
local FSID=$(echo "${FIRST_FST}" | head -n1 | sed -e 's/^[[:space:]]*//' -e 's/[[:space:]]*$//')
local LPATH=$(echo "${FIRST_FST}" | tail -n1)
# Corrupt data in the stripe then fix the checksum
exec_cmd "eos-fst${FSID}" "dd if=/dev/urandom of=${LPATH} bs=1 seek=5000 count=10 conv=notrunc"
exec_cmd "eos-fst${FSID}" "eos-compute-blockxs ${LPATH}"
local FSID=$(echo "${LAST_FST}" | head -n1 | sed -e 's/^[[:space:]]*//' -e 's/[[:space:]]*$//')
local LPATH=$(echo "${LAST_FST}" | tail -n1)
exec_cmd "eos-fst${FSID}" "dd if=/dev/urandom of=${LPATH} bs=1 seek=5000 count=10 conv=notrunc"
exec_cmd "eos-fst${FSID}" "eos-compute-blockxs ${LPATH}"
}
# Duplicate stripe to generate RAIN stripe error
function corrupt_rain_stripe_diff_over() {
local CMD_OUT=$(exec_cmd eos-cli1 "eos fileinfo fxid:${FXID_RAIN_DIFF_OVER} -m --fullpath | sed -r 's/[[:alnum:]]+=/\n&/g' | awk -F '=' '{if (\$1 ==\"fsid\") {print \$2};}'")
local VECT_FSID=( $(echo "${CMD_OUT}" | sed -e 's/^[[:space:]]*//' -e 's/[[:space:]]*$//') )
echo "Used locations: ${VECT_FSID[@]}"
local NEW_FSID=""
local FST_ONLINE=$(exec_cmd eos-mgm1 "eos fs ls | grep \"online\" | wc -l")
for (( i=1; i<=${FST_ONLINE}; i++ )); do
local found=false
for e in ${VECT_FSID[@]}; do
if [[ "$i" == "$e" ]]; then
found=true
break
fi
done
if [[ "${found}" == false ]]; then
NEW_FSID=$i
break
fi
done
if [[ "${NEW_FSID}" == "" ]]; then
echo "error: no new FSID found for replication command"
exit 1
fi
# Start by tagging the file on the new fst
exec_cmd eos-cli1 "eos -r 0 0 file tag ${EOS_RAIN_DIR}/rain_stripe_diff_over.dat +${NEW_FSID}"
# Get local path of source and local path of the destination directory
local CMD_OUT=$(exec_cmd eos-cli1 "eos fileinfo fxid:${FXID_RAIN_DIFF_OVER} -m --fullpath | sed -r 's/[[:alnum:]]+=/\n&/g'")
local SRC_LPATH=$(echo "${CMD_OUT}" | grep fsid="${VECT_FSID[0]}" -A1 | tail -n1 | sed -r -e 's/^fullpath=//' -e 's/[[:space:]]*$//')
local SRC_LDIR=$(echo "${SRC_LPATH}" | sed -r 's/[[:alnum:]]+$//')
local SRC_LFILE=$(echo "${SRC_LPATH}" | grep -oE '[[:alnum:]]+$')
local DST_LDIR=$(echo "${CMD_OUT}" | grep fsid="${NEW_FSID}" -A1 | tail -n1 | sed -r -e 's/^fullpath=//' -e 's/[[:alnum:]]+[[:space:]]*$//')
cp_file_with_xattr_cmd "eos-fst${VECT_FSID[0]}" "${SRC_LDIR}" "${SRC_LFILE}" "eos-fst${NEW_FSID}" "${DST_LDIR}"
cp_file_with_xattr_cmd "eos-fst${VECT_FSID[0]}" "${SRC_LDIR}" "${SRC_LFILE}.xsmap" "eos-fst${NEW_FSID}" "${DST_LDIR}"
# Force resync of fmd on new fst
exec_cmd eos-cli1 "eos -r 0 0 file verify fxid:${FXID_RAIN_DIFF_OVER} ${NEW_FSID} -resync"
}
# Duplicate stripe and corrupt another stripe to generate RAIN stripe error
function corrupt_rain_stripe_diff_over2() {
# First corrupt one stripe
local CMD_OUT=$(exec_cmd eos-cli1 "eos fileinfo fxid:${FXID_RAIN_DIFF_OVER2} -m --fullpath | sed -r 's/[[:alnum:]]+=/\n&/g' | awk -F '=' '{if (\$1 ==\"fsid\" || \$1 ==\"fullpath\") {print \$2};}' | tail -n2")
local FSID=$(echo "${CMD_OUT}" | head -n1 | sed -e 's/^[[:space:]]*//' -e 's/[[:space:]]*$//')
local LPATH=$(echo "${CMD_OUT}" | tail -n1)
exec_cmd "eos-fst${FSID}" "dd if=/dev/urandom of=${LPATH} bs=1 seek=5000 count=10 conv=notrunc"
exec_cmd "eos-fst${FSID}" "eos-compute-blockxs ${LPATH}"
# Then duplicate another one
local CMD_OUT=$(exec_cmd eos-cli1 "eos fileinfo fxid:${FXID_RAIN_DIFF_OVER2} -m --fullpath | sed -r 's/[[:alnum:]]+=/\n&/g' | awk -F '=' '{if (\$1 ==\"fsid\") {print \$2};}'")
# Extract the fxid and local path, trim the input
local VECT_FSID=( $(echo "${CMD_OUT}" | sed -e 's/^[[:space:]]*//' -e 's/[[:space:]]*$//') )
echo "Used locations: ${VECT_FSID[@]}"
local NEW_FSID=""
local FST_ONLINE=$(exec_cmd eos-mgm1 "eos fs ls | grep \"online\" | wc -l")
for (( i=1; i<=${FST_ONLINE}; i++ )); do
local found=false
for e in ${VECT_FSID[@]}; do
if [[ "$i" == "$e" ]]; then
found=true
break
fi
done
if [[ "${found}" == false ]]; then
NEW_FSID=$i
break
fi
done
if [[ "${NEW_FSID}" == "" ]]; then
echo "error: no new FSID found for replication command"
exit 1
fi
# Start by tagging the file on the new fst
exec_cmd eos-cli1 "eos -r 0 0 file tag ${EOS_RAIN_DIR}/rain_stripe_diff_over2.dat +${NEW_FSID}"
# Get local path of source and local path of the destination directory
local CMD_OUT=$(exec_cmd eos-cli1 "eos fileinfo fxid:${FXID_RAIN_DIFF_OVER2} -m --fullpath | sed -r 's/[[:alnum:]]+=/\n&/g'")
local SRC_LPATH=$(echo "${CMD_OUT}" | grep fsid="${VECT_FSID[0]}" -A1 | tail -n1 | sed -r -e 's/^fullpath=//' -e 's/[[:space:]]*$//')
local SRC_LDIR=$(echo "${SRC_LPATH}" | sed -r 's/[[:alnum:]]+$//')
local SRC_LFILE=$(echo "${SRC_LPATH}" | grep -oE '[[:alnum:]]+$')
local DST_LDIR=$(echo "${CMD_OUT}" | grep fsid="${NEW_FSID}" -A1 | tail -n1 | sed -r -e 's/^fullpath=//' -e 's/[[:alnum:]]+[[:space:]]*$//')
cp_file_with_xattr_cmd "eos-fst${VECT_FSID[0]}" "${SRC_LDIR}" "${SRC_LFILE}" "eos-fst${NEW_FSID}" "${DST_LDIR}"
cp_file_with_xattr_cmd "eos-fst${VECT_FSID[0]}" "${SRC_LDIR}" "${SRC_LFILE}.xsmap" "eos-fst${NEW_FSID}" "${DST_LDIR}"
# Force resync of fmd on new fst
exec_cmd eos-cli1 "eos -r 0 0 file verify fxid:${FXID_RAIN_DIFF_OVER2} ${NEW_FSID} -resync"
}
# Configure fsck to run more often and reduce the scan times
function configure_fsck() {
# First reduce the scan interval on the FSTs
local FST_ONLINE=$(exec_cmd eos-mgm1 "eos fs ls | grep \"online\" | wc -l")
for (( i=1; i<=${FST_ONLINE}; i++ )); do
exec_cmd eos-cli1 "eos -r 0 0 fs config ${i} scan_disk_interval=20 &&
eos -r 0 0 fs config ${i} scan_ns_interval=20 &&
eos -r 0 0 fs config ${i} scaninterval=15 &&
eos -r 0 0 fs config ${i} scan_rain_interval=15 &&
eos -r 0 0 fs config ${i} fsck_refresh_interval=20"
done
# Reduce the interval when the fsck collection thread runs
exec_cmd eos-cli1 "eos -r 0 0 fsck config toggle-collect 0.25;"
}
# Check that we collected all the errors that we expect
function check_all_errors_collected() {
# Allow for at most MAX_DELAY seconds to collect all the errors
local MAX_DELAY=${1:-"300"}
local START_TIME=$(date +%s)
while
local CURRENT_TIME=$(date +%s)
local FOUND_D_MEM_SZ_DIFF=$(exec_cmd eos-cli1 "eos -r 0 0 fsck report -i | grep ${FXID_D_MEM_SZ_DIFF}")
local FOUND_M_MEM_SZ_DIFF1=$(exec_cmd eos-cli1 "eos -r 0 0 fsck report -i | grep ${FXID_M_MEM_SZ_DIFF1}")
local FOUND_M_MEM_SZ_DIFF2=$(exec_cmd eos-cli1 "eos -r 0 0 fsck report -i | grep ${FXID_M_MEM_SZ_DIFF2}")
local FOUND_D_CX_DIFF=$(exec_cmd eos-cli1 "eos -r 0 0 fsck report -i | grep ${FXID_D_CX_DIFF}")
local FOUND_M_CX_DIFF=$(exec_cmd eos-cli1 "eos -r 0 0 fsck report -i | grep ${FXID_M_CX_DIFF}")
local FOUND_UNREG=$(exec_cmd eos-cli1 "eos -r 0 0 fsck report -i | grep ${FXID_UNREG}")
local FOUND_REP_MISSING=$(exec_cmd eos-cli1 "eos -r 0 0 fsck report -i | grep ${FXID_REP_MISSING}")
local FOUND_REP_DIFF_UNDER=$(exec_cmd eos-cli1 "eos -r 0 0 fsck report -i | grep ${FXID_REP_DIFF_UNDER}")
local FOUND_REP_DIFF_OVER=$(exec_cmd eos-cli1 "eos -r 0 0 fsck report -i | grep ${FXID_REP_DIFF_OVER}")
local FOUND_RAIN_BLOCKXS=$(exec_cmd eos-cli1 "eos -r 0 0 fsck report -i | grep ${FXID_RAIN_BLOCKXS}")
local FOUND_RAIN_DIFF=$(exec_cmd eos-cli1 "eos -r 0 0 fsck report -i | grep ${FXID_RAIN_DIFF}")
local FOUND_RAIN_STRIPE_SZ=$(exec_cmd eos-cli1 "eos -r 0 0 fsck report -i | grep ${FXID_RAIN_STRIPE_SZ}")
local FOUND_ORPHAN=$(exec_cmd eos-cli1 "eos -r 0 0 fsck report -i | grep ${FXID_ORPHAN}")
local FOUND_RAIN_INVALID_STRIPE=$(exec_cmd eos-cli1 "eos -r 0 0 fsck report -i | grep ${FXID_RAIN_INVALID_STRIPE}")
local FOUND_RAIN_INVALID_STRIPE2=$(exec_cmd eos-cli1 "eos -r 0 0 fsck report -i | grep ${FXID_RAIN_INVALID_STRIPE2}")
local FOUND_RAIN_DIFF_OVER=$(exec_cmd eos-cli1 "eos -r 0 0 fsck report -i | grep ${FXID_RAIN_DIFF_OVER}")
local FOUND_RAIN_DIFF_OVER2=$(exec_cmd eos-cli1 "eos -r 0 0 fsck report -i | grep ${FXID_RAIN_DIFF_OVER2}")
if [[ -z "${FOUND_D_MEM_SZ_DIFF}" ||
-z "${FOUND_M_MEM_SZ_DIFF1}" ||
-z "${FOUND_M_MEM_SZ_DIFF2}" ||
-z "${FOUND_D_CX_DIFF}" ||
-z "${FOUND_UNREG}" ||
-z "${FOUND_REP_MISSING}" ||
-z "${FOUND_REP_DIFF_UNDER}" ||
-z "${FOUND_REP_DIFF_OVER}" ||
-z "${FOUND_RAIN_BLOCKXS}" ||
-z "${FOUND_RAIN_DIFF}" ||
-z "${FOUND_RAIN_STRIPE_SZ}" ||
-z "${FOUND_ORPHAN}" ||
-z "${FOUND_RAIN_INVALID_STRIPE}" ||
-z "${FOUND_RAIN_INVALID_STRIPE2}" ||
-z "${FOUND_RAIN_DIFF_OVER}" ||
-z "${FOUND_RAIN_DIFF_OVER2}" ]]; then
if (( $((${CURRENT_TIME} - ${START_TIME})) >= ${MAX_DELAY} )); then
echo "error: some errors not discovered"
exec_cmd eos-cli1 "eos -r 0 0 fsck report -i -a"
cleanup
exit 1
else
echo "info: sleep for 5 seconds waiting for error collection, `secs_to_human $((${CURRENT_TIME} - ${START_TIME}))` passed"
sleep 5
fi
(( $((${CURRENT_TIME} - ${START_TIME})) < ${MAX_DELAY} ))
else
echo "info: found all the errors we were expecting"
false # to end the loop
fi
do
:
done
}
# Cleanup the files and directories at the MGM
function cleanup() {
exec_cmd eos-cli1 "eos -r 0 0 rm -rF \"${EOS_REPLICA_DIR}/*\" &&
eos -r 0 0 rmdir ${EOS_REPLICA_DIR}/ &&
eos -r 0 0 rm -rF \"${EOS_RAIN_DIR}/*\" &&
eos -r 0 0 rmdir ${EOS_RAIN_DIR}/"
exec_cmd eos-cli1 "eos -r 0 0 fsck config toggle-collect"
}
# @todo the whole args parsing story should be rewritten, possibly decoupling docker/kubernetes execution from the test itself
MAX_DELAY=300
# Late addition of optional argument "max-delay, it must be the first arg"
if [[ "$1" == "--max-delay" ]]; then
MAX_DELAY=$2
shift # past argument
shift # past value
fi
# Set up global variables
IS_DOCKER=false
IS_LOCAL=false
K8S_NAMESPACE=""
if [[ $# -lt 2 ]]; then
echo "error: invalid number of arguments"
usage
exit 1
fi
if [[ "$1" != "--type" ]]; then
echo "error: unknown argument \"$1\""
usage
exit 1
fi
if [[ "$2" == "docker" ]]; then
IS_DOCKER=true
elif [[ "$2" == "local" ]]; then
IS_LOCAL=true
elif [[ "$2" == "k8s" ]]; then
IS_DOCKER=false
else
echo "error: unknown type of executor \"$2\""
usage
exit 1
fi
if [[ "${IS_LOCAL}" == true && -z "${EOS_MGM_URL}" ]]; then
echo "error: EOS_MGM_URL env needs to be set for local!"
exit 1
fi
if [[ "${IS_DOCKER}" == false && "${IS_LOCAL}" == false ]]; then
# For the Kubernetes setup we also need a namespace argument
if [[ $# -lt 3 ]]; then
echo "error: missing Kubernetes namespace argument"
usage
exit 1
fi
K8S_NAMESPACE="$3"
fi
echo "eos-fsck-test configuration:"
echo "MAX_DELAY=$MAX_DELAY"
echo "IS_DOCKER=$IS_DOCKER"
echo "IS_LOCAL=$IS_LOCAL"
echo "K8S_NAMESPACE=$K8S_NAMESPACE (if IS_DOCKER=false)"
echo
# Create test file
create_test_files
# Create different type of corruptions for different files
corrupt_d_mem_sz_diff
corrupt_m_mem_sz_diff1
corrupt_m_mem_sz_diff2
corrupt_d_cx_diff
corrupt_m_cx_diff
corrupt_rep_missing_n
corrupt_rep_diff_under
corrupt_rep_diff_over
corrupt_unreg
corrupt_rain_blocxs_err
corrupt_rain_stripe_diff
corrupt_rain_stripe_size
corrupt_orphan
corrupt_rain_invalid_stripe
corrupt_rain_invalid_stripe2
corrupt_rain_stripe_diff_over
corrupt_rain_stripe_diff_over2
# Configure fsck to run more often and reduce scan times
configure_fsck
# Check that we are collecting all the expected errors
check_all_errors_collected $MAX_DELAY
# Enable the repair thread and allow a delay of 2 minutes to
# correct all the discovered errors
exec_cmd eos-cli1 "eos -r 0 0 fsck config toggle-repair"
# Cleanup the orphan entries
exec_cmd eos-cli1 "eos -r 0 0 fsck clean_orphans"
# Wait for all the errors to be repaired
START_TIME=$(date +%s)
while
CURRENT_TIME=$(date +%s)
HAS_ERRORS=$(exec_cmd eos-cli1 "eos -r 0 0 fsck report -i")
if [[ ! -z "${HAS_ERRORS}" ]]; then
if (( $((${CURRENT_TIME} - ${START_TIME})) >= ${MAX_DELAY} )); then
echo "error: some errors where not repaired"
exec_cmd eos-cli1 "eos -r 0 0 fsck report -i -a"
cleanup
exit 1
else
echo "info: sleep for 5 seconds waiting for error repair, `secs_to_human $((${CURRENT_TIME} - ${START_TIME}))` passed"
sleep 5
fi
else
echo "info: all errors were repaired"
false # to end the loop
fi
do
:
done
cleanup