#!/bin/bash -ex # ****************************************************************************** # EOS - the CERN Disk Storage System # Copyright (C) 2019 CERN/Switzerland # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program. If not, see . # ****************************************************************************** SCRIPTPATH="$( cd "$(dirname "$0")" >/dev/null 2>&1 ; pwd -P )" source ${SCRIPTPATH}/eos-test-utils function usage() { echo "usage: $(basename $0) [--max-delay ] --type docker/local/k8s " echo " --max-delay : optional max delay in seconds" echo " docker : script runs in a Docker based setup" echo " k8s : script runs in a Kubernetes setup and requires a namespace argument" echo " local : script runs locally, needs EOS_MGM_URL to be set" } # Create and upload test files to the eos instance. We create a random file and # upload it multiple times to EOS one file per type of corruption. function create_test_files() { EOS_ROOT=/eos/dockertest/ EOS_RAIN_DIR=${EOS_ROOT}/fsck/rain EOS_REPLICA_DIR=${EOS_ROOT}/fsck/replica # Create one file per type of fsck error and trim whitespaces exec_cmd eos-cli1 "export PATH=/opt/eos/xrootd/bin/:${PATH} && dd if=/dev/urandom of=/tmp/test_file.dat bs=1M count=10 && eos -r 0 0 mkdir -p ${EOS_RAIN_DIR} && eos -r 0 0 mkdir -p ${EOS_REPLICA_DIR} && eos -r 0 0 attr set default=replica ${EOS_REPLICA_DIR} && eos -r 0 0 attr set default=raid6 ${EOS_RAIN_DIR} && xrdcp -f /tmp/test_file.dat \${EOS_MGM_URL}/${EOS_REPLICA_DIR}/file_d_mem_sz_diff.dat && xrdcp -f /tmp/test_file.dat \${EOS_MGM_URL}/${EOS_REPLICA_DIR}/file_m_mem_sz_diff1.dat && xrdcp -f /tmp/test_file.dat \${EOS_MGM_URL}/${EOS_REPLICA_DIR}/file_m_mem_sz_diff2.dat && xrdcp -f /tmp/test_file.dat \${EOS_MGM_URL}/${EOS_REPLICA_DIR}/file_d_cx_diff.dat && xrdcp -f /tmp/test_file.dat \${EOS_MGM_URL}/${EOS_REPLICA_DIR}/file_m_cx_diff.dat && xrdcp -f /tmp/test_file.dat \${EOS_MGM_URL}/${EOS_REPLICA_DIR}/file_unreg.dat && xrdcp -f /tmp/test_file.dat \${EOS_MGM_URL}/${EOS_REPLICA_DIR}/file_rep_missing.dat && xrdcp -f /tmp/test_file.dat \${EOS_MGM_URL}/${EOS_REPLICA_DIR}/file_rep_diff_under.dat && xrdcp -f /tmp/test_file.dat \${EOS_MGM_URL}/${EOS_REPLICA_DIR}/file_rep_diff_over.dat && xrdcp -f /tmp/test_file.dat \${EOS_MGM_URL}/${EOS_REPLICA_DIR}/file_orphan.dat && xrdcp -f /tmp/test_file.dat \${EOS_MGM_URL}/${EOS_RAIN_DIR}/rain_blockxs_err.dat && xrdcp -f /tmp/test_file.dat \${EOS_MGM_URL}/${EOS_RAIN_DIR}/rain_stripe_diff.dat && xrdcp -f /tmp/test_file.dat \${EOS_MGM_URL}/${EOS_RAIN_DIR}/rain_stripe_sz_err.dat && xrdcp -f /tmp/test_file.dat \${EOS_MGM_URL}/${EOS_RAIN_DIR}/rain_invalid_stripe_err.dat && xrdcp -f /tmp/test_file.dat \${EOS_MGM_URL}/${EOS_RAIN_DIR}/rain_invalid_stripe_err2.dat && xrdcp -f /tmp/test_file.dat \${EOS_MGM_URL}/${EOS_RAIN_DIR}/rain_stripe_diff_over.dat && xrdcp -f /tmp/test_file.dat \${EOS_MGM_URL}/${EOS_RAIN_DIR}/rain_stripe_diff_over2.dat" FXID_D_MEM_SZ_DIFF=$(exec_cmd eos-cli1 "eos fileinfo ${EOS_REPLICA_DIR}/file_d_mem_sz_diff.dat -m | sed -r 's/[[:alnum:]]+=/\n&/g' | awk -F '=' '\$1==\"fxid\" {print \$2};' | sed -e 's/^[[:space:]]*//' -e 's/[[:space:]]*\$//'") FXID_M_MEM_SZ_DIFF1=$(exec_cmd eos-cli1 "eos fileinfo ${EOS_REPLICA_DIR}/file_m_mem_sz_diff1.dat -m | sed -r 's/[[:alnum:]]+=/\n&/g' | awk -F '=' '\$1==\"fxid\" {print \$2};' | sed -e 's/^[[:space:]]*//' -e 's/[[:space:]]*\$//'") FXID_M_MEM_SZ_DIFF2=$(exec_cmd eos-cli1 "eos fileinfo ${EOS_REPLICA_DIR}/file_m_mem_sz_diff2.dat -m | sed -r 's/[[:alnum:]]+=/\n&/g' | awk -F '=' '\$1==\"fxid\" {print \$2};' | sed -e 's/^[[:space:]]*//' -e 's/[[:space:]]*\$//'") FXID_D_CX_DIFF=$(exec_cmd eos-cli1 "eos fileinfo ${EOS_REPLICA_DIR}/file_d_cx_diff.dat -m | sed -r 's/[[:alnum:]]+=/\n&/g' | awk -F '=' '\$1==\"fxid\" {print \$2};' | sed -e 's/^[[:space:]]*//' -e 's/[[:space:]]*\$//'") FXID_M_CX_DIFF=$(exec_cmd eos-cli1 "eos fileinfo ${EOS_REPLICA_DIR}/file_m_cx_diff.dat -m | sed -r 's/[[:alnum:]]+=/\n&/g' | awk -F '=' '\$1==\"fxid\" {print \$2};' | sed -e 's/^[[:space:]]*//' -e 's/[[:space:]]*\$//'") FXID_UNREG=$(exec_cmd eos-cli1 "eos fileinfo ${EOS_REPLICA_DIR}/file_unreg.dat -m | sed -r 's/[[:alnum:]]+=/\n&/g' | awk -F '=' '\$1==\"fxid\" {print \$2};' | sed -e 's/^[[:space:]]*//' -e 's/[[:space:]]*\$//'") FXID_REP_MISSING=$(exec_cmd eos-cli1 "eos fileinfo ${EOS_REPLICA_DIR}/file_rep_missing.dat -m | sed -r 's/[[:alnum:]]+=/\n&/g' | awk -F '=' '\$1==\"fxid\" {print \$2};' | sed -e 's/^[[:space:]]*//' -e 's/[[:space:]]*\$//'") FXID_REP_DIFF_UNDER=$(exec_cmd eos-cli1 "eos fileinfo ${EOS_REPLICA_DIR}/file_rep_diff_under.dat -m | sed -r 's/[[:alnum:]]+=/\n&/g' | awk -F '=' '\$1==\"fxid\" {print \$2};' | sed -e 's/^[[:space:]]*//' -e 's/[[:space:]]*\$//'") FXID_REP_DIFF_OVER=$(exec_cmd eos-cli1 "eos fileinfo ${EOS_REPLICA_DIR}/file_rep_diff_over.dat -m | sed -r 's/[[:alnum:]]+=/\n&/g' | awk -F '=' '\$1==\"fxid\" {print \$2};' | sed -e 's/^[[:space:]]*//' -e 's/[[:space:]]*\$//'") FXID_ORPHAN=$(exec_cmd eos-cli1 "eos fileinfo ${EOS_REPLICA_DIR}/file_orphan.dat -m | sed -r 's/[[:alnum:]]+=/\n&/g' | awk -F '=' '\$1==\"fxid\" {print \$2};' | sed -e 's/^[[:space:]]*//' -e 's/[[:space:]]*\$//'") FXID_RAIN_BLOCKXS=$(exec_cmd eos-cli1 "eos fileinfo ${EOS_RAIN_DIR}/rain_blockxs_err.dat -m | sed -r 's/[[:alnum:]]+=/\n&/g' | awk -F '=' '\$1==\"fxid\" {print \$2};' | sed -e 's/^[[:space:]]*//' -e 's/[[:space:]]*\$//'") FXID_RAIN_DIFF=$(exec_cmd eos-cli1 "eos fileinfo ${EOS_RAIN_DIR}/rain_stripe_diff.dat -m | sed -r 's/[[:alnum:]]+=/\n&/g' | awk -F '=' '\$1==\"fxid\" {print \$2};' | sed -e 's/^[[:space:]]*//' -e 's/[[:space:]]*\$//'") FXID_RAIN_STRIPE_SZ=$(exec_cmd eos-cli1 "eos fileinfo ${EOS_RAIN_DIR}/rain_stripe_sz_err.dat -m | sed -r 's/[[:alnum:]]+=/\n&/g' | awk -F '=' '\$1==\"fxid\" {print \$2};' | sed -e 's/^[[:space:]]*//' -e 's/[[:space:]]*\$//'") FXID_RAIN_INVALID_STRIPE=$(exec_cmd eos-cli1 "eos fileinfo ${EOS_RAIN_DIR}/rain_invalid_stripe_err.dat -m | sed -r 's/[[:alnum:]]+=/\n&/g' | awk -F '=' '\$1==\"fxid\" {print \$2};' | sed -e 's/^[[:space:]]*//' -e 's/[[:space:]]*\$//'") FXID_RAIN_INVALID_STRIPE2=$(exec_cmd eos-cli1 "eos fileinfo ${EOS_RAIN_DIR}/rain_invalid_stripe_err2.dat -m | sed -r 's/[[:alnum:]]+=/\n&/g' | awk -F '=' '\$1==\"fxid\" {print \$2};' | sed -e 's/^[[:space:]]*//' -e 's/[[:space:]]*\$//'") FXID_RAIN_DIFF_OVER=$(exec_cmd eos-cli1 "eos fileinfo ${EOS_RAIN_DIR}/rain_stripe_diff_over.dat -m | sed -r 's/[[:alnum:]]+=/\n&/g' | awk -F '=' '\$1==\"fxid\" {print \$2};' | sed -e 's/^[[:space:]]*//' -e 's/[[:space:]]*\$//'") FXID_RAIN_DIFF_OVER2=$(exec_cmd eos-cli1 "eos fileinfo ${EOS_RAIN_DIR}/rain_stripe_diff_over2.dat -m | sed -r 's/[[:alnum:]]+=/\n&/g' | awk -F '=' '\$1==\"fxid\" {print \$2};' | sed -e 's/^[[:space:]]*//' -e 's/[[:space:]]*\$//'") # If any of the FXID_* variables are empty then we have a problem if [[ -z "${FXID_D_MEM_SZ_DIFF}" || -z "${FXID_M_MEM_SZ_DIFF1}" || -z "${FXID_M_MEM_SZ_DIFF2}" || -z "${FXID_D_CX_DIFF}" || -z "${FXID_M_CX_DIFF}" || -z "${FXID_UNREG}" || -z "${FXID_REP_MISSING}" || -z "${FXID_REP_DIFF_UNDER}" || -z "${FXID_REP_DIFF_OVER}" || -z "${FXID_ORPHAN}" || -z "${FXID_RAIN_BLOCKXS}" || -z "${FXID_RAIN_DIFF}" || -z "${FXID_RAIN_STRIPE_SZ}" || -z "${FXID_RAIN_INVALID_STRIPE}" || -z "${FXID_RAIN_INVALID_STRIPE2}" || -z "${FXID_RAIN_DIFF_OVER}" || -z "${FXID_RAIN_DIFF_OVER2}" ]]; then echo "error: some of the fxids could not be retrieved" cleanup exit 1 fi # Cleanup generated test file exec_cmd eos-cli1 "rm -rf /tmp/test_file.dat" } # Corrupt file to generate d_mem_sz_diff error function corrupt_d_mem_sz_diff() { local CMD_OUT=$(exec_cmd eos-cli1 "eos fileinfo fxid:${FXID_D_MEM_SZ_DIFF} -m --fullpath | sed -r 's/[[:alnum:]]+=/\n&/g' | awk -F '=' '{if (\$1 ==\"fsid\" || \$1 ==\"fullpath\") {print \$2};}' | tail -n2") # Extract the fxid and local path, trim the input local FSID=$(echo "${CMD_OUT}" | head -n1 | sed -e 's/^[[:space:]]*//' -e 's/[[:space:]]*$//') local LPATH=$(echo "${CMD_OUT}" | tail -n1) exec_cmd "eos-fst${FSID}" "echo \"dummy\" >> ${LPATH}" } # Corrupt file to generate m_mem_sz_diff - where repair just needs to update the ns size function corrupt_m_mem_sz_diff1() { # Use the eos-ns-inspect tool to corrupt the MGM file size local QDB_CLUSTER=$(exec_cmd eos-mgm1 "cat /etc/xrd.cf.mgm | grep "^mgmofs.qdbcluster" | awk '{print \$2;}'") local QDB_PWDFILE=$(exec_cmd eos-mgm1 "cat /etc/xrd.cf.mgm | grep "^mgmofs.qdbpassword_file" | awk '{print \$2;}'") exec_cmd eos-cli1 "eos-ns-inspect change-fid --no-dry-run --members ${QDB_CLUSTER} --password-file ${QDB_PWDFILE} --fid $(( 16#${FXID_M_MEM_SZ_DIFF1} )) --new-size 1234568" exec_cmd eos-cli1 "eos -r 0 0 ns cache drop-single-file $(( 16#${FXID_M_MEM_SZ_DIFF1} )) || true" } # Corrupt file to generate m_mem_sz_diff - where repair involves dropping the # broken replica and triggering a replication of the correct one function corrupt_m_mem_sz_diff2() { local CMD_OUT=$(exec_cmd eos-cli1 "eos fileinfo fxid:${FXID_M_MEM_SZ_DIFF2} -m --fullpath | sed -r 's/[[:alnum:]]+=/\n&/g' | awk -F '=' '{if (\$1 ==\"fsid\" || \$1 ==\"fullpath\") {print \$2};}' | tail -n2") # Extract the fxid and local path, trim the input local FSID=$(echo "${CMD_OUT}" | head -n1 | sed -e 's/^[[:space:]]*//' -e 's/[[:space:]]*$//') local LPATH=$(echo "${CMD_OUT}" | tail -n1) # Corrupt the size and disksize values registered in the db by appending # some data to the physical file and then triggering a file verify exec_cmd "eos-fst${FSID}" "echo \"dummy content\" >> ${LPATH}" exec_cmd "eos-mgm1" "eos file verify fxid:${FXID_M_MEM_SZ_DIFF2}" } # Corrupt file to generate d_cx_diff error function corrupt_d_cx_diff() { local CMD_OUT=$(exec_cmd eos-cli1 "eos fileinfo fxid:${FXID_D_CX_DIFF} -m --fullpath | sed -r 's/[[:alnum:]]+=/\n&/g' | awk -F '=' '{if (\$1 ==\"fsid\" || \$1 ==\"fullpath\") {print \$2};}' | tail -n2") # Extract the fxid and local path, trim the input local FSID=$(echo "${CMD_OUT}" | head -n1 | sed -e 's/^[[:space:]]*//' -e 's/[[:space:]]*$//') local LPATH=$(echo "${CMD_OUT}" | tail -n1) # Corrupt the checksum of the file by writing random bytes to it exec_cmd "eos-fst${FSID}" "dd if=/dev/urandom of=${LPATH} bs=1M count=10" } # Corrupt file to generate m_cx_diff function corrupt_m_cx_diff() { # Use the eos-ns-inspect tool to corrupt the MGM checksum value local QDB_CLUSTER=$(exec_cmd eos-mgm1 "cat /etc/xrd.cf.mgm | grep "^mgmofs.qdbcluster" | awk '{print \$2;}'") local QDB_PWDFILE=$(exec_cmd eos-mgm1 "cat /etc/xrd.cf.mgm | grep "^mgmofs.qdbpassword_file" | awk '{print \$2;}'") exec_cmd eos-cli1 "eos-ns-inspect change-fid --no-dry-run --members ${QDB_CLUSTER} --password-file ${QDB_PWDFILE} --fid $(( 16#${FXID_M_CX_DIFF} )) --new-checksum 12345678" } # Corrupt file to generate rep_missing_n error function corrupt_rep_missing_n { local CMD_OUT=$(exec_cmd eos-cli1 "eos fileinfo fxid:${FXID_REP_MISSING} -m --fullpath | sed -r 's/[[:alnum:]]+=/\n&/g' | awk -F '=' '{if (\$1 ==\"fsid\" || \$1 ==\"fullpath\") {print \$2};}' | tail -n2") # Extract the fxid and local path, trim the input local FSID=$(echo "${CMD_OUT}" | head -n1 | sed -e 's/^[[:space:]]*//' -e 's/[[:space:]]*$//') local LPATH=$(echo "${CMD_OUT}" | tail -n1) exec_cmd "eos-fst${FSID}" "rm -rf ${LPATH}" } # Corrupt file to generate rep_diff_under error function corrupt_rep_diff_under() { local CMD_OUT=$(exec_cmd eos-cli1 "eos fileinfo fxid:${FXID_REP_DIFF_UNDER} -m --fullpath | sed -r 's/[[:alnum:]]+=/\n&/g' | awk -F '=' '{if (\$1 ==\"fsid\" || \$1 ==\"fullpath\") {print \$2};}' | tail -n2") # Extract the fxid and local path, trim the input local FSID=$(echo "${CMD_OUT}" | head -n1 | sed -e 's/^[[:space:]]*//' -e 's/[[:space:]]*$//') local LPATH=$(echo "${CMD_OUT}" | tail -n1) exec_cmd eos-cli1 "eos -r 0 0 file drop fxid:${FXID_REP_DIFF_UNDER} ${FSID}" } # Corrupt file to generate rep_diff_over error function corrupt_rep_diff_over() { local CMD_OUT=$(exec_cmd eos-cli1 "eos fileinfo fxid:${FXID_REP_DIFF_OVER} -m --fullpath | sed -r 's/[[:alnum:]]+=/\n&/g' | awk -F '=' '{if (\$1 ==\"fsid\") {print \$2};}' | tail -n2") # Extract the fxid and local path, trim the input local VECT_FSID=( $(echo "${CMD_OUT}" | sed -e 's/^[[:space:]]*//' -e 's/[[:space:]]*$//') ) echo "Used locations: ${VECT_FSID[@]}" local NEW_FSID="" local FST_ONLINE=$(exec_cmd eos-mgm1 "eos fs ls | grep \"online\" | wc -l") for (( i=1; i<=${FST_ONLINE}; i++ )); do local found=false for e in ${VECT_FSID[@]}; do if [[ "$i" == "$e" ]]; then found=true break fi done if [[ "${found}" == false ]]; then NEW_FSID=$i break fi done if [[ "${NEW_FSID}" == "" ]]; then echo "error: no new FSID found for replication command" exit 1 fi exec_cmd eos-cli1 "eos -r 0 0 file replicate fxid:${FXID_REP_DIFF_OVER} ${VECT_FSID[0]} ${NEW_FSID}" } # Corrupt file to generate file_unreg error function corrupt_unreg() { local CMD_OUT=$(exec_cmd eos-cli1 "eos fileinfo fxid:${FXID_UNREG} -m --fullpath | sed -r 's/[[:alnum:]]+=/\n&/g' | awk -F '=' '{if (\$1 ==\"fsid\" || \$1 ==\"fullpath\") {print \$2};}' | tail -n2") # Extract the fxid and local path, trim the input local FSID=$(echo "${CMD_OUT}" | head -n1 | sed -e 's/^[[:space:]]*//' -e 's/[[:space:]]*$//') exec_cmd eos-cli1 "eos -r 0 0 file drop fxid:${FXID_UNREG} ${FSID} -f" } # Corrupt file to generate RAIN block checksum error function corrupt_rain_blocxs_err() { local CMD_OUT=$(exec_cmd eos-cli1 "eos fileinfo fxid:${FXID_RAIN_BLOCKXS} -m --fullpath | sed -r 's/[[:alnum:]]+=/\n&/g' | awk -F '=' '{if (\$1 ==\"fsid\" || \$1 ==\"fullpath\") {print \$2};}' | tail -n2") # Extract the fxid and local path, trim the input local FSID=$(echo "${CMD_OUT}" | head -n1 | sed -e 's/^[[:space:]]*//' -e 's/[[:space:]]*$//') local LPATH=$(echo "${CMD_OUT}" | tail -n1) # Corrupt the checksum of the file by writing random bytes at the beginning of the file exec_cmd "eos-fst${FSID}" "dd if=/dev/urandom of=${LPATH} bs=1 seek=4 count=3 conv=notrunc" } # Corrupt file to generate rain stripe diff error function corrupt_rain_stripe_diff() { local CMD_OUT=$(exec_cmd eos-cli1 "eos fileinfo fxid:${FXID_RAIN_DIFF} -m --fullpath | sed -r 's/[[:alnum:]]+=/\n&/g' | awk -F '=' '{if (\$1 ==\"fsid\" || \$1 ==\"fullpath\") {print \$2};}' | tail -n2") # Extract the fxid and local path, trim the input local FSID=$(echo "${CMD_OUT}" | head -n1 | sed -e 's/^[[:space:]]*//' -e 's/[[:space:]]*$//') exec_cmd eos-cli1 "eos -r 0 0 file drop fxid:${FXID_RAIN_DIFF} ${FSID} -f" } # Corrupt file to generate orphan_n error function corrupt_orphan() { local CMD_OUT=$(exec_cmd eos-cli1 "eos fileinfo fxid:${FXID_ORPHAN} -m --fullpath | sed -r 's/[[:alnum:]]+=/\n&/g' | awk -F '=' '{if (\$1 ==\"fsid\" || \$1 ==\"fullpath\") {print \$2};}' | tail -n2") # Extract the fxid and local path, trim the input local FSID=$(echo "${CMD_OUT}" | head -n1 | sed -e 's/^[[:space:]]*//' -e 's/[[:space:]]*$//') local LPATH=$(echo "${CMD_OUT}" | tail -n1) # Do it twice to force delete the filemd object and set the file system off # so that the deletion message is not sent before we drop the list of # deletions. exec_cmd eos-mgm1 "eos fs config ${FSID} configstatus=off" exec_cmd eos-mgm1 "eos rm -F fxid:${FXID_ORPHAN}" exec_cmd eos-mgm1 "eos rm -F fxid:${FXID_ORPHAN}" exec_cmd eos-mgm1 "eos fs dropdeletion ${FSID}" exec_cmd eos-mgm1 "eos fs config ${FSID} configstatus=rw" } # Corrupt file to generate RAIN stripe size error function corrupt_rain_stripe_size() { local CMD_OUT=$(exec_cmd eos-cli1 "eos fileinfo fxid:${FXID_RAIN_STRIPE_SZ} -m --fullpath | sed -r 's/[[:alnum:]]+=/\n&/g' | awk -F '=' '{if (\$1 ==\"fsid\" || \$1 ==\"fullpath\") {print \$2};}' | tail -n2") # Extract the fxid and local path, trim the input local FSID=$(echo "${CMD_OUT}" | head -n1 | sed -e 's/^[[:space:]]*//' -e 's/[[:space:]]*$//') local LPATH=$(echo "${CMD_OUT}" | tail -n1) # Corrupt the size of the stripe by truncating to 0 exec_cmd "eos-fst${FSID}" "truncate --size 0 ${LPATH}" } # Corrupt stripe to generate RAIN stripe error function corrupt_rain_invalid_stripe() { local CMD_OUT=$(exec_cmd eos-cli1 "eos fileinfo fxid:${FXID_RAIN_INVALID_STRIPE} -m --fullpath | sed -r 's/[[:alnum:]]+=/\n&/g' | awk -F '=' '{if (\$1 ==\"fsid\" || \$1 ==\"fullpath\") {print \$2};}' | tail -n2") # Extract the fxid and local path, trim the input local FSID=$(echo "${CMD_OUT}" | head -n1 | sed -e 's/^[[:space:]]*//' -e 's/[[:space:]]*$//') local LPATH=$(echo "${CMD_OUT}" | tail -n1) # Corrupt data in the stripe then fix the checksum exec_cmd "eos-fst${FSID}" "dd if=/dev/urandom of=${LPATH} bs=1 seek=5000 count=10 conv=notrunc" exec_cmd "eos-fst${FSID}" "eos-compute-blockxs ${LPATH}" } # Corrupt 2 stripes to generate RAIN stripe error function corrupt_rain_invalid_stripe2() { local CMD_OUT=$(exec_cmd eos-cli1 "eos fileinfo fxid:${FXID_RAIN_INVALID_STRIPE2} -m --fullpath | sed -r 's/[[:alnum:]]+=/\n&/g' | awk -F '=' '{if (\$1 ==\"fsid\" || \$1 ==\"fullpath\") {print \$2};}'") local FIRST_FST=$(echo "${CMD_OUT}" | head -n 4 | tail -n 2) local LAST_FST=$(echo "${CMD_OUT}" | tail -n 2) # Extract the fxid and local path, trim the input local FSID=$(echo "${FIRST_FST}" | head -n1 | sed -e 's/^[[:space:]]*//' -e 's/[[:space:]]*$//') local LPATH=$(echo "${FIRST_FST}" | tail -n1) # Corrupt data in the stripe then fix the checksum exec_cmd "eos-fst${FSID}" "dd if=/dev/urandom of=${LPATH} bs=1 seek=5000 count=10 conv=notrunc" exec_cmd "eos-fst${FSID}" "eos-compute-blockxs ${LPATH}" local FSID=$(echo "${LAST_FST}" | head -n1 | sed -e 's/^[[:space:]]*//' -e 's/[[:space:]]*$//') local LPATH=$(echo "${LAST_FST}" | tail -n1) exec_cmd "eos-fst${FSID}" "dd if=/dev/urandom of=${LPATH} bs=1 seek=5000 count=10 conv=notrunc" exec_cmd "eos-fst${FSID}" "eos-compute-blockxs ${LPATH}" } # Duplicate stripe to generate RAIN stripe error function corrupt_rain_stripe_diff_over() { local CMD_OUT=$(exec_cmd eos-cli1 "eos fileinfo fxid:${FXID_RAIN_DIFF_OVER} -m --fullpath | sed -r 's/[[:alnum:]]+=/\n&/g' | awk -F '=' '{if (\$1 ==\"fsid\") {print \$2};}'") local VECT_FSID=( $(echo "${CMD_OUT}" | sed -e 's/^[[:space:]]*//' -e 's/[[:space:]]*$//') ) echo "Used locations: ${VECT_FSID[@]}" local NEW_FSID="" local FST_ONLINE=$(exec_cmd eos-mgm1 "eos fs ls | grep \"online\" | wc -l") for (( i=1; i<=${FST_ONLINE}; i++ )); do local found=false for e in ${VECT_FSID[@]}; do if [[ "$i" == "$e" ]]; then found=true break fi done if [[ "${found}" == false ]]; then NEW_FSID=$i break fi done if [[ "${NEW_FSID}" == "" ]]; then echo "error: no new FSID found for replication command" exit 1 fi # Start by tagging the file on the new fst exec_cmd eos-cli1 "eos -r 0 0 file tag ${EOS_RAIN_DIR}/rain_stripe_diff_over.dat +${NEW_FSID}" # Get local path of source and local path of the destination directory local CMD_OUT=$(exec_cmd eos-cli1 "eos fileinfo fxid:${FXID_RAIN_DIFF_OVER} -m --fullpath | sed -r 's/[[:alnum:]]+=/\n&/g'") local SRC_LPATH=$(echo "${CMD_OUT}" | grep fsid="${VECT_FSID[0]}" -A1 | tail -n1 | sed -r -e 's/^fullpath=//' -e 's/[[:space:]]*$//') local SRC_LDIR=$(echo "${SRC_LPATH}" | sed -r 's/[[:alnum:]]+$//') local SRC_LFILE=$(echo "${SRC_LPATH}" | grep -oE '[[:alnum:]]+$') local DST_LDIR=$(echo "${CMD_OUT}" | grep fsid="${NEW_FSID}" -A1 | tail -n1 | sed -r -e 's/^fullpath=//' -e 's/[[:alnum:]]+[[:space:]]*$//') cp_file_with_xattr_cmd "eos-fst${VECT_FSID[0]}" "${SRC_LDIR}" "${SRC_LFILE}" "eos-fst${NEW_FSID}" "${DST_LDIR}" cp_file_with_xattr_cmd "eos-fst${VECT_FSID[0]}" "${SRC_LDIR}" "${SRC_LFILE}.xsmap" "eos-fst${NEW_FSID}" "${DST_LDIR}" # Force resync of fmd on new fst exec_cmd eos-cli1 "eos -r 0 0 file verify fxid:${FXID_RAIN_DIFF_OVER} ${NEW_FSID} -resync" } # Duplicate stripe and corrupt another stripe to generate RAIN stripe error function corrupt_rain_stripe_diff_over2() { # First corrupt one stripe local CMD_OUT=$(exec_cmd eos-cli1 "eos fileinfo fxid:${FXID_RAIN_DIFF_OVER2} -m --fullpath | sed -r 's/[[:alnum:]]+=/\n&/g' | awk -F '=' '{if (\$1 ==\"fsid\" || \$1 ==\"fullpath\") {print \$2};}' | tail -n2") local FSID=$(echo "${CMD_OUT}" | head -n1 | sed -e 's/^[[:space:]]*//' -e 's/[[:space:]]*$//') local LPATH=$(echo "${CMD_OUT}" | tail -n1) exec_cmd "eos-fst${FSID}" "dd if=/dev/urandom of=${LPATH} bs=1 seek=5000 count=10 conv=notrunc" exec_cmd "eos-fst${FSID}" "eos-compute-blockxs ${LPATH}" # Then duplicate another one local CMD_OUT=$(exec_cmd eos-cli1 "eos fileinfo fxid:${FXID_RAIN_DIFF_OVER2} -m --fullpath | sed -r 's/[[:alnum:]]+=/\n&/g' | awk -F '=' '{if (\$1 ==\"fsid\") {print \$2};}'") # Extract the fxid and local path, trim the input local VECT_FSID=( $(echo "${CMD_OUT}" | sed -e 's/^[[:space:]]*//' -e 's/[[:space:]]*$//') ) echo "Used locations: ${VECT_FSID[@]}" local NEW_FSID="" local FST_ONLINE=$(exec_cmd eos-mgm1 "eos fs ls | grep \"online\" | wc -l") for (( i=1; i<=${FST_ONLINE}; i++ )); do local found=false for e in ${VECT_FSID[@]}; do if [[ "$i" == "$e" ]]; then found=true break fi done if [[ "${found}" == false ]]; then NEW_FSID=$i break fi done if [[ "${NEW_FSID}" == "" ]]; then echo "error: no new FSID found for replication command" exit 1 fi # Start by tagging the file on the new fst exec_cmd eos-cli1 "eos -r 0 0 file tag ${EOS_RAIN_DIR}/rain_stripe_diff_over2.dat +${NEW_FSID}" # Get local path of source and local path of the destination directory local CMD_OUT=$(exec_cmd eos-cli1 "eos fileinfo fxid:${FXID_RAIN_DIFF_OVER2} -m --fullpath | sed -r 's/[[:alnum:]]+=/\n&/g'") local SRC_LPATH=$(echo "${CMD_OUT}" | grep fsid="${VECT_FSID[0]}" -A1 | tail -n1 | sed -r -e 's/^fullpath=//' -e 's/[[:space:]]*$//') local SRC_LDIR=$(echo "${SRC_LPATH}" | sed -r 's/[[:alnum:]]+$//') local SRC_LFILE=$(echo "${SRC_LPATH}" | grep -oE '[[:alnum:]]+$') local DST_LDIR=$(echo "${CMD_OUT}" | grep fsid="${NEW_FSID}" -A1 | tail -n1 | sed -r -e 's/^fullpath=//' -e 's/[[:alnum:]]+[[:space:]]*$//') cp_file_with_xattr_cmd "eos-fst${VECT_FSID[0]}" "${SRC_LDIR}" "${SRC_LFILE}" "eos-fst${NEW_FSID}" "${DST_LDIR}" cp_file_with_xattr_cmd "eos-fst${VECT_FSID[0]}" "${SRC_LDIR}" "${SRC_LFILE}.xsmap" "eos-fst${NEW_FSID}" "${DST_LDIR}" # Force resync of fmd on new fst exec_cmd eos-cli1 "eos -r 0 0 file verify fxid:${FXID_RAIN_DIFF_OVER2} ${NEW_FSID} -resync" } # Configure fsck to run more often and reduce the scan times function configure_fsck() { # First reduce the scan interval on the FSTs local FST_ONLINE=$(exec_cmd eos-mgm1 "eos fs ls | grep \"online\" | wc -l") for (( i=1; i<=${FST_ONLINE}; i++ )); do exec_cmd eos-cli1 "eos -r 0 0 fs config ${i} scan_disk_interval=20 && eos -r 0 0 fs config ${i} scan_ns_interval=20 && eos -r 0 0 fs config ${i} scaninterval=15 && eos -r 0 0 fs config ${i} scan_rain_interval=15 && eos -r 0 0 fs config ${i} fsck_refresh_interval=20" done # Reduce the interval when the fsck collection thread runs exec_cmd eos-cli1 "eos -r 0 0 fsck config toggle-collect 0.25;" } # Check that we collected all the errors that we expect function check_all_errors_collected() { # Allow for at most MAX_DELAY seconds to collect all the errors local MAX_DELAY=${1:-"300"} local START_TIME=$(date +%s) while local CURRENT_TIME=$(date +%s) local FOUND_D_MEM_SZ_DIFF=$(exec_cmd eos-cli1 "eos -r 0 0 fsck report -i | grep ${FXID_D_MEM_SZ_DIFF}") local FOUND_M_MEM_SZ_DIFF1=$(exec_cmd eos-cli1 "eos -r 0 0 fsck report -i | grep ${FXID_M_MEM_SZ_DIFF1}") local FOUND_M_MEM_SZ_DIFF2=$(exec_cmd eos-cli1 "eos -r 0 0 fsck report -i | grep ${FXID_M_MEM_SZ_DIFF2}") local FOUND_D_CX_DIFF=$(exec_cmd eos-cli1 "eos -r 0 0 fsck report -i | grep ${FXID_D_CX_DIFF}") local FOUND_M_CX_DIFF=$(exec_cmd eos-cli1 "eos -r 0 0 fsck report -i | grep ${FXID_M_CX_DIFF}") local FOUND_UNREG=$(exec_cmd eos-cli1 "eos -r 0 0 fsck report -i | grep ${FXID_UNREG}") local FOUND_REP_MISSING=$(exec_cmd eos-cli1 "eos -r 0 0 fsck report -i | grep ${FXID_REP_MISSING}") local FOUND_REP_DIFF_UNDER=$(exec_cmd eos-cli1 "eos -r 0 0 fsck report -i | grep ${FXID_REP_DIFF_UNDER}") local FOUND_REP_DIFF_OVER=$(exec_cmd eos-cli1 "eos -r 0 0 fsck report -i | grep ${FXID_REP_DIFF_OVER}") local FOUND_RAIN_BLOCKXS=$(exec_cmd eos-cli1 "eos -r 0 0 fsck report -i | grep ${FXID_RAIN_BLOCKXS}") local FOUND_RAIN_DIFF=$(exec_cmd eos-cli1 "eos -r 0 0 fsck report -i | grep ${FXID_RAIN_DIFF}") local FOUND_RAIN_STRIPE_SZ=$(exec_cmd eos-cli1 "eos -r 0 0 fsck report -i | grep ${FXID_RAIN_STRIPE_SZ}") local FOUND_ORPHAN=$(exec_cmd eos-cli1 "eos -r 0 0 fsck report -i | grep ${FXID_ORPHAN}") local FOUND_RAIN_INVALID_STRIPE=$(exec_cmd eos-cli1 "eos -r 0 0 fsck report -i | grep ${FXID_RAIN_INVALID_STRIPE}") local FOUND_RAIN_INVALID_STRIPE2=$(exec_cmd eos-cli1 "eos -r 0 0 fsck report -i | grep ${FXID_RAIN_INVALID_STRIPE2}") local FOUND_RAIN_DIFF_OVER=$(exec_cmd eos-cli1 "eos -r 0 0 fsck report -i | grep ${FXID_RAIN_DIFF_OVER}") local FOUND_RAIN_DIFF_OVER2=$(exec_cmd eos-cli1 "eos -r 0 0 fsck report -i | grep ${FXID_RAIN_DIFF_OVER2}") if [[ -z "${FOUND_D_MEM_SZ_DIFF}" || -z "${FOUND_M_MEM_SZ_DIFF1}" || -z "${FOUND_M_MEM_SZ_DIFF2}" || -z "${FOUND_D_CX_DIFF}" || -z "${FOUND_UNREG}" || -z "${FOUND_REP_MISSING}" || -z "${FOUND_REP_DIFF_UNDER}" || -z "${FOUND_REP_DIFF_OVER}" || -z "${FOUND_RAIN_BLOCKXS}" || -z "${FOUND_RAIN_DIFF}" || -z "${FOUND_RAIN_STRIPE_SZ}" || -z "${FOUND_ORPHAN}" || -z "${FOUND_RAIN_INVALID_STRIPE}" || -z "${FOUND_RAIN_INVALID_STRIPE2}" || -z "${FOUND_RAIN_DIFF_OVER}" || -z "${FOUND_RAIN_DIFF_OVER2}" ]]; then if (( $((${CURRENT_TIME} - ${START_TIME})) >= ${MAX_DELAY} )); then echo "error: some errors not discovered" exec_cmd eos-cli1 "eos -r 0 0 fsck report -i -a" cleanup exit 1 else echo "info: sleep for 5 seconds waiting for error collection, `secs_to_human $((${CURRENT_TIME} - ${START_TIME}))` passed" sleep 5 fi (( $((${CURRENT_TIME} - ${START_TIME})) < ${MAX_DELAY} )) else echo "info: found all the errors we were expecting" false # to end the loop fi do : done } # Cleanup the files and directories at the MGM function cleanup() { exec_cmd eos-cli1 "eos -r 0 0 rm -rF \"${EOS_REPLICA_DIR}/*\" && eos -r 0 0 rmdir ${EOS_REPLICA_DIR}/ && eos -r 0 0 rm -rF \"${EOS_RAIN_DIR}/*\" && eos -r 0 0 rmdir ${EOS_RAIN_DIR}/" exec_cmd eos-cli1 "eos -r 0 0 fsck config toggle-collect" } # @todo the whole args parsing story should be rewritten, possibly decoupling docker/kubernetes execution from the test itself MAX_DELAY=300 # Late addition of optional argument "max-delay, it must be the first arg" if [[ "$1" == "--max-delay" ]]; then MAX_DELAY=$2 shift # past argument shift # past value fi # Set up global variables IS_DOCKER=false IS_LOCAL=false K8S_NAMESPACE="" if [[ $# -lt 2 ]]; then echo "error: invalid number of arguments" usage exit 1 fi if [[ "$1" != "--type" ]]; then echo "error: unknown argument \"$1\"" usage exit 1 fi if [[ "$2" == "docker" ]]; then IS_DOCKER=true elif [[ "$2" == "local" ]]; then IS_LOCAL=true elif [[ "$2" == "k8s" ]]; then IS_DOCKER=false else echo "error: unknown type of executor \"$2\"" usage exit 1 fi if [[ "${IS_LOCAL}" == true && -z "${EOS_MGM_URL}" ]]; then echo "error: EOS_MGM_URL env needs to be set for local!" exit 1 fi if [[ "${IS_DOCKER}" == false && "${IS_LOCAL}" == false ]]; then # For the Kubernetes setup we also need a namespace argument if [[ $# -lt 3 ]]; then echo "error: missing Kubernetes namespace argument" usage exit 1 fi K8S_NAMESPACE="$3" fi echo "eos-fsck-test configuration:" echo "MAX_DELAY=$MAX_DELAY" echo "IS_DOCKER=$IS_DOCKER" echo "IS_LOCAL=$IS_LOCAL" echo "K8S_NAMESPACE=$K8S_NAMESPACE (if IS_DOCKER=false)" echo # Create test file create_test_files # Create different type of corruptions for different files corrupt_d_mem_sz_diff corrupt_m_mem_sz_diff1 corrupt_m_mem_sz_diff2 corrupt_d_cx_diff corrupt_m_cx_diff corrupt_rep_missing_n corrupt_rep_diff_under corrupt_rep_diff_over corrupt_unreg corrupt_rain_blocxs_err corrupt_rain_stripe_diff corrupt_rain_stripe_size corrupt_orphan corrupt_rain_invalid_stripe corrupt_rain_invalid_stripe2 corrupt_rain_stripe_diff_over corrupt_rain_stripe_diff_over2 # Configure fsck to run more often and reduce scan times configure_fsck # Check that we are collecting all the expected errors check_all_errors_collected $MAX_DELAY # Enable the repair thread and allow a delay of 2 minutes to # correct all the discovered errors exec_cmd eos-cli1 "eos -r 0 0 fsck config toggle-repair" # Cleanup the orphan entries exec_cmd eos-cli1 "eos -r 0 0 fsck clean_orphans" # Wait for all the errors to be repaired START_TIME=$(date +%s) while CURRENT_TIME=$(date +%s) HAS_ERRORS=$(exec_cmd eos-cli1 "eos -r 0 0 fsck report -i") if [[ ! -z "${HAS_ERRORS}" ]]; then if (( $((${CURRENT_TIME} - ${START_TIME})) >= ${MAX_DELAY} )); then echo "error: some errors where not repaired" exec_cmd eos-cli1 "eos -r 0 0 fsck report -i -a" cleanup exit 1 else echo "info: sleep for 5 seconds waiting for error repair, `secs_to_human $((${CURRENT_TIME} - ${START_TIME}))` passed" sleep 5 fi else echo "info: all errors were repaired" false # to end the loop fi do : done cleanup