#!/usr/bin/env expect ############################################################################ # Purpose: Test removing clusters from federation # # Reqs: 1. Using slurmdbd accounting storage type and is up # 2. fed_slurm_base is defined in globals.local - set to directory that # has access to each federation configure (fedc1, fedc2, fedc3). # Eg. # fedr/slurm/ (src) # fedr/fed1/bin # fedr/fed1/sbin # fedr/fed1/etc # fedr/fed1/... # fedr/fed2/... # fedr/fed3/... # 3. controllers are up and running. ############################################################################ # Copyright (C) 2017 SchedMD LLC. # Written by Isaac Hartung # # This file is part of Slurm, a resource management program. # For details, see . # Please also read the included file: DISCLAIMER. # # Slurm is free software; you can redistribute it and/or modify it under # the terms of the GNU General Public License as published by the Free # Software Foundation; either version 2 of the License, or (at your option) # any later version. # # Slurm is distributed in the hope that it will be useful, but WITHOUT ANY # WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS # FOR A PARTICULAR PURPOSE. See the GNU General Public License for more # details. # # You should have received a copy of the GNU General Public License along # with Slurm; if not, write to the Free Software Foundation, Inc., # 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. ############################################################################ source ./globals source ./globals_accounting source ./globals_federation set exit_code 0 set fed_name "feda" set user_name "" set srun_job_cnt 0 set my_sbatch "${fed_slurm_base}/$fedc1/bin/sbatch" set my_squeue "${fed_slurm_base}/$fedc1/bin/squeue" set my_sacctmgr "${fed_slurm_base}/$fedc1/bin/sacctmgr" set my_srun "${fed_slurm_base}/$fedc2/bin/srun" set my_sacct "${fed_slurm_base}/$fedc1/bin/sacct" set my_scontrol "${fed_slurm_base}/$fedc1/bin/scontrol" set eol "\r\n" # # Check accounting config and bail if not found. # if { [test_account_storage] == 0 } { skip "This test can't be run without a usable AccountStorageType" } if { [string compare [check_accounting_admin_level] "Administrator"] } { skip "This test can't be run without being an Accounting administrator. Use: sacctmgr mod user \$USER set admin=admin" } proc setup_cluster_features {} { global sacctmgr fedc1 fedc2 fedc3 eol set exit_code 0 set matches 0 set my_pid [spawn $sacctmgr -i modify cluster $fedc1 set features=fa] expect { -re "Setting$eol" { incr matches exp_continue } -re "^\\s+Feature\\s+=\\s+fa" { incr matches exp_continue } -re "Modified cluster...$eol" { incr matches exp_continue } -re "^\\s+$fedc1$eol" { incr matches exp_continue } timeout { log_error "sacctmgr mod not responding" slow_kill $my_pid set exit_code 1 } eof { wait } } if {$exit_code || $matches != 4} { log_error "Unexpected error. Got $matches" end_it 1 } set matches 0 set my_pid [spawn $sacctmgr -i modify cluster $fedc2 set features=fb] expect { -re "Setting$eol" { incr matches exp_continue } -re "^\\s+Feature\\s+=\\s+fb" { incr matches exp_continue } -re "Modified cluster...$eol" { incr matches exp_continue } -re "^\\s+$fedc2$eol" { incr matches exp_continue } timeout { log_error "sacctmgr mod not responding" slow_kill $my_pid set exit_code 1 } eof { wait } } if {$exit_code || $matches != 4} { log_error "Unexpected error. Got $matches" end_it 1 } set matches 0 set my_pid [spawn $sacctmgr -i modify cluster $fedc3 set features=fc] expect { -re "Setting$eol" { incr matches exp_continue } -re "^\\s+Feature\\s+=\\s+fc" { incr matches exp_continue } -re "Modified cluster...$eol" { incr matches exp_continue } -re "^\\s+$fedc3$eol" { incr matches exp_continue } timeout { log_error "sacctmgr mod not responding" slow_kill $my_pid set exit_code 1 } eof { wait } } if {$exit_code || $matches != 4} { log_error "Unexpected error. Got $matches" end_it 1 } } proc scontrol { fed job_id index regex } { global my_scontrol eol srun_spawn_id set matches 0 log_debug "Executing $my_scontrol -M$fed notify $job_id I'm Alive" log_debug "Output: [exec $my_scontrol -M$fed notify $job_id I'm Alive]" set spawn_id $srun_spawn_id($index) expect { -re "$regex" { incr matches } timeout { log_error "srun not responding" end_it 1 } eof { wait } } if {$matches != 1} { log_error "srun failure: expected $regex" end_it 1 } } proc sbatch { options } { global number bin_sleep my_sbatch fedc1 fedc2 fedc3 set matches 0 set job_id 0 set command "$my_sbatch -N10 --exclusive -o/dev/null " append command $options append command " --wrap \"sleep 300\"" set regex "Submitted batch job ($number)" spawn {*}$command expect { -re "$regex" { incr matches set job_id $expect_out(1,string) } timeout { log_error "sbatch not responding" end_it 1 } eof { wait } } if {$matches != 1} { log_error "batch submit failure: expected $regex" end_it 1 } return $job_id } proc srun { options index } { global number srun_spawn_id bin_sleep my_srun set matches 0 set job_id 0 set command "$my_srun --exclusive -N10 " append command $options append command " sleep 300" set regex "($number)" spawn {*}$command set srun_spawn_id($index) $spawn_id expect { -re "$regex" { incr matches set job_id $expect_out(1,string) } timeout { log_error "srun not responding" end_it 1 } eof { wait } } if {$matches != 1} { log_error "srun failure: expected $regex" end_it 1 } return $job_id } proc squeue { fed m regex } { global my_squeue set matches 0 set command "$my_squeue -Ostatecompact:.4,name:.10,jobid:.10,siblingsviable:.20,siblingsactive:.20 -a -M$fed" spawn {*}$command expect { -re "$regex" { incr matches exp_continue } } if {$matches != $m} { log_error "unexpected error in squeue. expected $regex. Matched $matches/$m times." end_it 1 } } proc cancel_federation_jobs { } { global scancel user_name fedc1 fedc2 fedc3 spawn $scancel -M$fedc1,$fedc2,$fedc3 --user $user_name expect { eof { wait } } sleep 5 } proc cancel_job { job_id options } { global scancel spawn $scancel $options $job_id expect { eof { wait } } } proc cleanup { } { global scancel fed_name user_name bin_rm file_in fedc1 fedc2 fedc3 global test_id bin_bash my_sacctmgr cancel_federation_jobs exec $bin_bash -c "$bin_rm -f test$test_id*.out" return [delete_federations $fed_name] } proc end_it { exit_code } { global test_id my_squeue cleanup if {$exit_code != 0} { fail "Test failed due to previous errors (\$exit_code = $exit_code)" } pass } #start test if {[test_federation_setup]} { skip "This test can't be run without fed_slurm_base, fedc1, fedc2, fedc3 setup in globals.local" } if {[test_all_up]} { pass } set user_name [get_my_user_name] # Remove existing setup if {[cleanup] != 0} { log_error "failed to cleanup" end_it 1 } # add clusters to federation if {[setup_federation $fed_name]} { log_error "failed to setup federation" end_it 1 } setup_cluster_features log_info "################################################################" log_info "Test sacctmgr remove federation" log_info "################################################################" cancel_federation_jobs sleep 5 set srun_spawn_id(0) 0 set j(0) [sbatch -M$fedc1] wait_for_fed_job $j(0) RUNNING $fedc1 set j(1) [sbatch -M$fedc2] wait_for_fed_job $j(1) RUNNING $fedc2 set j(2) [sbatch ""] wait_for_fed_job $j(2) RUNNING $fedc3 set j(6) [sbatch "--cluster-constraint=fb,fc"] set j(7) [sbatch "--cluster-constraint=fb,fc"] set j(8) [sbatch "--cluster-constraint=fb,fc"] set j(9) [sbatch "--cluster-constraint=fb"] set j(10) [sbatch "--cluster-constraint=fa"] set j(3) [sbatch ""] set my_sbatch "${fed_slurm_base}/$fedc2/bin/sbatch" set j(4) [sbatch ""] set my_sbatch "${fed_slurm_base}/$fedc3/bin/sbatch" set j(5) [sbatch ""] srun "" 0 sleep 5 # Verify jobs are where they should be # O: origin # R: running # PD: Pending # RV: Revoked # fed1 fed2 fed3 # j(0)O:R # j(1)O:R # j(2)O:RV j(2)R # j(3)O:PD j(3)PD j(3)PD # j(4)PD j(4)O:PD j(4)PD # j(5)PD j(5)PD j(5)O:PD # # j(6)O:RV j(6)PD j(6)PD # j(7)O:RV j(7)PD j(7)PD # j(8)O:RV j(8)PD j(8)PD # j(9)O:RV j(9)PD # j(10)O:PD # # srun:PD srun:O:PD srun:PD set f1 "\\s+$fedc1" set f2 "\\s+$fedc2" set f3 "\\s+$fedc3" set n "\\s+NA" set srun_job "sleep\\s+\\d+" set sib "\\s+$fedc1,$fedc2,$fedc3" set sib2 "\\s+$fedc2,$fedc3" set r1 "$j(0)$f1$f1|$j(2)$sib$f3|$j(3)$sib$sib|$j(4)$sib$n|$j(5)$sib$n|$srun_job$sib$n|$j(6)$sib2$sib2|$j(7)$sib2$sib2|$j(8)$sib2$sib2|$j(9)$f2$f2|$j(10)$f1$f1" set r2 "$j(1)$f2$f2|$j(3)$sib$n|$j(4)$sib$sib|$j(5)$sib$n|$srun_job$sib$sib|$j(6)$sib2$n|$j(7)$sib2$n|$j(8)$sib2$n|$j(9)$f2$n" set r3 "$j(2)$sib$f3|$j(3)$sib$n|$j(4)$sib$n|$j(5)$sib$sib|$srun_job$sib$n" squeue "$fedc1" "11" $r1 squeue "$fedc2" "9" $r2 squeue "$fedc3" "5" $r3 log_info "################################################################" log_info "Remove $fedc1" log_info "################################################################" spawn $my_sacctmgr mod fed $fed_name set -i clusters-=$fedc1 sleep 5 # Verify: # 1. If origin is removed, that pending jobs are left on origin cluster and # removed from non-origin clusters. # 2. If there is a sibling job that is on a non-origin sibling where that # sibling is the only viable sibling then that job become non-federated and # remain pending and the origin tracking job should be removed. # 3. If there is a job that has multiple viable siblings that and the origin is # not a viable sibling then the non-origin jobs should stay as federated # jobs and the origin tracking job should be removed. These jobs will # schedule amongst themselves to start. # 4. non-origin running jobs are still running and don't have federation # information (e.g. viable and active siblings). # 5. non-origin jobs have the removed cluster removed from the viable and # active siblings # 6. Verify that pending sruns don't get terminated message when siblings are # revoked. # 7. Verify jobs schedule amongst the siblings when the origin is removed. # 7. Verify jobs run when the only viable sibling and the origin is removed. # # fed1 fed2 fed3 # j(0)O:R # j(1)O:R # j(2)R # j(3)O:PD # j(4)O:PD j(4)PD # j(5)PD j(5)O:PD # srun:O:PD srun:PD # # j(6)PD j(6)PD # j(7)PD j(7)PD # j(8)PD j(8)PD # j(9)PD # j(10)O:PD set sib "\\s+$fedc2,$fedc3" set r1 "$j(0)$n$n|$j(3)$n$n|$j(10)$n$n" set r2 "$j(1)$f2$f2|$j(4)$sib$sib|$j(5)$sib$n|$srun_job$sib$sib|$j(6)$sib$n|$j(7)$sib$n|$j(8)$sib$n|$j(9)$n$n" set r3 "$j(2)$n$n|$j(4)$sib$n|$j(5)$sib$sib|$srun_job$sib$n|$j(6)$sib$n|$j(7)$sib$n|$j(8)$sib$n" squeue "$fedc1" "3" $r1 squeue "$fedc2" "8" $r2 squeue "$fedc3" "7" $r3 #check db # Test that job4 is marked REVOKED on the removed cluster (fed1) and still # pending on the other clusters. set matches 0 set r1 "REVOKED$f1" set r2 "PENDING$f2" set r3 "PENDING$f3" spawn $my_sacct -o state,cluster -j $j(4) -M$fedc1,$fedc2,$fedc3 -D expect { -re "$r1|$r2|$f3" { incr matches exp_continue } } if {$matches != 3} { log_error "unexpected error in sacct. $matches/3" end_it 1 } # Test that job3 is marked REVOKED on the siblings clusters (fed2,fed3) and # pending on the origin cluster. set matches 0 set r1 "PENDING$f1" set r2 "REVOKED$f2" set r3 "REVOKED$f3" spawn $my_sacct -o state,cluster -j $j(3) -M$fedc1,$fedc2,$fedc3 -D expect { -re "$r1|$r2|$f3" { incr matches exp_continue } } if {$matches != 3} { log_error "unexpected error in sacct." end_it 1 } # Cancel running job on fed2 so that j(6) will schedule amongst fed2 and fed3 # to start the job. cancel_job $j(1) "-M$fedc2" set run_cluster [wait_for_fed_job $j(6) RUNNING $fedc2] if {[string compare $run_cluster ""] == 0} { log_error "Didn't find running job on cluster" end_it 1 } log_info "################################################################" log_info "Remove $fedc2" log_info "################################################################" spawn $my_sacctmgr mod fed $fed_name set -i clusters-=$fedc2 sleep 5 # Verify: # fed1 fed2 fed3 # j(0)O:R # j(6)R # j(2)R # j(3)O:PD # j(4)O:PD # j(5)O:PD # srun:O:PD # # j(7)PD # j(8)PD # j(9)PD # j(10)O:PD set old_timeout $timeout set timeout 1 set matches 0 set spawn_id $srun_spawn_id(0) expect { -re "srun: Force Terminated job $number" { incr matches exp_continue } eof {} } if {$matches != 0} { log_error "srun got signaled when it shouldn't have." end_it 1 } set timeout $old_timeout set r1 "$j(0)$n$n|$j(3)$n$n|$j(10)$n$n" set r2 "$j(6)$n$n|$j(4)$n$n|$srun_job$n$n|$j(9)$n$n" set r3 "$j(2)$n$n|$j(5)$f3$f3|$j(7)$f3$n|$j(8)$f3$n" squeue "$fedc1" "3" $r1 squeue "$fedc2" "4" $r2 squeue "$fedc3" "4" $r3 # Cancel running job on fed3 so that j(7) will schedule be scheduled on fed3. # j(7) is still considered a federated job but only has one sibling. cancel_job $j(2) "-M$fedc3" set run_cluster [wait_for_fed_job $j(7) RUNNING $fedc3] if {[string compare $run_cluster ""] == 0} { log_error "Didn't find running job on cluster" end_it 1 } log_info "################################################################" log_info "Remove $fedc3" log_info "################################################################" spawn $my_sacctmgr mod fed $fed_name set -i clusters-=$fedc3 sleep 5 # Verify: # fed1 fed2 fed3 # j(0)O:R # j(6)R # j(3)O:PD # j(4)O:PD # j(5)O:PD # srun:O:PD # # j(7)R # j(8)PD # j(9)PD # j(10)O:PD set r1 "$j(0)$n$n|$j(3)$n$n|$j(10)$n$n" set r2 "$j(6)$n$n|$j(4)$n$n|$srun_job$n$n|$j(9)$n$n" set r3 "$j(5)$n$n|$j(7)$n$n|$j(8)$n$n" squeue "$fedc1" "3" $r1 squeue "$fedc2" "4" $r2 squeue "$fedc3" "3" $r3 log_info "################################################################" log_info "Test revoked origin jobs are removed." log_info "################################################################" # Start from scratch # Start jobs on all clusters and remove non-origin clusters. # Remove non-origin clusters # Running jobs on the non-origin clusters should remain running. # Revoked tracking jobs on origin should go away. cancel_federation_jobs exec $my_sacctmgr mod fed $fed_name set -i clusters=$fedc1,$fedc2,$fedc3 #Test whether origin tracking jobs are removed when remote cluster is. set my_srun "${fed_slurm_base}/$fedc1/bin/srun" set j(0) [srun -M$fedc1 1] wait_for_fed_job $j(0) RUNNING $fedc1 set j(1) [srun "" 2] set clus1 [wait_for_fed_job $j(1) RUNNING $fedc2,$fedc3] set j(2) [srun "" 3] set clus2 [wait_for_fed_job $j(2) RUNNING $fedc3,$fedc2] # fed1 fed2 # clus1 clus2 # j(0)O:R # j(1)O:R # j(2)O:RV j(2)R set f1 "\\s*$fedc1" set f2 "\\s*$clus1" set f3 "\\s*$clus2" set n "\\s*NA" set sib "\\s*($fedc1,*|$clus1,*|$clus2,*){3}" set r1 "$j(0)$f1$f1|$j(1)$sib$f2|$j(2)$sib$f3" squeue "$fedc1" "3" $r1 spawn $my_sacctmgr mod fed $fed_name set -i clusters-=$clus1 sleep 5 set sib "\\s*$fedc1,$clus2" set r1 "$j(0)$f1$f1|$j(2)$sib$f3" squeue "$fedc1" "2" $r1 spawn $my_sacctmgr mod fed $fed_name set -i clusters-=$clus2 sleep 5 set r1 "$j(0)$f1$f1" squeue "$fedc1" "1" $r1 spawn $my_sacctmgr mod fed $fed_name set -i clusters-=$fedc1 #Make sure all of the jobs(sruns) have not terminated scontrol $fedc1 $j(0) 1 "srun: I'm Alive" scontrol $clus1 $j(1) 2 "srun: I'm Alive" scontrol $clus2 $j(2) 3 "srun: I'm Alive" # All Done end_it 0