#!/usr/bin/env expect ############################################################################ # Purpose: Test of Slurm functionality # Test association plus partition/job QoS unique node limits enforced ############################################################################ # Copyright (C) 2019 SchedMD LLC # Written by Morris Jette # # This file is part of Slurm, a resource management program. # For details, see . # Please also read the included file: DISCLAIMER. # # Slurm is free software; you can redistribute it and/or modify it under # the terms of the GNU General Public License as published by the Free # Software Foundation; either version 2 of the License, or (at your option) # any later version. # # Slurm is distributed in the hope that it will be useful, but WITHOUT ANY # WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS # FOR A PARTICULAR PURPOSE. See the GNU General Public License for more # details. # # You should have received a copy of the GNU General Public License along # with Slurm; if not, write to the Free Software Foundation, Inc., # 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. ############################################################################ source ./globals source ./globals_accounting set exit_code 0 set file_in "test$test_id.bash" set test_acct "test$test_id\_acct" set test_part "test$test_id\_part" set job_qos "test$test_id\_job_qos" set part_qos "test$test_id\_part_qos" proc cleanup { } { global test_acct job_qos part_qos scontrol sacctmgr test_part global exit_code # Delete the test QOS spawn $sacctmgr -i delete qos $job_qos,$part_qos expect { timeout { log_error "sacctmgr is not responding" set exit_code 1 } eof { wait } } # Delete account spawn $sacctmgr -i delete account $test_acct expect { timeout { log_error "sacctmgr is not responding" set exit_code 1 } eof { wait } } # Delete partition spawn $scontrol delete partitionname=$test_part expect { timeout { log_error "scontrol is not responding" set exit_code 1 } eof { wait } } } proc set_assoc {count} { global sacctmgr test_acct exit_code spawn $sacctmgr -i modify account $test_acct set grpnodes=$count expect { timeout { log_error "sacctmgr is not responding" set exit_code 1 } eof { wait } } } proc set_qos {qos_name count} { global sacctmgr exit_code spawn $sacctmgr -i modify qos $qos_name set grpnodes=$count expect { timeout { log_error "sacctmgr is not responding" set exit_code 1 } eof { wait } } } proc reason_node_limit {job_id} { global scontrol exit_code set node_limit 0 spawn $scontrol show job $job_id expect { -re "Reason=AssocGrpNodeLimit" { set node_limit 1 exp_continue } -re "Reason=QOSGrpNodeLimit" { set node_limit 1 exp_continue } timeout { log_error "scontrol is not responding" set exit_code 1 } eof { wait } } return $node_limit } proc run_job_test { } { global file_in cancel_job sbatch scontrol test_acct global test_part job_qos number re_word_str global nb_nodes exit_code reason_node_limit # Time to wait for scheduling logic to set job's Reason to GrpNodeLimit. # For some configurations this might need to be set higher. # Starting at 10 seconds. set sleep_for_reason_set 10 # Submit 1 node job set job_id1 0 spawn $sbatch -p $test_part --account=$test_acct --qos=$job_qos -t1 --mem=10 -o /dev/null -N1 $file_in expect { -re "Submitted batch job ($number)" { set job_id1 $expect_out(1,string) exp_continue } timeout { log_error "sbatch is not responding" set exit_code 1 } eof { wait } } if {$job_id1 == 0} { log_error "Job submit failed" set exit_code 1 return } # Wait for job to start and get its info if {[wait_for_job $job_id1 "RUNNING"] != 0} { log_error "Error waiting for job $job_id1 to run" cancel_job $job_id1 set exit_code 1 return } set hostname 0 spawn $scontrol show job $job_id1 expect { -re "NodeList=($re_word_str)" { set hostname $expect_out(1,string) exp_continue } timeout { log_error "scontrol is not responding" set exit_code 1 } eof { wait } } if {$hostname == 0} { log_error "BatchHost for job $job_id1 not found" cancel_job $job_id1 set exit_code 1 return } # Submit 1 node job EXCLUDING first job's node set job_id2 0 spawn $sbatch -p $test_part --account=$test_acct --qos=$job_qos -t1 --exclude=$hostname --mem=10 -o /dev/null -N1 $file_in expect { -re "Submitted batch job ($number)" { set job_id2 $expect_out(1,string) exp_continue } timeout { log_error "sbatch is not responding" set exit_code 1 } eof { wait } } if {$job_id2 == 0} { log_error "Job submit failed" cancel_job $job_id1 set exit_code 1 return } # Submit 1 node job INCLUDING first job's node set job_id3 0 spawn $sbatch -p $test_part --account=$test_acct --qos=$job_qos -t1 --nodelist=$hostname --mem=10 -o /dev/null -N1 $file_in expect { -re "Submitted batch job ($number)" { set job_id3 $expect_out(1,string) exp_continue } timeout { log_error "sbatch is not responding" set exit_code 1 } eof { wait } } if {$job_id3 == 0} { log_error "Job submit failed" cancel_job $job_id1 cancel_job $job_id2 set exit_code 1 return } # Check if job's are waiting on GrpNodeLimit sleep $sleep_for_reason_set if {[reason_node_limit $job_id2] != 1} { log_error "Job $job_id2 should be waiting on GrpNodeLimit, but is not" set exit_code 1 } if {[reason_node_limit $job_id3] != 0} { log_error "Job $job_id3 should not be waiting on GrpNodeLimit, but is" set exit_code 1 } cancel_job $job_id1 cancel_job $job_id2 cancel_job $job_id3 # Try to submit 2-node job while there is 1-node limit if {$nb_nodes < 2} { return } set job_id4 0 spawn $sbatch -p $test_part --account=$test_acct --qos=$job_qos -t1 --mem=10 -o /dev/null -N2 $file_in expect { -re "Submitted batch job ($number)" { set job_id4 $expect_out(1,string) exp_continue } timeout { log_error "sbatch is not responding" set exit_code 1 } eof { wait } } if {$job_id4 != 0} { sleep $sleep_for_reason_set if {[reason_node_limit $job_id4] != 1} { log_error "Job $job_id4 should be waiting on GrpNodeLimit, but is not" set exit_code 1 } cancel_job $job_id4 } } if { [test_account_storage] == 0 } { skip "This test can't be run without a usable AccountStorageType" } elseif { [test_enforce_limits] == 0 } { skip "This test can't be run without a usable AccountingStorageEnforce" } if { [test_limits_enforced] == 0 } { skip "This test can't be run without enforcing limits" } if {[test_super_user] == 0} { skip "Test can only be ran as SlurmUser" } # Start with clean configuration cleanup # Create test QOS if {[add_qos $job_qos ""] != 0} { fail "Unable to add qos ($job_qos)" } if {[add_qos $part_qos ""] != 0} { fail "Unable to add qos ($part_qos)" } # Add account with QOS set match 0 spawn $sacctmgr -i add account $test_acct qos=$job_qos expect { -re "Adding Account" { incr match exp_continue } timeout { log_error "sacctmgr is not responding" set exit_code 1 } eof { wait } } if {$match != 1} { fail "sacctmgr had a problem adding the account" } # Get user name set user_name [get_my_user_name] # Add user to account spawn $sacctmgr -i create user name=$user_name account=$test_acct expect { timeout { log_error "sacctmgr not responding" set exit_code 1 } eof { wait } } # Get default partition name set def_part [default_partition] set nb_nodes [get_node_cnt_in_part $def_part] # Create a partition to use for testing spawn $scontrol create partitionname=$test_part qos=$part_qos nodes=[available_nodes_hostnames $def_part] expect { timeout { log_error "scontrol is not responding" set exit_code } eof { wait } } make_bash_script $file_in "$bin_sleep 60" log_info "TEST 1: Association GrpNodes limit" set_assoc 1 run_job_test set_assoc -1 log_info "TEST 2: Job QOS GrpNodes limit" set_qos $job_qos 1 run_job_test set_qos $job_qos -1 log_info "TEST 3: Partition QOS GrpNodes limit" set_qos $part_qos 1 run_job_test set_qos $part_qos -1 # Clean up and exit if {$exit_code == 0} { exec $bin_rm -f $file_in } else { fail "Test failed due to previous errors (\$exit_code = $exit_code)" }