#!/usr/bin/env expect ############################################################################ # Purpose: Test of Slurm functionality # sacctmgr create qos/account job and then delete account/qos ############################################################################ # Copyright (C) 2019 SchedMD LLC. # Written by Nathan Rini # All rights reserved. # # This file is part of Slurm, a resource management program. # For details, see . # Please also read the included file: DISCLAIMER. # # Slurm is free software; you can redistribute it and/or modify it under # the terms of the GNU General Public License as published by the Free # Software Foundation; either version 2 of the License, or (at your option) # any later version. # # Slurm is distributed in the hope that it will be useful, but WITHOUT ANY # WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS # FOR A PARTICULAR PURPOSE. See the GNU General Public License for more # details. # # You should have received a copy of the GNU General Public License along # with Slurm; if not, write to the Free Software Foundation, Inc., # 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. ############################################################################ source ./globals source ./globals_accounting set part_name "test$test_id\_part" set ta1 "test$test_id-account.1" set ta2 "test$test_id-account.2" set tu1 [get_my_user_name] set tq1 "test$test_id-qos.1" set tq2 "test$test_id-qos.2" set exit_code 0 # account options array set acct_1 {} array set acct_2 {} # user options array set user_req_1 {} set user_req_1(Account) $ta1 set user_req_1(Qos) "$tq1,$tq2" array set user_req_2 {} set user_req_2(Account) $ta2 set user_req_2(Qos) "$tq1,$tq2" # qos options array set qos_1 {} array set qos_2 {} set access_err 0 # # Verify preconditions # if { [test_account_storage] == 0 } { skip "This test can't be run without a usable AccountStorageType" } if { [test_assoc_enforced] == 0 } { skip "This test can't be run without AccountingStorageEnforce=associations" } if { [test_enforce_qos_set] == 0 } { skip "This test can't be run without AccountingStorageEnforce=qos" } if { [string compare [check_accounting_admin_level] "Administrator"] } { skip "This test can't be run without being an Accounting administrator.\nUse: sacctmgr mod user \$USER set admin=admin" } proc check_rc { exit_code } { if {$exit_code != 0} { fail "Subcommand failed with exit code $exit_code" } } # Create test assoc and accounts proc create_accounts {} { global ta1 ta2 tq1 tq2 tu1 user_req_1 user_req_2 global qos_1 qos_2 log_debug "Create account and QOS" # Create test assoc and accounts check_rc [add_qos $tq1 [array get qos_1]] check_rc [add_qos $tq2 [array get qos_2]] check_rc [add_acct $ta1 [array get acct_1]] check_rc [add_acct $ta2 [array get acct_2]] check_rc [add_user $tu1 [array get user_req_1]] check_rc [add_user $tu1 [array get user_req_2]] } # Cleanup test assoc and accounts proc cleanup_accounts {} { global ta1 ta2 tq1 tq2 wait_for_account_done $ta1,$ta2 log_debug "Remove QOS: $tq1,$tq2" remove_qos $tq1,$tq2 log_debug "Remove account: $ta1,$ta2" remove_acct "" $ta1,$ta2 } proc cleanup { } { global config_file part_name cleanup_accounts restore_conf $config_file reconfigure } proc test_salloc { qos account num_nodes } { global salloc number part_name bin_bash set rc -12345 set job_id 0 set my_pid [spawn $salloc -p$part_name --exclusive -q$qos -A$account -N$num_nodes $bin_bash] expect { -re "allocation ($number)" { set job_id $expect_out(1,string) } -re "error" { fail "salloc job was not submitted" } timeout { fail "salloc not responding" } } if { $job_id == 0 } { fail "Submit failure" } return $job_id } proc check_job_reason { job_id state why } { global scontrol re_word_str re_word_str set found_why "" set state_found "" log_user 0 spawn $scontrol show job $job_id expect { -re "State=($re_word_str)" { set state_found $expect_out(1,string) exp_continue } -re "Reason=($re_word_str)" { set found_why $expect_out(1,string) exp_continue } timeout { fail "scontrol not responding" } eof { lassign [wait] pid spawnid os_error_flag rc } } log_user 1 if { $state_found != $state } { fail "Job $job_id state found was $state_found, expected $state" } set found_reason 0 foreach iwhy $why { if { $found_why == $iwhy } { set found_reason 1 break } } if { !$found_reason } { fail "Job $job_id scontrol returned Reason=$found_why instead of Reason='$why'" } log_debug "Found jobid $job_id with correct state '$state' and reason '$found_why'" return 0 } proc run_test { run_qos } { global scontrol tq1 tq2 ta1 ta2 part_name if { $run_qos } { set qos $tq1 set acct $ta2 set update_line "qos=$tq2" set reason "InvalidQOS" } else { set qos $tq1 set acct $ta1 set update_line "account=$ta2" set reason "InvalidAccount" } set job_id_1 [test_salloc $qos $acct 1] set job_id_2 [test_salloc $qos $acct 1] check_rc [wait_for_job $job_id_1 "RUNNING"] if { $run_qos } { # remove test qos if { [remove_qos $qos] } { log_debug "We hit the race trying to get the job running before it hits the database before we removed the qos. This can be expected, trying again." wait_for_part_done $part_name return 1 } } else { #remove test acct if { [remove_acct "" $acct] } { log_debug "We hit the race trying to get the job running before it hits the database before we removed the account. This can be expected, trying again." wait_for_part_done $part_name return 1 } } # Verify jobs state and reason check_job_reason $job_id_1 "RUNNING" [list "None"] check_job_reason $job_id_2 "PENDING" [list "$reason"] # Update pending job to make it runnable, updating the running job isn't # possible but it tests other code if you are watching the log file spawn $scontrol update jobid=$job_id_1 $update_line spawn $scontrol update jobid=$job_id_2 $update_line sleep 5 # Check reasons after account alter check_job_reason $job_id_1 "RUNNING" [list "None"] check_job_reason $job_id_2 "PENDING" [list "Resources" "None"] #cleanup jobs wait_for_part_done $part_name return 0 } cleanup_accounts create_accounts # Get the location of the slurm.conf file set config_dir [get_conf_path] fail_on_error "Unable to determine configuration path" set config_file $config_dir/slurm.conf # # Copy slurm.conf file # save_conf $config_file # Comment out PrologFlags in the slurm.conf exec $bin_sed -i {s/^\(PrologFlags=\)/#\1/gI} $config_file reconfigure delete_part $part_name if { [create_part $part_name 1] } { fail "Unable to create partition $part_name" } for {set i 0} {$i < 2} {incr i} { set cnt 0 set rc 1 # First lets test against removing an account and then qos # from a running and pending job since partition only has 1 so # one of these should run and the second should pend while { $rc && ($cnt < 10) } { set rc [run_test $i] incr cnt } if { $rc } { set credential [expr $i == 0 ? "account" : "qos"] fail "Too many ($cnt) failures trying to remove $credential from job" } } log_info "Testing that PD jobs that lost their QOS will not lunch until restored" # Return to clean slate cleanup_accounts # Add test qos run_command -fail "$sacctmgr -vi add qos $tq1" # Add test account run_command -fail "$sacctmgr -vi add account name=$ta1 set qos=normal,$tq1" # Add test user to account run_command -fail "$sacctmgr -vi add user name=$tu1 account=$ta1" # Check what we just added to make sure it is there if {![regexp -line "^\\S+\\|$ta1\\|$tu1\\|normal,$tq1" [run_command_output "$sacctmgr -n -P show assoc format=cluster,account,user,qos"]]} { fail "Association with the right account, user and qos was not found" } # Submit a delayed job that wants to run in the test qos regexp {Submitted batch job (\d+)} [run_command_output -fail "$sbatch --begin=now+2 --qos=$tq1 -A $ta1 --wrap \"$srun sleep 10\""] {} job_id_1 # The first job should get queued if {[wait_for_job $job_id_1 "PENDING"] != 0} { fail "Job $job_id_1 did not enter the pending state" } # Remove the test qos out from under the job run_command "$sacctmgr -vi modify account name=$ta1 user=$tu1 set qos=normal" # Submit a second delayed job requesting the test qos. It should be rejected if {![run_command_status -none "$sbatch --begin=now+2 --qos=$tq1 -A $ta1 --wrap \"$srun sleep 10\""]} { fail "Job submitted with unassociated qos should have failed" } else { log_debug "The preceding job failure was expected" } # Wait for the first job to pend with Reason=InvalidQOS set condition_matched false wait_for {$condition_matched} { set scontrol_out [run_command_output -fail "$scontrol -o show job $job_id_1"] regexp {JobState=([^ ]+) Reason=([^ ]+)} $scontrol_out {} job_state reason if {$job_state eq "PENDING" && $reason eq "InvalidQOS"} { set condition_matched true } } assert_or_fail {$condition_matched} "Job $job_id_1 did not pend on InvalidQOS as expected" # Add back the test qos run_command -fail "$sacctmgr -vi modify account name=$ta1 user=$tu1 set qos=normal,$tq1" # Wait for the first job to begin running if {[wait_for_job $job_id_1 "RUNNING"] != 0} { fail "Job $job_id_1 did not run after adding back the qos" } log_info "Testing that PD jobs that lost their Account will not resume until restored" # Return to clean slate cleanup_accounts # Add test qos run_command -fail "$sacctmgr -vi add qos $tq1" # Add test accounts run_command -fail "$sacctmgr -vi add account name=$ta1 set qos=normal,$tq1" run_command -fail "$sacctmgr -vi add account name=$ta2 set qos=normal,$tq1" # Add test user to accounts run_command -fail "$sacctmgr -vi add user name=$tu1 account=$ta1" run_command -fail "$sacctmgr -vi add user name=$tu1 account=$ta2" # Check what we just added to make sure it is there if {![regexp -line "^\\S+\\|$ta1\\|$tu1\\|normal,$tq1" [run_command_output "$sacctmgr -n -P show assoc format=cluster,account,user,qos"]]} { fail "Test qos $tq1 has not been added to test account $ta1" } if {![regexp -line "^\\S+\\|$ta2\\|$tu1\\|normal,$tq1" [run_command_output "$sacctmgr -n -P show assoc format=cluster,account,user,qos"]]} { fail "Test qos $tq1 has not been added to test account $ta2" } # Submit a delayed job that wants to run with the test qos and test account 2 regexp {Submitted batch job (\d+)} [run_command_output -fail "$sbatch --begin=now+2 --qos=$tq1 -A $ta2 --wrap \"$srun sleep 10\""] {} job_id_1 # The first job should get queued if {[wait_for_job $job_id_1 "PENDING"] != 0} { fail "Job $job_id_1 did not enter the pending state" } # Remove the test account out from under the job run_command -fail "$sacctmgr -vi delete account name=$ta2" # Submit a second delayed job requesting test account 2. It should be rejected if {![run_command_status -none "$sbatch --begin=now+2 --qos=$tq1 -A $ta2 --wrap \"$srun sleep 10\""]} { fail "Job submitted with unassociated account should have failed" } else { log_debug "The preceding job failure was expected" } # Wait for the first job to pend with Reason=InvalidAccount set condition_matched false wait_for {$condition_matched} { set scontrol_out [run_command_output -fail "$scontrol -o show job $job_id_1"] regexp {JobState=([^ ]+) Reason=([^ ]+)} $scontrol_out {} job_state reason if {$job_state eq "PENDING" && $reason eq "InvalidAccount"} { set condition_matched true } } assert_or_fail {$condition_matched} "Job $job_id_1 did not pend on InvalidAccount as expected" # Add back the test account run_command -fail "$sacctmgr -vi add account name=$ta2 set qos=normal,$tq1" run_command -fail "$sacctmgr -vi add user name=$tu1 account=$ta2" # Wait for the first job to begin running if {[wait_for_job $job_id_1 "RUNNING"] != 0} { fail "Job $job_id_1 did not run after adding back the qos" }