#!/usr/bin/env expect ############################################################################ # Purpose: Test of Slurm functionality # Test that partition and job qos limits are enforced when using # the OverPartQos flag for the job's qos ############################################################################ # Copyright (C) 2015 SchedMD LLC # Written by Nathan Yee # # This file is part of Slurm, a resource management program. # For details, see . # Please also read the included file: DISCLAIMER. # # Slurm is free software; you can redistribute it and/or modify it under # the terms of the GNU General Public License as published by the Free # Software Foundation; either version 2 of the License, or (at your option) # any later version. # # Slurm is distributed in the hope that it will be useful, but WITHOUT ANY # WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS # FOR A PARTICULAR PURPOSE. See the GNU General Public License for more # details. # # You should have received a copy of the GNU General Public License along # with Slurm; if not, write to the Free Software Foundation, Inc., # 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. ############################################################################ source ./globals source ./globals_accounting source ./inc21.34_tests set exit_code 0 set test_node "" # Total cpus in test node set totcpus 0 set nthreads 0 set acct test_acct set user_name "" set test_part "test$test_id\_part" set part_qos "test$test_id\_part_qos" set job_qos "test$test_id\_job_qos" set qostest "" set grn GrpNodes set grn_num 0 set grcpu GrpCpus set grcpu_num 0 set grpcpumin GrpCPUMins set grpcpumin_num 0 # Set grpcpurunmin_num to multiple of CPUs per core to work with most configurations # Also make sure that it is at least 4 so we can add and subtract from it set grpcpurunmin GrpCPURunMins set grpcpurunmin_num 40 set grjobs GrpJobs set grjobs_num 2 set grpmem GrpMem set grpmem_num 100 set grsub GrpSubmit set grsub_num 2 set grpwall GrpWall set grpwall_num 1 set maxcpu MaxCpus set maxcpu_num 0 # Set maxcpumin_num to multiple of CPUs per core to work with most configurations set maxcpumin MaxCPUMins set maxcpumin_num 2 set maxwall MaxWall set maxwall_num 2 set maxcpuspu MaxCPUSPerUser set maxcpuspu_num 2 set maxnodes MaxNodes set maxnode_num 0 set maxnodespu MaxNodesPerUser set maxnodespu_num 0 set maxjobs MaxJobs set maxjobs_num 2 set maxjobsub MaxSubmitJobs set maxjobsub_num 2 set time_spacing 1 set tres_cpu_mult 2 # cr_core = 1 / cr_cpu = 0 set selectparam 0 set def_part [default_partition] # mod qos array set mod_job_qos { GrpNodes -1 GrpCpus -1 GrpJob -1 GrpSubmit -1 GrpCpuMin -1 GrpCpuRunMin -1 GrpMem -1 GrpWall -1 MaxCpus -1 MaxNode -1 MaxJobs -1 MaxSubmitJobs -1 MaxCpuMin -1 MaxWall -1 MaxCpusPerUser -1 MaxNode -1 MaxNodesPerUser -1 GrpTRES=billing -1 GrpTRESMins=billing -1 GrpTRESRunMins=billing -1 MaxTRESPerJob=billing -1 MaxTRESMinsPerJob=billing -1 MaxTRESPerUser=billing -1 } array set mod_part_qos { GrpNodes -1 GrpCpus -1 GrpJob -1 GrpSubmit -1 GrpCpuMin -1 GrpCpuRunMin -1 GrpMem -1 GrpWall -1 MaxCpus -1 MaxNode -1 MaxJobs -1 MaxSubmitJobs -1 MaxCpuMin -1 MaxWall -1 MaxCpusPerUser -1 MaxNode -1 MaxNodesPerUser -1 GrpTRES=billing -1 GrpTRESMins=billing -1 GrpTRESRunMins=billing -1 MaxTRESPerJob=billing -1 MaxTRESMinsPerJob=billing -1 MaxTRESPerUser=billing -1 } # # Cannot run the test if OverTimeLimit is set, since we test time limits. # set overtimelim [get_over_time_limit] if {$overtimelim != 0} { skip "Cannot run this test when OverTimeLimit is set. Exiting now" } proc cleanup { } { global acct job_qos part_qos scontrol sacctmgr test_part def_part global exit_code # Delete the test qos set match 0 spawn $sacctmgr -i delete qos $job_qos,$part_qos expect { -re "Deleting" { set match 1 exp_continue } timeout { log_error "sacctmgr is not responding" set exit_code 1 } eof { wait } } #delete account spawn $sacctmgr -i delete account $acct expect { -re "Deleting accounts" { exp_continue } -re "Error" { log_error "Account was not deleted" set exit_code 1 } timeout { log_error "sacctmgr is not responding" set exit_code 1 } eof { wait } } spawn $scontrol delete partitionname=$test_part expect { timeout { log_error "scontrol is not responding" set exit_code 1 } eof { wait } } if {[string length $def_part]} { spawn $scontrol update partitionname=$def_part default=yes expect { timeout { log_error "scontrol is not responding" set exit_code 1 } eof { wait } } } } if { [test_account_storage] == 0 } { skip "This test can't be run without a usable AccountStorageType" } elseif { [test_enforce_limits] == 0 } { skip "This test can't be run without a usable AccountingStorageEnforce" } if { [test_limits_enforced] == 0 } { skip "This test can't be run without enforcing limits" } if {[test_super_user] == 0} { skip "Test can only be ran as SlurmUser" } if {[test_select_type_params "CR_SOCKET"]} { skip "This test is incompatible with CR_SOCKET allocations" } if {[test_select_type_params "CR_ONE_TASK_PER_CORE"]} { skip "This test is incompatible with CR_ONE_TASK_PER_CORE allocations" } if { ![string compare [priority_type] "multifactor"] } { set prio_multifactor 1 } else { set prio_multifactor 0 } # Remove any vestigial data cleanup # Check to see that there are enough resources in the default partition set tmpc 0 set tmpn 0 spawn $scontrol show part [default_partition] expect { -re "TotalCPUs=($number)" { set tmpc [expr $expect_out(1,string) - 1] exp_continue } -re "TotalNodes=($number)" { set tmpn [expr $expect_out(1,string) - 1] exp_continue } timeout { log_error "scontrol is not responding" set exit_code 1 } eof { wait } } if {$tmpc == 0 || $tmpn == 0} { skip "Not enough Nodes and/or CPUs" } # Determine what the selecttype param is if {[test_select_type_params "CR_CORE"]} { set selectparam 1 } # Get the number of nodes in the default partition set num_nodes [available_nodes "idle"] if {$num_nodes == 0} { fail "No cpus were found" } else { # Set QoS node values set grn_num $num_nodes set maxnode_num $num_nodes set maxnodespu_num $num_nodes } # Create 2 test qos add_qos $part_qos "" add_qos $job_qos "" # create a tmp partition to use for testing spawn $scontrol create partitionname=$test_part qos=$part_qos tresbillingweights=cpu=$tres_cpu_mult default=yes \ nodes=ALL expect { timeout { log_error "scontrol is not responding" set exit_code } eof { wait } } set got_node 0 spawn $srun -N1 printenv SLURM_NODELIST expect { -re "($re_word_str)" { set test_node $expect_out(1,string) set got_node 1 exp_continue } timeout { log_error "srun is not responding" set exit_code 1 } eof { wait } } if {$got_node != 1} { fail "Did not get node for testing" } # Get the number of cpus on a node lassign [get_node_cpus $test_node] totcpus nthreads if {$totcpus == 0} { fail "No cpus were found" } else { # Set QoS CPU values set grcpu_num [expr $totcpus - $nthreads] set grpcpumin_num $totcpus set maxcpu_num [expr $totcpus - $nthreads] set maxcpumin_num $totcpus } # Gets user set user_name [get_my_user_name] # Add account with qos set acctmatch 0 spawn $sacctmgr -i add account $acct qos=$job_qos expect { -re "Adding Account" { incr acctmatch exp_continue } timeout { log_error "sacctmgr is not responding" set exit_code 1 } eof { wait } } if {$acctmatch != 1} { fail "sacctmgr had a problem adding the account" } # Add user to account spawn $sacctmgr -i create user name=$user_name account=$acct expect { timeout { log_error "sacctmgr not responding" } eof { wait } } log_info "========== Run limit test on partition's qos limits ==========" part_test # # Set overpartqos flag on job's qos # set changed 0 spawn $sacctmgr -i mod qos $job_qos set flag=overpartqos expect { -re "Modified qos" { set changed 1 exp_continue } timeout { log_error "sacctmgr is not resonding" set exit_code 1 } eof { wait } } log_info "========== Run limit test on job's qos limits ==========" qos_test cleanup if {$exit_code != 0} { fail "Test failed due to previous errors" }