#!/usr/bin/env expect ############################################################################ # Purpose: Test of Slurm functionality # Test that checks if the QOS limits are enforced. ############################################################################ # Copyright (C) 2012 SchedMD LLC # Written by Nathan Yee # # This file is part of Slurm, a resource management program. # For details, see . # Please also read the included file: DISCLAIMER. # # Slurm is free software; you can redistribute it and/or modify it under # the terms of the GNU General Public License as published by the Free # Software Foundation; either version 2 of the License, or (at your option) # any later version. # # Slurm is distributed in the hope that it will be useful, but WITHOUT ANY # WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS # FOR A PARTICULAR PURPOSE. See the GNU General Public License for more # details. # # You should have received a copy of the GNU General Public License along # with Slurm; if not, write to the Free Software Foundation, Inc., # 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. ############################################################################ source ./globals source ./globals_accounting source ./inc21.30.1 source ./inc21.30.2 source ./inc21.30.3 source ./inc21.30.4 source ./inc21.30.5 source ./inc21.30.6 source ./inc21.30.7 source ./inc21.30.8 source ./inc21.30.9 source ./inc21.30.10 source ./inc21.30.11 source ./inc21.30.12 source ./inc21.30.13 source ./inc21.30.14 source ./inc21.30.15 source ./inc21.30.16 source ./inc21.30.17 set exit_code 0 set skips 0 set test_node "" # Total cpus in test node set totcpus 0 set nthreads 0 set acct "$test_id\_test_acct" set user_name "" set qosname name set qostest "$test_id\_qostest" set grn GrpNodes set grn_num 0 set grcpu GrpCpus set grcpu_num 0 set grpcpumin GrpCPUMins set grpcpumin_num 0 # Set grpcpurunmin_num to multiple of CPUs per core to work with most configurations # Also make sure that it is at least 4 so we can add and subtract from it set grpcpurunmin GrpCPURunMins set grpcpurunmin_num 40 set grjobs GrpJobs set grjobs_num 2 set grpmem GrpMem set grpmem_num 100 set grsub GrpSubmit set grsub_num 2 set grpwall GrpWall set grpwall_num 1 set maxcpu MaxCpus set maxcpu_num 0 # Set maxcpumin_num to multiple of CPUs per core to work with most configurations set maxcpumin MaxCPUMins set maxcpumin_num 0 set maxwall MaxWall set maxwall_num 2 set maxcpuspu MaxCPUSPerUser set maxcpuspu_num 2 set maxnodes MaxNodes set maxnode_num 0 set maxnodespu MaxNodesPerUser set maxnodespu_num 0 set maxjobs MaxJobs set maxjobs_num 2 set maxjobsub MaxSubmitJobs set maxjobsub_num 2 set save_billing_weights "" set time_spacing 1 set tres_cpu_mult 2 # cr_core = 1 / cr_cpu = 0 set selectparam 0 set one_task_pc 0 # mod qos array set mod_qos_vals { GrpNodes -1 GrpCpus -1 GrpJob -1 GrpSubmit -1 GrpCpuMin -1 GrpCpuRunMin -1 GrpMem -1 GrpWall -1 MaxCpus -1 MaxNode -1 MaxJobs -1 MaxSubmitJobs -1 MaxCpuMin -1 MaxWall -1 MaxCpusPerUser -1 MaxNode -1 GrpTRES=billing -1 GrpTRESMins=billing -1 GrpTRESRunMins=billing -1 MaxTRESPerJob=billing -1 MaxTRESMinsPerJob=billing -1 MaxTRESPerUser=billing -1 } # Check to see that there are enough resources in the default partition set tmpc 0 set tmpn 0 set partition [default_partition] spawn $scontrol show part $partition expect { -re "TotalCPUs=($number)" { set tmpc [expr $expect_out(1,string) - 1] exp_continue } -re "TotalNodes=($number)" { set tmpn [expr $expect_out(1,string) - 1] exp_continue } -re "TRESBillingWeights=(\\S+)" { set save_billing_weights $expect_out(1,string) exp_continue } timeout { log_error "scontrol is not responding" set exit_code 1 } eof { wait } } if {$tmpc == 0 || $tmpn == 0} { skip "Not enough Nodes and/or CPUs" } if { ![string compare [priority_type] "multifactor"] } { set prio_multifactor 1 } else { set prio_multifactor 0 } # Determine what the selecttype param is if {[test_select_type_params "CR_CORE"]} { set selectparam 1 } # Determine what the selecttype param is if {[test_select_type_params "CR_ONE_TASK_PER_CORE"]} { set one_task_pc 1 } set got_node 0 spawn $srun -N1 printenv SLURM_NODELIST expect { -re "($re_word_str)" { set test_node $expect_out(1,string) set got_node 1 exp_continue } timeout { log_error "srun is not responding" set exit_code 1 } eof { wait } } if {$got_node != 1} { fail "Did not get node for testing" } # Get the number of cpus on a node lassign [get_node_cpus $test_node] totcpus nthreads if {$totcpus == 0} { fail "No cpus were found" } else { # Set QoS CPU values set grcpu_num [expr $totcpus - $nthreads] set grpcpumin_num $totcpus set maxcpu_num [expr $totcpus - $nthreads] set maxcpumin_num $totcpus } # Get the number of nodes in the default partition set num_nodes 0 spawn $scontrol show partition $partition expect { -re "TotalNodes=($number)" { set num_nodes [expr $expect_out(1,string) - 1] exp_continue } timeout { log_error "scontrol is not responding" set exit_code 1 } eof { wait } } if {$num_nodes == 0} { fail "No cpus were found" } else { # Set QoS node values set grn_num $num_nodes set maxnode_num $num_nodes set maxnodespu_num $num_nodes } proc endit { } { global sacctmgr qostest acct test_id exit_code skips global scontrol save_billing_weights partition # delete qos spawn $sacctmgr -i delete qos $qostest expect { -re "Deleting QOS(s)" { exp_continue } -re "Error" { log_error "QOS was not deleted" } timeout { log_error "sacctmgr is not responding" } eof { wait } } #delete account spawn $sacctmgr -i delete account $acct expect { -re "Deleting accounts" { exp_continue } -re "Error" { log_error "account was not deleted" set exit_code 1 } timeout { log_error "sacctmgr is not responding" set exit_code 1 } eof { wait } } if {$save_billing_weights ne ""} { spawn $scontrol update partitionname=$partition TRESBillingWeights=$save_billing_weights expect { -re "error" { log_error "Failed to reset TRESBillingWeights" set exit_code 1 } timeout { log_error "scontrol is not responding" set exit_code 1 } eof { wait } } } if {$exit_code != 0} { fail "Test failed due to previous errors (\$exit_code = $exit_code)" } if {$skips != 0} { skip "Test was partially skipped (skips = $skips)" } pass } # # Check accounting configuration and terminate if limits not enforced. # if { [test_account_storage] == 0 } { skip "This test can't be run without a usable AccountStorageType" } elseif { [test_enforce_limits] == 0 } { skip "This test can't be run without a usable AccountingStorageEnforce" } if { [test_limits_enforced] == 0 } { skip "This test can't be run without enforcing limits" } if {[test_super_user] == 0} { skip "Test can only be ran as SlurmUser" } # # Some tests will not work properly when allocating sockets or whole nodes to jobs # if {[test_linear] || [default_part_exclusive]} { skip "This test is incompatible with exclusive node allocations" } if {[test_select_type_params "CR_SOCKET"]} { skip "This test is incompatible with CR_SOCKET allocations" } # Remove any vestigial accounts or qos spawn $sacctmgr -i delete qos $qostest expect { -re "Deleting QOS(s)" { exp_continue } -re "Error" { log_error "QOS was not deleted" } timeout { log_error "sacctmgr is not responding" } eof { wait } } # Delete account spawn $sacctmgr -i delete account $acct expect { -re "Deleting accounts" { exp_continue } -re "Error" { log_error "Account was not deleted" set exit_code 1 } timeout { log_error "sacctmgr is not responding" set exit_code 1 } eof { wait } } # Gets user set user_name [get_my_user_name] # add qos set qosmatch 0 spawn $sacctmgr -i add qos $qosname=$qostest expect { -re "Adding QOS" { incr qosmatch exp_continue } timeout { log_error "sacctmgr did not add QOS" set exit_code 1 } eof { wait } } # Add account with qos set acctmatch 0 spawn $sacctmgr -i add account $acct qos=$qostest expect { -re "Adding Account" { incr acctmatch exp_continue } timeout { log_error "sacctmgr is not responding" set exit_code 1 } eof { wait } } if {$acctmatch != 1} { fail "sacctmgr had a problem adding the account" } # Add user to account spawn $sacctmgr -i create user name=$user_name account=$acct expect { timeout { log_error "sacctmgr not responding" } eof { wait } } # # Test GrpNode limit # set mod_qos_vals(GrpNodes) $grn_num mod_qos $qostest [array get mod_qos_vals] sleep $time_spacing inc21_30_1 "QOSGrpNodeLimit" if {$exit_code != 0} { endit } # Reset the value to 0 set mod_qos_vals(GrpNodes) "-1" # # Test GrpCpus # set mod_qos_vals(GrpCpus) $grcpu_num mod_qos $qostest [array get mod_qos_vals] sleep $time_spacing if { $one_task_pc } { set grcpu_num [expr $grcpu_num / $nthreads] } inc21_30_2 "QOSGrpCpuLimit" if {$exit_code != 0} { endit } set mod_qos_vals(GrpCpus) "-1" # # test GrpJob limits # set mod_qos_vals(GrpJobs) $grjobs_num mod_qos $qostest [array get mod_qos_vals] sleep $time_spacing inc21_30_3 "QOSGrpJobsLimit" if {$exit_code != 0} { endit } set mod_qos_vals(GrpJobs) "-1" # # test GrpSubmit # set mod_qos_vals(GrpSubmit) $grsub_num mod_qos $qostest [array get mod_qos_vals] sleep $time_spacing inc21_30_4 if {$exit_code != 0} { endit } set mod_qos_vals(GrpSubmit) "-1" # # Test MaxCpus limits # set mod_qos_vals(MaxCpus) $maxcpu_num mod_qos $qostest [array get mod_qos_vals] sleep $time_spacing if { $one_task_pc } { set maxcpu_num [expr $maxcpu_num / $nthreads] } inc21_30_5 "QOSMaxCpuPerJobLimit" if {$exit_code != 0} { endit } set mod_qos_vals(MaxCpus) "-1" # # Test MaxNode limit # set mod_qos_vals(MaxNodes) $maxnode_num mod_qos $qostest [array get mod_qos_vals] sleep $time_spacing inc21_30_6 "QOSMaxNodePerJobLimit|PartitionConfig" if {$exit_code != 0} { endit } set mod_qos_vals(MaxNodes) "-1" # # Test MaxJobs limit # set mod_qos_vals(MaxJobs) $maxjobs_num mod_qos $qostest [array get mod_qos_vals] sleep $time_spacing inc21_30_7 "QOSMaxJobsPerUserLimit" if {$exit_code != 0} { endit } set mod_qos_vals(MaxJobs) "-1" # # Test MaxJobsSubmits limit # set mod_qos_vals(MaxSubmitJobs) $maxjobsub_num mod_qos $qostest [array get mod_qos_vals] sleep $time_spacing inc21_30_8 if {$exit_code != 0} { endit } set mod_qos_vals(MaxSubmitJobs) "-1" # # Test GroupCPUMins # set mod_qos_vals(GrpCpuMin) $grpcpumin_num mod_qos $qostest [array get mod_qos_vals] sleep $time_spacing if { $one_task_pc } { set grpcpumin_num [expr $grpcpumin_num / $nthreads] } inc21_30_9 "QOSGrpCPUMinutesLimit" if {$exit_code != 0 } { endit } set mod_qos_vals(GrpCpuMin) "-1" # # Test GroupCPURunMins # Requires priority/multifactor to properly handle decay well # if { $prio_multifactor != 0 } { set mod_qos_vals(GrpCpuRunMin) $grpcpurunmin_num mod_qos $qostest [array get mod_qos_vals] sleep $time_spacing if { $one_task_pc } { set grpcpurunmin_num [expr $grpcpurunmin_num / $nthreads] } inc21_30_10 "QOSGrpCPURunMinutesLimit" if {$exit_code != 0 } { endit } set mod_qos_vals(GrpCpuRunMin) "-1" } # # Test Group Memory # set mod_qos_vals(GrpMem) $grpmem_num mod_qos $qostest [array get mod_qos_vals] sleep $time_spacing inc21_30_11 "QOSGrpMemLimit" if {$exit_code != 0 } { endit } set mod_qos_vals(GrpMem) "-1" # # Test Group wall # Requires priority/multifactor to properly handle decay well # if { $prio_multifactor != 0 } { set mod_qos_vals(GrpWall) $grpwall_num mod_qos $qostest [array get mod_qos_vals] sleep $time_spacing inc21_30_12 if {$exit_code != 0 } { endit } set mod_qos_vals(GrpWall) "-1" } # # Test Max Cpu Mins # set mod_qos_vals(MaxCpuMin) $maxcpumin_num mod_qos $qostest [array get mod_qos_vals] sleep $time_spacing if { $one_task_pc } { set maxcpumin_num [expr $maxcpumin_num / $nthreads] } inc21_30_13 "QOSMaxCpuMinutesPerJobLimit" if {$exit_code != 0 } { endit } set mod_qos_vals(MaxCpuMin) "-1" # # Test Max Wall # set mod_qos_vals(MaxWall) $maxwall_num mod_qos $qostest [array get mod_qos_vals] sleep $time_spacing inc21_30_14 "QOSMaxWallDurationPerJobLimit" if {$exit_code != 0 } { endit } set mod_qos_vals(MaxWall) "-1" # # Test Max CPUs Per User # # If CR_CORE set maxcpuspu a multiple number of threads if {$selectparam} { set maxcpuspu_num [expr $maxcpuspu_num * $nthreads] } set mod_qos_vals(MaxCpusPerUser) $maxcpuspu_num mod_qos $qostest [array get mod_qos_vals] sleep $time_spacing if { $one_task_pc } { set maxcpuspu_num [expr $maxcpuspu_num / $nthreads] } inc21_30_15 "QOSMaxCpuPerUserLimit" if {$exit_code != 0 } { endit } set mod_qos_vals(MaxCpusPerUser) "-1" # # Test MaxNodesPerUser # set mod_qos_vals(MaxNodesPerUser) $maxnodespu_num mod_qos $qostest [array get mod_qos_vals] sleep $time_spacing inc21_30_16 "QOSMaxNodePerUserLimit|PartitionConfig" if {$exit_code != 0 } { endit } set mod_qos_vals(MaxNodesPerUser) "-1" # # Test MaxWall is used as job's timelimit if job was requested # without --time option # set mod_qos_vals(MaxWall) $maxwall_num mod_qos $qostest [array get mod_qos_vals] sleep $time_spacing inc21_30_17 if {$exit_code != 0 } { endit } set mod_qos_vals(MaxWall) "-1" # TRESBillingWeights spawn $scontrol update partitionname=$partition tresbillingweights=cpu=$tres_cpu_mult expect { -re "error" { log_error "Failed to set TRESBillingWeights" set exit_code 1 } timeout { log_error "scontrol is not responding" set exit_code 1 } eof { wait } } if {$exit_code != 0} { endit } # # Test GrpTRES=billing # set mod_qos_vals(GrpTRES=billing) [expr $grcpu_num * $tres_cpu_mult] mod_qos $qostest [array get mod_qos_vals] sleep $time_spacing if { $one_task_pc } { set grcpu_num [expr $grcpu_num / $nthreads] } inc21_30_2 "QOSGrpBilling" if {$exit_code != 0} { endit } set mod_qos_vals(GrpTRES=billing) "-1" # # Test GrpTRESMins=billing # set mod_qos_vals(GrpTRESMins=billing) [expr $grpcpumin_num * $tres_cpu_mult] mod_qos $qostest [array get mod_qos_vals] sleep $time_spacing if { $one_task_pc } { set grpcpumin_num [expr $grpcpumin_num / $nthreads] } inc21_30_9 "QOSGrpBillingMinutes" if {$exit_code != 0 } { endit } set mod_qos_vals(GrpTRESMins=billing) "-1" # # Test GroupTRESRunMins=billing # Requires priority/multifactor to properly handle decay well # if { $prio_multifactor != 0 } { set mod_qos_vals(GrpTRESRunMins=billing) [expr $grpcpurunmin_num * $tres_cpu_mult] mod_qos $qostest [array get mod_qos_vals] sleep $time_spacing if { $one_task_pc } { set grpcpurunmin_num [expr $grpcpurunmin_num / $nthreads] } inc21_30_10 "QOSGrpBillingRunMinutes" if {$exit_code != 0 } { endit } set mod_qos_vals(GrpTRESRunMins=billing) "-1" } # # Test GrpTRES=billing limits # set mod_qos_vals(MaxTRESPerJob=billing) [expr $maxcpu_num * $tres_cpu_mult] mod_qos $qostest [array get mod_qos_vals] sleep $time_spacing if { $one_task_pc } { set maxcpu_num [expr $maxcpu_num / $nthreads] } inc21_30_5 "QOSMaxBillingPerJob" if {$exit_code != 0} { endit } set mod_qos_vals(MaxTRESPerJob=billing) "-1" # # Test MaxTRESMinsPerJob=billing # set mod_qos_vals(MaxTRESMinsPerJob=billing) [expr $maxcpumin_num * $tres_cpu_mult] mod_qos $qostest [array get mod_qos_vals] sleep $time_spacing if { $one_task_pc } { set maxcpumin_num [expr $maxcpumin_num / $nthreads] } inc21_30_13 "QOSMaxBillingMinutesPerJob" if {$exit_code != 0 } { endit } set mod_qos_vals(MaxTRESMinsPerJob=billing) "-1" # # Test Max CPUs Per User # # If CR_CORE set maxcpuspu a multiple number of threads if {$selectparam} { set maxcpuspu_num [expr $maxcpuspu_num * $nthreads] } set mod_qos_vals(MaxTRESPerUser=billing) [expr $maxcpuspu_num * $tres_cpu_mult] mod_qos $qostest [array get mod_qos_vals] sleep $time_spacing if { $one_task_pc } { set maxcpuspu_num [expr $maxcpuspu_num / $nthreads] } inc21_30_15 "QOSMaxBillingPerUser" if {$exit_code != 0 } { endit } set mod_qos_vals(MaxTRESPerUser=billing) "-1" endit