#!/usr/bin/env expect ############################################################################ # Purpose: Test of Slurm functionality # Test options --ntask-per-node and -c are enforced ############################################################################ # Copyright (C) 2014 SchedMD LLC # Written by Nathan Yee # # This file is part of Slurm, a resource management program. # For details, see . # Please also read the included file: DISCLAIMER. # # Slurm is free software; you can redistribute it and/or modify it under # the terms of the GNU General Public License as published by the Free # Software Foundation; either version 2 of the License, or (at your option) # any later version. # # Slurm is distributed in the hope that it will be useful, but WITHOUT ANY # WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS # FOR A PARTICULAR PURPOSE. See the GNU General Public License for more # details. # # You should have received a copy of the GNU General Public License along # with Slurm; if not, write to the Free Software Foundation, Inc. # 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. ############################################################################ source ./globals set num_nodes 3 set nodelist "" set cputot 0 set ntasks 0 set ntaskpn 0 set exit_code 0 set sys_homo 0 set file_in "test$test_id\_sc" array set nodes {} array set tasks {} # some systems take a while to run the | sort -V | uniq -c stuff set timeout 60 if {![test_select_type_params "CR_PACK_NODES"]} { skip "This test requires SelectTypeParameters=CR_PACK_NODES" } if {[test_front_end] != 0} { skip "This test is not compatible with front-end configurations" } proc check_node_config { } { global scontrol nodelist sys_homo number exit_code set match 0 set low 0 set tmp 0 set same 0 log_user 0 spawn $scontrol show nodes $nodelist expect { -re "CPUTot=($number)" { if {$match != 0} { set tmp $expect_out(1,string) if {$tmp < $low} { set low $tmp } elseif {$tmp == $low} { incr same 1 } } else { set low $expect_out(1,string) } incr match 1 exp_continue } timeout { log_error "scontrol is not responding" set exit_code 1 } eof { wait } } log_user 1 if {$match != 3} { fail "Could not determine node config" } elseif {$same == 2} { set sys_homo 1 return $low } else { return $low } } proc check_tasks_all {ntaskspn tasks} { global scontrol exit_code array set ntasks $tasks set match 0 for {set i 0} {$i<3} {incr i 1} { if {$ntasks($i) == $ntaskspn} { incr match 1 } } if {$match != 3} { log_error "Incorrect number of tasks were set" set exit_code 1 } } proc check_tasks_off {ntaskspn tasks offset} { global scontrol exit_code array set ntasks $tasks set match 0 for {set i 0} {$i<2} {incr i 1} { if {$ntasks($i) == $ntaskspn} { incr match 1 } } if {$ntasks($i) == [expr $ntaskspn - $offset]} { incr match 1 } if {$match != 3} { log_error "Incorrect number of tasks were set $match != 3" set exit_code 1 } } proc check_cpu_all {nodes job_cpus ncpus} { global scontrol exit_code array set nnodes $nodes array set jcpus $job_cpus set match 0 for {set i 0} {$i<3} {incr i 1} { spawn $scontrol show nodes $nnodes($i) expect { -re "CPUTot=[expr $jcpus($i) * $ncpus]" { incr match 1 exp_continue } timeout { log_error "scontrol is not responding" set exit_code 1 } eof { wait } } } if {$match != 3} { log_error "Incorrect number of cpus were set" set exit_code 1 } } proc check_cpu_off {nodes job_cpus ncpus} { global scontrol exit_code array set nnodes $nodes array set jcpus $job_cpus set match 0 for {set i 0} {$i<2} {incr i 1} { spawn $scontrol show nodes $nnodes($i) expect { -re "CPUTot=[expr $jcpus($i) * $ncpus]" { incr match 1 exp_continue } timeout { log_error "scontrol is not responding" set exit_code 1 } eof { wait } } } spawn $scontrol show nodes $nnodes($i) expect { -re "CPUTot=[expr ($jcpus($i) * $ncpus) + $ncpus]" { incr match 1 exp_continue } timeout { log_error "scontrol is not responding" set exit_code 1 } eof { wait } } if {$match != 3} { log_error "Incorrect number of cpus were set $match != 3" set exit_code 1 } } proc submit_cpu {ntasks ncpus} { global srun bin_printenv nodelist exit_code num_nodes tasks nodes global bin_bash number re_word_str wait_for_job global bin_sort bin_uniq # Wait awhile for the jobs to cleanup sleep 2 set x 0 spawn $bin_bash -c "$srun -N$num_nodes -n$ntasks -w$nodelist -c$ncpus --exclusive $bin_printenv SLURMD_NODENAME | $bin_sort -V | $bin_uniq -c" expect { -re "job ($number)" { exp_continue } -re " ($number) ($re_word_str)" { set tasks($x) $expect_out(1,string) set nodes($x) $expect_out(2,string) incr x 1 exp_continue } timeout { log_error "srun is not responding" set exit_code 1 } eof { wait } } if {$x != 3} { fail "srun did not submit the jobs correctly $x != 3" } } proc submit_tasks {ntasks ntaskpn} { global srun bin_printenv nodelist exit_code num_nodes tasks nodes bin_bash global number re_word_str global bin_sort bin_uniq # Wait awhile for the jobs to clean up sleep 2 set x 0 spawn $bin_bash -c "$srun -N$num_nodes -n$ntasks --ntasks-per-node=$ntaskpn -w$nodelist --exclusive $bin_printenv SLURMD_NODENAME | $bin_sort -V | $bin_uniq -c" expect { -re "job ($number)" { exp_continue } -re " ($number) ($re_word_str)" { set tasks($x) $expect_out(1,string) set nodes($x) $expect_out(2,string) incr x 1 exp_continue } timeout { log_error "srun is not responding" set exit_code 1 } eof { wait } } if {$x != 3} { fail "srun did not submit the jobs correctly $x != 3" } } ######################## Test Starts Here ######################## set nb_nodes [get_node_cnt_in_part] if {$nb_nodes < 3} { skip "This test requires at least 3 nodes in the cluster" } make_bash_script $file_in "true" # Submit an exclusive job to get a nodelist set tmp_id 0 spawn $sbatch -N3 -o/dev/null --exclusive $file_in expect { -re "Submitted batch job ($number)" { set tmp_id $expect_out(1,string) exp_continue } timeout { log_error "sbatch is not responding" set exit_code 1 } eof { wait } } if {[wait_for_job $tmp_id "DONE"] != 0} { log_error "Error waiting for job $tmp_id to complete" cancel_job $tmp_id set exit_code 1 } spawn $scontrol show job $tmp_id expect { -re "NodeList=($re_word_str)" { set nodelist $expect_out(1,string) exp_continue } -re "NumCPUs=($number)" { set cputot $expect_out(1,string) exp_continue } timeout { log_error "sbatch is not responding" set exit_code 1 } eof { wait } } ############# Test by CPU ############# log_info "====================Testing CPUs per Task====================" set ncpuspt [check_node_config] # Submit job with just one cpu per task submit_cpu $cputot 1 check_cpu_all [array get nodes] [array get tasks] 1 # Submit job with the lowest cpu count of the 3 nodes submit_cpu [expr $cputot/$ncpuspt] $ncpuspt check_cpu_all [array get nodes] [array get tasks] $ncpuspt if {!$sys_homo} { # Submit job with lowest cpu count of the 3 nodes and set tasks to 1 # less the number of cpus (This test only works on heterogeneous systems) submit_cpu [expr ($cputot/$ncpuspt) - 1] $ncpuspt check_cpu_off [array get nodes] [array get tasks] $ncpuspt } ############# Test by node task ############# log_info "====================Testing Tasks per Node====================" set ntask [expr $num_nodes * [check_node_config]] set ntaskpn [check_node_config] # Submit job with ntasks-per-node with the lost cpu count submit_tasks $ntask $ntaskpn check_tasks_all $ntaskpn [array get tasks] # Submit job with one less number of ntasks to see that task are spread # across all nodes submit_tasks [expr $ntask -1] $ntaskpn check_tasks_off $ntaskpn [array get tasks] 1 # Submit job with two less number of ntasks to see that task are spread # across all nodes submit_tasks [expr $ntask -2] $ntaskpn check_tasks_off $ntaskpn [array get tasks] 2 if {$exit_code == 0} { exec $bin_rm $file_in } else { fail "Test failed due to previous errors (\$exit_code = $exit_code)" }