#!/usr/bin/env expect ############################################################################ # Purpose: Test of Slurm functionality # Test that the core spec option in sbatch allocates the correct # number of cores and that tasks spread over multiple nodes # when there is not enough resources on one node. ############################################################################ # Copyright (C) 2014 SchedMD LLC # Written by Nathan Yee # # This file is part of Slurm, a resource management program. # For details, see . # Please also read the included file: DISCLAIMER. # # Slurm is free software; you can redistribute it and/or modify it under # the terms of the GNU General Public License as published by the Free # Software Foundation; either version 2 of the License, or (at your option) # any later version. # # Slurm is distributed in the hope that it will be useful, but WITHOUT ANY # WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS # FOR A PARTICULAR PURPOSE. See the GNU General Public License for more # details. # # You should have received a copy of the GNU General Public License along # with Slurm; if not, write to the Free Software Foundation, Inc., # 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. ############################################################################ source ./globals set file_in "test$test_id\.in" set file_out "test$test_id\.out" set spec_in "spec_core_script\.in" set exit_code 0 ############################################################################# # # Checks that the node uses the correct number of specialized cores # and that the number of nodes the job uses is correct. # # exp_node = 0: job must only use the specified node # exp_node = 1: job must use more then specified node # exp_node = -1: job must fail because the job exceeds the number of cores # ############################################################################# proc core_spec_job {task node core_spec exp_nodes} { global sbatch scontrol spec_in file_out number thread_cnt exit_code global cpu_tot set job_id 0 set num_nodes 0 # Determine the number of tasks that can be run set cpu_used_by_spec [expr $thread_cnt * $core_spec] if {$cpu_tot > $cpu_used_by_spec} { set task_limit [expr $cpu_tot - $cpu_used_by_spec] } else { set task_limit 1 } set ntasks [expr abs($task_limit + $task)] if {$ntasks == 0} { set ntasks 1 } set error_chk 0 spawn $sbatch -t1 -w$node -S$core_spec -n$ntasks -o$file_out $spec_in expect { -re "Submitted batch job ($number)" { set job_id $expect_out(1,string) exp_continue } -re "error" { if {$exp_nodes != -1} { log_error "sbatch should not have produced an error" set exit_code 1 } set error_chk 1 exp_continue } timeout { log_error "sbatch is not responding" set exit_code 1 } eof { wait } } if {$job_id == 0 && $error_chk == 0} { fail "Job was not submitted" } elseif {$exp_nodes == -1 && $job_id != 0} { fail "This job should have failed but did not" } elseif {$exp_nodes == -1 && $error_chk != 0} { log_debug "This error is expected do not worry" } else { set core_chk 0 if {[wait_for_job $job_id "RUNNING"] != 0} { log_error "Waiting for job to start" set exit_code 1 } spawn $scontrol show job $job_id expect { -re "NumNodes=($number)" { set num_nodes $expect_out(1,string) exp_continue } -re "CoreSpec=$core_spec" { set core_chk 1 exp_continue } timeout { log_error "scontrol is not responding" set exit_code 1 } eof { wait } } if {$core_chk == 0} { log_error "Job $job_id does not have the correct number of specialized cores" set exit_code 1 } if {[wait_for_job $job_id "DONE"] != 0} { log_error "Waiting for job to complete" set exit_code 1 } } if {$exp_nodes == 1} { if {$num_nodes <= 1} { log_error "Job $job_id should use more then 1 node" set exit_code 1 } } if {$exp_nodes == 0} { if {$num_nodes != 1} { log_error "Job $job_id should use only $node" set exit_code 1 } } } ############################################################################# # # Tests begin here # ############################################################################# if {[test_linear]} { skip "This test is incompatible with select/linear" } if {[test_select_type_params "CR_SOCKET"]} { skip "This test is incompatible with CR_SOCKET allocations" } set allow_spec [test_allow_spec_resc] if {$allow_spec == 0} { skip "AllowSpecResourcesUsage not configured to permit core specialization" } # Remove any vestigial files exec $bin_rm -f $file_in $file_out $spec_in make_bash_script $file_in " first=\$($scontrol show hostnames \$SLURM_JOB_NODELIST\ | head -n1)\ $scontrol show node \$first\ " make_bash_script $spec_in "sleep 5" set job_id 0 spawn $sbatch --exclusive -t1 -N2 -o$file_out $file_in expect { -re "Batch job submission failed" { skip "Can't test srun task distribution" } -re "Submitted batch job ($number)" { set job_id $expect_out(1,string) exp_continue } timeout { log_error "sbatch is not responding" set exit_code 1 } eof { wait } } if {$job_id == 0} { fail "sbatch did not submit job" } if {[wait_for_file $file_out] != 0} { fail "Output file was not created" } set first_node "" set core_cnt 0 set cpu_tot 1 set socket_cnt 1 set thread_cnt 1 spawn $bin_cat $file_out expect { -re "NodeName=($re_word_str)" { set first_node $expect_out(1,string) exp_continue } -re "CoresPerSocket=($number)" { set core_cnt $expect_out(1,string) exp_continue } -re "CPUTot=($number)" { set cpu_tot $expect_out(1,string) exp_continue } -re "Sockets=($number)" { set socket_cnt $expect_out(1,string) exp_continue } -re "ThreadsPerCore=($number)" { set thread_cnt $expect_out(1,string) exp_continue } timeout { log_error "cat is not responding" set exit_code 1 } eof { wait } } set core_cnt [expr $core_cnt * $socket_cnt] if {$core_cnt == 0} { fail "sbatch did not find the number of cores" } if {$core_cnt < 4} { skip "Core count too low for testing ($core_cnt < 4)" } # # Using the core spec within the node limits # log_info "Run within the specified node" core_spec_job 0 $first_node [expr $core_cnt - 2] 0 core_spec_job -2 $first_node [expr $core_cnt - 2] 0 # # Using core spec with more tasks then the node can handle. This should # cause the tasks to spread across multiple nodes as needed # log_info "Spread job across multiple nodes" core_spec_job 1 $first_node [expr $core_cnt - 2] 1 core_spec_job 1 $first_node [expr $core_cnt - 1] 1 # # Using core spec with more cores then the specified node has # log_info "Fail by trying to use more cores than exist" core_spec_job 1 $first_node [expr $core_cnt + 5] -1 core_spec_job 1 $first_node [expr $core_cnt + 7] -1 if {$exit_code == 0} { exec $bin_rm -f $file_in $file_out $spec_in } else { fail "Test failed due to previous errors (\$exit_code = $exit_code)" }