#!/usr/bin/env expect ############################################################################ # Purpose: Test of Slurm functionality # Test --gpus-per-task with implicit task count. ############################################################################ # Copyright (C) 2019 SchedMD LLC # Written by Morris Jette # # This file is part of Slurm, a resource management program. # For details, see . # Please also read the included file: DISCLAIMER. # # Slurm is free software; you can redistribute it and/or modify it under # the terms of the GNU General Public License as published by the Free # Software Foundation; either version 2 of the License, or (at your option) # any later version. # # Slurm is distributed in the hope that it will be useful, but WITHOUT ANY # WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS # FOR A PARTICULAR PURPOSE. See the GNU General Public License for more # details. # # You should have received a copy of the GNU General Public License along # with Slurm; if not, write to the Free Software Foundation, Inc., # 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA. ############################################################################ source ./globals set exit_code 0 set file_in1 "test$test_id.input1" set file_in2 "test$test_id.input2" set number_commas "\[0-9_,\]+" if {[test_cons_tres]} { log_debug "Valid configuration, using select/cons_tres" } else { skip "This test is only compatible with select/cons_tres" } if {[test_front_end]} { skip "This test is incompatible with front-end systems" } set nb_nodes [get_node_cnt_in_part] log_debug "Default partition node count $nb_nodes" if {$nb_nodes > 3} { set nb_nodes 3 } log_debug "Using node count $nb_nodes" set gpu_cnt [get_highest_gres_count $nb_nodes "gpu"] if {$gpu_cnt < 1} { skip "This test requires 1 or more GPUs on $nb_nodes nodes of the default partition" } log_debug "GPU count is $gpu_cnt" # # Build input script files # make_bash_script $file_in1 " if \[ \$SLURM_LOCALID -eq 0 \]; then echo HOST:\$SLURMD_NODENAME NODE_CNT:\$SLURM_NNODES CUDA_VISIBLE_DEVICES:\$CUDA_VISIBLE_DEVICES fi if \[ \$SLURM_PROCID -eq 0 \]; then $scontrol -dd show job \$SLURM_JOB_ID | grep \"GRES=\" fi exit 0" make_bash_script $file_in2 "echo HOST:\$SLURMD_NODENAME NODE_CNT:\$SLURM_NNODES CUDA_VISIBLE_DEVICES:\$CUDA_VISIBLE_DEVICES if \[ \$SLURM_PROCID -eq 0 \]; then $scontrol -dd show job \$SLURM_JOB_ID | grep \"NumCPUs=\" $scontrol -dd show job \$SLURM_JOB_ID | grep \"GRES=\" fi exit 0" # # One GPU per task with node count (without range) # log_info "TEST: One GPU per task with node count (without range)" set timeout $max_job_delay set match 0 set node_cnt -1 set srun_pid [spawn $srun --nodes=$nb_nodes --gpus-per-task=1 -t1 -J "test$test_id" -l ./$file_in1] expect { -re "NODE_CNT:($number_commas) CUDA_VISIBLE_DEVICES:($number_commas)" { set node_cnt $expect_out(1,string) incr match [cuda_count $expect_out(2,string)] exp_continue } timeout { log_error "srun not responding" slow_kill $srun_pid set exit_code 1 } eof { wait } } if {$match != $node_cnt} { log_error "srun --gpus-per-task failure ($match != $node_cnt)" set exit_code 1 } # # Two GPUs per task with node count (without range) # if {$gpu_cnt > 1} { log_info "TEST: Two GPUs per task with node count (without range)" set match 0 if {$nb_nodes > 1} { set min_nodes 2 } else { set min_nodes 1 } set node_cnt -1 set srun_pid [spawn $srun --nodes=$nb_nodes --gpus-per-task=2 -t1 -J "test$test_id" -l ./$file_in1] expect { -re "NODE_CNT:($number_commas) CUDA_VISIBLE_DEVICES:($number_commas)" { set node_cnt $expect_out(1,string) incr match [cuda_count $expect_out(2,string)] exp_continue } timeout { log_error "srun not responding" slow_kill $srun_pid set exit_code 1 } eof { wait } } set exp_gpu_cnt [expr $node_cnt * 2] if {$match != $exp_gpu_cnt} { log_error "srun --gpus-per-task failure ($match != $exp_gpu_cnt)" set exit_code 1 } } # # One GPU per task with node count range and task count resulting in uneven task distribution # if {$gpu_cnt > 1 && $nb_nodes > 1} { log_info "TEST: Node count range and uneven task distribution" set match 0 set node_cnt -1 set task_cnt [expr $nb_nodes + 1] set srun_pid [spawn $srun --nodes=2-$nb_nodes --ntasks=$task_cnt --gpus-per-task=1 -t1 -J "test$test_id" -l ./$file_in1] expect { -re "NODE_CNT:($number_commas) CUDA_VISIBLE_DEVICES:($number_commas)" { set node_cnt $expect_out(1,string) incr match [cuda_count $expect_out(2,string)] exp_continue } timeout { log_error "srun not responding" slow_kill $srun_pid set exit_code 1 } eof { wait } } if {$match != $task_cnt} { log_error "srun --gpus-per-task failure ($match != $task_cnt)" set exit_code 1 } } # # One task and GPU for each GPU available at step level # if {$gpu_cnt > 1} { log_info "TEST: One task and GPU for each GPU available at step level" set match 0 set task_cnt 0 # FIXME: RANGE CHECK CPU/GPU COUNT set salloc_pid [spawn $salloc -N1 --gpus=$gpu_cnt -t1 -J "test$test_id" $srun -n $gpu_cnt -O --gpus-per-task=1 ./$file_in2] expect { -re "CUDA_VISIBLE_DEVICES:($number_commas)" { incr task_cnt incr match [cuda_count $expect_out(1,string)] exp_continue } timeout { log_error "srun not responding" slow_kill $srun_pid set exit_code 1 } eof { wait } } set exp_gpu_cnt [expr $task_cnt * $task_cnt] if {$task_cnt == 0} { log_error "srun --gpus-per-task test failed to run" set exit_code 1 } elseif {$match != $exp_gpu_cnt} { log_error "srun --gpus-per-task failure ($match != $exp_gpu_cnt)" set exit_code 1 } } # # One task and two GPUs as resources available at step level # if {$gpu_cnt > 1} { log_info "TEST: One task and two GPUs as resources available at step level" set hostname "unknown" set match 0 set num_cpus 0 set task_cnt 0 set tasks_spawned [expr $gpu_cnt / 2] set salloc_pid [spawn $salloc -N1 --exclusive --gpus=$gpu_cnt -t1 -J "test$test_id" $srun -n $tasks_spawned -O --gpus-per-task=2 ./$file_in2] expect { -re "CUDA_VISIBLE_DEVICES:($number_commas)" { incr task_cnt incr match [cuda_count $expect_out(1,string)] exp_continue } -re "NumCPUs=($number)" { set num_cpus $expect_out(1,string) exp_continue } -re " Nodes=($re_word_str)" { set hostname $expect_out(1,string) exp_continue } timeout { log_error "srun not responding" slow_kill $srun_pid set exit_code 1 } eof { wait } } set exp_gpu_cnt [expr $task_cnt * $task_cnt * 2] if {$task_cnt == 0} { log_error "srun --gpus-per-task test failed to run" set exit_code 1 } elseif {$match != $exp_gpu_cnt} { log_error "srun --gpus-per-task failure ($match != $exp_gpu_cnt)" set exit_code 1 } } # # Step allocation of GPUs based upon CPUs per task # if {$gpu_cnt > 1 && $num_cpus > 0} { log_info "TEST: Step allocation of GPUs based upon CPUs per task" set match 0 set task_cnt 0 if {$gpu_cnt > $num_cpus} { set cpus_per_task 1 set gpus_per_task [expr $gpu_cnt / $num_cpus] } else { set cpus_per_task $num_cpus set gpus_per_task 1 } set salloc_pid [spawn $salloc -N1 --exclusive -w $hostname --gpus=$gpu_cnt -t1 -J "test$test_id" $srun -c $cpus_per_task --gpus-per-task=$gpus_per_task ./$file_in2] expect { -re "CUDA_VISIBLE_DEVICES:($number_commas)" { incr task_cnt incr match [cuda_count $expect_out(1,string)] exp_continue } timeout { log_error "srun not responding" slow_kill $srun_pid set exit_code 1 } eof { wait } } set exp_gpu_cnt [expr $task_cnt *$task_cnt * $gpus_per_task] if {$task_cnt == 0} { fail "srun --gpus-per-task test failed to run" } elseif {$match != $exp_gpu_cnt} { fail "srun --gpus-per-task failure ($match != $exp_gpu_cnt)" } } if {$exit_code == 0} { exec $bin_rm -f $file_in1 $file_in2 } else { fail "Test failed due to previous errors (\$exit_code = $exit_code)" }