#!/usr/bin/env expect ############################################################################ # Purpose: Validate heterogeneous gpu job options. ############################################################################ # Copyright (C) 2020 SchedMD LLC. # Written by Brian Christiansen # # This file is part of Slurm, a resource management program. # For details, see . # Please also read the included file: DISCLAIMER. # # Slurm is free software; you can redistribute it and/or modify it under # the terms of the GNU General Public License as published by the Free # Software Foundation; either version 2 of the License, or (at your option) # any later version. # # Slurm is distributed in the hope that it will be useful, but WITHOUT ANY # WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS # FOR A PARTICULAR PURPOSE. See the GNU General Public License for more # details. # # You should have received a copy of the GNU General Public License along # with Slurm; if not, write to the Free Software Foundation, Inc., # 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. ############################################################################ source ./globals set exit_code 0 set is_skip false if {[get_highest_gres_count 1 "gpu"] < 2} { skip "This test requires 2 or more GPUs per node in the default partition" } proc submit_job_error {cmd expected_error} { set err_match 0 set cmd_pid [spawn {*}$cmd] expect { -re $expected_error { incr err_match exp_continue } timeout { slow_kill $cmd_pid fail "command not responding" } eof { wait } } assert_or_fail {$err_match == 1} "Didn't get expected error ($expected_error)" } proc submit_job {cmd} { global number set err_match 0 set job_id 0 set cmd_pid [spawn {*}$cmd] expect { -re "job ($number)" { set job_id $expect_out(1,string) } timeout { slow_kill $cmd_pid fail "command not responding" } eof { wait } } if {!$job_id} { fail "didn't get a job_id" } return $job_id } proc check_job {job_id grep re} { global bin_bash bin_grep scontrol set matches 0 spawn $bin_bash -c "exec $scontrol show job $job_id | $bin_grep -i '$grep'" expect { -re $re { incr matches exp_continue } timeout { fail "scontrol not responding" } eof { wait } } assert_or_fail {$matches == 1} "Didn't match $re ($matches != 1)" } proc test_gpu_bind {} { global sbatch srun salloc log_info "Testing --gpu-bind" set tests [list \ "--gpu-bind=blah : --gpu-bind=closest" \ "--gpu-bind=closest : --gpu-bind=blah" \ ] set submission_errors { "error: Invalid --gpu-bind argument: gpu:blah" "error: Invalid --gpu-bind argument: gpu:blah" } foreach cmd {sbatch srun salloc} { foreach a $tests b $submission_errors { set run "" if {$cmd eq "sbatch"} { set run "$sbatch -o/dev/null $a --wrap=hostname" } elseif {$cmd eq "srun"} { set run "$srun $a hostname" } else { set run "$salloc $a hostname" } submit_job_error $run $b } } set tests [list \ "--gpu-bind=closest : --gpu-bind=closest" \ "--gpu-bind=closest : --gpu-bind=map_gpu:1" \ "--gpu-bind=map_gpu:1 : --gpu-bind=closest" \ "--gpu-bind=closest : -n1" \ "-n1 : --gpu-bind=closest" \ ] set regexes [list \ "^JobId=.*TresBind=gpu:closest.*JobId=.*TresBind=gpu:closest" \ "^JobId=.*TresBind=gpu:closest.*JobId=.*TresBind=gpu:map_gpu:1" \ "^JobId=.*TresBind=gpu:map_gpu:1.*JobId=.*TresBind=gpu:closest" \ "^JobId=.*TresBind=gpu:closest.*JobId=" \ "^JobId=.*JobId=.*TresBind=gpu:closest" \ ] foreach cmd {sbatch srun salloc} { foreach a $tests re $regexes { set run "" if {$cmd eq "sbatch"} { set run "$sbatch -o/dev/null $a --wrap=hostname" } elseif {$cmd eq "srun"} { set run "$srun $a hostname" } else { set run "$salloc $a hostname" } set job_id [submit_job $run] check_job $job_id "^JobId\\|TresBind" $re cancel_job $job_id } } } proc test_gpu_freq {} { global sbatch srun salloc log_info "Testing --gpu-freq" set tests [list \ "--gpu-freq=blah : --gpu-freq=low" \ "--gpu-freq=low : --gpu-freq=blah" \ ] set submission_errors { "error: Invalid --gpu-freq argument: gpu:blah" "error: Invalid --gpu-freq argument: gpu:blah" } foreach cmd {sbatch srun salloc} { foreach a $tests b $submission_errors { set run "" if {$cmd eq "sbatch"} { set run "$sbatch -o/dev/null $a --wrap=hostname" } elseif {$cmd eq "srun"} { set run "$srun $a hostname" } else { set run "$salloc $a hostname" } submit_job_error $run $b } } set tests [list \ "--gpu-freq=low : --gpu-freq=low" \ "--gpu-freq=low : --gpu-freq=medium" \ "--gpu-freq=medium : --gpu-freq=low" \ "--gpu-freq=low : -n1" \ "-n1 : --gpu-freq=low" \ ] set regexes [list \ "^JobId=.*TresFreq=gpu:low.*JobId=.*TresFreq=gpu:low" \ "^JobId=.*TresFreq=gpu:low.*JobId=.*TresFreq=gpu:medium" \ "^JobId=.*TresFreq=gpu:medium.*JobId=.*TresFreq=gpu:low" \ "^JobId=.*TresFreq=gpu:low.*JobId=" \ "^JobId=.*JobId=.*TresFreq=gpu:low" \ ] foreach cmd {sbatch srun salloc} { foreach a $tests re $regexes { set run "" if {$cmd eq "sbatch"} { set run "$sbatch -o/dev/null $a --wrap=hostname" } elseif {$cmd eq "srun"} { set run "$srun $a hostname" } else { set run "$salloc $a hostname" } set job_id [submit_job $run] check_job $job_id "^JobId\\|TresFreq" $re cancel_job $job_id } } } proc test_cpus_per_gpu {} { global sbatch srun salloc log_info "Testing --cpus-per-gpu" set tests [list \ "--cpus-per-gpu=1 : --cpus-per-gpu=2" \ "--cpus-per-gpu=2 : --cpus-per-gpu=1" \ "--cpus-per-gpu=2 : -n1" \ "-n1 : --cpus-per-gpu=2" \ ] set regexes [list \ "^JobId=.*CpusPerTres=gpu:1.*JobId=.*CpusPerTres=gpu:2" \ "^JobId=.*CpusPerTres=gpu:2.*JobId=.*CpusPerTres=gpu:1" \ "^JobId=.*CpusPerTres=gpu:2.*JobId=.*" \ "^JobId=.*JobId=.*CpusPerTres=gpu:2" \ ] foreach cmd {sbatch srun salloc} { foreach a $tests re $regexes { set run "" if {$cmd eq "sbatch"} { set run "$sbatch -o/dev/null $a --wrap=hostname" } elseif {$cmd eq "srun"} { set run "$srun $a hostname" } else { set run "$salloc $a hostname" } set job_id [submit_job $run] check_job $job_id "^JobId\\|CpusPerTres" $re cancel_job $job_id } } } proc test_gpus_per_job {} { global sbatch srun salloc log_info "Testing --cpus-per-job" set tests [list \ "-n1 --gpus=1 : -n1 --gpus=2" \ "-n1 --gpus=2 : -n1 --gpus=1" \ "-n1 --gpus=2 : -n1" \ "-n1 : -n1 --gpus=2" \ ] set regexes [list \ "^JobId=.*TresPerJob=gpu:1.*JobId=.*TresPerJob=gpu:2" \ "^JobId=.*TresPerJob=gpu:2.*JobId=.*TresPerJob=gpu:1" \ "^JobId=.*TresPerJob=gpu:2.*JobId=.*" \ "^JobId=.*JobId=.*TresPerJob=gpu:2" \ ] foreach cmd {sbatch srun salloc} { foreach a $tests re $regexes { set run "" if {$cmd eq "sbatch"} { set run "$sbatch -o/dev/null $a --wrap=hostname" } elseif {$cmd eq "srun"} { set run "$srun $a hostname" } else { set run "$salloc $a hostname" } set job_id [submit_job $run] check_job $job_id "^JobId\\|TresPerJob" $re cancel_job $job_id } } } proc test_gpus_per_node {} { global sbatch srun salloc log_info "Testing --gpus-per-node" set tests [list \ "-n1 --gpus-per-node=1 : -n1 --gpus-per-node=2" \ "-n1 --gpus-per-node=2 : -n1 --gpus-per-node=1" \ "-n1 --gpus-per-node=2 : -n1" \ "-n1 : -n1 --gpus-per-node=2" \ "-n1 --gres=gpu:1 : -n1 --gres=gpu:2" \ "-n1 --gres=gpu:2 : -n1 --gres=gpu:1" \ "-n1 --gres=gpu:2 : -n1" \ "-n1 : -n1 --gres=gpu:2" \ "-n1 --gpus-per-node=1 --gres=gpu:2 : -n1 --gpus-per-node=2 --gres=gpu:1" \ ] set regexes [list \ "^JobId=.*TresPerNode=gpu:1.*JobId=.*TresPerNode=gpu:2" \ "^JobId=.*TresPerNode=gpu:2.*JobId=.*TresPerNode=gpu:1" \ "^JobId=.*TresPerNode=gpu:2.*JobId=.*" \ "^JobId=.*JobId=.*TresPerNode=gpu:2" \ "^JobId=.*TresPerNode=gpu:1.*JobId=.*TresPerNode=gpu:2" \ "^JobId=.*TresPerNode=gpu:2.*JobId=.*TresPerNode=gpu:1" \ "^JobId=.*TresPerNode=gpu:2.*JobId=.*" \ "^JobId=.*JobId=.*TresPerNode=gpu:2" \ "^JobId=.*TresPerNode=gpu:1,gpu:2.*JobId=.*TresPerNode=gpu:2,gpu:1" \ ] foreach cmd {sbatch srun salloc} { foreach a $tests re $regexes { set run "" if {$cmd eq "sbatch"} { set run "$sbatch -o/dev/null $a --wrap=hostname" } elseif {$cmd eq "srun"} { set run "$srun $a hostname" } else { set run "$salloc $a hostname" } set job_id [submit_job $run] check_job $job_id "^JobId\\|TresPerNode" $re cancel_job $job_id } } } proc test_gpus_per_socket {} { global sbatch srun salloc log_info "Testing --gpus-per-socket" set tests [list \ "-n1 --gpus-per-socket=1 : -n1 " \ "-n1 : -n1 --gpus-per-socket=1" \ ] set submission_errors { "--gpus-per-socket option requires --sockets-per-node specification" "--gpus-per-socket option requires --sockets-per-node specification" } foreach cmd {sbatch srun salloc} { foreach a $tests b $submission_errors { set run "" if {$cmd eq "sbatch"} { set run "$sbatch -o/dev/null $a --wrap=hostname" } elseif {$cmd eq "srun"} { set run "$srun $a hostname" } else { set run "$salloc $a hostname" } submit_job_error $run $b } } set tests [list \ "-n1 --sockets-per-node=1 --gpus-per-socket=1 : -n1 --sockets-per-node=1 --gpus-per-socket=2" \ "-n1 --sockets-per-node=1 --gpus-per-socket=2 : -n1 --sockets-per-node=1 --gpus-per-socket=1" \ "-n1 --sockets-per-node=1 --gpus-per-socket=2 : -n1" \ "-n1 : -n1 --sockets-per-node=1 --gpus-per-socket=2" \ ] set regexes [list \ "^JobId=.*TresPerSocket=gpu:1.*JobId=.*TresPerSocket=gpu:2" \ "^JobId=.*TresPerSocket=gpu:2.*JobId=.*TresPerSocket=gpu:1" \ "^JobId=.*TresPerSocket=gpu:2.*JobId=.*" \ "^JobId=.*JobId=.*TresPerSocket=gpu:2" \ ] foreach cmd {sbatch srun salloc} { foreach a $tests re $regexes { set run "" if {$cmd eq "sbatch"} { set run "$sbatch -o/dev/null $a --wrap=hostname" } elseif {$cmd eq "srun"} { set run "$srun $a hostname" } else { set run "$salloc $a hostname" } set job_id [submit_job $run] check_job $job_id "^JobId\\|TresPerSocket" $re cancel_job $job_id } } } proc test_gpus_per_task {} { global sbatch srun salloc log_info "Testing --gpus-per-task" set tests [list \ "-n1 --gpus-per-task=1 : -n1 --gpus-per-task=2" \ "-n1 --gpus-per-task=2 : -n1 --gpus-per-task=1" \ "-n1 --gpus-per-task=2 : -n1" \ "-n1 : -n1 --gpus-per-task=2" \ ] set regexes [list \ "^JobId=.*TresPerTask=gpu:1.*JobId=.*TresPerTask=gpu:2" \ "^JobId=.*TresPerTask=gpu:2.*JobId=.*TresPerTask=gpu:1" \ "^JobId=.*TresPerTask=gpu:2.*JobId=.*" \ "^JobId=.*JobId=.*TresPerTask=gpu:2" \ ] foreach cmd {sbatch srun salloc} { foreach a $tests re $regexes { set run "" if {$cmd eq "sbatch"} { set run "$sbatch -o/dev/null $a --wrap=hostname" } elseif {$cmd eq "srun"} { set run "$srun $a hostname" } else { set run "$salloc $a hostname" } set job_id [submit_job $run] check_job $job_id "^JobId\\|TresPerTask" $re cancel_job $job_id } } } proc test_mem_per_gpu {} { global sbatch srun salloc log_info "Testing --mem-per-gpu" set tests [list \ "-n1 --mem-per-gpu=1 : -n1 --mem-per-gpu=2" \ "-n1 --mem-per-gpu=2 : -n1 --mem-per-gpu=1" \ "-n1 --mem-per-gpu=2 : -n1" \ "-n1 : -n1 --mem-per-gpu=2" \ ] set regexes [list \ "^JobId=.*MemPerTres=gpu:1.*JobId=.*MemPerTres=gpu:2" \ "^JobId=.*MemPerTres=gpu:2.*JobId=.*MemPerTres=gpu:1" \ "^JobId=.*MemPerTres=gpu:2.*JobId=.*" \ "^JobId=.*JobId=.*MemPerTres=gpu:2" \ ] foreach cmd {sbatch srun salloc} { foreach a $tests re $regexes { set run "" if {$cmd eq "sbatch"} { set run "$sbatch -o/dev/null $a --wrap=hostname" } elseif {$cmd eq "srun"} { set run "$srun $a hostname" } else { set run "$salloc $a hostname" } set job_id [submit_job $run] check_job $job_id "^JobId\\|MemPerTres" $re cancel_job $job_id } } } test_gpu_bind test_gpu_freq test_gpus_per_node if {[test_cons_tres]} { test_cpus_per_gpu test_gpus_per_job test_gpus_per_socket test_gpus_per_task test_mem_per_gpu } else { log_warn "Some tests are skipped because they require SelectType=cons_tres." set is_skip true } if {$exit_code} { fail "Test failed due to previous errors (\$exit_code = $exit_code)" } if {$is_skip} { skip "Some tests were skipped" }