#!/usr/bin/env expect ############################################################################ # Purpose: Test of Slurm functionality # Test exclusive resource allocation for a step (--exclusive option). ############################################################################ # Copyright (C) 2007 The Regents of the University of California. # Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). # Written by Morris Jette # CODE-OCEC-09-009. All rights reserved. # # This file is part of Slurm, a resource management program. # For details, see . # Please also read the included file: DISCLAIMER. # # Slurm is free software; you can redistribute it and/or modify it under # the terms of the GNU General Public License as published by the Free # Software Foundation; either version 2 of the License, or (at your option) # any later version. # # Slurm is distributed in the hope that it will be useful, but WITHOUT ANY # WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS # FOR A PARTICULAR PURPOSE. See the GNU General Public License for more # details. # # You should have received a copy of the GNU General Public License along # with Slurm; if not, write to the Free Software Foundation, Inc., # 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. ############################################################################ source ./globals set exit_code 0 set file_in "test$test_id.input" set file_in2 "test$test_id.input2" set file_out "test$test_id.output" set file_out2 "test$test_id.output2" set job_id 0 set gpu_tot 0 set job_tres_cnt 0 if {[test_front_end]} { skip "This test is incompatible with front-end systems" } set node_name [get_nodes_by_request "--gres=gpu:2 -n1 -t1"] if { [llength $node_name] != 1 } { skip "This test need to be able to submit jobs with at least --gres=gpu:2" } set gpu_tot [dict get [get_gres_count "gpu" $node_name] $node_name] # # Delete left-over input script # Build input script file # Run one more step than allocated CPUs and make sure it waits # The "sleep 4" is meant to ensure the earlier job steps start first # exec $bin_rm -f $file_in $file_in2 $file_out $file_out2 make_bash_script $file_in " echo tasks_per_node=\$SLURM_TASKS_PER_NODE if \[ \$SLURM_TASKS_PER_NODE -gt 32 \]; then sleep_secs=45 else sleep_secs=10 fi inx=0 while \[ \$inx -lt \$SLURM_TASKS_PER_NODE \] do $srun --exclusive -n1 $bin_sleep \$sleep_secs & inx=\$((inx+1)) done $bin_sleep 4 $srun -v --exclusive -n1 ./$file_in2 & wait " make_bash_script $file_in2 " $scontrol show steps " # # Spawn a job via sbatch # spawn $sbatch -N1 -t2 --gres=craynetwork:0 --output=$file_out ./$file_in expect { -re "Submitted batch job ($number)" { set job_id $expect_out(1,string) exp_continue } timeout { log_error "sbatch not responding" set exit_code 1 exp_continue } eof { wait } } if { $job_id == 0 } { fail "Failed to submit job" } # # Wait for job to complete # if {[wait_for_job $job_id "DONE"] != 0} { log_error "Waiting for job to complete" cancel_job $job_id set exit_code 1 } # # Check for desired output # if {[wait_for_file $file_out] != 0} { fail "Output file $file_out is missing" } set matches 0 set tasks_per_node 0 spawn $bin_cat $file_out expect { -re "tasks_per_node=($number)" { set tasks_per_node $expect_out(1,string) } -re "StepId=$job_id" { incr matches exp_continue } eof { wait } } if { $matches > $tasks_per_node } { log_error "Problem with exclusive resource allocation for step ($matches > $tasks_per_node)" set exit_code 1 } if {$exit_code == 0} { log_info "So far, so good. Trying with --immediate option" } else { fail "Test failed due to previous errors (\$exit_code = $exit_code)" } # # Delete left-over input script # Build another input script file # Run one more step than allocated CPUs with immediate option and make aborts # The "sleep" is meant to ensure the earlier job steps start first # exec $bin_rm -f $file_in $file_out make_bash_script $file_in " inx=0 if \[ \$SLURM_TASKS_PER_NODE -gt 32 \]; then sleep_secs=45 else sleep_secs=10 fi while \[ \$inx -lt \$SLURM_TASKS_PER_NODE \] do $srun --exclusive -n1 --mem=0 $bin_sleep \$sleep_secs & inx=\$((inx+1)) done $bin_sleep 4 $srun -v --exclusive -n1 --mem=0 --immediate ./$file_in2 & wait " # # Spawn a job via sbatch # spawn $sbatch -N1 -t2 --gres=craynetwork:0 --output=$file_out2 ./$file_in expect { -re "Submitted batch job ($number)" { set job_id $expect_out(1,string) exp_continue } timeout { log_error "sbatch not responding" set exit_code 1 exp_continue } eof { wait } } if { $job_id == 0 } { fail "Failed to submit job" } # # Wait for job to complete # if {[wait_for_job $job_id "DONE"] != 0} { log_error "Waiting for job to complete" cancel_job $job_id set exit_code 1 } # # Check for desired output # if {[wait_for_file $file_out2] != 0} { fail "Output file $file_out2 is missing" } set matches 0 spawn $bin_cat $file_out2 expect { -re "StepId=$job_id" { log_error "Problem --exclusive and --immediate option for step" set exit_code 1 exp_continue } -re "Unable to create " { log_debug "This error was expected, no worries" incr matches exp_continue } eof { wait } } if { $matches != 1 } { log_error "Problem --exclusive and --immediate option for step ($matches != 1)" set exit_code 1 } # # Verify that all GPUs are allocated with the --exclusive flag # set job_id 0 spawn $srun -N1 -w $node_name --gres=gpu --exclusive $bin_printenv SLURM_JOBID expect { -re "($number)" { set job_id $expect_out(1,string) exp_continue } timeout { log_error "srun not responding\n" set exit_code 1 } eof { wait } } set job_gpu_cnt [get_job_gpu_cnt $job_id] if {$gpu_tot != $job_gpu_cnt} { log_error "Job didn't allocate all GPUs ($gpu_tot != $job_gpu_cnt)" set exit_code 1 } else { log_info "Node GPU count matches Job GPU count ($gpu_tot == $job_gpu_cnt)" } if {[wait_for_job $job_id "DONE"] != 0} { log_error "job wasn't able to complete\n" cancel_job $job_id set exit_code 1 } if {$exit_code == 0} { exec $bin_rm -f $file_in $file_in2 $file_out $file_out2 } else { fail "Test failed due to previous errors (\$exit_code = $exit_code)" }