#!/usr/bin/env expect ############################################################################ # Purpose: Test of Slurm functionality # Increase size of job with allocated GPUs ############################################################################ # Copyright (C) 2010 Lawrence Livermore National Security. # Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). # Written by Morris Jette # CODE-OCEC-09-009. All rights reserved. # # This file is part of Slurm, a resource management program. # For details, see . # Please also read the included file: DISCLAIMER. # # Slurm is free software; you can redistribute it and/or modify it under # the terms of the GNU General Public License as published by the Free # Software Foundation; either version 2 of the License, or (at your option) # any later version. # # Slurm is distributed in the hope that it will be useful, but WITHOUT ANY # WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS # FOR A PARTICULAR PURPOSE. See the GNU General Public License for more # details. # # You should have received a copy of the GNU General Public License along # with Slurm; if not, write to the Free Software Foundation, Inc., # 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. ############################################################################ source ./globals set exit_code 0 set file_in1 "test$test_id.input1" set file_in2 "test$test_id.input2" set file_in3 "test$test_id.input3" set file_out1 "test$test_id.output1" set file_out2 "test$test_id.output2" if {![test_scheduler_params "permit_job_expansion"]} { skip "This test is only compatible with SchedulerParameters=permit_job_expansion" } if {[test_cons_tres]} { log_debug "Valid configuration, using select/cons_tres" } else { skip "This test is only compatible with select/cons_tres" } if {[test_front_end]} { skip "This test is incompatible with front-end systems" } set nb_nodes [available_nodes] if {$nb_nodes < 2} { skip "Not enough nodes currently available ($nb_nodes avail, 2 needed)" } set gpu_cnt [get_highest_gres_count 2 "gpu"] if {$gpu_cnt < 0} { fail "Error getting GPU count" } if {$gpu_cnt < 1} { skip "This test requires 1 or more GPUs per node in the default partition" } # # Build input scripts # file_in1: Determine GPUs allocated, wait for dependent job to exit, # expand allocation and run another GPU job # file_in2: Determine GPUs allocated, shrink to size 0 and exit # file_in3: Print the hostname and GPU IDs # exec $bin_rm -f $file_out1 $file_out2 make_bash_script $file_in1 " $scontrol -dd show job \${SLURM_JOBID} $srun ./$file_in3 sleep 20 # Wait for job 2 submission while true; do $squeue -h -n test_child_$test_id | wc | grep -e ' *0 *0 *0' if \[ \${?} -eq 0 \]; then break fi sleep 5 done $scontrol update JobId=\${SLURM_JOBID} NumNodes=ALL . slurm_job_\${SLURM_JOBID}_resize.sh $scontrol -dd show job \${SLURM_JOBID} $srun ./$file_in3 $bin_rm -f slurm_job_\${SLURM_JOBID}_resize.csh $bin_rm -f slurm_job_\${SLURM_JOBID}_resize.sh exit 0" make_bash_script $file_in2 " $scontrol -dd show job \${SLURM_JOBID} $scontrol update JobId=\${SLURM_JOBID} NumNodes=0 . slurm_job_\${SLURM_JOBID}_resize.sh # JOB GETS CANCELLED HERE AS BATCH HOST GETS REMOVED FROM JOB ALLOCATION $bin_rm -f slurm_job_\${SLURM_JOBID}_resize.csh $bin_rm -f slurm_job_\${SLURM_JOBID}_resize.sh exit 0" make_bash_script $file_in3 " echo 'HOST:'\$SLURMD_NODENAME 'CUDA_VISIBLE_DEVICES:'\$CUDA_VISIBLE_DEVICES" # # Submit job to expand: uses one GPU one node # set job_id1 0 spawn $sbatch -N1 --exclusive -J "test$test_id" -t2 --gpus=1 --output=$file_out1 $file_in1 expect { -re "Submitted batch job ($number)" { set job_id1 $expect_out(1,string) exp_continue } timeout { log_error "sbatch not responding" set exit_code 1 } eof { wait } } if {$job_id1 == 0} { fail "Job 1 not submitted" } if {[wait_for_job $job_id1 "RUNNING"] != 0} { cancel_job $job_id1 fail "Job 1 did not start" } # # Submit job to shrink: uses one GPU one node # set job_id2 0 spawn $sbatch -N1 --exclusive -J "test_child_$test_id" --dependency=expand:$job_id1 -t1 --gpus=1 --output=$file_out2 $file_in2 expect { -re "Submitted batch job ($number)" { set job_id2 $expect_out(1,string) exp_continue } timeout { log_error "sbatch not responding" set exit_code 1 } eof { wait } } if {$job_id2 == 0} { cancel_job $job_id1 fail "Job 2 not submitted" } if {[wait_for_job $job_id1 "DONE"] != 0} { cancel_job $job_id1 cancel_job $job_id2 fail "Job 1 did not complete" } if {[wait_for_job $job_id2 "DONE"] != 0} { cancel_job $job_id2 fail "Job 2 did not complete" } # # Parse the output files from job 1 # log_debug "Parse job 1 output" if {[wait_for_file $file_out1] != 0} { fail "No output file" } set match 0 spawn $bin_cat $file_out1 expect { -re "CUDA_VISIBLE_DEVICES" { incr match exp_continue } eof { wait } } if {$match != 3} { log_error "Bad CUDA information about job 1 ($match != 3)" set exit_code 1 } # # Parse the output files from job 2 # Not currently looking for anything, but do log it's contents before purge # log_debug "Parse job 2 output" if {[wait_for_file $file_out2] != 0} { fail "No output file" } set match 0 spawn $bin_cat $file_out2 expect { eof { wait } } if {$exit_code == 0} { exec $bin_rm -f $file_in1 $file_in2 $file_in3 $file_out1 $file_out2 exec $bin_rm -f slurm_job_${job_id2}_resize.csh exec $bin_rm -f slurm_job_${job_id2}_resize.sh } else { fail "Test failed due to previous errors (\$exit_code = $exit_code)" }