#!/usr/bin/env expect ############################################################################ # Purpose: Test of Slurm functionality # Validate scontrol update size of running job with some running tasks. ############################################################################ # Copyright (C) 2010 Lawrence Livermore National Security. # Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). # Written by Morris Jette # CODE-OCEC-09-009. All rights reserved. # # This file is part of Slurm, a resource management program. # For details, see . # Please also read the included file: DISCLAIMER. # # Slurm is free software; you can redistribute it and/or modify it under # the terms of the GNU General Public License as published by the Free # Software Foundation; either version 2 of the License, or (at your option) # any later version. # # Slurm is distributed in the hope that it will be useful, but WITHOUT ANY # WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS # FOR A PARTICULAR PURPOSE. See the GNU General Public License for more # details. # # You should have received a copy of the GNU General Public License along # with Slurm; if not, write to the Free Software Foundation, Inc., # 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. ############################################################################ source ./globals set exit_code 0 set file_in "test$test_id.bash" set job_id 0 if {[test_front_end] != 0} { skip "This test is incompatible with front-end systems" } elseif {[test_killonbadexit]} { skip "This test is incompatible with KillOnBadExit=1" } elseif {[string first "none" [get_affinity_types]] != -1} { skip "This test is incompatible with TaskPlugin=none" } set node_cnt 2 set available [available_nodes] if {$available < $node_cnt} { skip "Not enough nodes currently available ($available avail, $node_cnt needed)" } # NOTE: The second sleep command in task 1 is to deal with a possible race # condition in which the first sleep command is killed and the shell starts the # second command before the shell itself is killed make_bash_script $file_in " if \[ \$SLURM_PROCID = 0 \]; then $bin_echo JOBID=\$SLURM_JOB_ID $bin_echo \"Proc \$SLURM_PROCID on HOST0=\$SLURMD_NODENAME : I await the end of the other tasks\" $bin_sleep 35 $squeue -o \"%i %10j %2t %N\" $bin_echo \"Proc \$SLURM_PROCID terminated properly\" else $bin_sleep 1 $bin_echo \"Proc \$SLURM_PROCID on HOST1=\$SLURMD_NODENAME : Waiting to get killed\" $bin_sleep 10 $bin_sleep 1 $bin_echo \"Proc \$SLURM_PROCID terminated improperly, not killed\" exit 0 fi " # # Run job with one task continuing # set host "" set host_cnt 0 set timeout $max_job_delay set srun_pid [spawn $srun -N$node_cnt -t1 ./$file_in] set srun_id $spawn_id expect { -re "JOBID=($number)" { set job_id $expect_out(1,string) exp_continue } -re "HOST($number)=($re_word_str)" { if {$expect_out(1,string) == 0} { set host $expect_out(2,string) } incr host_cnt if {$host_cnt < $node_cnt} { exp_continue } } timeout { log_error "srun not responding" slow_kill $srun_pid set exit_code 1 } eof { wait } } if {$job_id == 0} { fail "Job not submitted" } spawn $scontrol update jobid=$job_id nodelist=$host set scontrol_id $spawn_id expect { timeout { log_error "scontrol not responding" set exit_code 1 } eof { wait } } set matches 0 set timeout 40 set spawn_id $srun_id expect { -re "terminated properly" { incr matches exp_continue } -re "terminated improperly" { set matches -9999 exp_continue } timeout { log_error "srun not responding" slow_kill $srun_pid set exit_code 1 } eof { wait } } if {$matches != 1} { fail "Task 0 did not run to completion after other tasks killed" } if {$exit_code == 0} { exec $bin_rm -f $file_in } else { fail "Test failed due to previous errors (\$exit_code = $exit_code)" }