#!/usr/bin/env expect ############################################################################ # Purpose: Test of srun functionality # Test of hostfile option (-hostfile) inside of an allocation. ############################################################################ # Copyright (C) 2002-2006 The Regents of the University of California. # Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). # Written by Danny Auble # UCRL-CODE-217948. # # This file is part of Slurm, a resource management program. # For details, see . # Please also read the included file: DISCLAIMER. # # Slurm is free software; you can redistribute it and/or modify it under # the terms of the GNU General Public License as published by the Free # Software Foundation; either version 2 of the License, or (at your option) # any later version. # # Slurm is distributed in the hope that it will be useful, but WITHOUT ANY # WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS # FOR A PARTICULAR PURPOSE. See the GNU General Public License for more # details. # # You should have received a copy of the GNU General Public License along # with Slurm; if not, write to the Free Software Foundation, Inc., # 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. ############################################################################ source ./globals set exit_code 0 set file_in "test$test_id.input" set num_nodes 3 set node_count 0 set task_count 0 set job_id 0 set hostfile "test$test_id.hostfile" exec $bin_rm -f $hostfile if { [test_front_end] } { skip "This test incompatible with front-end systems" } # find out if we have enough nodes to test functionality set node_count [get_node_cnt_in_part] if { $node_count < 3 } { skip "Insufficient nodes in default partition ($node_count < 3)" } set node0 0 set node1 0 set node2 0 set node3 0 set node4 0 # # Build input script file # make_bash_script $file_in " export PS1=\"$prompt\" $bin_bash --norc " set timeout $max_job_delay set salloc_pid [spawn $salloc -N$num_nodes -v -t2 ./$file_in] expect { -re "salloc: Granted job allocation ($number)" { set job_id $expect_out(1,string) exp_continue } -re $prompt { #log_debug "Job initiated" } timeout { log_error "salloc not responding" slow_kill $salloc_pid set exit_code 1 } eof { wait } } exec $bin_rm -f $file_in if {$job_id == 0} { fail "salloc failure" } for {set i 0} {$i<4} {incr i} { set extra "" if { $i==1 } { if { $node0 == 0 || $node1 == 0 || $node2 == 0 } { fail "node names not set from previous srun" } send "export SLURM_HOSTFILE=$hostfile\r" expect { -re $prompt { #log_debug "Srun completed" } } set 1node0 $node0 set 1node1 $node1 set 1node2 $node2 set file [open $hostfile "w"] puts $file "$node2\n$node0\n$node1" close $file } elseif { $i==2 } { if { $node_count < 3 } { skip "System must have at least 3 nodes to finish this test. This system only has $node_count" } if { $node0 == 0 || $node1 == 0 || $node2 == 0 } { fail "node names not set from previous srun" } set 2node0 $node1 set 2node1 $node0 set 2node2 $node0 set 2node3 $node0 set 2node4 $node0 set file [open $hostfile "w"] puts $file "$node1\n$node0\n$node0\n$node0\n$node0" close $file set extra "-n5" } elseif { $i==3 } { if { $node0 == 0 || $node1 == 0 || $node2 == 0 } { fail "node names not set from previous srun" } set 3node0 $node1 set 3node1 $node0 set 3node2 $node0 set 3node3 $node0 set 3node4 0 set file [open $hostfile "w"] puts $file "$node1\n$node0\n$node0\n$node0\n$node0" close $file set extra "-n4" } set node0 0 set node1 0 set node2 0 set node3 0 set node4 0 # # execute srun with a specific node count # send "$srun -l $extra -O $bin_printenv SLURMD_NODENAME\r" expect { -re "($number): *($re_word_str)" { set task_id $expect_out(1,string) if {$task_id == 0} { set node0 $expect_out(2,string) } elseif {$task_id == 1} { set node1 $expect_out(2,string) } elseif {$task_id == 2} { set node2 $expect_out(2,string) } elseif {$task_id == 3} { set node3 $expect_out(2,string) } elseif {$task_id == 4} { set node4 $expect_out(2,string) } elseif {$task_id == 5} { set node5 $expect_out(2,string) } exp_continue } -re $prompt { #log_debug "Srun completed" } -re "Granted job allocation ($number)" { set job_id $expect_out(1,string) exp_continue } timeout { log_error "srun not responding" cancel_job $job_id set exit_code 1 } eof { } } if { $i == 1 } { if { [string compare $node0 $1node2] } { log_error "Tasks not distributed by hostfile" set exit_code 1 } elseif { [string compare $node1 $1node0] } { log_error "Tasks not distributed by hostfile" set exit_code 1 } elseif { [string compare $node2 $1node1] } { log_error "Tasks not distributed by hostfile" set exit_code 1 } } elseif { $i == 2 } { if { [string compare $node0 $2node0] } { log_error "Tasks not distributed by hostfile" set exit_code 1 } elseif { [string compare $node1 $2node1] } { log_error "Tasks not distributed by hostfile" set exit_code 1 } elseif { [string compare $node2 $2node2] } { log_error "Tasks not distributed by hostfile" set exit_code 1 } elseif { [string compare $node3 $2node3] } { log_error "Tasks not distributed by hostfile" set exit_code 1 } elseif { [string compare $node4 $2node4] } { log_error "Tasks not distributed by hostfile" set exit_code 1 } } elseif { $i==3 } { if { [string compare $node0 $3node0] } { log_error "Tasks not distributed by hostfile" set exit_code 1 } elseif { [string compare $node1 $3node1] } { log_error "Tasks not distributed by hostfile" set exit_code 1 } elseif { [string compare $node2 $3node2] } { log_error "Tasks not distributed by hostfile" set exit_code 1 } elseif { [string compare $node3 $3node3] } { log_error "Tasks not distributed by hostfile" set exit_code 1 } elseif { [string compare $node4 $3node4] } { log_error "Tasks not distributed by hostfile" set exit_code 1 } } } send "unset SLURM_HOSTFILE\r" expect { -re $prompt { #log_debug "Srun completed" } } # try with commandline -w option for {set i 0} {$i<5} {incr i} { set extra "" if { $i==1 } { if { $node0 == 0 || $node1 == 0 || $node2 == 0 } { fail "node names not set from previous srun" } set 1node0 $node0 set 1node1 $node1 set 1node2 $node2 set extra " -m arbitrary -w $node2,$node0,$node1" } elseif { $i==2 } { if { $node_count < 3 } { skip "System must have at least 3 nodes to finish this test. This system only has $node_count" } if { $node0 == 0 || $node1 == 0 || $node2 == 0 } { fail "node names not set from previous srun" } set 2node0 $node1 set 2node1 $node0 set 2node2 $node0 set 2node3 $node0 set 2node4 $node0 set extra " -m arbitrary -w $node1,$node0,$node0,$node0,$node0" } elseif { $i==3 } { if { $node1 == 0 } { fail "node names not set from previous srun" } set 4node0 $node1 set extra " -w $node1" } elseif { $i==4 } { if { $node0 == 0 || $node1 == 0 || $node2 == 0 } { fail "node names not set from previous srun" } set 4node0 $node1 set 4node1 $node0 set 4node2 $node0 set 4node3 $node0 set 4node4 0 set extra " -m arbitrary -w $node1,$node0,$node0,$node0,$node0 -n4" } set node0 0 set node1 0 set node2 0 set node3 0 set node4 0 # # execute srun with a specific node count # send "$srun -l $extra -O $bin_printenv SLURMD_NODENAME\r" expect { -re "($number): *($re_word_str)" { set task_id $expect_out(1,string) if {$task_id == 0} { set node0 $expect_out(2,string) } elseif {$task_id == 1} { set node1 $expect_out(2,string) } elseif {$task_id == 2} { set node2 $expect_out(2,string) } elseif {$task_id == 3} { set node3 $expect_out(2,string) } elseif {$task_id == 4} { set node4 $expect_out(2,string) } elseif {$task_id == 5} { set node5 $expect_out(2,string) } exp_continue } -re "error" { if { $i == 4 } { log_debug "This error is expected, no worries" } else { set exit_code 1 } exp_continue } -re $prompt { #log_debug "Srun completed" } -re "slurm job ($number)" { set job_id $expect_out(1,string) exp_continue } timeout { log_error "srun not responding" cancel_job $job_id set exit_code 1 } eof { } } if { $i == 1 } { if { [string compare $node0 $1node2] } { log_error "Tasks not distributed by -w" set exit_code 1 } elseif { [string compare $node1 $1node0] } { log_error "Tasks not distributed by -w" set exit_code 1 } elseif { [string compare $node2 $1node1] } { log_error "Tasks not distributed by -w" set exit_code 1 } } elseif { $i == 2 } { if { [string compare $node0 $2node0] } { log_error "Tasks not distributed by -w" set exit_code 1 } elseif { [string compare $node1 $2node1] } { log_error "Tasks not distributed by -w" set exit_code 1 } elseif { [string compare $node2 $2node2] } { log_error "Tasks not distributed by -w" set exit_code 1 } elseif { [string compare $node3 $2node3] } { log_error "Tasks not distributed by -w" set exit_code 1 } elseif { [string compare $node4 $2node4] } { log_error "Tasks not distributed by -w" set exit_code 1 } } elseif { $i == 3 } { if { [string compare $node2 $4node0] } { log_error "We only asked for 1 node in the allocation and we were unable to get it" set exit_code 1 } } } send "exit\r" expect { -re "error.*Exit 1" { log_debug "This error is expected, no worries" exp_continue } timeout { cancel_job $job_id kill_srun fail "srun not responding" } eof { wait } } if {$exit_code == 0} { exec $bin_rm -f $hostfile } else { fail "Test failed due to previous errors (\$exit_code = $exit_code)" }