#!/usr/bin/env expect ############################################################################ # Purpose: Test of Slurm functionality # Submit job directly to slurmd without use of slurmctld scheduler. # (--no-allocate option). NOTE: Needs to run as SlurmUser or root. ############################################################################ # Copyright (C) 2002-2007 The Regents of the University of California. # Copyright (C) 2008 Lawrence Livermore National Security. # Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). # Written by Morris Jette # CODE-OCEC-09-009. All rights reserved. # # This file is part of Slurm, a resource management program. # For details, see . # Please also read the included file: DISCLAIMER. # # Slurm is free software; you can redistribute it and/or modify it under # the terms of the GNU General Public License as published by the Free # Software Foundation; either version 2 of the License, or (at your option) # any later version. # # Slurm is distributed in the hope that it will be useful, but WITHOUT ANY # WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS # FOR A PARTICULAR PURPOSE. See the GNU General Public License for more # details. # # You should have received a copy of the GNU General Public License along # with Slurm; if not, write to the Free Software Foundation, Inc., # 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. ############################################################################ source ./globals set iterations 100 set exit_code 0 if {[is_super_user] == 0} { skip "This test can't be run except as SlurmUser" } if {[test_front_end]} { skip "This test is incompatible with front-end systems" } set switch [switch_type] if {[string compare $switch "none"]} { skip "This test is incompatible with switch/$switch" } # # Submit a 1 node job and record the node name # # NOTE: Check explicity for "^0:" or "\n0:. Otherwise in srun verbose mode we # can get a hostname ending with 0 in the messages that gets used to generate # a bad value for host_0 below. # set host_0 "" set nodelist_name "" set timeout $max_job_delay set srun_pid [spawn $srun -v -N1 -l -t1 $bin_printenv SLURMD_NODENAME] expect { -re "on host ($re_word_str)," { set nodelist_name $expect_out(1,string) exp_continue } -re "^0: ($re_word_str)" { set host_0 $expect_out(1,string) exp_continue } -re "\n0: ($re_word_str)" { set host_0 $expect_out(1,string) exp_continue } timeout { log_error "srun not responding" slow_kill $srun_pid set exit_code 1 } eof { wait } } # # Verify node count # if {[string compare $host_0 ""] == 0} { fail "Did not get hostname of task 0" } if {[string compare $nodelist_name ""] == 0} { fail "Did not get nodelist_name of task 0" } if {[string compare $host_0 $nodelist_name] != 0} { log_warn "Hostname inconsistency" } set include_node $host_0 # # Submit a job directly to that node # set host_1 "" set slurm_user 1 set timeout 10 set srun_pid [spawn $srun -N1 -l --nodelist=$include_node --no-allocate -t1 $bin_printenv SLURMD_NODENAME] expect { -re "Invalid job credential" { log_warn "Not SlurmUser or root" set slurm_user 0 exp_continue } -re "error: .*try again" { log_debug "Can't avoid this possible error" set host_1 $host_0 exp_continue } -re "error: .*already in shared memory" { log_debug "Can't avoid this possible error" set host_1 $host_0 exp_continue } -re "error: .*exit code 1" { exp_continue } -re "0: ($re_word_str)" { set host_1 $expect_out(1,string) exp_continue } timeout { log_error "srun not responding." slow_kill $srun_pid set slurm_user 0 set exit_code 1 } eof { wait } } if {$slurm_user == 0} { pass } if {[string compare $host_1 $include_node]} { log_error "Allocation lacked an included node" set exit_code 1 } # # Run three tasks at a time on some node and do so repeatedly # This checks for slurmd race conditions # The sleep between cycles is to make sure the job step completion # logic has time to be processed (slurmd -> slurmctld messages) # Note: process output in order of expected completion # set front_end [test_front_end] set successes 0 for {set inx 0} {$inx < $iterations} {incr inx} { exec $bin_sleep 0.25 set failures 0 set srun_pid [spawn $srun -N1 --nodelist=$nodelist_name -t1 -l $bin_printenv SLURMD_NODENAME] set alloc $spawn_id set srun_pid1 [spawn $srun -N1 --nodelist=$include_node -t1 -Z $bin_sleep 0.5] set noalloc1 $spawn_id set srun_pid2 [spawn $srun -N1 --nodelist=$include_node -t1 -Z $bin_sleep 0.25] set noalloc2 $spawn_id set timeout 20 set spawn_id $noalloc2 expect { -i $noalloc2 -re "error:.*configuring interconnect" { log_debug "Can't avoid this possible error" exp_continue } -re "error:" { log_error "Some error happened" set failures 1 exp_continue } timeout { log_error "srun not responding" slow_kill $srun_pid2 set failures 1 } eof { wait } } set spawn_id $noalloc1 expect { -i $noalloc1 -re "error:.*configuring interconnect" { log_debug "Can't avoid this possible error" exp_continue } -re "error:" { log_error "Some error happened" set failures 1 exp_continue } timeout { log_error "srun not responding" slow_kill $srun_pid1 set failures 1 } eof { wait } } set timeout $max_job_delay set spawn_id $alloc expect { -i $alloc -re "Invalid node name specified" { log_error "Some error happened" set failures 1 exp_continue } -re "error:.*configuring interconnect" { log_debug "Can't avoid this possible error" exp_continue } -re "error:" { log_error "Some error happened" set failures 1 exp_continue } -re "0: ($re_word_str)" { set host_0 $expect_out(1,string) exp_continue } timeout { log_error "srun not responding" slow_kill $srun_pid set failures 1 } eof { wait } } if {$failures == 0} { incr successes } else { set exit_code 1 } } if {$successes != $iterations} { fail "Only $successes of $iterations completed successfully" } if {$exit_code != 0} { fail "Test failed due to previous errors (\$exit_code = $exit_code)" }