#!/usr/bin/env expect ############################################################################ # Purpose: Establish global state information for Slurm federation tests # # To define site-specific state information, set the values in a file # named 'globals.local'. Those values will override any specified here. # for example: # # $ cat globals.local # set slurm_dir "/usr/local" # set mpicc "/usr/local/bin/mpicc" # ############################################################################ # Copyright (C) 2016 SchedMD LLC. # Written by Brian Christiansen # This file is part of SLURM, a resource management program. # For details, see . # Please also read the supplied file: DISCLAIMER. # # Slurm is free software; you can redistribute it and/or modify it under # the terms of the GNU General Public License as published by the Free # Software Foundation; either version 2 of the License, or (at your option) # any later version. # # Slurm is distributed in the hope that it will be useful, but WITHOUT ANY # WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS # FOR A PARTICULAR PURPOSE. See the GNU General Public License for more # details. # # You should have received a copy of the GNU General Public License along # with SLURM; if not, write to the Free Software Foundation, Inc., # 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. ############################################################################ source ./globals # Set if testing federations cset fed_slurm_base "" cset fedc1 "" cset fedc2 "" cset fedc3 "" set eol "\r\n" proc test_federation_setup { } { global fed_slurm_base fedc1 fedc2 fedc3 set rc 0 if {![string compare $fed_slurm_base ""] || ![string compare $fedc1 ""] || ![string compare $fedc2 ""] || ![string compare $fedc3 ""]} { set rc 1; } return $rc } proc setup_federation { fed_name } { global sacctmgr fedc1 fedc2 fedc3 eol set rc 0 set my_pid [spawn $sacctmgr -i add federation $fed_name] set matches 0 expect { -re "Adding Federation\\(s\\)$eol" { incr matches exp_continue } -re "$fed_name$eol" { incr matches exp_continue } timeout { log_error "sacctmgr add not responding" slow_kill $my_pid set rc 1 } eof { wait } } if {!$rc && $matches != 2} { log_error "Failed to create federation" set rc 1 return $rc } set count 0 foreach cluster [list $fedc1 $fedc2 $fedc3] { incr count set my_pid [spawn $sacctmgr -i mod cluster $cluster set federation=$fed_name features=] set matches 0 expect { -re "Setting$eol" { incr matches exp_continue } -re "^\\s+Feature\\s+=\\s+$eol" { incr matches exp_continue } -re "^\\s+Federation\\s+=\\s+$fed_name$eol" { incr matches exp_continue } -re "Modified cluster...$eol" { incr matches exp_continue } -re "^\\s+$cluster$eol" { incr matches exp_continue } timeout { log_error "sacctmgr add not responding" slow_kill $my_pid set rc 1 } eof { wait } } if {!$rc && $matches != 5} { log_error "Failed to add $cluster to federation" set rc 1 break; } if {$count > 1} { sleep 5; } } return $rc } proc test_cluster_up { cname } { set rc 0 set matches 0 set timeout 2 global fed_slurm_base fedc1 fedc2 fedc3 set my_scontrol "${fed_slurm_base}/$cname/bin/scontrol" log_user 0 set my_pid [spawn $my_scontrol show config] expect { "Configuration data as of" { incr matches } timeout { log_warn "$cname not responding" slow_kill $my_pid set rc 1 } eof { wait } } if {!$rc && $matches != 1} { log_error "$cname not responding" set rc 1 } log_user 1 return $rc } proc test_all_up {} { set rc 0 global fedc1 fedc2 fedc3 if {[test_cluster_up $fedc1] || [test_cluster_up $fedc2] || [test_cluster_up $fedc3]} { log_warn "This test can't be run if any clusters--$fedc1,\ $fedc2, or $fedc3--are down" set rc 1 } return $rc } proc delete_federations { names } { global sacctmgr set matches 0 set rc 0 set object "federation" set my_pid [spawn $sacctmgr -i delete $object $names] expect { -re "privilege to perform this action" { log_error "Don't have privileges" incr rc } -re "(There was a problem|Unknown condition|Bad format on|Bad MaxWall|Unknown option)" { log_error "There was a problem with the sacctmgr command" incr rc } -re "Problem getting" { log_error "There was a problem getting information from the database" incr rc } -re "Problem adding" { log_error "There was an unknown problem" incr rc } -re "No associations" { log_error "Your command didn't return anything" incr rc } -re "Deleting $object" { incr matches exp_continue } -re " Nothing deleted" { incr matches exp_continue } timeout { log_error "sacctmgr delete not responding" slow_kill $my_pid incr rc } eof { wait } } if {!$rc && $matches != 1} { log_error "sacctmgr had a problem deleting $object. Got $matches" incr rc } return $rc } proc get_clusterfed_info { fed_name } { global sacctmgr eol set matches 0 array set clusters {} set my_pid [spawn $sacctmgr show cluster federation=$fed_name \ format="cluster%20,federation%20,id,controlhost,controlport,features,fedstate"] expect { -re "Cluster\\s+Federation\\s+ID\\s+ControlHost\\s+ControlPort\\s+Features\\s+FedState $eol" { incr matches exp_continue } -re "\\s+(\\S+)\\s+$fed_name\\s+(\\d+)\\s+(\\S+)\\s+(\\d+)\\s+(\\S*)\\s+(\\S*) $eol" { set clusters($expect_out(1,string)) [dict create id $expect_out(2,string) \ host $expect_out(3,string) \ port $expect_out(4,string) \ features $expect_out(5,string) \ state $expect_out(6,string)] incr matches exp_continue } timeout { slow_kill $my_pid fail "sacctmgr add not responding" } eof { wait } } if {$matches < 2} { fail "$matches didn't match enough clusters for $fed_name" } return [array get clusters] } # # Add a single cluster to the given federation. # IN: cname - name of cluster to add to federation. # IN: fed_name - name of federation to add cluster to. # RET: returns 0 on success, 1 on failure. # proc add_cluster_to_fed {cname fed_name} { global sacctmgr eol set rc 0 set matches 0 set my_pid [spawn $sacctmgr -i modify federation $fed_name set clusters+=$cname] expect { -re "Setting$eol" { incr matches exp_continue } -re "Cluster\\s+ \\+= $cname$eol" { incr matches exp_continue } -re "^\\s+Modified federation...$eol" { incr matches exp_continue } -re "\\s+$fed_name$eol" { incr matches exp_continue } timeout { log_error "sacctmgr add not responding" slow_kill $my_pid set rc 1 } eof { wait } } if {$rc || $matches != 4} { log_error "$matches failed to add $cname to $fed_name" set $rc 1 } return $rc } # # Remove a single cluster from the given federation. # IN: cname - name of cluster to remove from the federation. # IN: fed_name - name of federation to remove cluster from. # RET: returns 0 on success, 1 on failure. # proc remove_cluster_from_fed {cname fed_name} { global sacctmgr eol set rc 0 set matches 0 set my_pid [spawn $sacctmgr -i modify federation $fed_name set clusters-=$cname] expect { -re "Setting$eol" { incr matches exp_continue } -re "Cluster\\s+ -= $cname$eol" { incr matches exp_continue } -re "^\\s+Modified federation...$eol" { incr matches exp_continue } -re "\\s+$fed_name$eol" { incr matches exp_continue } timeout { log_error "sacctmgr add not responding" slow_kill $my_pid set rc 1 } eof { wait } } if {$rc || $matches != 4} { log_error "Failed to remove $cname from $fed_name" set $rc 1 } return $rc } proc modify_federation_flags {fed_name mode flags} { global sacctmgr eol set matches 0 set my_pid [spawn $sacctmgr -i modify federation $fed_name set flags$mode$flags] expect { -re "Setting$eol" { incr matches exp_continue } -re "^\\s+Flags\\s+\\$mode\\s+$flags$eol" { incr matches exp_continue } -re "^\\s+Modified federation...$eol" { incr matches exp_continue } -re "^\\s+$fed_name$eol" { incr matches exp_continue } timeout { log_error "sacctmgr add not responding" slow_kill $my_pid end_it 1 } eof { wait } } if {$matches != 4} { log_error "$matches unexpected error" end_it 1 } } ################################################################ # # NAME # wait_for_fed_job - waits for a job to reach the desired state # # SYNOPSIS # wait_for_fed_job job_id desired_state clusters # # DESCRIPTION # Wait for a previously submitted Slurm job to reach # the desired state. Polls every $testsuite_poll_interval seconds. # # ARGUMENTS # job_id # The Slurm job id of a job we want to wait for. # desired_state # The state you want the job to attain before # returning. Currently supports: # DONE any terminated state # PENDING job is pending # RUNNING job is running # SUSPENDED job is suspended # # RETURN VALUE # The name of the first cluster on which the job is found # in the desired state. Empty string indicates a failure. # # NOTE # We sleep for two seconds before replying that a job is # done to give time for I/O completion (stdout/stderr files) # ################################################################ proc wait_for_fed_job { job_id desired_state clusters } { global scontrol max_job_state_delay fedc1 fedc2 fedc3 testsuite_poll_interval # First verify that desired_state is supported switch $desired_state { "DONE" {} "PENDING" {} "REVOKED" {} "RUNNING" {} "SUSPENDED" {} "SPECIAL_EXIT" {} default { log_error "wait_for_job with invalid state: $desired_state" return "" } } if {$job_id == 0} { log_error "wait_for_job with invalid job ID: $job_id" return "" } set my_delay 0 set spec_clusters [list $fedc1 $fedc2 $fedc3] if {[string compare $clusters ""]} { set spec_clusters [split $clusters ","] } log_info "checking for job '$job_id' in state '$desired_state' on [join $spec_clusters ,]" while 1 { foreach cluster $spec_clusters { log_info "checking $cluster" set fd [open "|$scontrol -M$cluster --local -a -o show job $job_id"] gets $fd line catch {close $fd} if {[regexp {JobState\s*=\s*(\w+)} $line foo state] != 1} { log_error "$desired_state not found on cluster $cluster" continue } switch $state { "NOT_FOUND" - "CANCELLED" - "DEADLINE" - "FAILED" - "TIMEOUT" - "NODE_FAIL" - "PREEMPTED" - "COMPLETED" { if {[string compare $desired_state "DONE"] == 0} { log_info "Job $job_id is DONE ($state) on $cluster" sleep 2 return $cluster } if {[string compare $desired_state "RUNNING"] == 0} { log_warn "Job $job_id is $state but we wanted RUNNING" } if {[string compare $desired_state "SUSPENDED"] == 0} { log_warn "Job $job_id is $state but we wanted SUSPENDED" } return "" } "PENDING" { if {[string compare $desired_state "PENDING"] == 0} { log_info "Job $job_id is PENDING on $cluster" return $cluster } log_warn "Job $job_id is in state $state, but desired state $desired_state" } "REVOKED" { if {[string compare $desired_state "REVOKED"] == 0} { log_info "Job $job_id is REVOKED on $cluster" return $cluster } log_warn "Job $job_id is in state $state, but desired state $desired_state" } "RUNNING" { if {[string compare $desired_state "RUNNING"] == 0} { log_info "Job $job_id is RUNNING on $cluster" return $cluster } log_warn "Job $job_id is in state $state, but desired state $desired_state" } "SPECIAL_EXIT" { if {[string compare $desired_state "SPECIAL_EXIT"] == 0} { log_info "Job $job_id is SPECIAL_EXIT on $cluster" return $cluster } log_warn "Job $job_id is in state $state, but desired state $desired_state" } "SUSPENDED" { if {[string compare $desired_state "SUSPENDED"] == 0} { log_info "Job $job_id is SUSPENDED on $cluster" return $cluster } log_warn "Job $job_id is in state $state, but desired state $desired_state" } default { log_info "Job $job_id is in state $state. Desired state was $desired_state" } } } if { $my_delay > $max_job_state_delay } { log_error "Timeout waiting for job state $desired_state" return "" } exec sleep $testsuite_poll_interval set my_delay [expr $my_delay + $testsuite_poll_interval] } }