#!/usr/bin/env expect ############################################################################ # Purpose: Establish global state information for Slurm test suite # # To define site-specific state information, set the values in a file # named 'globals.local'. Those values will override any specified here. # for example: # # $ cat globals.local # set slurm_dir "/usr/local" # set build_dir "/home/mine/SLURM/build_smd" # set src_dir "/home/mine/SLURM/slurm.git" # set mpicc "/usr/local/bin/mpicc" # # If you want to have more than one test going at the same time for multiple # installs you can have multiple globals.local files and set the # SLURM_LOCAL_GLOBALS_FILE env var, and have that set to the correct # globals.local file for your various installs. The file can be named anything, # not just globals.local. # ############################################################################ # Copyright (C) 2002-2007 The Regents of the University of California. # Copyright (C) 2008-2010 Lawrence Livermore National Security. # Portions Copyright (C) 2010-2018 SchedMD LLC. # Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). # Written by Morris Jette # Additions by Joseph Donaghy # CODE-OCEC-09-009. All rights reserved. # # This file is part of Slurm, a resource management program. # For details, see . # Please also read the supplied file: DISCLAIMER. # # Slurm is free software; you can redistribute it and/or modify it under # the terms of the GNU General Public License as published by the Free # Software Foundation; either version 2 of the License, or (at your option) # any later version. # # Slurm is distributed in the hope that it will be useful, but WITHOUT ANY # WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS # FOR A PARTICULAR PURPOSE. See the GNU General Public License for more # details. # # You should have received a copy of the GNU General Public License along # with Slurm; if not, write to the Free Software Foundation, Inc., # 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. ############################################################################ # Avoid sourcing this file multiple times if {[info procs exit] eq "exit"} { return } # # Include private functions and variables # source ./globals_private global sacctmgr sacct salloc sattach sbatch sbcast scancel scontrol sinfo global smd squeue sreport srun sstat strigger ################################################################ # # NAME # cset - conditional set # # SYNOPSIS # cset name value # # DESCRIPTION # Conditional set. Only set variable if variable does not yet exist. # # Input: name -- name of the variable to set # value -- value to set to 'name' # ################################################################ proc cset {name value} { if {![uplevel 1 info exists $name]} { upvar $name tmp set tmp $value } } # # Defining colors here to be able to use them in globals.local. # By default, these colors are bold # set COLOR_RED "\033\[1;31m" set COLOR_RED_NORMAL "\033\[31m" set COLOR_ORANGE "\033\[1;38;5;208m" set COLOR_YELLOW "\033\[1;33m" set COLOR_GREEN "\033\[1;32m" set COLOR_BLUE "\033\[1;34m" set COLOR_MAGENTA "\033\[1;35m" set COLOR_CYAN "\033\[1;36m" set COLOR_NONE "\033\[0m" cset local_globals_file "./globals.local" # Log level "enum" # Define log levels here so they are available in globals.local set LOG_LEVEL_QUIET 0 set LOG_LEVEL_FATAL 1 set LOG_LEVEL_ERROR 2 set LOG_LEVEL_WARNING 3 set LOG_LEVEL_INFO 4 set LOG_LEVEL_DEBUG 5 set LOG_LEVEL_TRACE 6 if {[info exists env(SLURM_LOCAL_GLOBALS_FILE)]} { set local_globals_file $env(SLURM_LOCAL_GLOBALS_FILE) } if [file exists $local_globals_file] { source $local_globals_file } # # Specify the slurm install directory. # Used to locate binaries, libraries, and header files. # cset slurm_dir "/usr" cset build_dir "../../" cset src_dir "../../" cset config_h "${build_dir}/config.h" cset sacctmgr "${slurm_dir}/bin/sacctmgr" cset sacct "${slurm_dir}/bin/sacct" cset salloc "${slurm_dir}/bin/salloc" cset sattach "${slurm_dir}/bin/sattach" cset sbatch "${slurm_dir}/bin/sbatch" cset sbcast "${slurm_dir}/bin/sbcast" cset scancel "${slurm_dir}/bin/scancel" cset scontrol "${slurm_dir}/bin/scontrol" cset sdiag "${slurm_dir}/bin/sdiag" cset sgather "${slurm_dir}/bin/sgather" cset sh5util "${slurm_dir}/bin/sh5util" cset sinfo "${slurm_dir}/bin/sinfo" cset smd "${slurm_dir}/bin/smd" cset sprio "${slurm_dir}/bin/sprio" cset squeue "${slurm_dir}/bin/squeue" cset srun "${slurm_dir}/bin/srun" cset sreport "${slurm_dir}/bin/sreport" cset sshare "${slurm_dir}/bin/sshare" cset sstat "${slurm_dir}/bin/sstat" cset strigger "${slurm_dir}/bin/strigger" cset slurmd "${slurm_dir}/sbin/slurmd" cset pbsnodes "${slurm_dir}/bin/pbsnodes" cset qdel "${slurm_dir}/bin/qdel" cset qstat "${slurm_dir}/bin/qstat" cset qsub "${slurm_dir}/bin/qsub" cset qalter "${slurm_dir}/bin/qalter" cset qrerun "${slurm_dir}/bin/qrerun" cset lsid "${slurm_dir}/bin/lsid" cset bjobs "${slurm_dir}/bin/bjobs" cset bkill "${slurm_dir}/bin/bkill" cset bsub "${slurm_dir}/bin/bsub" # If using MPICH-2 or other version of MPI requiring pmi libary, use this #cset mpicc "/home/jette/mpich2-install/bin/mpicc" #cset use_pmi 1 # OR for other versions of MPICH, use this cset mpicc "/usr/local/bin/mpicc" cset nvcc "/usr/bin/nvcc" cset use_pmi 0 #cset upcc "/usr/local/bin/upcc" cset upcc "/usr/bin/xlupc" cset oshcc "/usr/local/bin/oshcc" cset mpirun "mpirun" cset totalviewcli "/usr/local/bin/totalviewcli" # Set if using "--enable-memory-leak-debug" configuration option cset enable_memory_leak_debug 0 # Pattern to match your shell prompt #cset prompt {(%|#|\$|\]) *$} cset prompt "(%|#|\\\$|]|\[^>]>) *(|\[^ ]* *)$" # # Specify locations of other executable files used # Only the shell names (e.g. bin_bash) must be full pathnames # cset bin_awk "awk" cset bin_bash [exec which bash | tail -n 1] cset bin_cat "cat" cset bin_cc "gcc" cset bin_chmod "chmod" cset bin_cmp "cmp" cset bin_cp "cp" cset bin_date "date" cset bin_diff "diff" cset bin_echo "echo" cset bin_env "env" cset bin_file "file" cset bin_id "id" cset bin_grep "grep" cset bin_head "head" cset bin_ln "ln" cset bin_perldoc "/usr/bin/perldoc" # Don't user $bin_hostname unless on a front-end system that # doesn't fully use the slurmd, use $bin_printenv SLURMD_NODENAME cset bin_hostname "hostname" cset bin_kill "kill" cset bin_make "make" cset bin_mv "mv" cset bin_od "od" cset bin_pkill "pkill" cset bin_printenv "printenv" cset bin_ps "ps" cset bin_pwd "pwd" cset bin_rm "rm" cset bin_sed "sed" cset bin_sleep "sleep" cset bin_sort "sort" cset bin_sum "sum" cset bin_touch "touch" cset bin_uname "uname" cset bin_uniq "uniq" cset bin_wc "wc" # # Let the commands complete without expect timing out waiting for a # response. Single node jobs submitted to the default partition should # be initiated within this number of seconds. # for interactive slurm jobs: cset timeout $max_job_delay # cset max_job_delay 120 # # Files must be propagated between nodes within this number of seconds. # The delay may be due to NFS. # cset max_file_delay 90 # # Desired job state must be reached within this number of seconds. # cset max_job_state_delay 360 # # Max number of iterations that wait_for_all_jobs can use # cset wait_for_all_jobs_iterations 600 # # Specify the maximum number of tasks to use in the stress tests. # cset max_stress_tasks 4 # # The error message that the "sleep" command prints when we run "sleep aaa". # cset sleep_error_message "(invalid time interval)|(bad character in argument)|(usage: sleep seconds)" # Force LANG, as the expect tests aren't localized set ::env(LANG) "en_US.UTF-8" # Testsuite level variables cset testsuite_shared_dir "[$bin_pwd]" # # The poll interval (how many seconds to sleep between polls in functions like # wait_for_file and wait_for_job) # cset testsuite_poll_interval 1 # Testsuite log variables cset testsuite_log_level $LOG_LEVEL_DEBUG cset testsuite_log_format "\[%{timestamp}s.%{msecs}03d] \(%{filename}s:%{lineno}d) %{loglevel}-7s %{message}s" cset testsuite_time_format "%Y-%m-%d %H:%M:%S" # Default to using color if writing to a terminal and not if writing to a file cset testsuite_colorize [dict exists [fconfigure stdout] -mode] cset testsuite_color_fatal $COLOR_RED cset testsuite_color_error $COLOR_RED_NORMAL cset testsuite_color_warn $COLOR_ORANGE cset testsuite_color_info $COLOR_YELLOW cset testsuite_color_debug $COLOR_BLUE cset testsuite_color_trace $COLOR_MAGENTA cset testsuite_color_header $COLOR_NONE cset testsuite_color_success $COLOR_GREEN cset testsuite_color_failure $COLOR_RED cset testsuite_color_skipped $COLOR_ORANGE # To automatically call cleanup or not when ending the test cset testsuite_cleanup_on_failure false if {[info exists env(SLURM_TESTSUITE_CLEANUP_ON_FAILURE)]} { set testsuite_cleanup_on_failure $env(SLURM_TESTSUITE_CLEANUP_ON_FAILURE) } # Other common variables set re_word_str "\\S+" set digit "\\d" set end_of_line "\[\r\n\]" set float "\\d+\\.?\\d*" set number "\\d+" set format_time "\\d+\\:\\d+\\:\\d+" set number_with_suffix "\\d+\[KM\]*" set slash "/" set whitespace "\\s+" set controlmachine_regex "\\S+" # Any characters except ( , : newline set no_delim "\[^(,:\r\n\]" set no_delim_slash "\[^(,:/\r\n\]" # The first group matches GRES name # The second **optional** group matches GRES type. # The third group matches GRES count. # Test out the regex here: https://regex101.com/r/FlNYKM/7 set gres_regex "($no_delim_slash*):($no_delim*)?:?($no_delim*)" # # Cache SlurmUser to check for SuperUser requests # cset super_user 0 cset super_user_set 0 # # Global variable used in multiple functions in "globals" file # set gpu_sock_list {} ################################################################ # # NAME # get_test_name - gets the name of the invoking source script # # SYNOPSIS # get_test_name # # RETURN VALUE # The name of the originally called script # ################################################################ proc get_test_name { } { set test_name unknown set frame_level 1 while { $frame_level <= [info frame] } { if { [dict get [info frame $frame_level] type] == "source" } { set test_name [file tail [dict get [info frame $frame_level] file]] break } incr frame_level } return $test_name } # # Name of the originally invoked test script, e.g. test1.1 # set test_name [get_test_name] # # Suffix of the test script, e.g. 1.1 # set test_id [string map {test ""} $test_name] ################################################################ # # NAME # fail - fails a test # # SYNOPSIS # fail message # # DESCRIPTION # To be used when an error is fatal for the test. This routine # prints the specified error message, optionally cleans up, prints # a final test failure message, and exits the test with exit code 1. # # ENVIRONMENT # Whether or not the cleanup procedure is called depends on the setting # of the $testsuite_cleanup_on_failure set in the globals.local file or # overridden with the SLURM_TESTSUITE_CLEANUP_ON_FAILURE environment # variable. # # NOTE # DO NOT call this within your local cleanup procedure. # ################################################################ proc fail { message } { # Avoid recursive calls from within cleanup if {[info level] > 1 && [lindex [info level -1] 0] eq "cleanup"} { log_error "Local cleanup shouldn't call pass, fail or skip" return } log_fatal $message # _test_fini will handle cleanup and print the failure message. _test_fini 1 } ################################################################ # # NAME # skip - skips a test # # SYNOPSIS # skip message # # DESCRIPTION # To be used when a precondition for the test fails and the test # should be skipped. This routine prints the specified warning message, # calls the cleanup procedure if defined, prints a final test skipped # message, and exits the test with exit code -1 (aka 255). # # NOTE # DO NOT call this within your local cleanup procedure. # ################################################################ proc skip { message } { # Avoid recursive calls from within cleanup if {[info level] > 1 && [lindex [info level -1] 0] eq "cleanup"} { log_error "Local cleanup shouldn't call pass, fail or skip" return } log_warn $message # _test_fini will handle cleanup and print the skipped message. _test_fini -1 } ################################################################ # # NAME # pass - passes a test # # SYNOPSIS # pass # # DESCRIPTION # To be used when a test passes and should complete with success. # This routine calls the cleanup procedure if defined, prints a final # test success message, and exits with exit code 0. # # NOTE # DO NOT call this within your local cleanup procedure. # ################################################################ proc pass { } { # Avoid recursive calls from within cleanup if {[info level] > 1 && [lindex [info level -1] 0] eq "cleanup"} { log_error "Local cleanup shouldn't call pass, fail or skip" return } # _test_fini will handle cleanup and print the success message. _test_fini 0 } ################################################################ # # NAME # fail_on_error - fails if exit_code is non-zero # # SYNOPSIS # fail_on_error message # # DESCRIPTION # If exit_code global variable is non-zero it calls fail with the # message. # # NOTE # DO NOT call this within your local cleanup procedure. # ################################################################ proc fail_on_error { message } { global exit_code if {$exit_code != 0} { fail "$message" } } ################################################################ # # NAME # print_time - prints the current date and time # # SYNOPSIS # print_time # ################################################################ proc print_time { } { global bin_date spawn $bin_date expect { eof { wait } } return } ################################################################ # # NAME # dict_getdef - 'dict get' with ability to specify the default value # # SYNOPSIS # dict_getdef dictionary_value key default_value # # DESCRIPTION # Tcl < 8.7 lacks a built in 'dict get' with ability to specify the # default value. Tcl 8.7 adds a dict getdef. # This proc returns the value from the dictionary corresponding to the # keys if it exists, or the default value otherwise. # # EXAMPLE # dict_getdef $option_dict action "warn" # # SOURCE # https://core.tcl-lang.org/tips/doc/trunk/tip/342.md # https://core.tcl-lang.org/tcl/tktview/2370575 # ################################################################ proc dict_getdef {D args} { if {[dict exists $D {*}[lrange $args 0 end-1]]} then { dict get $D {*}[lrange $args 0 end-1] } else { lindex $args end } } ################################################################ # # NAME # run_command - executes a command and returns a dictionary result # # SYNOPSIS # run_command ?options? command # # DESCRIPTION # Executes a command and returns a dictionary that includes the output, # exit code, etc. An action can be taken (fail, warn, none) if the # command's exit code or timeout is unexpected. By default, the action # will be applied if the command fails. If the -xfail option is # specified, the behavior will be reversed to apply the action if the # command ran successfully. # # OPTIONS # -fail # If the exit code or timeout is unexpected, the action that will # be taken is to fail the test # -warn # If the exit code or timeout is unexpected, the action that will # be taken is to log a warning (this is the default) # -none # If the exit code or timeout is unexpected, no action will be # taken # -xfail # If the command exits with zero and does not time out, the # action will be applied. Without this option, the action will # be applied if the command exits with a non-zero exit code or # times out. # -timeout # Time in seconds to wait for the command to complete before # timing out (default is 60.0) # -nolog # Logging for this command will occur at trace threshold only # # ARGUMENTS # command # a string containing the command and arguments to execute # # RETURN VALUE # A dictionary containing the following elements: # command - The command that was invoked # exit_code - Exit code # output - The combined standard output and standard error # start_time - The time (epoch with milliseconds) the command was # executed # duration - The duration (seconds and milliseconds) the # command took to run # ################################################################ proc run_command args { global bin_bash set exit_status 0 set output "" set action "warn" set timeout 60 set expect_failure false set log_at_trace_level false while {[llength $args]} { switch -glob -- [lindex $args 0] { -fail {set action "fail"; set args [lrange $args 1 end]} -none {set action "none"; set args [lrange $args 1 end]} -timeout {set args [lassign $args - timeout]} -warn {set action "warn"; set args [lrange $args 1 end]} -xfail {set expect_failure true; set args [lrange $args 1 end]} -nolog {set log_at_trace_level true; set args [lrange $args 1 end]} -* {fail "Unknown option: [lindex $args 0]"} default break } } if {[llength $args] == 1} { lassign $args command } else { fail "Invalid number of arguments [llength $args]: $args" } if {$log_at_trace_level} { interp alias {} log_alias {} log_trace } else { interp alias {} log_alias {} log_debug } set orig_log_user [log_user -info] log_user 0 log_alias "Invoking command \"$command\"" set start_time [format "%.3f" [expr [clock milliseconds] / 1000.000]] set stty_init raw ; # Prevent the terminal from inserting \r set expect_pid [spawn -noecho $bin_bash -c "$command"] expect { -re "(.+)" { append output $expect_out(1,string) exp_continue } timeout { set message "Command \"$command\" timed out after $timeout seconds" slow_kill $expect_pid set exit_status 1 } eof { lassign [wait] pid spawnid os_error_flag errno set exit_status [expr $errno > 128 ? $errno - 256 : $errno] } } set end_time [format "%.3f" [expr [clock milliseconds] / 1000.000]] set duration [format "%.3f" [expr $end_time - $start_time]] log_alias "Command Results:" log_alias " Start Time: $start_time" log_alias " Duration: $duration" log_alias " Exit Code: $exit_status" if {[info exists output]} { log_alias " Output: $output" } if {! $expect_failure && $exit_status != 0} { cset message "Command \"$command\" failed with rc=$exit_status" if {[info exists output] && $output != ""} { append message ": [string trimright $output]" } if {$action eq "warn"} { log_warn $message } elseif {$action eq "fail"} { fail $message } } elseif {$expect_failure && $exit_status == 0} { cset message "Command \"$command\" was expected to fail but succeeeded" if {$action eq "warn"} { log_warn $message } elseif {$action eq "fail"} { fail $message } } log_user $orig_log_user dict set result command $command dict set result exit_code $exit_status dict set result output $output dict set result start_time $start_time dict set result duration $duration return $result } ################################################################ # # NAME # run_command_output - executes a command and returns the output # # SYNOPSIS # run_command_output ?options? command # # DESCRIPTION # Executes a command and returns a dictionary that includes the output, # exit code, etc. An action can be taken (fail, warn, none) if the # command's exit code or timeout is unexpected. By default, the action # will be applied if the command fails. If the -xfail option is # specified, the behavior will be reversed to apply the action if the # command ran successfully. # # OPTIONS # -fail # if the exit code or timeout is unexpected, the action that will # be taken is to fail the test # -warn # if the exit code or timeout is unexpected, the action that will # be taken is to log a warning (this is the default) # -none # if the exit code or timeout is unexpected, no action will be # taken # -xfail # if the command exits with zero and does not time out, the # action will be applied. Without this option, the action will # be applied if the command exits with a non-zero exit code or # times out. # -timeout # time in seconds to wait for the command to complete before # timing out (default is 60.0) # # ARGUMENTS # command # a string containing the command and arguments to execute # # RETURN VALUE # A string containing the combined standard output and standard error # ################################################################ proc run_command_output args { global exit_code set result [run_command {*}$args] if [dict exists $result output] { return [dict get $result output] } else { return "" } } ################################################################ # # NAME # run_command_status - executes a command and returns the exit code # # SYNOPSIS # run_command_status ?options? command # # DESCRIPTION # Executes a command and returns a dictionary that includes the output, # exit code, etc. An action can be taken (fail, warn, none) if the # command's exit code or timeout is unexpected. By default, the action # will be applied if the command fails. If the -xfail option is # specified, the behavior will be reversed to apply the action if the # command ran successfully. # # OPTIONS # -fail # if the exit code or timeout is unexpected, the action that will # be taken is to fail the test # -warn # if the exit code or timeout is unexpected, the action that will # be taken is to log a warning (this is the default) # -none # if the exit code or timeout is unexpected, no action will be # taken # -xfail # if the command exits with zero and does not time out, the # action will be applied. Without this option, the action will # be applied if the command exits with a non-zero exit code or # times out. # -timeout # time in seconds to wait for the command to complete before # timing out (default is 60.0) # # ARGUMENTS # command # a string containing the command and arguments to execute # # RETURN VALUE # The exit code for the invoked command # ################################################################ proc run_command_status args { global exit_code set result [run_command {*}$args] return [dict get $result exit_code] } ################################################################ # # NAME # cancel_job - cancels the specified job # # SYNOPSIS # cancel_job job_id ?het_job? # # ARGUMENTS # job_id # The Slurm job id of a job we want to cancel # het_job # For a het job confirm each component job has completed # # RETURN VALUE # A non-zero return code indicates a failure. # ################################################################ proc cancel_job { job_id {het_job 0}} { global scancel bin_sleep if {$job_id == 0} { return 1 } log_debug "Cancelling $job_id" set status [catch [exec $scancel -Q $job_id] result] exec $bin_sleep 1 return [wait_for_job $job_id "DONE" $het_job] } ################################################################ # # NAME # get_line_cnt - returns the size of the specified file # # SYNOPSIS # get_line_cnt file_name # # RETURN VALUE # Number of lines in the specified file. # ################################################################ proc get_line_cnt { file_name } { global bin_wc number set lines 0 spawn $bin_wc -l $file_name expect { -re "($number) " { set lines $expect_out(1,string) exp_continue } eof { wait } } return $lines } ################################################################ # # NAME # slow_kill - kills a process slowly # # SYNOPSIS # slow_kill pid # # DESCRIPTION # Kill a process slowly, first trying SIGINT, pausing for # a second, then sending SIGKILL. # # RETURN VALUE # A non-zero return code indicates a failure. # ################################################################ proc slow_kill { pid } { global bin_kill catch {exec $bin_kill -INT $pid} catch {exec $bin_kill -INT $pid} sleep 1 catch {exec $bin_kill -KILL $pid} return 0 } ################################################################ # # NAME # get_my_id - gets the id from the running user # # SYNOPSIS # get_my_id # # RETURN VALUE # output of id # ################################################################ proc get_my_id {} { global bin_id number set login_info -1 log_user 0 spawn $bin_id expect { -re "(uid=.*\n)" { set login_info $expect_out(1,string) exp_continue } eof { wait } } log_user 1 if {$login_info == -1} { fail "Unable to get user info" } return $login_info } ################################################################ # # NAME # get_my_user_name - gets the name from the running user # # SYNOPSIS # get_my_user_name # # RETURN VALUE # A non-zero return code indicates a failure. # ################################################################ proc get_my_user_name { } { global bin_id re_word_str set user_name -1 log_user 0 spawn $bin_id -nu expect { -re "($re_word_str)" { set user_name $expect_out(1,string) exp_continue } eof { wait } } log_user 1 if {$user_name == -1} { fail "Unable to get user name" } return $user_name } ################################################################ # # NAME # get_my_uid - gets the uid from the running user # # SYNOPSIS # get_my_uid # # RETURN VALUE # A non-zero return code indicates a failure. # ################################################################ proc get_my_uid { } { global bin_id number set uid -1 log_user 0 spawn $bin_id -u expect { -re "($number)" { set uid $expect_out(1,string) exp_continue } eof { wait } } log_user 1 return $uid } ################################################################ # # NAME # get_my_gid - gets the gid from the running user # # SYNOPSIS # get_my_gid # # RETURN VALUE # A non-zero return code indicates a failure. # ################################################################ proc get_my_gid { } { global bin_id number set gid -1 log_user 0 spawn $bin_id -g expect { -re "($number)" { set gid $expect_out(1,string) exp_continue } eof { wait } } log_user 1 return $gid } ################################################################ # # NAME # kill_salloc - kills all salloc commands associated with this user # # SYNOPSIS # kill_salloc # # DESCRIPTION # Kill all salloc commands associated with this user. # Issue two SIGINT, sleep 1 and a SIGKILL # # RETURN VALUE # A non-zero return code indicates a failure. # # NOTE # Use slow_kill instead of kill_salloc if you can capture # the process id # ################################################################ proc kill_salloc { } { global bin_id bin_pkill bin_sleep number set uid [get_my_uid] catch {exec $bin_pkill -INT -u $uid salloc} catch {exec $bin_pkill -INT -u $uid salloc} sleep 1 catch {exec $bin_pkill -KILL -u $uid salloc} return 0 } ################################################################ # # NAME # kill_srun - kills all srun commands associated with this user # # SYNOPSIS # kill_srun # # DESCRIPTION # Kill all srun commands associated with this user. # Issue two SIGINT, sleep 1 and a SIGKILL # # RETURN VALUE # A non-zero return code indicates a failure. # # NOTE # Use slow_kill instead of kill_srun if you can capture # the process id # ################################################################ proc kill_srun { } { global bin_id bin_pkill bin_sleep number set uid [get_my_uid] catch {exec $bin_pkill -INT -u $uid srun} catch {exec $bin_pkill -INT -u $uid srun} sleep 1 catch {exec $bin_pkill -KILL -u $uid srun} return 0 } ################################################################ # # NAME # wait_for - generic wait utility # # SYNOPSIS # wait_for ?options? condition body # # DESCRIPTION # Generic wait utility allowing you to repeatedly execute a generic block # of code until a specified boolean expression is met. The code block and # condition check occur every poll interval until a timeout is reached. # # OPTIONS # -timeout # time in seconds to wait for the command to complete before # timing out (default is 60.0) # -pollinterval # time in seconds between each loop execution and condition check # (defaults to 1.0) # # ARGUMENTS # condition # The boolean expression to test # body # A block of code to evaluate in the invoking stack frame # # RETURN VALUE # true if the condition is met before the timeout occurs # false if the timeout occurs before the condition is met # ################################################################ proc wait_for args { set timeout 60 set poll_interval 1 while {[llength $args]} { switch -glob -- [lindex $args 0] { -time* {set args [lassign $args - timeout]} -poll* {set args [lassign $args - poll_interval]} -* {fail "Unknown option: [lindex $args 0]"} default break } } if {[llength $args] == 2} { lassign $args condition body } else { fail "Invalid number of arguments [llength $args]: $args" } set start_time [format "%.3f" [expr [clock milliseconds] / 1000.000]] log_debug "Waiting for $condition starting at [clock format [expr int($start_time)] -format %Y-%m-%dT%X].[lindex [split $start_time '.'] 1]" while {1} { # Evaluate code block log_trace "Evaluating code block ([string trim $body])" set ret [catch {uplevel $body} result] # Check condition if {[uplevel expr [format "{%s}" $condition]]} { set now [format "%.3f" [expr [clock milliseconds] / 1000.000]] log_debug "Condition ($condition) was met at [clock format [expr int($now)] -format %Y-%m-%dT%X].[lindex [split $now '.'] 1]" return true } else { log_trace "Condition ($condition) was not met" } # Sleep poll interval log_trace "Sleeping for $poll_interval seconds" after [expr {int($poll_interval * 1000)}] # Check if we have surpassed our timeout set now [format "%.3f" [expr [clock milliseconds] / 1000.000]] log_trace "Checking whether the current time ([clock format [expr int($now)] -format %Y-%m-%dT%X].[lindex [split $now '.'] 1]) is greater than the start time plus the timeout ([clock format [expr int($start_time + $timeout)] -format %Y-%m-%dT%X].[lindex [split [expr $start_time + $timeout] '.'] 1])" if {$now > $start_time + $timeout} { log_warn "Condition ($condition) did not occur before timeout ($timeout) seconds" return false } } } ################################################################ # # NAME # wait_for_command - waits for command output to match a pattern # # SYNOPSIS # wait_for_command command args regex ?matches_in? ?or_more? ?matches_out? ?max_delay? # # DESCRIPTION # Executes a command every testsuite_poll_interval until a regex # pattern is matched in the output, or timeout after max_delay. # # ARGUMENTS # command # The command to run via spawn. # args # The arguments to the command, as a single string. # regex # The regex pattern to search for in the command # output. Can be a simple string. # matches_in # The number of times to match the regex. Defaults to 1. # or_more # If 1, allow for matching the regex pattern match_cnt *or more* # times, instead of exactly match_cnt times. Defaults to 0. # matches_out # The upvar (a variable name to "pass by reference" in TCL) # to set/return the number of matches found. # Useful if or_more is 1 and the caller wants the matches found. # max_delay # The timeout seconds to execute the command and wait on the # output before giving up. Defaults to global max_command_delay. # # RETURN VALUE # 0 on success and 1 on failure. On failure, an error is logged to # the output. If matches_out is specified, the number of matches will # be returned via the reference/upvar matches_out. # ################################################################ proc wait_for_command { command args regex {matches_in 1} {or_more 0} {matches_out ""} {max_delay 120} } { global bin_sleep testsuite_poll_interval if {$matches_out != ""} { upvar $matches_out matches } set delay 0 while {$delay < $max_delay} { set matches 0 # `{*}` breaks apart a string into individual pieces spawn $command {*}$args expect { -re $regex { incr matches exp_continue } timeout { log_error "$command not responding after $delay seconds polling" # Return failure return 1 } eof { wait } } if {($matches == $matches_in) || ($or_more == 1 && $matches >= $matches_in)} { # Return success return 0 } log_info "[lindex [info level 0] 0] polled $matches matches of '$regex', but expecting $matches_in" exec $bin_sleep $testsuite_poll_interval incr delay $testsuite_poll_interval } if {$or_more == 1} { set match_str "$matches_in or more times" } elseif {$matches_in == 1} { set match_str "exactly $matches_in time" } else { set match_str "exactly $matches_in times" } log_error "Failed to match regex `$regex` $match_str after $max_delay seconds for command `$command $args`." # Return failure return 1 } ################################################################ # # NAME # wait_for_file - waits for a file to exist with non-zero size # # SYNOPSIS # wait_for_file file_name # # DESCRIPTION # Wait for the specified file to exist and have a non-zero size. # Note that if JobFileAppend=0 is configured, a file can exist and # be purged then be re-created. Polls every $testsuite_poll_interval # seconds. # # RETURN VALUE # A non-zero return code indicates a failure. # ################################################################ proc wait_for_file { file_name } { global bin_sleep max_file_delay testsuite_poll_interval for {set my_delay 0} {$my_delay <= $max_file_delay} \ {set my_delay [expr $my_delay + $testsuite_poll_interval]} { if {[file exists $file_name]} { # Add small delay for I/O buffering exec $bin_sleep 1 return 0 } exec $bin_sleep $testsuite_poll_interval # # Expect may fail to load current NFS info. # Use the ls command to load current info. # set slash_pos [string last $file_name "/"] if {$slash_pos < 1} { set dir_name "." } else { decr slash_pos set dir_name [string $file_name 0 $slash_pos] } exec /bin/ls $dir_name } log_error "Timeout waiting for file $file_name" return 1 } ################################################################ # # NAME # wait_for_job - waits for job to be in desired state # # SYNOPSIS # wait_for_job job_id desired_state ?het_job? # # DESCRIPTION # Wait for job to be in desired state. Can handle het job components. # # ARGUMENTS # job_id # The Slurm job id of a job we want to wait for. # desired_state # The state you want the job to attain before returning. # Currently supports: # DONE any terminated state # PENDING job is pending # RUNNING job is running # SPECIAL_EXIT # SUSPENDED job is suspended # het_job # If set, checks the state of each component job if the job # is a het one. # # RETURN VALUE # A non-zero return code indicates a failure # # SEE ALSO # _wait_for_single_job # ################################################################ proc wait_for_job { job_id desired_state {het_job 0}} { set jid_list "" if { $het_job } { # get component job ids set jid_list [get_het_job_ids $job_id 1] } set rc 0 if { $jid_list == "" } { # non-het job set jid_list $job_id } foreach jid $jid_list { set rc [_wait_for_single_job $jid $desired_state] if { $rc } { # bail out on first failure break } } return $rc } ################################################################ # # NAME # wait_for_account_done - cancels and waits on jobs in specified accounts # # SYNOPSIS # wait_for_account_done accounts # # DESCRIPTION # Cancel jobs on and wait for them to be finished in account(s) given. # Polls every $testsuite_poll_interval seconds. # # RETURN VALUE # A non-zero return code indicates a failure. # # NOTE # We sleep for two seconds before replying that a job is # done to give time for I/O completion (stdout/stderr files) # ################################################################ proc wait_for_account_done { accounts } { global scancel squeue max_job_state_delay re_word_str testsuite_poll_interval if { $accounts == "" } { log_error "wait_for_account_done: no account given" return 1 } log_user 0 set account_list [split $accounts ","] foreach item $account_list { spawn $scancel -A $item expect { timeout { log_warn "No response from scancel" } eof { wait } } } set my_delay 0 while 1 { set found 0 spawn $squeue -o Account=%a -h -A$accounts expect { -re "Account=($re_word_str)" { set found 1 exp_continue; } eof { wait } } if { !$found } { log_debug "Account(s) $accounts is/are empty" break; } if { $my_delay > $max_job_state_delay } { log_error "Timeout waiting for account(s) '$accounts' to be finished" log_user 1 return 1 } exec sleep $testsuite_poll_interval set my_delay [expr $my_delay + $testsuite_poll_interval] } log_user 1 return 0 } ################################################################ # # NAME # wait_for_part_done - cancels and waits on jobs in specified partition # # SYNOPSIS # wait_for_part_done partition # # DESCRIPTION # Cancel jobs on and wait for them to be finished in partition given. # Polls every $testsuite_poll_interval seconds. # # RETURN VALUE # A non-zero return code indicates a failure. # # NOTE # We sleep for two seconds before replying that a job is # done to give time for I/O completion (stdout/stderr files) # ################################################################ proc wait_for_part_done { part } { global scancel squeue max_job_state_delay re_word_str testsuite_poll_interval if { $part == "" } { log_error "wait_for_part_done: no partition given" return 1 } log_user 0 spawn $scancel -p $part expect { timeout { log_error "wait_for_part_done: No response from scancel" } eof { wait } } set my_delay 0 while 1 { set found 0 spawn $squeue -o Part=%P -h -p$part expect { -re "Part=($re_word_str)" { set found 1 exp_continue; } eof { wait } } if { !$found } { log_debug "Partition $part is empty" break; } if { $my_delay > $max_job_state_delay } { log_error "wait_for_part_done: Timeout waiting for partition '$part' to be finished" log_user 1 return 1 } exec sleep $testsuite_poll_interval set my_delay [expr $my_delay + $testsuite_poll_interval] } log_user 1 return 0 } ################################################################ # # NAME # wait_for_step - waits for a job step to be found # # SYNOPSIS # wait_for_step step_id # # DESCRIPTION # Wait for a job step to be found. # Polls every $testsuite_poll_interval seconds. # # RETURN VALUE # A non-zero return code indicates a failure. # ################################################################ proc wait_for_step { step_id } { global scontrol max_job_state_delay testsuite_poll_interval set my_delay 0 while 1 { set fd [open "|$scontrol -o show step $step_id"] gets $fd line catch {close $fd} if {[regexp {Nodes=} $line foo] == 1} { return 0 } if {[regexp {MidplaneList=} $line foo] == 1} { return 0 } if { $my_delay > $max_job_state_delay } { log_error "Timeout waiting for job step" return 1 } log_info "Step $step_id not done yet. Waiting for $testsuite_poll_interval seconds" exec sleep $testsuite_poll_interval set my_delay [expr $my_delay + $testsuite_poll_interval] } } ################################################################ # # NAME # wait_for_all_jobs - waits for jobs to finish having a specified name # # SYNOPSIS # wait_for_all_jobs job_name # # DESCRIPTION # Wait for previously submitted Slurm jobs to finish of a # certain name. Iterates every $testsuite_poll_interval seconds. # # ARGUMENTS # job_name # The name of job to wait for # # RETURN VALUE # -1 on failure, 0 if all jobs are done, and the remaining job count # if not all jobs are done after $wait_for_all_jobs_iterations # iterations. # ################################################################ proc wait_for_all_jobs { job_name } { global scancel squeue bin_sleep wait_for_all_jobs_iterations testsuite_poll_interval set matches 0 set timeout 30 log_info "Waiting for all jobs to terminate" for {set inx 0} {$inx < $wait_for_all_jobs_iterations} {incr inx} { log_user 0 set matches 0 spawn $squeue -o %j expect { -re "$job_name" { incr matches exp_continue } -re "error" { set matches -1 } timeout { log_warn "No response from squeue" set matches -1 } eof { wait } } log_user 1 if {$matches == 0} { log_info "All jobs complete" break } if {$matches > 0} { log_info " $matches jobs remaining" exec sleep $testsuite_poll_interval } if {$matches == -1} { break } } if {$matches != 0} { spawn $scancel -n $job_name expect { timeout { log_warn "No response from scancel" } eof { wait } } } return $matches } ################################################################ # # NAME # wait_job_reason - waits for a desired job state and reason # # SYNOPSIS # wait_job_reason job_id ?desired_state? ?desired_reason_list? # # DESCRIPTION # Wait until the job is in desired state and reason is one # of the desired ones. Or to max_job_state_delay. # # ARGUMENTS # job_id # The job to wait for # desired_state # Desired state. # desired_reason_list # List of desired reasons. Empty list means that any reason # is ok. # # RETURN VALUE # 0 when job is in the desired state and reason is one # of the desired ones, or 1 otherwise. # ################################################################ proc wait_job_reason { job_id {desired_state "PENDING"} {desired_reason_list ""} } { global scontrol re_word_str max_job_state_delay testsuite_poll_interval set final_state "COMPLETED CANCELLED FAILED TIMEOUT DEADLINE OUT_OF_MEMORY" set log_user_prev [log_user -info] log_user 0 set my_delay 0 while true { set pending 0 set has_reason 1 spawn $scontrol show job $job_id expect { -re "JobState=($re_word_str) Reason=(\\S+)" { set job_state $expect_out(1,string) set job_reason $expect_out(2,string) } timeout { log_error "No response from scontrol show job" set rc 1 break } } # Check if both state and reason are the desired ones if {$job_state == $desired_state} { set found 0 if {$desired_reason_list == ""} { set found 1 } foreach desired_reason $desired_reason_list { if {$job_reason == $desired_reason } { set found 1 } } if {$found} { set rc 0 break } } elseif {[lsearch -exact final_state $job_state] >= 0} { # Job is in final step no need to wait longer log_error [format "Job in final state/reason '%s' / '%s' instead of the desired '%s' / '%s'." \ $job_state $job_reason \ $desired_state $desired_reason_list] set rc 1 break } # Check if this was the last poll if {$my_delay > $max_job_state_delay} { log_error "wait_job_reason timeout" set rc 1 break } set remamining_sec [expr $max_job_state_delay - $my_delay] log_info [format "Job in state/reason '%s' / '%s' instead of the desired '%s' / '%s'." \ $job_state $job_reason \ $desired_state $desired_reason_list] log_info [format "Polling again in %ss, %ss to timeout." \ $testsuite_poll_interval $remamining_sec] sleep $testsuite_poll_interval set my_delay [expr $my_delay + $testsuite_poll_interval] } log_user $log_user_prev return $rc } ################################################################ # # NAME # test_config_overrides - returns value of FastSchedule from slurm.conf # # SYNOPSIS # test_config_overrides # # RETURN VALUE # Returns if SlurmdParameters=config_overrides is configured # ################################################################ proc test_config_overrides { } { global scontrol re_word_str log_user 0 set config_overrides 0 spawn $scontrol show config expect { -re "SlurmdParameters *= *($re_word_str)" { if { [string first "config_overrides" $expect_out(1,string)] != -1} { set config_overrides 1 } else { set config_overrides 0 } exp_continue } eof { wait } } log_user 1 return $config_overrides } ################################################################ # # NAME # test_assoc_enforced - determines if we need an association to run a job # # SYNOPSIS # test_assoc_enforced # # DESCRIPTION # Determine if we need an association to run a job. This is based upon # the value of AccountingStorageEnforce in the slurm.conf. # # RETURN VALUE # Returns level of association enforcement, 0 if none # ################################################################ proc test_assoc_enforced { } { global scontrol number log_user 0 set assoc_enforced 0 spawn $scontrol show config expect { -re "AccountingStorageEnforce *= associations" { set assoc_enforced 1 exp_continue } eof { wait } } log_user 1 return $assoc_enforced } ################################################################ # # NAME # test_limits_enforced - check if AccountingStorageEnforce limits is set # # SYNOPSIS # test_limits_enforced # # RETURN VALUE # Returns 1 if limits is set, else 0 # ################################################################ proc test_limits_enforced { } { global scontrol log_user 0 set enforced 0 spawn $scontrol show config expect { -re "AccountingStorageEnforce *= (\[a-z]+),limits" { set enforced 1 exp_continue } eof { wait } } log_user 1 return $enforced } ################################################################ # # NAME # test_enforce_part_limits - returns value of EnforcePartLimits # # SYNOPSIS # test_enforce_part_limits # # RETURN VALUE # Returns EnforcePartLimits value (ALL, ANY, or NO) # ################################################################ proc test_enforce_part_limits { } { global re_word_str scontrol log_user 0 set enforced "UNKNOWN" spawn $scontrol show config expect { -re "EnforcePartLimits *= ($re_word_str)" { set enforced $expect_out(1,string) exp_continue } eof { wait } } log_user 1 return $enforced } ################################################################ # # NAME # test_gang - determines if gang scheduling is configured # # SYNOPSIS # test_gang # # RETURN VALUE # Returns level of association enforcement, 0 if none # ################################################################ proc test_gang { } { global scontrol log_user 0 set gang 0 spawn $scontrol show config expect { -re "PreemptMode *= .*GANG" { set gang 1 exp_continue } eof { wait } } log_user 1 return $gang } ################################################################ # # NAME # test_power_save - checks whether power save mode is enabled # # SYNOPSIS # test_power_save # # RETURN VALUE # Return 1 if power save mode is enabled, 0 otherwise # ################################################################ proc test_power_save { } { global scontrol number log_user 0 set suspend_time 0 spawn $scontrol show config expect { -re "SuspendTime *= ($number)" { set suspend_time $expect_out(1,string) exp_continue } eof { wait } } log_user 1 if {$suspend_time == 0} { set power_save 0 } else { set power_save 1 } return $power_save } ################################################################ # # NAME # slurmd_user_root - checks whether SlurmUser is root # # SYNOPSIS # slurmd_user_root # # RETURN VALUE # Return 1 if the SlurmdUser is root, 0 otherwise # ################################################################ proc slurmd_user_root { } { global scontrol log_user 0 set rc 0 spawn $scontrol show config expect { -re "SlurmdUser *= root" { set rc 1 exp_continue } eof { wait } } log_user 1 return $rc } ################################################################ # # NAME # test_topology - determines if system is topology aware # # SYNOPSIS # test_topology # # RETURN VALUE # Returns level of association enforcement, 0 if none # ################################################################ proc test_topology { } { global scontrol log_user 0 set have_topology 1 spawn $scontrol show config expect { -re "TopologyPlugin *= *topology/none" { set have_topology 0 exp_continue } eof { wait } } log_user 1 return $have_topology } ################################################################ # # NAME # get_config - returns a dictionary of slurm configuration parameters # SYNOPSIS # get_config # # DESCRIPTION # Uses `scontrol show config` to return the slurm configuration as # a dictionary of parameter values. # ################################################################ proc get_config { } { global scontrol set output [run_command_output -fail -nolog "$scontrol show config"] foreach line [split $output "\n"] { if {[regexp {^(\w+) += (.*)$} $line {} param_name param_value] == 1} { dict set config_dict $param_name $param_value } } return $config_dict } ################################################################ # # NAME # get_config_param - returns a slurm configuration parameter # # SYNOPSIS # get_config_param parameter_name # # DESCRIPTION # Returns a specific configuration parameter value. # # RETURN VALUE # Returns the value of the specified parameter or MISSING if it does not # exist. # ################################################################ proc get_config_param { parameter_name } { set config_dict [get_config] if [dict exists $config_dict $parameter_name] { return [dict get $config_dict $parameter_name] } else { return "MISSING" } } ################################################################ # # NAME # get_affinity_types - gets the task plugins running with task/ stripped # # SYNOPSIS # get_affinity_types # # RETURN VALUE # Returns comma separated list of task plugins running without the task/ # ################################################################ proc get_affinity_types { } { global scontrol re_word_str log_user 0 set affinity "" spawn $scontrol show config expect { -re "TaskPlugin *= ($re_word_str)" { set parts [split $expect_out(1,string) ",/"] while 1 { set task_found [lsearch $parts "task"] if { $task_found == -1 } break set parts [lreplace $parts $task_found $task_found] } set affinity [join $parts ","] exp_continue } eof { wait } } log_user 1 return $affinity } ################################################################ # # NAME # get_affinity_params - gets the task plugin parameters # # SYNOPSIS # get_affinity_params # # RETURN VALUE # Returns value of TaskPluginParam # ################################################################ proc get_affinity_params { } { global scontrol re_word_str log_user 0 set params "" spawn $scontrol show config expect { -re "TaskPluginParam *= ($re_word_str)" { set params $expect_out(1,string) exp_continue } eof { wait } } log_user 1 return $params } ################################################################ # # NAME # test_constrain_devices - determines if devices are constrained by cgroup # # SYNOPSIS # test_constrain_devices # # DESCRIPTION # Determine if devices are constrained by cgroup. # the value of ConstrainDevices in the gres.conf. # # RETURN VALUE # Returns 1 if constrained, 0 otherwise # ################################################################ proc test_constrain_devices { } { global scontrol number log_user 0 set constrain_devices 0 spawn $scontrol show config expect { -re "ConstrainDevices *= yes" { set constrain_devices 1 exp_continue } eof { wait } } log_user 1 return $constrain_devices } ################################################################ # # NAME # get_mps_count_by_index - gets the count of a specific gres/mps device # # SYNOPSIS # get_mps_count_by_index index hostname # # RETURN VALUE # Returns the Count of a specific gres/mps device # ################################################################ proc get_mps_count_by_index { index hostname } { global slurmd number re_word_str log_user 0 set count 0 spawn $slurmd -G -N $hostname expect { -re "Gres Name=mps Type=$re_word_str Count=($number) Index=$index" { set count $expect_out(1,string) exp_continue } eof { wait } } log_user 1 return $count } ################################################################ # # NAME # get_bb_emulate - determines if Cray burst buffers API is emulated # # SYNOPSIS # get_bb_emulate # # DESCRIPTION # Determine if Cray burst buffers API is emulated # # RETURN VALUE # 1 if true, 0 if false # ################################################################ proc get_bb_emulate { } { global scontrol log_user 0 set bb_emulate 0 spawn $scontrol show burst expect { -re "EmulateCray" { set bb_emulate 1 exp_continue } eof { wait } } log_user 1 return $bb_emulate } ################################################################ # # NAME # get_bb_persistent - determines if persistent burst buffers can be created by users # # SYNOPSIS # get_bb_persistent # # DESCRIPTION # Determine if persistent burst buffers can be created by users # # RETURN VALUE # 1 if true, 0 if false # ################################################################ proc get_bb_persistent { } { global scontrol log_user 0 set bb_persistent 0 spawn $scontrol show burst expect { -re "EnablePersistent" { set bb_persistent 1 exp_continue } eof { wait } } log_user 1 return $bb_persistent } ################################################################ # # NAME # get_bb_types - gets the burst buffer plugins running with task/ stripped # # SYNOPSIS # get_bb_types # # RETURN VALUE # Returns comma separated list of task plugins running without the task/ # ################################################################ proc get_bb_types { } { global scontrol re_word_str log_user 0 set bb_types "" spawn $scontrol show config expect { -re "BurstBufferType *= ($re_word_str)" { set parts [split $expect_out(1,string) ",/"] while 1 { set task_found [lsearch $parts "burst_buffer"] if { $task_found == -1 } break set parts [lreplace $parts $task_found $task_found] } set bb_types [join $parts ","] exp_continue } eof { wait } } log_user 1 return $bb_types } ################################################################ # # NAME # get_cpu_governors - gets the CpuFreqGovernor configuration parameter # # SYNOPSIS # get_cpu_governors # # RETURN VALUE # Returns comma separated list of available CPU governor's # ################################################################ proc get_cpu_governors { } { global scontrol re_word_str log_user 0 set governors "" spawn $scontrol show config expect { -re "CpuFreqGovernors *= ($re_word_str)" { set governors $expect_out(1,string) exp_continue } eof { wait } } log_user 1 return $governors } ################################################################ # # NAME # test_cpu_affinity - determines if system is using the task/affinity plugin # # SYNOPSIS # test_cpu_affinity # # RETURN VALUE # Returns 1 if enforcing, 0 if none # ################################################################ proc test_cpu_affinity { } { log_user 0 set affinity 0 set parts [split [get_affinity_types] ","] if { [lsearch $parts "affinity"] != -1 } { set affinity 1 } log_user 1 return $affinity } ################################################################ # # NAME # test_cpu_affinity_or_cgroup - determines if system is enforcing CPU affinity # # SYNOPSIS # test_cpu_affinity_or_cgroup # # DESCRIPTION # Determine if system is enforcing CPU affinity (using # either the task/affinity and/or task/cgroup plugin) # # RETURN VALUE # Returns 1 if enforcing, 0 if none # ################################################################ proc test_cpu_affinity_or_cgroup { } { global scontrol re_word_str log_user 0 set affinity 0 set parts [split [get_affinity_types] ","] if { [lsearch $parts "affinity"] != -1 } { set affinity 1 } elseif { [lsearch $parts "cgroup"] != -1 } { spawn $scontrol show config expect { -re "TaskAffinity *= yes" { set affinity 1 exp_continue } eof { wait } } } log_user 1 return $affinity } ################################################################ # # NAME # test_mem_affinity - determines if system is enforcing memory affinity # # SYNOPSIS # test_mem_affinity # # RETURN VALUE # Returns 1 if enforcing, 0 if none # ################################################################ proc test_mem_affinity { } { global scontrol re_word_str log_user 0 set affinity 0 set parts [split [get_affinity_types] ","] if { [lsearch $parts "affinity"] != -1 } { set affinity 1 } log_user 1 return $affinity } ################################################################ # # NAME # test_track_wckey_slurmctld - determines if we track wckeys # # SYNOPSIS # test_track_wckey_slurmctld # # DESCRIPTION # Determine if we track workload characterization keys. # This is based upon the value of TrackWCKey in the slurm.conf. # # RETURN VALUE # Returns value of TrackWCKey # ################################################################ proc test_track_wckey_slurmctld { } { global scontrol number log_user 0 set track_wckey 0 spawn $scontrol show config expect { -re "TrackWCKey *= Yes" { set track_wckey 1 exp_continue } eof { wait } } log_user 1 return $track_wckey } ################################################################ # # NAME # test_account_storage - determines if we are using a usable accounting storage plugin # # SYNOPSIS # test_account_storage # # DESCRIPTION # Determine if we are using a usable accounting storage package. # This is based upon the value of AccountingStorageType in the slurm.conf. # # RETURN VALUE # Returns 1 if the system is running an accounting storage type # that is complete, 0 otherwise # ################################################################ proc test_account_storage { } { global scontrol log_user 0 set account_storage 0 spawn $scontrol show config expect { -re "(accounting_storage/slurmdbd|accounting_storage/mysql|accounting_storage/pgsql)" { set account_storage 1 exp_continue } eof { wait } } log_user 1 return $account_storage } ################################################################ # # NAME # test_enforce_limits - determines if resource limits are enforced # # SYNOPSIS # test_enforce_limits # # DESCRIPTION # Determine whether resouce limits are enforced. This is based upon # the value of AccountingStorageEnforce in the slurm.conf. # # RETURN VALUE # Returns 1 if the system is enforcing limits, 0 otherwise # ################################################################ proc test_enforce_limits { } { global re_word_str scontrol log_user 0 set enforce_limits 0 spawn $scontrol show config expect { -re "AccountingStorageEnforce *= ($re_word_str)" { if {[string first "safe" $expect_out(1,string)] != -1 } { set enforce_limits 1 } if {[string first "limits" $expect_out(1,string)] != -1 } { set enforce_limits 1 } exp_continue } eof { wait } } log_user 1 return $enforce_limits } ################################################################ # # NAME # test_allow_spec_resources - returns the value of AllowSpecResources # # SYNOPSIS # test_allow_spec_resources # # RETURN VALUE # Returns 1 if AllowSpecResources is set, 0 if not and 2 in case of error # # ENVIRONMENT # It also sets the global variable exit_code to 1 in case of error. # ################################################################ proc test_allow_spec_resc { } { global exit_code re_word_str scontrol log_user 0 set allow_spec_resc 2 spawn $scontrol show config expect { -re "AllowSpecResourcesUsage *= ($re_word_str)" { if {[string equal $expect_out(1,string) Yes]} { set allow_spec_resc 1 } elseif {[string equal $expect_out(1,string) No]} { set allow_spec_resc 0 } exp_continue } timeout { log_error "scontrol show config time out" set exit_code 1 } eof { wait } } if {$allow_spec_resc == 2} { log_error "AllowSpecResourcesUsage not found in scontrol show config" set exit_code 1 } log_user 1 return $allow_spec_resc } ################################################################ # # NAME # test_enforce_safe_set - determines if AccountingStorageEnforce=safe is set # # SYNOPSIS # test_enforce_safe_set # # DESCRIPTION # Determine if AccountingStorageEnforce=safe is set in the slurm.conf. # # RETURN VALUE # Returns 1 if the system is running with safe limits, 0 otherwise # ################################################################ proc test_enforce_safe_set { } { global re_word_str scontrol log_user 0 set enforce_limits 0 spawn $scontrol show config expect { -re "AccountingStorageEnforce *= ($re_word_str)" { if {[string first "safe" $expect_out(1,string)] != -1 } { set enforce_limits 1 } exp_continue } eof { wait } } log_user 1 return $enforce_limits } ################################################################ # # NAME # test_enforce_qos_set - determines if AccountingStorageEnforce=qos is set # # SYNOPSIS # test_enforce_qos_set # # DESCRIPTION # Determine if AccountingStorageEnforce=qos is set in the slurm.conf. # # RETURN VALUE # Returns 1 if the system is running with safe limits, 0 otherwise # ################################################################ proc test_enforce_qos_set { } { global re_word_str scontrol log_user 0 set enforce_limits 0 spawn $scontrol show config expect { -re "AccountingStorageEnforce *= ($re_word_str)" { if {[string first "qos" $expect_out(1,string)] != -1 } { set enforce_limits 1 } exp_continue } eof { wait } } log_user 1 return $enforce_limits } ################################################################ # # NAME # test_using_slurmdbd - checks whether using slurmdbd # # SYNOPSIS # test_using_slurmdbd # # DESCRIPTION # Since there is a lag at which the slurmdbd processes a job start we # need to wait a bit to make sure the data has been set before proceeding. # This is based upon the value of AccountingStorageType in the slurm.conf. # # RETURN VALUE # Returns 1 if the system is running with slurmdbd, 0 otherwise # ################################################################ proc test_using_slurmdbd { } { global scontrol log_user 0 set account_storage 0 spawn $scontrol show config expect { -re "(accounting_storage/slurmdbd)" { set account_storage 1 exp_continue } eof { wait } } log_user 1 return $account_storage } ################################################################ # # NAME # priority_type - gets the priority plugin type # # SYNOPSIS # priority_type # # DESCRIPTION # Use scontrol to determine the priority plugin # # RETURN VALUE # Name of priority type # ################################################################ proc priority_type {} { global scontrol log_user 0 set name "" set fd [open "|$scontrol show config"] while {[gets $fd line] != -1} { if {[regexp {^PriorityType *= priority/(\w+)} $line frag name] == 1} { break } } catch {close $fd} log_user 1 if {[string length $name] == 0} { log_error "Could not identify the Priority Type" } return $name } ################################################################ # # NAME # get_min_job_age - determines the MinJobAge # # SYNOPSIS # get_min_job_age # # DESCRIPTION # Use scontrol to determine the MinJobAge # # RETURN VALUE # MinJobAge value # ################################################################ proc get_min_job_age {} { global scontrol number set age 0 log_user 0 spawn $scontrol show config expect { -re "MinJobAge *= ($number)" { set age $expect_out(1,string) exp_continue } eof { wait } } log_user 1 if {$age == 0} { log_error "Could not identify the MinJobAge" } return $age } ################################################################ # # NAME # get_default_acct - gets user's default account # # SYNOPSIS # get_default_acct user # # RETURN VALUE # Returns name of default account if exists, NULL otherwise # ################################################################ proc get_default_acct { user } { global sacctmgr re_word_str bin_id log_user 0 set def_acct "" if { !$user } { set user [get_my_user_name] } spawn $sacctmgr -n list -P user $user format="DefaultAccount" expect { -re "($re_word_str)" { set def_acct $expect_out(1,string) exp_continue } eof { wait } } log_user 1 return $def_acct } ################################################################ # # NAME # test_front_end - checks if execution host executes on a front-end node # # SYNOPSIS # test_front_end # # DESCRIPTION # Determine if the execution host is one in which the # slurmd daemon executes on a front-end node rather than the # compute hosts (e.g. Blue Gene systems). # # RETURN VALUE # Returns 1 if the system uses a front-end, 0 otherwise # ################################################################ proc test_front_end { } { global enable_front_end scontrol log_user 0 set front_end 0 spawn $scontrol show frontend expect { "FrontendName=" { set front_end 1 exp_continue } eof { wait } } log_user 1 return $front_end } ################################################################ # # NAME # test_multiple_slurmd - checks if running multiple slurmds per node # # SYNOPSIS # test_multiple_slurmd # # RETURN VALUE # Returns 1 if running multiple slurmd per node # ################################################################ proc test_multiple_slurmd { } { global scontrol log_user 0 set multiple_slurmd 0 spawn $scontrol show config expect { "MULTIPLE_SLURMD" { set multiple_slurmd 1 exp_continue } eof { wait } } log_user 1 return $multiple_slurmd } ################################################################ # # NAME # test_configless_slurmd - checks if configless slurmd enabled # # SYNOPSIS # test_configless_slurmd # # RETURN VALUE # Returns 1 if configless slurmd enabled # ################################################################ proc test_configless_slurmd { } { global scontrol bin_bash bin_grep log_user 0 set configless 0 spawn -noecho $bin_bash -c "exec $scontrol show config |\ $bin_grep SlurmctldParameters" expect { "enable_configless" { set configless 1 exp_continue } timeout { log_error("timeout checing if configless is set") } eof { wait } } log_user 1 return $configless } ################################################################ # # NAME # test_cray - determine if the system is a native cray system # # SYNOPSIS # test_cray # # RETURN VALUE # Returns 1 if the system is a cray, 0 otherwise # ################################################################ proc test_cray { } { global scontrol bin_bash bin_grep log_user 0 set cray 0 spawn -noecho $bin_bash -c "exec $scontrol show config | $bin_grep SwitchType" expect { "switch/cray" { set cray 1 exp_continue } eof { wait } } log_user 1 return $cray } ################################################################ # # NAME # test_launch_type - determines launch type plugin # # SYNOPSIS # test_launch_type # # RETURN VALUE # Returns the launch plugin type # ################################################################ proc test_launch_type { } { global scontrol bin_bash bin_grep re_word_str log_user 0 set type "" spawn -noecho $bin_bash -c "exec $scontrol show config | $bin_grep LaunchType" expect { -re "launch/($re_word_str)" { set type $expect_out(1,string) exp_continue } eof { wait } } log_user 1 return $type } ################################################################ # # NAME # test_launch_test_exec - determines launch type plugin # # SYNOPSIS # test_launch_test_exec # # RETURN VALUE # Returns the launch plugin type # ################################################################ proc test_launch_test_exec { } { global scontrol bin_bash bin_grep re_word_str log_user 0 set test_exec 0 set type "" spawn -noecho $bin_bash -c "exec $scontrol show config | $bin_grep LaunchParameters" expect { -re "test_exec" { set test_exec 1 exp_continue } eof { wait } } log_user 1 return $test_exec } ################################################################ # # NAME # test_node_features_plugin - determines NodeFeaturesPlugin type # # SYNOPSIS # test_node_features_plugin # # RETURN VALUE # Returns the NodeFeaturesPlugin type # ################################################################ proc test_node_features_plugin { } { global scontrol bin_bash bin_grep re_word_str log_user 0 set type "" spawn -noecho $bin_bash -c "exec $scontrol show config | $bin_grep NodeFeaturesPlugins" expect { -re "node_features/($re_word_str)" { set type $expect_out(1,string) exp_continue } -re "null" { exp_continue } -re "($re_word_str)" { set type $expect_out(1,string) exp_continue } eof { wait } } log_user 1 return $type } ################################################################ # # NAME # test_emulated - determines if the system is emulated # # SYNOPSIS # test_emulated # # DESCRIPTION # Determine if the system is emulated (not running on # actual Cray or Bluegene hardware # # RETURN VALUE # Returns 1 if the system is emulated otherwise # ################################################################ proc test_emulated { } { global scontrol bin_bash log_user 0 set emulated 0 spawn -noecho $bin_bash -c "exec $scontrol show config" expect { "Emulated * = yes" { set emulated 1 exp_continue } eof { wait } } log_user 1 return $emulated } ################################################################ # # NAME # test_killonbadexit - determines if KillOnBadExit is configured to be 1 # # SYNOPSIS # test_killonbadexit # # DESCRIPTION # Determine if KillOnBadExit is configured to be 1. # # RETURN VALUE # Returns 1 if KillOnBadExit is 1. # ################################################################ proc test_killonbadexit { } { global scontrol bin_bash bin_grep log_user 0 set killonbadexit 0 spawn -noecho $bin_bash -c "exec $scontrol show config | $bin_grep KillOnBadExit" expect { -re "KillOnBadExit *= *1" { set killonbadexit 1 exp_continue } eof { wait } } log_user 1 return $killonbadexit } ################################################################ # # NAME # get_cycle_count - get desired iteration count # # SYNOPSIS # get_cycle_count # # DESCRIPTION # For tests with iteration counts (e.g. test9.1, test9.2) # return the desired iteration count # # RETURN VALUE # Returns desired iteration count # ################################################################ proc get_cycle_count { } { global enable_memory_leak_debug if {$enable_memory_leak_debug != 0} { return 2 } return 100 } ################################################################ # # NAME # test_select_type - determine which select plugin is being used # # SYNOPSIS # test_select_type # # RETURN VALUE # Returns name of select plugin # ################################################################ proc test_select_type { } { global scontrol bin_bash bin_grep re_word_str log_user 0 set type "" spawn -noecho $bin_bash -c "exec $scontrol show config | $bin_grep SelectType" expect { -re "select/($re_word_str)" { set type $expect_out(1,string) exp_continue } eof { wait } } log_user 1 return $type } ################################################################ # # NAME # get_select_type_params - determines SelectTypeParameters being used for a given partition # # SYNOPSIS # get_select_type_params ?partition? # # DESCRIPTION # Determine SelectTypeParameters being used for a given partition. # If the partition is not specified, the default partition will be used. # # RETURN VALUE # Returns a string containing SelectTypeParameters # ################################################################ proc get_select_type_params { {partition ""} } { global scontrol bin_bash bin_grep re_word_str log_user 0 set params "" if {[string length $partition] == 0} { set partition [default_partition] } if {[string compare $partition ""]} { spawn -noecho $bin_bash -c "exec $scontrol show part $partition | $bin_grep SelectTypeParameters" expect { -re "SelectTypeParameters *= *NONE" { exp_continue } -re "SelectTypeParameters *= *($re_word_str)" { set params $expect_out(1,string) exp_continue } eof { wait } } } if { [string compare params ""] } { spawn -noecho $bin_bash -c "exec $scontrol show config | $bin_grep SelectTypeParameters" expect { -re "SelectTypeParameters *= *($re_word_str)" { set params $expect_out(1,string) exp_continue } eof { wait } } } log_user 1 return $params } ################################################################ # # NAME # test_linear - determines if system is configured with linear plugin # # SYNOPSIS # test_linear # # RETURN VALUE # Returns 1 if configured, 0 otherwise # ################################################################ proc test_linear { } { global scontrol number log_user 0 set select_type [test_select_type] if {![string compare $select_type "linear"]} { return 1 } elseif {![string compare $select_type "cray_aries"] && ![test_select_type_params "other_cons_res"] && ![test_select_type_params "other_cons_tres"]} { return 1 } log_user 1 return 0 } ################################################################ # # NAME # test_cons_res - determines if system is configured with cons_res plugin # # SYNOPSIS # test_cons_res # # RETURN VALUE # Returns 1 if configured, 0 otherwise # ################################################################ proc test_cons_res { } { global scontrol number log_user 0 set select_type [test_select_type] if {![string compare $select_type "cons_res"]} { return 1 } elseif {![string compare $select_type "cray_aries"] && [test_select_type_params "other_cons_res"]} { return 1 } log_user 1 return 0 } ################################################################ # # NAME # test_cons_tres - determines if system is configured with cons_tres plugin # # SYNOPSIS # test_cons_tres # # DESCRIPTION # Determine if system is configured with cons_tres plugin. # # RETURN VALUE # Returns 1 if configured, 0 otherwise # ################################################################ proc test_cons_tres { } { global scontrol number log_user 0 set select_type [test_select_type] if {![string compare $select_type "cons_tres"]} { return 1 } elseif {![string compare $select_type "cray_aries"] && [test_select_type_params "other_cons_tres"]} { return 1 } log_user 1 return 0 } ################################################################ # # NAME # get_total_cpus - gets the total amount of CPUs on the default partition # # SYNOPSIS # get_total_cpus # # RETURN VALUE # The total amount of CPUs on the default partition. # # NOTE # CoreSpecCount are not part of the total. # ################################################################ proc get_total_cpus {} { global sinfo scontrol re_word_str exit_code set partition [default_partition] set cpu_cnt 0 set re_with_cs "CPUTot=(\\d+).*CoreSpecCount=(\\d+).*ThreadsPerCore=(\\d+)" set re_without_cs "CPUTot=(\\d+)" spawn $sinfo -h -o "%P %N" -p $partition --state=idle expect { -re "$partition\\* ($re_word_str)" { set def_hostlist $expect_out(1,string) exp_continue } timeout { log_error "sinfo not responding" set exit_code 1 } eof { wait } } set fd [open "|$scontrol --oneliner show node $def_hostlist"] while {[gets $fd line] != -1} { if {[regexp $re_with_cs $line frag tmp_cpu_cnt core_spec_cnt threads_per_core] == 1} { set cpu_cnt [expr $cpu_cnt + $tmp_cpu_cnt - $core_spec_cnt * $threads_per_core] continue } if {[regexp $re_without_cs $line frag tmp_cpu_cnt] == 1} { set cpu_cnt [expr $cpu_cnt + $tmp_cpu_cnt] continue } } return $cpu_cnt } ################################################################ # # NAME # test_scheduler_params - tests SchedulerParameters being used # # SYNOPSIS # test_scheduler_params type # # DESCRIPTION # Test SchedulerParameters being used # # RETURN VALUE # Returns 1 if "type" (input) is found, 0 otherwise # ################################################################ proc test_scheduler_params { type } { global scontrol bin_bash bin_grep re_word_str log_user 0 set ret 0 set params "" spawn -noecho $bin_bash -c "exec $scontrol show config | $bin_grep SchedulerParameters" expect { -re "SchedulerParameters *= *($re_word_str)" { set params $expect_out(1,string) exp_continue } eof { wait } } # Since string first doesn't have any case # distinction just make it always be upper. set type [string toupper $type] set params [string toupper $params] set params [split $params ,] # We have to search with the '*' since some options have an =value # on them. if { [lsearch $params "$type*"] != -1 } { set ret 1 } log_user 1 return $ret } ################################################################ # # # NAME # test_dependency_params - tests DependencyParameters being used # # SYNOPSIS # test_dependency_params type # # DESCRIPTION # Tests DependencyParameters being used # # RETURN VALUE # Returns 1 if "type" (input) is found, 0 otherwise # ################################################################ proc test_dependency_params { type } { global bin_bash scontrol bin_grep re_word_str log_user 0 set ret 0 set params "" spawn -noecho $bin_bash -c "exec $scontrol show config | $bin_grep DependencyParameters" expect { -re "DependencyParameters *= *($re_word_str)" { set params $expect_out(1,string) exp_continue } eof { wait } } # Since string first doesn't have any case # distinction just make it always be upper. set type [string toupper $type] set params [string toupper $params] set params [split $params ,] # We have to search with the '*' since some options have an =value # on them. if { [lsearch $params "$type*"] != -1 } { set ret 1 } log_user 1 return $ret } ################################################################ # # NAME # test_select_type_params - determines SelectTypeParameters being used # # SYNOPSIS # test_select_type_params type # # RETURN VALUE # Returns 1 if "type" (input) is found, 0 otherwise # ################################################################ proc test_select_type_params { type } { global scontrol bin_bash bin_grep re_word_str log_user 0 set ret 0 set params "" spawn -noecho $bin_bash -c "exec $scontrol show config | $bin_grep SelectTypeParameters" expect { -re "SelectTypeParameters *= *($re_word_str)" { set params $expect_out(1,string) exp_continue } eof { wait } } # Since string first doesn't have any case # distinction just make it always be upper. set type [string toupper $type] set params [string toupper $params] set params [split $params ,] foreach item $params { # If argument is "MEMORY" then search for "_MEMORY" if {![string compare $type "MEMORY"] && [string first "_MEMORY" $item] != -1} { set ret 1 break # i.e. Check for CR_CORE_MEMORY or CR_CORE } elseif {[string first "MEMORY" $item] != -1 && [string first $type $item] != -1} { set ret 1 break } elseif {![string compare $type $item]} { set ret 1 break } } log_user 1 return $ret } ################################################################ # # NAME # test_root_user - determines if user is root # # SYNOPSIS # test_root_user # ################################################################ proc test_root_user { } { global super_user super_user_set set uid [get_my_uid] if {$uid == 0} { set super_user 1 set super_user_set 1 return 1 } return 0 } ################################################################ # # NAME # test_super_user - determines if user is a Slurm super user # # SYNOPSIS # test_super_user # # DESCRIPTION # Determine if user is a Slurm super user (i.e. user # root or configured SlurmUser) # ################################################################ proc test_super_user { } { global re_word_str bin_id number scontrol super_user super_user_set if {$super_user_set != 0} { return $super_user } # # Check if user root # set uid [get_my_uid] if {$uid == 0} { set super_user 1 set super_user_set 1 return $super_user } # # Check if SlurmUser # set user [get_my_user_name] log_user 0 spawn $scontrol show config set slurm_user "" expect { -re "SlurmUser *= ($re_word_str)\[\(\]($number)\[\)\]" { set slurm_user $expect_out(1,string) exp_continue } eof { wait } } if {[string compare $user $slurm_user] == 0} { set super_user 1 } set super_user_set 1 log_user 1 return $super_user } ################################################################ # # NAME # dec2hex - creates a 32 bit hex number from a signed decimal number # # SYNOPSIS # dec2hex value # # DESCRIPTION # Create a 32 bit hex number from a signed decimal number # # RETURN VALUE # 32 bit hex version of input 'value' # # SOURCE # Courtesy of Chris Cornish # http://aspn.activestate.com/ASPN/Cookbook/Tcl/Recipe/415982 # ################################################################ # Replace all non-decimal characters proc dec2hex {value} { regsub -all {[^0-x\.-]} $value {} newtemp set value [string trim $newtemp] if {$value < 2147483647 && $value > -2147483648} { set tempvalue [format "%#010X" [expr $value]] return [string range $tempvalue 2 9] } elseif {$value < -2147483647} { return "80000000" } else { return "7FFFFFFF" } } ################################################################ # # NAME # uint2hex - creates a 32 bit hex number from an unsigned decimal # # SYNOPSIS # uint2hex value # # DESCRIPTION # Create a 32 bit hex number from an unsigned decimal number. # # ARGUMENTS # value # unsigneddecimal number to convert # # RETURN VALUE # 32 bit hex version of input 'value' # # SOURCE # Courtesy of Chris Cornish # http://aspn.activestate.com/ASPN/Cookbook/Tcl/Recipe/415982 # ################################################################ # Replace all non-decimal characters proc uint2hex {value} { regsub -all {[^0-x\.-]} $value {} newtemp set value [string trim $newtemp] if {$value <= 4294967295 && $value >= 0} { set tempvalue [format "%#010X" [expr $value]] return [string range $tempvalue 2 9] } else { return "FFFFFFFF" } } ################################################################ # # NAME # available_nodes - returns number of available nodes # # SYNOPSIS # available_nodes ?state? ?partition? # # DESCRIPTION # Check to see if a given partition has a at least "num_nodes" number # of nodes in the alloc, idle, or comp state. This can be used to # avoid launching a job that will never run because nodes are in the # "drained" state or otherwise unavailable. # If the partition is not specified, the default partition will be used. # # RETURN VALUE # Returns the number of available nodes in the partition, or # -1 on failure. # ################################################################ proc available_nodes { {state ""} {partition ""} } { global sinfo if {[string length $partition] == 0} { set partition [default_partition] } if {[string length $state] == 0} { set state "idle,alloc,comp" } set available -1 set fd [open "|$sinfo --noheader --partition $partition --state $state --format %D"] gets $fd line catch {close $fd} regexp {\d+} $line available if {[string match *K $line]} { set available [expr $available * 1024] } elseif {[string match *M $line]} { set available [expr $available * 1048576] } return $available } ################################################################ # # NAME # partition_oversubscribe - determines the oversubscribe configuration of the specified partition # # SYNOPSIS # partition_oversubscribe ?partition? # # DESCRIPTION # Determine the oversubscribe configuration of the specified partition. # If the partition is not specified, the default partition will be used. # # RETURN VALUE # Return the oversubscribe configuration of the specified partition. # ################################################################ proc partition_oversubscribe { {partition ""} } { global sinfo if {[string length $partition] == 0} { set partition [default_partition] } set oversubscribe "NO" log_debug "$sinfo --noheader --partition $partition --format %h" set fd [open "|$sinfo --noheader --partition $partition --format %h"] gets $fd line catch {close $fd} regexp {[a-zA-Z]+} $line oversubscribe return $oversubscribe } ################################################################ # # NAME # default_partition - determines the name of the default partition # # SYNOPSIS # default_partition # # DESCRIPTION # Use scontrol to determine the name of the default partition # # RETURN VALUE # Name of the current default partition # ################################################################ proc default_partition {} { global scontrol set name "" set fd [open "|$scontrol --all --oneliner show partition"] while {[gets $fd line] != -1} { if {[regexp {^PartitionName=([^ ]*).*Default=YES} $line frag name] == 1} { break } } catch {close $fd} if {[string length $name] == 0} { log_error "Could not identify the default partition" } return $name } ################################################################ # # NAME # default_part_exclusive - determines if the default partition allocates whole nodes to jobs # # SYNOPSIS # default_part_exclusive # # DESCRIPTION # Use scontrol to determine if the default partition # allocates whole nodes to jobs # # RETURN VALUE # Name of the current default partition # ################################################################ proc default_part_exclusive {} { set def_part [default_partition] set oversubscribe [partition_oversubscribe $def_part] if {[string compare $oversubscribe "EXCLUSIVE"] == 0} { return 1 } else { return 0 } } ################################################################ # # NAME # switch_type - determines the switch type # # SYNOPSIS # switch_type # # DESCRIPTION # Use scontrol to determine the switch type # # RETURN VALUE # Name of SwitchType # ################################################################ proc switch_type {} { global scontrol set name "" set fd [open "|$scontrol show config"] while {[gets $fd line] != -1} { if {[regexp {^SwitchType *= switch/(\w+)} $line frag name] == 1} { break } } catch {close $fd} if {[string length $name] == 0} { log_error "Could not identify the switch type" } return $name } ################################################################ # # NAME # make_bash_script - creates a bash script # # SYNOPSIS # make_bash_script script_name script_contents # # DESCRIPTION # Create a bash script of name "script_name", and # make the body of the script "script_contents". # make_bash_script removes the file if it already exists, # then generates the #! line, and then dumps "script_contents" # to the file. Finally, it makes certain that the script # is executable. # # ARGUMENTS # script_name # file name for the bash script # script_contents # body of the script, not including the initial #! line. # # RETURN VALUE # Nothing. # ################################################################ proc make_bash_script { script_name script_contents } { global bin_bash bin_chmod file delete $script_name set fd [open $script_name "w"] puts $fd "#!$bin_bash" puts $fd $script_contents close $fd exec $bin_chmod 700 $script_name } ################################################################ # # NAME # get_suffix - given a hostname, returns it's numeric suffix # # SYNOPSIS # get_suffix hostname # # DESCRIPTION # Given a hostname, return it's numeric suffix # # RETURN VALUE # numerical suffix for input 'hostname' or -1 if not a number # ################################################################ proc get_suffix { hostname } { set host_len [string length $hostname] set host_inx [expr $host_len-1] set host_char [string index $hostname $host_inx] if {[string compare $host_char "0"] < 0 || [string compare $host_char "9"] > 0} { return -1 } for {set host_inx [expr $host_len-1]} {$host_inx >= 0} {incr host_inx -1} { set host_char [string index $hostname $host_inx] if {[string compare $host_char "0"] < 0} { break } if {[string compare $host_char "9"] > 0} { break } } incr host_inx if {$host_inx == $host_len} { log_warn "Hostname lacks a suffix: $hostname" return "-1" } # Strip off leading zeros to avoid doing octal arithmetic set suffix [string range $hostname $host_inx $host_len] set suffix_len [string length $suffix] for {set suffix_inx 0} {$suffix_inx < [expr $suffix_len - 1]} {incr suffix_inx} { set suffix_char [string index $suffix $suffix_inx] if {[string compare $suffix_char "0"] != 0} { break } } return [string range $suffix $suffix_inx $suffix_len] } ################################################################ # # NAME # is_super_user - checks if we are user root or SlurmUser # # SYNOPSIS # is_super_user # # DESCRIPTION # Check if we are user root or SlurmUser # # RETURN VALUE # 1 if true, 0 if false # ################################################################ proc is_super_user { } { global re_word_str bin_id scontrol log_user 0 set user_name [get_my_user_name] if {[string compare $user_name "root"] == 0} { log_user 1 return 1 } set found_user 0 spawn $scontrol show config expect { -re "SlurmUser *= $user_name" { set found_user 1 exp_continue } eof { wait } } log_user 1 return $found_user } ################################################################ # # NAME # check_acct_associations - checks associations # # SYNOPSIS # check_acct_associations # # DESCRIPTION # Use sacctmgr to check associations # # RETURN VALUE # 0 on any error # ################################################################ proc check_acct_associations { } { global sacctmgr number re_word_str exit_code set rc 1 log_user 0 log_debug "Sanity-Checking Associations" # # Use sacctmgr to check associations # set s_pid [spawn $sacctmgr -n -p list assoc wopi wopl withd format=lft,rgt,cluster] expect { -re "($number)\\|($number)\\|($re_word_str)\\|" { # Here we are checking if we have duplicates and # setting up an array to check for holes later set cluster $expect_out(3,string) if { ![info exists c_min($cluster)] } { set c_min($cluster) -1 set c_max($cluster) -1 } set num1 $expect_out(1,string) set num2 $expect_out(2,string) set first [info exists found($cluster,$num1)] set sec [info exists found($cluster,$num2)] #log_debug "$first=$num1 $sec=$num2" if { $first } { log_error "$cluster found lft $num1 again" set rc 0 } elseif { $sec } { log_error "$cluster found rgt $num2 again" set rc 0 } else { set found($cluster,$num1) 1 set found($cluster,$num2) 1 if { $c_min($cluster) == -1 || $c_min($cluster) > $num1 } { set c_min($cluster) $num1 } if { $c_max($cluster) == -1 || $c_max($cluster) < $num2 } { set c_max($cluster) $num2 } } exp_continue } timeout { log_error "sacctmgr add not responding" slow_kill $s_pid set exit_code 1 } eof { wait } } foreach cluster [array names c_min] { # Here we are checking for holes in the list from above for {set inx $c_min($cluster)} {$inx < $c_max($cluster)} {incr inx} { if { ![info exists found($cluster,$inx)] } { log_error "$cluster No index at $inx" set rc 0 } } } log_user 1 return $rc } ################################################################ # # NAME # get_job_acct_freq - gets the value of the job account gather frequency # # SYNOPSIS # get_job_acct_freq # # RETURN VALUE # job account gather frequency # ################################################################ proc get_job_acct_freq { } { global scontrol number log_user 0 set freq_val 0 spawn $scontrol show config expect { -re "JobAcctGatherFrequency *= ($number)" { set freq_val $expect_out(1,string) if {$freq_val == 0} { set freq_val 0 } } -re "JobAcctGatherFrequency *= task=($number)" { set freq_val $expect_out(1,string) if {$freq_val == 0} { set freq_val 0 } } eof { wait } } log_user 1 return $freq_val } ################################################################ # # NAME # get_job_acct_type - gets the value of JobAcctGatherType # # SYNOPSIS # get_job_acct_type # # RETURN VALUE # JobAcctGatherType value # ################################################################ proc get_job_acct_type { } { global scontrol re_word_str log_user 0 set gather_type "none" spawn $scontrol show config expect { -re "JobAcctGatherType *= jobacct_gather/($re_word_str)" { set gather_type $expect_out(1,string) exp_continue } -re "JobAcctGatherType *= ($re_word_str)" { set gather_type $expect_out(1,string) exp_continue } eof { wait } } log_user 1 return $gather_type } ################################################################ # # NAME # check_accounting_admin_level - gets the admin_level for the current user # # SYNOPSIS # check_accounting_admin_level # # RETURN VALUE # admin_level for the current user # ################################################################ proc check_accounting_admin_level { } { global sacctmgr re_word_str re_word_str bin_id exit_code set admin_level "" set user_name "" log_user 0 if { [test_super_user] } { return "Administrator" } set user_name [get_my_user_name] if { ![string length $user_name] } { log_error "No name returned from id" return "" } # # Use sacctmgr to check admin_level # set s_pid [spawn $sacctmgr -n -P list user $user_name format=admin] expect { -re "($re_word_str)" { set admin_level $expect_out(1,string) exp_continue } timeout { slow_kill $s_pid fail "sacctmgr add not responding" } eof { wait } } log_user 1 return $admin_level } ################################################################ # # NAME # get_cluster_name - gets the cluster name # # SYNOPSIS # get_cluster_name # # RETURN VALUE # name of the cluster # ################################################################ proc get_cluster_name { } { global scontrol re_word_str exit_code # # Use scontrol to find the cluster name # log_user 0 set cluster_name "" set scon_pid [spawn -noecho $scontrol show config] expect { -re "ClusterName *= ($re_word_str)" { set cluster_name $expect_out(1,string) exp_continue } timeout { slow_kill $scon_pid fail "scontrol not responding" } eof { wait } } log_user 1 return $cluster_name } ################################################################ # # NAME # get_control_machine - gets the ControlMachine parameter # # SYNOPSIS # get_control_machine # # RETURN VALUE # ControlMachine value # ################################################################ proc get_control_machine { } { global scontrol re_word_str exit_code # # Use scontrol to find the ControlMachine # log_user 0 set control_machine "" set scon_pid [spawn -noecho $scontrol show config] expect { # We need to handle two possible outputs of SlurmctldHost[0]: # a) hostname # b) hostname(IP) -re "SlurmctldHost.0. *= ($re_word_str)" { set par_idx [expr [string last "(" $expect_out(1,string)] -1] if { $par_idx == -2 } { set control_machine $expect_out(1,string) } else { set control_machine [string range $expect_out(1,string) 0 $par_idx] } exp_continue } timeout { log_error "scontrol not responding" slow_kill $scon_pid set exit_code 1 } eof { wait } } log_user 1 return $control_machine } ################################################################ # # NAME # get_node_cnt - determines how many nodes are on the system # # SYNOPSIS # get_node_cnt # # RETURN VALUE # Returns count of nodes on system or 0 if unknown # ################################################################ proc get_node_cnt { } { global scontrol exit_code log_user 0 set node_cnt 0 set scon_pid [spawn -noecho $scontrol show nodes] expect { -re "NodeName=" { incr node_cnt exp_continue } timeout { log_error "scontrol not responding" slow_kill $scon_pid set exit_code 1 } eof { wait } } log_user 1 return $node_cnt } ################################################################ # # NAME # get_node_cnt_in_part - determines how many nodes are in a given partition # # SYNOPSIS # get_node_cnt_in_part ?partition? # # DESCRIPTION # Determine how many nodes are in a given partition. # If the partition is not specified, the default partition will be used. # # RETURN VALUE # Returns count of nodes in a partition or 0 if unknown. # ################################################################ proc get_node_cnt_in_part { {partition ""} } { global scontrol number if {[string length $partition] == 0} { set partition [default_partition] } log_user 0 set node_cnt 0 set scon_pid [spawn -noecho $scontrol show partition $partition] expect { -re "not found" { log_error "Partition $partition doesn't exist" } -re "TotalNodes=($number)" { set node_cnt $expect_out(1,string) exp_continue } timeout { log_error "scontrol not responding" } eof { } } log_user 1 return $node_cnt } ################################################################ # # NAME # get_idle_node_in_part - gets an idle node in a given partition # # SYNOPSIS # get_idle_node_in_part ?partition? # # DESCRIPTION # Get an idle node in a given partition. # If the partition is not specified, the default partition will be used. # # RETURN VALUE # Returns name of node in a partition or "" if unknown. # ################################################################ proc get_idle_node_in_part { {partition ""} } { global scontrol sinfo re_word_str if {[string length $partition] == 0} { set partition [default_partition] } log_user 0 set host_list "" spawn $sinfo -oNAME=%N -h -p$partition --state=idle expect { -re "not found" { log_error "Partition $partition doesn't exist" } -re "NAME=($re_word_str)" { set host_list $expect_out(1,string) } timeout { log_error "sinfo not responding" } eof { wait } } set node_name "" spawn $scontrol show hostname $host_list expect { -re "($re_word_str)" { set node_name $expect_out(1,string) } timeout { log_error "scontrol not responding" } eof { wait } } log_user 1 return $node_name } ################################################################ # # NAME # get_array_config - determines the MaxArraySize # # SYNOPSIS # get_array_config # # DESCRIPTION # Use scontrol to determine the MaxArraySize # # RETURN VALUE # MaxArraySize value # ################################################################ proc get_array_config { } { global scontrol number log_user 0 set array_size 1 spawn $scontrol show config expect { -re "MaxArraySize *= ($number)" { set array_size $expect_out(1,string) exp_continue } eof { wait } } log_user 1 return $array_size } ################################################################ # # NAME # get_max_tasks - determines the MaxTasksPerNode # # SYNOPSIS # get_max_tasks # # DESCRIPTION # Use scontrol to determine the MaxTasksPerNode # # RETURN VALUE # MaxTasksPerNode value # ################################################################ proc get_max_tasks { } { global scontrol number log_user 0 set max_tasks 1 spawn $scontrol show config expect { -re "MaxTasksPerNode *= ($number)" { set max_tasks $expect_out(1,string) exp_continue } eof { wait } } log_user 1 return $max_tasks } ################################################# # # NAME # scale_to_megs - scales the value by the factor T|G|M to megabytes # # SYNOPSIS # scale_to_megs value factor # # DESCRIPTION # scale the value by the factor T|G|M to megabytes # # RETURN VALUE # the scaled variable # ################################################# proc scale_to_megs { value factor } { if {[string compare $factor "T"] == 0} { set value [expr $value * 1024 * 1024] } elseif {[string compare $factor "G"] == 0} { set value [expr $value * 1024] } elseif {[string compare $factor "M"] == 0} { set value [expr $value * 1] } elseif {[string compare $factor "K"] == 0} { set value [expr $value / 1024] set value [expr {round($value)}] } else { set value [expr $value / (1024 * 1024)] set value [expr {round($value)}] } return $value } ################################################# # # NAME # scale_to_ks - scales the value by the factor G|M|K to kilobytes # # SYNOPSIS # scale_to_ks value factor # # DESCRIPTION # scale the value by the factor G|M|K to kilobytes # # RETURN VALUE # the scaled variable # ################################################# proc scale_to_ks { value factor } { if {[string compare $factor "G"] == 0} { set value [expr $value * 1024 * 1024] } elseif {[string compare $factor "M"] == 0} { set value [expr $value * 1024] } elseif {[string compare $factor "K"] == 0} { set value [expr $value * 1] } else { set value [expr $value / 1024] set value [expr {round($value)}] } return $value } ############################################################ # # NAME # check_node_mem - checks that the nodes have memory configured # # SYNOPSIS # check_node_mem # # RETURN VALUE # 1 if the nodes have memory, 0 otherwise # ############################################################ proc check_node_mem { } { global scontrol number log_user 0 set mem_size 0 spawn $scontrol show node expect { -re "RealMemory=($number)" { set mem_size $expect_out(1,string) exp_continue } eof { wait } } if {$mem_size == 1} { return 0 } else { return 1 } log_user 1 } ################################################################ # # NAME # get_fs_damping_factor - gets FairShareDampeningFactor configuration parameter # # SYNOPSIS # get_fs_damping_factor # # DESCRIPTION # get FairShareDampeningFactor configuration parameter # # RETURN VALUE # Returns FairShareDampeningFactor # ################################################################ proc get_fs_damping_factor { } { global scontrol number exit_code log_user 0 set damp 1 set scon_pid [spawn -noecho $scontrol show config] expect { -re "FairShareDampeningFactor *= ($number)" { set damp $expect_out(1,string) exp_continue } timeout { log_error "scontrol not responding" slow_kill $scon_pid set exit_code 1 } eof { wait } } log_user 1 return $damp } ################################################################ # # NAME # slurmctld_plug_stack_nonstop - verifies that the SlurmctldPlugstack is set to nonstop # # SYNOPSIS # slurmctld_plug_stack_nonstop # # DESCRIPTION # Use scontrol to determine that the SlurmctldPlugstack is set to nonstop. # # RETURN VALUE # 1 if the value is set to nonstop. # ################################################################ proc slurmctld_plug_stack_nonstop { } { global scontrol re_word_str exit_code log_user 0 set nonstop_enforce 0 set scon_pid [spawn -noecho $scontrol show config] expect { -re "SlurmctldPlugstack *= ($re_word_str)" { if {[string first $expect_out(1,string) "nonstop"] != -1} { set nonstop_enforce 1 } exp_continue } timeout { log_error "scontrol not responding" slow_kill $scon_pid set exit_code 1 } eof { wait } } log_user 1 return $nonstop_enforce } ################################################################ # # NAME # job_submit_all_partitions - determines if the JobSubmitPlugins includes "all_partitions" # # SYNOPSIS # job_submit_all_partitions # # DESCRIPTION # Use scontrol to determine if the JobSubmitPlugins # includes "all_partitions". # # RETURN VALUE # 1 if the value is set to nonstop. # ################################################################ proc job_submit_all_partitions { } { global scontrol re_word_str exit_code log_user 0 set all_partitions 0 set scon_pid [spawn -noecho $scontrol show config] expect { -re "JobSubmitPlugins *= ($re_word_str)" { if {[string first $expect_out(1,string) "all_partitions"] != -1} { set all_partitions 1 } exp_continue } timeout { log_error "scontrol not responding" slow_kill $scon_pid set exit_code 1 } eof { wait } } log_user 1 return $all_partitions } ################################################################ # # NAME # wait_for_node - waits for nodes in a partition to reach a certain state # # SYNOPSIS # wait_for_node state num_nodes partition # # DESCRIPTION # Wait for a certain number of nodes in a partition to # reach a certain state. Polls every $testsuite_poll_interval # seconds. If the partition argument is empty, the default # partition will be used. # # RETURN VALUE # Returns: 1 on failure. # ################################################################ proc wait_for_node {state num_nodes {partition ""} } { global sinfo number testsuite_poll_interval set wait_time 0 set done 0 set cnt 0 set rt 0 if {[string length $partition] == 0} { set partition [default_partition] } while {$done != 1 && $wait_time < 3} { log_user 0 spawn $sinfo --noheader --partition $partition --state $state --format %D expect { -re "($number)" { set cnt $expect_out(1,string) exp_continue } timeout { log_error "sinfo is not responding" set rt 1 } eof { wait } } log_user 1 if {$num_nodes <= $cnt} { set done 1 } else { log_debug "Partition $partition has $cnt nodes idle and we want $num_nodes" sleep $testsuite_poll_interval incr wait_time 1 } } if {$done != 1} { set rt 1 } return $rt } ##################################################################### # # NAME # test_preempttype_part - determines if preempt mode partition_prio is configured # # SYNOPSIS # test_preempttype_part # # RETURN VALUE # 0 if none # ##################################################################### proc test_preempttype_part { } { global scontrol log_user 0 set part_prio 0 spawn $scontrol show config expect { -re "PreemptType *= preempt/partition_prio" { set part_prio 1 exp_continue } eof { wait } } log_user 1 return $part_prio } ##################################################################### # # NAME # test_preempttype_qos - determines if preempt mode qos is configured # # SYNOPSIS # test_preempttype_qos # # RETURN VALUE # 0 if none # ##################################################################### proc test_preempttype_qos { } { global scontrol log_user 0 set qos 0 spawn $scontrol show config expect { -re "PreemptType *= preempt/qos" { set qos 1 exp_continue } eof { wait } } log_user 1 return $qos } ##################################################################### # # NAME # test_proctrack - determines the ProctrackType # # SYNOPSIS # test_proctrack # # RETURN VALUE # the proctrack type # ##################################################################### proc test_proctrack { } { global scontrol re_word_str log_user 0 set proctype "" spawn $scontrol show config expect { -re "ProctrackType *=* proctrack/($re_word_str)" { set proctype $expect_out(1,string) exp_continue } eof { wait } } log_user 1 return $proctype } ##################################################################### # # NAME # get_srun_ports - determines the SrunPortRange # # SYNOPSIS # get_srun_ports # # RETURN VALUE # the SrunPortRange # ##################################################################### proc get_srun_ports { } { global scontrol re_word_str bin_grep bin_bash number log_user 0 set ports 0 spawn -noecho $bin_bash -c "exec $scontrol show config | $bin_grep SrunPortRange" expect { -re "SrunPortRange *=* ($re_word_str)" { set ports $expect_out(1,string) exp_continue } eof { wait } } log_user 1 return $ports } ##################################################################### # # NAME # available_nodes_hostnames - gets all available nodes in the system # # SYNOPSIS # available_nodes_hostnames partition ?also_power_save? # # ARGUMENTS # # partition # to only return nodes of an specific partition # also_power_save # to include POWERING_DOWN and POWER_DOWN nodes # # RETURN VALUE # idle nodes, and also_power_save nodes if specified # ##################################################################### proc available_nodes_hostnames { partition {also_power_save false} } { global sinfo re_word_str exit_code log_user 0 set idle_nodelist "" set sep "" if {$also_power_save} { set avail_states "idle,power_down,powering_down" } else { set avail_states "idle" } if {[string compare $partition ""] == 0} { spawn $sinfo -t$avail_states -h -o%N } else { spawn $sinfo -t$avail_states -h -o%N -p$partition } expect { -re "($re_word_str)" { append idle_nodelist $sep append idle_nodelist $expect_out(1,string) set sep "," exp_continue } timeout { log_error "sinfo is not responding" set exit_code 1 } eof { wait } } log_user 1 return $idle_nodelist } ##################################################################### # # NAME # test_accting_steps - determines if nostep or nojobs is set for AccoutingStorageEnforce # # SYNOPSIS # test_accting_steps # # DESCRIPTION # Determine if nostep or nojobs is set for AccoutingStorageEnforce # # RETURN VALUE # 1 if set else 0 # ##################################################################### proc test_accting_steps { } { global scontrol re_word_str log_user 0 set enforce_limits 1 spawn $scontrol show config expect { -re "AccountingStorageEnforce *= ($re_word_str)" { if {[string first "nosteps" $expect_out(1,string)] != -1 } { set enforce_limits 0 } if {[string first "nojobs" $expect_out(1,string)] != -1 } { set enforce_limits 0 } exp_continue } eof { wait } } log_user 1 return $enforce_limits } # Print the current line number in the script. Calling like this # [get_curr_line_num [info frame]] # will return the current line number proc get_curr_line_num {frame_info} { # Getting value of the key 'line' from the dictionary # returned by 'info frame' set result [dict get [info frame $frame_info] line] } ##################################################################### # # NAME # get_partition_nodes - gets the list of node names in a given partition/states # # SYNOPSIS # get_partition_nodes partition states # # DESCRIPTION # Get the list of node names in a given partition/states # # ARGUMENTS # partition # partition to get nodes off # states # states to filter on nodes # # RETURN VALUE # node names list, -1 on sinfo error # ##################################################################### proc get_partition_nodes {partition states} { global sinfo re_word_str log_user 0 set node_list "" if {[string length $partition] == 0} { set partition [default_partition] } if {[string length $states] == 0} { set sinfo_pid [spawn -noecho $sinfo -h -N -p $partition -o %N -e] } else { set sinfo_pid [spawn -noecho $sinfo -h -N -p $partition -o %N -t $states -e] } expect { -re "($re_word_str)" { lappend node_list $expect_out(1,string) exp_continue } timeout { log_error "sinfo not responding" slow_kill $sinfo_pid set exit_code 1 } eof { wait } } log_user 1 return $node_list } ##################################################################### # # NAME # set_partition_maximum_time_limit - sets the maximum time limit in a given partition # # SYNOPSIS # set_partition_maximum_time_limit partition limit # # RETURN VALUE # 0 if succeeded, -1 if error # ##################################################################### proc set_partition_maximum_time_limit {partition limit} { global scontrol exit_code if {[string length $partition] == 0} { set partition [default_partition] if { $partition == "" } { return 1 } } if { $limit < -1 } { log_error "Trying to set invalid partition time limit of $limit" return 1 } if { $limit == -1 } { set expected_lim "UNLIMITED" } else { set expected_lim limit } spawn $scontrol update partitionname=$partition MaxTime=-1 expect { timeout { log_error "scontrol not responding" return 1 } eof { wait } } set maxtime [get_partition_maximum_time_limit $partition] if { $maxtime != $limit } { log_error "Unable to update partition MaxTime, got $maxtime, wanted $limit" return 1 } if { $exit_code != 0 } { log_error "set_partition_maximum_time_limit: Unexpected error." return 1 } return 0 } ##################################################################### # # NAME # get_partition_maximum_time_limit - gets the maximum time limit in a given partition # # SYNOPSIS # get_partition_maximum_time_limit partition # # DESCRIPTION # Get the maximum time limit in a given partition # # RETURN VALUE # time limit in seconds, -1 if undefined or error # ##################################################################### proc get_partition_maximum_time_limit {partition} { global sinfo number exit_code if {[string length $partition] == 0} { set partition [default_partition] } set secs 0 log_user 0 set sinfo_pid [spawn -noecho $sinfo -h -p $partition -O time -e] expect { -re "infinite" { set secs -1 exp_continue } -re "n/a" { set secs -1 exp_continue } -re "($number)-($number):($number):($number)" { set days [expr $expect_out(1,string) * 24 * 60 * 60] set hours [expr $expect_out(2,string) * 60 * 60] set mins [expr $expect_out(3,string) * 60] set secs [expr $days + $hours + $mins + $expect_out(4,string)] exp_continue } -re "($number):($number):($number)" { set hours [expr $expect_out(1,string) * 60 * 60] set mins [expr $expect_out(2,string) * 60] set secs [expr $hours + $mins + $expect_out(3,string)] exp_continue } -re "($number):($number)" { set mins [expr $expect_out(1,string) * 60] set secs [expr $mins + $expect_out(2,string)] exp_continue } -re "($number)" { set secs [expr $expect_out(1,string) * 60] exp_continue } timeout { log_error "sinfo not responding" slow_kill $sinfo_pid set exit_code 1 } eof { wait } } log_user 1 return $secs } ################################################################ # # NAME # get_partition_default_time_limit - gets the default time limit in a given partition # # SYNOPSIS # get_partition_default_time_limit ?partition? # # DESCRIPTION # Get the default time limit in a given partition. # If the partition is not specified, the default partition will be used. # # RETURN VALUE # Returns: time limit in seconds, -1 if undefined or error. # ################################################################ proc get_partition_default_time_limit { {partition ""} } { global sinfo number exit_code if {[string length $partition] == 0} { set partition [default_partition] } set secs 0 log_user 0 set sinfo_pid [spawn -noecho $sinfo -h -p $partition -O defaulttime -e] expect { -re "infinite" { set secs -1 exp_continue } -re "n/a" { set secs -1 exp_continue } -re "($number)-($number):($number):($number)" { set days [expr $expect_out(1,string) * 24 * 60 * 60] set hours [expr $expect_out(2,string) * 60 * 60] set mins [expr $expect_out(3,string) * 60] set secs [expr $days + $hours + $mins + $expect_out(4,string)] exp_continue } -re "($number):($number):($number)" { set hours [expr $expect_out(1,string) * 60 * 60] set mins [expr $expect_out(2,string) * 60] set secs [expr $hours + $mins + $expect_out(3,string)] exp_continue } -re "($number):($number)" { set mins [expr $expect_out(1,string) * 60] set secs [expr $mins + $expect_out(2,string)] exp_continue } -re "($number)" { set secs [expr $expect_out(1,string) * 60] exp_continue } timeout { log_error "sinfo not responding" slow_kill $sinfo_pid set exit_code 1 } eof { wait } } log_user 1 return $secs } ##################################################################### # # NAME # get_node_cores - given a node, returns its total number of cores # # SYNOPSIS # get_node_cores node # # DESCRIPTION # Given a node, return its total number of cores # (not the CoresPerSocket, but the total cores) # # RETURN VALUE # node cores if retrieved, -1 otherwise # ##################################################################### proc get_node_cores {node} { global sinfo number set cores -1 set sockets_per_node 0 set cores_per_socket 0 if {[string length $node] == 0} { return $cores } log_user 0 set sinfo_pid [spawn -noecho $sinfo -o "%X %Y" -h -n $node] expect { -re "($number)" { if {$sockets_per_node == 0} { set sockets_per_node $expect_out(1,string) } else { set cores_per_socket $expect_out(1,string) } exp_continue } timeout { log_error "sinfo not responding" slow_kill $sinfo_pid set exit_code 1 } eof { wait } } log_user 1 set cores [expr $sockets_per_node * $cores_per_socket] return $cores } ##################################################################### # # NAME # get_node_cpus - given a node, returns its total number of threads we account for # # SYNOPSIS # get_node_cpus node # # DESCRIPTION # Given a node, return its total number of threads we account for. # (not always ThreadsPerCore, but how many threads are in use. # i.e. CPUs=6 CoresPerSocket=6 ThreadsPerCore=2 Socket=1 would # result in only 1 thread we care about instead of the 2 listed.) # # RETURN VALUE # list of node [ tot_cpus threads ] if retrieved, [ -1 -1 ] otherwise # ##################################################################### proc get_node_cpus {node} { global scontrol number set nthreads -1 set nsockets 0 set ncores 0 set totcpus -1 if {[string length $node] == 0} { return [list $totcpus $nthreads] } # Get the number of CPUs on a node set my_pid [spawn $scontrol show node $node] expect { -re "CoresPerSocket=($number)" { set ncores $expect_out(1,string) exp_continue } -re "CPUTot=($number)" { set totcpus $expect_out(1,string) exp_continue } -re "Sockets=($number)" { set nsockets $expect_out(1,string) exp_continue } -re "ThreadsPerCore=($number)" { set nthreads $expect_out(1,string) exp_continue } timeout { log_error "scontrol is not responding" slow_kill $scontrol_pid set exit_code 1 } eof { wait } } set core_cnt [expr $nsockets * $ncores] set thread_cnt [expr $ncores * $nthreads] if {$totcpus != $nthreads && $totcpus == $ncores} { log_info "Cores rather than threads are being allocated" set nthreads 1 } return [list $totcpus $nthreads] } ##################################################################### # # NAME # get_part_total_cores - given a partition and/or states, return its total cores # # SYNOPSIS # get_part_total_cores partition states # # DESCRIPTION # Given a partition and/or states, return its total cores # # ARGUMENTS # partition # partition to check cores # states # states to filter on partition cores # # RETURN VALUE # partition cores # ##################################################################### proc get_part_total_cores {part states} { global sinfo number log_user 0 set cores 0 set tmp 0 set i 0 if {[string length $part] == 0} { set part [default_partition] } if {[string length $states] == 0} { set sinfo_pid [spawn -noecho $sinfo -h -N -p $part -o "%X %Y"] } else { set sinfo_pid [spawn -noecho $sinfo -h -N -p $part -t $states -o "%X %Y"] } expect { -re "($number)" { set is_even [expr {($i % 2) == 0}] if {$is_even == 1} { set tmp $expect_out(1,string) } else { set tmp [expr $tmp * $expect_out(1,string)] set cores [expr $cores + $tmp] } incr i exp_continue } timeout { log_error "sinfo not responding" slow_kill $sinfo_pid set exit_code 1 } eof { wait } } log_user 1 return $cores } ##################################################################### # # NAME # check_hosts_contiguous - verify if all hosts belong to the partition and are contiguous # # SYNOPSIS # check_hosts_contiguous check_hosts_list partition # # DESCRIPTION # Given a partition and a list of hosts, verify if all # hosts belong to the partition and are contiguous. # If the partition argument is empty, the default partition # will be used. # # RETURN VALUE # Returns: 0 on SUCCESS, 1 otherwise. # ##################################################################### proc check_hosts_contiguous { check_hosts_list {partition ""} } { global sinfo re_word_str if {[string length $partition] == 0} { set partition [default_partition] } set part_hosts_list {} log_user 0 set sinfo_pid [spawn $sinfo --noheader -p $partition -N -o %N] expect { -re "($re_word_str)" { lappend part_hosts_list $expect_out(1,string) exp_continue } -re "Unable to contact" { log_user 1 log_error "Slurm appears to be down" return 1 } timeout { log_user 1 log_error "sinfo not responding" slow_kill $sinfo_pid return 1 } eof { wait } } log_user 1 foreach host $check_hosts_list { set idx_cur [lsearch $part_hosts_list $host] if {$idx_cur == -1} { log_error "$host not found in list of hosts from partition $partition" return 1 } if {[info exists idx_old]} { if {$idx_cur != [expr $idx_old + 1]} { log_error "Node sequence number not contiguous" return 1 } } set idx_old $idx_cur } return 0 } ##################################################################### # # NAME # stop_root_user - detect, warn, and stop root user # # SYNOPSIS # stop_root_user # # DESCRIPTION # Detect, warn, and stop root user # # RETURN VALUE # uid if not root user, exits otherwise # ##################################################################### proc stop_root_user {} { global bin_id number set uid [get_my_uid] if {$uid == -1} { fail "Can't get my uid" } elseif {$uid == 0} { skip "Can't run this test as user root" } return $uid } ################################################################ # # NAME # get_requeue_exit - gets RequeueExit configuration parameter # # SYNOPSIS # get_requeue_exit # # RETURN VALUE # Returns RequeueExit number # ################################################################ proc get_requeue_exit { } { global scontrol number exit_code log_user 0 set re 0 set scon_pid [spawn -noecho $scontrol show config] expect { -re "RequeueExit *= ($number)" { set re $expect_out(1,string) exp_continue } timeout { log_error "scontrol not responding" slow_kill $scon_pid set exit_code 1 } eof { wait } } log_user 1 return $re } ################################################################ # # NAME # get_requeue_exit_hold - gets RequeueExitHold configuration parameter # # SYNOPSIS # get_requeue_exit_hold # # RETURN VALUE # Returns RequeueExitHold number # ################################################################ proc get_requeue_exit_hold { } { global scontrol number exit_code log_user 0 set re 0 set scon_pid [spawn -noecho $scontrol show config] expect { -re "RequeueExitHold *= ($number)" { set re $expect_out(1,string) exp_continue } timeout { log_error "scontrol not responding" slow_kill $scon_pid set exit_code 1 } eof { wait } } log_user 1 return $re } ################################################################ # # NAME # get_prolog - gets Prolog configuration parameter # # SYNOPSIS # get_prolog # # RETURN VALUE # Returns Prolog parameter # ################################################################ proc get_prolog { } { global scontrol re_word_str exit_code log_user 0 set prolog 0 set scon_pid [spawn -noecho $scontrol show config] expect { -re "^Prolog *= ($re_word_str)" { set re $expect_out(1,string) exp_continue } timeout { log_error "scontrol not responding" slow_kill $scon_pid set exit_code 1 } eof { wait } } log_user 1 return $prolog } proc expect_extern_step { } { global scontrol # If PrologFlags=contain is in slurm.conf, then an "extern" step will be # launched on each node, so we need to check for 3 steps per # job instead of 2. log_user 0 set extern_step 0 set scon_pid [spawn -noecho $scontrol show config] expect { -re "PrologFlags\\s*=\\s*\[A-z/,\]*Contain" { set extern_step 1 } timeout { fail "scontrol show config not responding" } eof { wait } } log_user 1 return $extern_step } ################################################################ # # NAME # get_het_job_ids - gets list of component job ids for a het job # # SYNOPSIS # get_het_job_ids job_id ?use_offset? # # DESCRIPTION # Gets list of component job ids for a het job. # # ARGUMENTS # job_id # Slurm job id # use_offset # If zero, returns list of integer job ids, else returns ids in # the form of X+Y where X is het job master id and Y is the # offset. # # RETURN VALUE # A list of ids for a hetjob or an empty list if jobid # is not a het one. # ################################################################ proc get_het_job_ids { jobid {use_offset 0}} { global scontrol number set id_list "" set log_user_save [log_user -info] log_user 0 spawn $scontrol show job $jobid expect { -re "JobId=($number) HetJobId=($number) HetJobOffset=($number)" { if { $use_offset } { lappend id_list "$expect_out(2,string)+$expect_out(3,string)" } else { lappend id_list $expect_out(1,string) } exp_continue } timeout { log_error "scontrol not responding" set id_list "" } eof { wait } } log_user $log_user_save return $id_list } ################################################################ # # NAME # test_hetjob_step - tests if steps can span multiple heterogeneous job allocations # # SYNOPSIS # test_hetjob_step # # Returns 1 if steps can span multiple heterogeneous job components, # 0 otherwise # ################################################################ proc test_hetjob_step { } { global scontrol number exit_code log_user 0 set hetjob_step 0 set major 0 set minor 0 spawn $scontrol -V expect { -re " ($number).($number).($number)" { set version_major $expect_out(1,string) set version_minor $expect_out(2,string) exp_continue } timeout { log_error "scontrol not responding" set exit_code 1 } eof { wait } } if {$version_major >= 19} { set hetjob_step 1 } elseif {$version_major >= 18} { set hetjob_step 1 spawn $scontrol show config expect { -re "select/cray" { set hetjob_step 0 exp_continue } timeout { log_error "scontrol not responding" set exit_code 1 } eof { wait } } } elseif {$version_major == 17 && $version_minor == 11} { spawn $scontrol show config expect { -re "select/cray" { set hetjob_step 0 exp_continue } -re "disable_hetjob_steps" { set hetjob_step 0 exp_continue } -re "enable_hetjob_steps" { set hetjob_step 1 exp_continue } timeout { log_error "scontrol not responding" set exit_code 1 } eof { wait } } } log_user 1 return $hetjob_step } ################################################################ # # NAME # reconfigure - calls scontrol reconfigure # # SYNOPSIS # reconfigure ?cluster? # # DESCRIPTION # Calls scontrol reconfigure. # # ARGUMENTS # cluster # The cluster to reconfigure # # RETURN VALUE # void # # ENVIRONMENT # Sets exit_code to 1 on failure. # ################################################################ proc reconfigure { {cluster ""} } { global exit_code scontrol timeout # # Increase timeout just in case we're running under valgrind # set save_timeout $timeout set timeout 20 if { $cluster == "" } { spawn $scontrol reconfigure } else { spawn $scontrol -M$cluster reconfigure } expect { -re "slurm_reconfigure error: Invalid user id" { log_error "Invalid user id" set exit_code 1 exp_continue } -re "Error|error" { log_error "scontrol reconfigure error" set exit_code 1 exp_continue } timeout { log_error "scontrol not responding" set exit_code 1 } eof { wait } } # # Wait for reconfigure to complete, then reset timeout and return. # sleep 5 set timeout $save_timeout } ##################################################################### # # NAME # log_fatal - prints a fatal message # # SYNOPSIS # log_fatal message # # SEE ALSO # _log_format for options governing the message format and colorization # ##################################################################### proc log_fatal {message} { global testsuite_log_level LOG_LEVEL_FATAL if {$testsuite_log_level >= $LOG_LEVEL_FATAL} { _log_format "fatal" "$message" } } ##################################################################### # # NAME # log_error - prints an error message # # SYNOPSIS # log_error message # # SEE ALSO # _log_format for options governing the message format and colorization # ##################################################################### proc log_error {message} { global testsuite_log_level LOG_LEVEL_ERROR if {$testsuite_log_level >= $LOG_LEVEL_ERROR} { _log_format "error" "$message" } } ##################################################################### # # NAME # log_warn - prints a warning message # # SYNOPSIS # log_warn message # # SEE ALSO # _log_format for options governing the message format and colorization # ##################################################################### proc log_warn {message} { global testsuite_log_level LOG_LEVEL_WARNING if {$testsuite_log_level >= $LOG_LEVEL_WARNING} { _log_format "warning" "$message" } } ##################################################################### # # NAME # log_info - prints an information message # # SYNOPSIS # log_info message # # SEE ALSO # _log_format for options governing the message format and colorization # ##################################################################### proc log_info {message} { global testsuite_log_level LOG_LEVEL_INFO if {$testsuite_log_level >= $LOG_LEVEL_INFO} { _log_format "info" "$message" } } ##################################################################### # # NAME # log_debug - prints a debug level message # # SYNOPSIS # log_debug message # # SEE ALSO # _log_format for options governing the message format and colorization # ##################################################################### proc log_debug {message} { global testsuite_log_level LOG_LEVEL_DEBUG if {$testsuite_log_level >= $LOG_LEVEL_DEBUG} { _log_format "debug" "$message" } } ##################################################################### # # NAME # log_trace - prints a trace level message # # SYNOPSIS # log_trace message # # SEE ALSO # _log_format for options governing the message format and colorization # ##################################################################### proc log_trace {message} { global testsuite_log_level LOG_LEVEL_TRACE if {$testsuite_log_level >= $LOG_LEVEL_TRACE} { _log_format "trace" "$message" } } ################################################################ # # NAME # in_fed - checks whether this cluster is in a federation # # SYNOPSIS # in_fed # # RETURN VALUE # Returns 1 if this cluster is in a federation, 0 otherwise # ################################################################ proc in_fed {} { global scontrol spawn $scontrol show fed expect { -re "Federation" { return 1 } timeout { log_error "scontrol not responding" } eof { wait } } return 0 } ################################################################ # # NAME # check_job_state - checks if the state of a job is the expected one # # SYNOPSIS # check_job_state job state ?het_job? # # DESCRIPTION # Checks if the state of a job is the expected one. # # ARGUMENTS # job # Job ID to check # state # Desired state of the job to match # het_job # If set, checks state of each component job if the # job is a hetjob. # # RETURN VALUE # 1 if job was on the desired state, or the number of job components # on that state if it's a hetjob and het_job option enabled. 0 otherwise. # # ENVIRONMENT # Also sets exit_code to 1 if job (or component) if there are some # error in the called commands. # ################################################################ proc check_job_state { job state {het_job 0}} { global scontrol exit_code set jid_list "" if { $het_job } { set jid_list [get_het_job_ids $job 1] } if { $jid_list == "" } { # non-het job set jid_list $job } foreach jid $jid_list { set state_match 0 spawn $scontrol show job $jid expect { -re "JobState=($state)" { incr state_match } timeout { log_error "scontrol not responding" set exit_code 1 } eof { wait } } if {$state_match != 1} { log_error "job $jid should be in $state state, but is not" set exit_code 1 break } } } ################################################################ # # NAME # get_gres_count - returns a dict of nodes and GRES counts # # SYNOPSIS # get_gres_count gres_name ?node_list? # # DESCRIPTION # Returns a dict of node names and the count of a specifed # GRES aggregating all its types on each node. # # RETURN VALUE # If the node_list is not specified node name is specified, # this function will return a dict with the GRES count for all # the nodes of the default partition. # If specified, a dict only with the nodes of the node_list. # ################################################################ proc get_gres_count { gres_name {node_list ""} } { global gres_regex set nodes_dict [get_nodes $node_list] set gres_dict [dict create] dict for {node_name node_dict} $nodes_dict { set gres_count 0 if [dict exists $node_dict "Gres"] { set gres_expression [dict get $node_dict "Gres"] foreach gres [split $gres_expression ","] { if {[regexp $gres_regex $gres {} name type count] == 1} { if {$count eq ""} { set count $type } if {$name eq $gres_name} { incr gres_count $count } } } } dict set gres_dict $node_name $gres_count } return $gres_dict } ################################################################ # # NAME # get_highest_gres_count - returns highest number of GRES per node on node_count nodes # # SYNOPSIS # get_highest_gres_count node_count gres_name # # DESCRIPTION # For a given number of nodes, returns the highest GRES count per # node available on at least that number of nodes. # # EXAMPLE # For example: node1 has 1 GPU, node2 has 2 GPUs and node3 has 3 GPUs # [get_highest_gres_count 1 "gpu"] returns 3 (i.e. 1 node 3 GPUs) # [get_highest_gres_count 2 "gpu"] returns 2 (i.e. 2 nodes have at least 2 GPUs each) # [get_highest_gres_count 3 "gpu"] returns 1 (i.e. 3 nodes have at least 1 GPU each) # ################################################################ proc get_highest_gres_count { node_count gres_name } { set available_nodes [available_nodes_hostnames [default_partition]] set gres_dict [get_gres_count $gres_name $available_nodes] set gres_count [list] dict for {node gres} $gres_dict { lappend gres_count $gres } set count [lindex [lsort -decreasing -integer $gres_count] [expr $node_count - 1]] return $count } ################################################################ # # NAME # _set_gpu_socket_inx - adds a socket index to the gpu_sock_list if not already on it # # SYNOPSIS # _set_gpu_socket_inx sock_inx # # DESCRIPTION # Add a socket index to the array gpu_sock_list if not already # on the list. Subroutine used by get_gpu_socket_count # ################################################################ proc _set_gpu_socket_inx { sock_inx } { global gpu_sock_list if {$sock_inx == -1} { set gpu_sock_list [lreplace $gpu_sock_list 0 99] return } set sock_cnt [llength $gpu_sock_list] for {set i 0} {$i < $sock_cnt} {incr i} { if {[lindex $gpu_sock_list $i] == $sock_inx} { return } } lappend gpu_sock_list $sock_inx } ################################################################ # Subroutine used by get_gpu_socket_count # Add a socket index to the array gpu_sock_list if not already # on the list. ################################################################ proc _set_gpu_socket_range { sock_first_inx sock_last_inx } { global gpu_sock_list if {$sock_inx == -1} { set gpu_sock_list [lreplace $gpu_sock_list 0 99] } set sock_cnt [llength $gpu_sock_list] for {set s $sock_first_inx} {$s <= $sock_last_inx} {incr s} { set found 0 for {set i 0} {$i < $sock_cnt} {incr i} { if {[lindex $gpu_sock_list $i] == $s} { set found 1 break } } if {$found == 0} { lappend gpu_sock_list $s } } } ################################################################ # # NAME # get_gpu_socket_count - returns the number of sockets with GPUS on a node with the given per-node GPU count # # SYNOPSIS # get_gpu_socket_count gpu_cnt sockets_per_node # # DESCRIPTION # Given a per-node GPU count, return the number of sockets with # GPUs on a node with the given per-node GPU count. # If the sockets_per_node has a value of 1 then just return 1 # rather than determine the count (for performance reasons). # ################################################################ proc get_gpu_socket_count { gpu_cnt sockets_per_node } { global re_word_str bin_rm number scontrol srun global gpu_sock_list set sockets_with_gpus 1 set file_in "test_get_gpu_socket_count.input" if {$sockets_per_node == 1} { return 1 } log_user 0 _set_gpu_socket_inx -1 make_bash_script $file_in "$scontrol show node \$SLURMD_NODENAME" spawn $srun -N1 --gres=gpu:$gpu_cnt $file_in expect { -re "gpu:${number}.S:($number)-($number)" { _set_gpu_socket_range $expect_out(1,string) $expect_out(2,string) exp_continue } -re "gpu:${re_word_str}:${number}.S:($number),($number),($number),($number)" { _set_gpu_socket_inx $expect_out(1,string) _set_gpu_socket_inx $expect_out(2,string) _set_gpu_socket_inx $expect_out(3,string) _set_gpu_socket_inx $expect_out(4,string) exp_continue } -re "gpu:${re_word_str}:${number}.S:($number),($number),($number)" { _set_gpu_socket_inx $expect_out(1,string) _set_gpu_socket_inx $expect_out(2,string) _set_gpu_socket_inx $expect_out(3,string) exp_continue } -re "gpu:${re_word_str}:${number}.S:($number),($number)" { _set_gpu_socket_inx $expect_out(1,string) _set_gpu_socket_inx $expect_out(2,string) exp_continue } -re "gpu:${re_word_str}:${number}.S:($number)" { _set_gpu_socket_inx $expect_out(1,string) exp_continue } -re "gpu:${number}.S:($number),($number),($number),($number)" { _set_gpu_socket_inx $expect_out(1,string) _set_gpu_socket_inx $expect_out(2,string) _set_gpu_socket_inx $expect_out(3,string) _set_gpu_socket_inx $expect_out(4,string) exp_continue } -re "gpu:${number}.S:($number),($number),($number)" { _set_gpu_socket_inx $expect_out(1,string) _set_gpu_socket_inx $expect_out(2,string) _set_gpu_socket_inx $expect_out(3,string) exp_continue } -re "gpu:${number}.S:($number),($number)" { _set_gpu_socket_inx $expect_out(1,string) _set_gpu_socket_inx $expect_out(2,string) exp_continue } -re "gpu:${number}.S:($number)" { _set_gpu_socket_inx $expect_out(1,string) exp_continue } eof { wait } } log_user 1 exec $bin_rm -f $file_in set sock_cnt [llength $gpu_sock_list] if {$sock_cnt > 1} { set sockets_with_gpus $sock_cnt } return $sockets_with_gpus } ################################################################ # # NAME # get_highest_mps_count - get_highest_gres_count nodes mps, but for "mps per GPU" # # SYNOPSIS # get_highest_mps_count node_count # # DESCRIPTION # For a given number of nodes, returns the higest number of MPS per GPU # available at least on those number of nodes. # ################################################################ proc get_highest_mps_count { node_count } { # We cannot use get_highest_gres_count because we need "per gpu", # so we get all the mps per node and all gpus per node, to create # a mps_per_gpu list to sort and get the count. set available_nodes [available_nodes_hostnames [default_partition]] set mps_dict [get_gres_count "mps" $available_nodes] set gpu_dict [get_gres_count "gpu" $available_nodes] set mps_per_gpu [list] dict for {node mps} $mps_dict { if { $mps > 0 } { if [dict exists $gpu_dict $node] { set gpu [dict get $gpu_dict $node] if { $gpu > 0 } { lappend mps_per_gpu [expr $mps / $gpu] } else { fail "All nodes with MPS should have a GPU" } } else { fail "All nodes with MPS should have a GPU" } } } set count [lindex [lsort -decreasing -integer $mps_per_gpu] [expr $node_count - 1]] return $count } ################################################################ # # NAME # get_mps_node_count - gets the number of nodes with a positive number of GRES MPS # # SYNOPSIS # get_mps_node_count # # RETURN VALUE # Return the count of nodes with a non-zero count of GRES MPS # ################################################################ proc get_mps_node_count { } { global number sinfo re_word_str set fini 0 set node_inx 0 set def_part [default_partition] log_user 0 spawn $sinfo -N -p$def_part -oGRES=%G -h expect { -re "GRES=($re_word_str)" { set mps_count 0 set parts [split $expect_out(1,string) ",/"] while 1 { set mps_found [lsearch $parts "mps*"] if { $mps_found == -1 } break set parts2 [split [lindex $parts $mps_found] ":(/"] set col [lsearch -regexp $parts2 ^$number$] if { $col == -1 } { incr mps_count } else { set mps_count [expr $mps_count + [lindex $parts2 $col]] } set parts [lreplace $parts $mps_found $mps_found] } if {$mps_count > 0} { incr node_inx } exp_continue } eof { wait } } log_user 1 return $node_inx } ################################################################ # # NAME # get_over_time_limit - returns the value of OverTimeLimit # # SYNOPSIS # get_over_time_limit # # RETURN VALUE # Returns the value of OverTimeLimit in slurm.conf # ################################################################ proc get_over_time_limit {} { global number scontrol exit_code set ret 0 log_user 0 spawn $scontrol show config expect { -re "OverTimeLimit *= *($number) min" { set ret $expect_out(1,string) } timeout { log_error "scontrol not responding" incr exit_code } eof { wait } } log_user 1 return $ret } ################################################################ # # NAME # cuda_count - determines the count of allocated GPUs # # SYNOPSIS # cuda_count cuda_string # # ARGUMENTS # cuda_string # Contents of a CUDA_VISIBLE_DEVICES environment variable # # RETURN VALUE # Return the number of GPUs or -1 on error # ################################################################ proc cuda_count { cuda_string } { set cuda_count 0 set has_number 0 set len [string length $cuda_string] for {set char_inx 0} {$char_inx < $len} {incr char_inx} { set cuda_char [string index $cuda_string $char_inx] if {[string match , $cuda_char]} { if {$has_number > 0} { incr cuda_count set has_number 0 } else { log_error "cuda_count: Invalid input ($cuda_string)" return -1 } } elseif {[string is digit $cuda_char]} { set has_number 1 } } if {$has_number > 0} { incr cuda_count } else { log_error "cuda_count: Invalid input ($cuda_string)" return -1 } return $cuda_count } ################################################################ # # NAME # get_acct_store_tres - gets the configured value of AccountingStorageTRES # # SYNOPSIS # get_acct_store_tres # # DESCRIPTION # Get the configured value of AccountingStorageTRES # # RETURN VALUE # Returns the configured value of AccountingStorageTRES # ################################################################ proc get_acct_store_tres { } { global scontrol re_word_str log_user 0 set store_tres "" spawn $scontrol show config expect { -re "AccountingStorageTRES *= ($re_word_str)" { set store_tres $expect_out(1,string) exp_continue } eof { wait } } log_user 1 return $store_tres } ################################################################ # # NAME # get_conf_path - gets the path to the slurm.conf file # # SYNOPSIS # get_conf_path # # RETURN VALUE # Returns the path to the slurm.conf file # ################################################################ proc get_conf_path { } { global scontrol re_word_str exit_code set config_path "" set got_config 0 log_user 0 spawn $scontrol show config expect { -re "SLURM_CONF.*= (/.*)/($re_word_str).*SLURM_VERSION" { set config_path $expect_out(1,string) set got_config 1 exp_continue } timeout { log_error "scontrol is not responding" set exit_code 1 } eof { wait } } log_user 1 if {$got_config != 1} { log_error "Did not get slurm.conf path" set exit_code 1 } return $config_path } ################################################################ # # NAME # copy_conf - backs up the slurm.conf file # # SYNOPSIS # copy_conf config_path cwd # # DESCRIPTION # Copy the slurm.conf file to the a new file called # slurm.conf.orig in the current working directory # # ARGUMENTS # config_path # The path to slurm.conf # cwd # The full path of the current working directory # ################################################################ proc copy_conf { config_path cwd } { global bin_cp bin_rm exit_code exec $bin_rm -fr $cwd/slurm.conf.orig spawn $bin_cp -v $config_path/slurm.conf $cwd/slurm.conf.orig expect { timeout { log_error "slurm.conf was not copied" set exit_code 1 } eof { wait } } } ################################################################ # # NAME # save_conf - saves a copy of the specfied configuration file # # SYNOPSIS # save_conf file_name # # DESCRIPTION # Rename a file as . and make a copy of it # to its original name to be able to modify it. # It registers the saved files to allow to restore them properly # with restore_conf. # It will log_warn if the file dosn't exist, but will register it # to allow restore_conf to "restore" the original state: no file. # # It throws a TCL error (FATAL: ) if unsafe scenarios are detected, # like a backup file already existing, or any error with mv or cp. # It does this to encourage the user executing the test to manually # check the files to avoid automatically removing any original file. # # SEE ALSO # restore_conf # ################################################################ proc save_conf { file_name } { global test_id global bin_mv bin_cp bin_rm global _global_bkp_register _global_bkp_noexist set log_prev [log_user -info] set rc 0 log_info "Saving backup of $file_name" # # Check for existing backups # If it exists, throw a tcl exception/error to stop. # If it exists, manual intervation should be encouraged to avoid # original config files to be overwriten by a test. # set backup_files [glob -nocomplain $file_name*\[0-9\].\[0-9\]*] if {$backup_files != ""} { error [format "\nFATAL: Backup file already exists: %s" \ $backup_files] } # # Check if file to backup exists. # If is doesn't warn the user, register the file as nonexistent and # allow the test to continue. # The restore_conf should remove the file. # if {![file exists $file_name]} { log_warn [format "Backup of a nonexistent file registered: %s" \ $file_name] lappend _global_bkp_register $file_name lappend _global_bkp_noexist $file_name return } log_user 0 spawn $bin_mv $file_name $file_name.$test_id expect { timeout { error [format "\nFATAL: Timeout moving original %s" \ $file_name] } eof { lassign [wait] pid spawnid os_error_flag rc if {$rc != 0} { error [format "\nFATAL: Error moving original %s" \ $file_name] } } } spawn $bin_cp $file_name.$test_id $file_name expect { timeout { error [format "\nFATAL: Timeout copying original %s" \ $file_name] } eof { lassign [wait] pid spawnid os_error_flag rc if {$rc != 0} { error [format "\nFATAL: Error copying original %s" \ $file_name] } } } # If everything is ok, register the backup file lappend _global_bkp_register $file_name log_user $log_prev } ################################################################ # # NAME # restore_conf - restores the original confiration file # # SYNOPSIS # restore_conf file_name # # DESCRIPTION # Restore the original file saved previously with save_config. # # It throws a TCL error (FATAL: ) if unsafe scenarios are detected, # like a file_name not previously saved or, or any error with the # mv command. # It does this to encourage the user executing the test to manually # check the files to avoid automatically removing any original file. # # SEE ALSO # save_conf # ################################################################ proc restore_conf { file_name } { global test_id global bin_mv bin_rm global _global_bkp_register _global_bkp_noexist set log_prev [log_user -info] set rc 0 # # We don't want to restore any file not previously saved. # This avoids possible overwrtites or removals. # We want to throw an error because this may indicate data corruption. # if { [lsearch $_global_bkp_register $file_name] == -1 } { error [format "\nFATAL: Cannot restore a not previously saved file:\n%s\nSaved files are: %s\n" \ $file_name [join $_global_bkp_register "\n"]] } log_user 0 if {[file exists $file_name.$test_id]} { # # If backup file exist but it was nonexistent when saved it # means that it has been created in the middle of the test. # This shouldn't happen and manual intervention is prefeared. # if { [lsearch $_global_bkp_noexist $file_name] != -1 } { error [format "\nFATAL: Backup file created by someone, not by this test!" \ $file_name] } log_info "Restoring backup of $file_name" spawn $bin_mv $file_name.$test_id $file_name expect { timeout { error [format "\nFATAL: Timeout restoring original: %s" \ $file_name.$test_id] } eof { lassign [wait] pid spawnid os_error_flag rc if {$rc != 0} { error [format "\nFATAL: Error restoring original %s" \ $file_name] } } } } else { # # If backup file doesn't exist but it did when when we saved # it means that it has been removed while the test was running. # This should never happen, manual intervention is prefered. # if { [lsearch $_global_bkp_noexist $file_name] == -1 } { error [format "\nFATAL: Backup file was removed since saved!" \ $file_name] } log_info "Removing file used for the test: $file_name" spawn $bin_rm -f $file_name expect { timeout { error [format "\nFATAL: Timeout removing vestigial file: %s" \ $file_name] } eof { lassign [wait] pid spawnid os_error_flag rc if {$rc != 0} { error [format "\nFATAL: Error removing vestigial file: %s" \ $file_name] } } } } log_user $log_prev } ################################################################ # # NAME # have_nvml - checks if HAVE_NVML is set in config.h # # SYNOPSIS # have_nvml # # RETURN VALUE # Returns 1 if HAVE_NVML is set in config.h. Else, returns 0. # ################################################################ proc have_nvml { } { global bin_grep number exit_code config_h set grep_fail 0 set have_nvml 0 log_user 0 spawn $bin_grep "HAVE_NVML" $config_h expect { -re "HAVE_NVML ($number)" { set have_nvml $expect_out(1,string) exp_continue } timeout { set grep_fail 1 set exit_code 1 } eof { wait } } log_user 1 if {$grep_fail == 1} { log_warn "Could not grep $config_h for HAVE_NVML" set have_nvml 0 } return $have_nvml } ################################################################ # # NAME # delete_part - deletes partition on system # # SYNOPSIS # delete_part partition # ################################################################ proc delete_part { part_name } { global scontrol # Remove part spawn $scontrol delete partition=$part_name expect { timeout { log_error "scontrol is not responding" set exit_code 1 } eof { wait } } } ################################################################ # # NAME # have_lua - checks if HAVE_LUA is set in config.h # # SYNOPSIS # have_lua # # RETURN VALUE # Returns 1 if HAVE_LUA is set in config.h. Else, returns 0. # ################################################################ proc have_lua { } { global bin_grep number exit_code config_h set grep_fail 0 set have_lua 0 log_user 0 spawn $bin_grep "HAVE_LUA" $config_h expect { -re "HAVE_LUA ($number)" { set have_lua $expect_out(1,string) exp_continue } timeout { set grep_fail 1 set exit_code 1 } eof { wait } } log_user 1 if {$grep_fail == 1} { log_error "Could not grep $config_h for HAVE_LUA" set have_lua 0 } return $have_lua } ################################################################ # # NAME # get_reservations - returns a dictionary of dictionaries of reservation parameters # # SYNOPSIS # get_reservations ?resv_name? # # RETURN VALUE # Uses `scontrol show reservation` to return a dictionary of dictionaries # of job parameters. Specifying an invalid resv_name result in a failure. # ################################################################ proc get_reservations { {resv_name ""} } { global scontrol set command "$scontrol show reservation -o" if {$resv_name ne ""} { append command " $resv_name" } set output [run_command_output -fail "$command"] # Iterate over each reservation's parameter list foreach line [split $output "\n"] { if {$line eq ""} { continue } # Peel off the resv parameters one at a time # The first quantifier sets the greediness for the whole RE while {[regexp {^ *?([^ =]+)=(.*)(?= +[^ =]+=| *$)} $line {} param_name param_value]} { # Remove the consumed parameter from the line set line [regsub {^ *?[^ =]+=.*(?= +[^ =]+=| *$)} $line {}] # Add it to the temporary job dictionary dict set resv_dict $param_name $param_value } set resv_name_dict [dict get $resv_dict "ReservationName"] # Add the resv dictionary to resvs dictionary dict set resvs_dict $resv_name_dict $resv_dict # Clear the resv dictionary for the next resv set resv_dict {} } return $resvs_dict } ################################################################ # # NAME # get_resvation_param - returns a specific parameter value for a specific reservation # # SYNOPSIS # get_resvation_param resv_name parameter_name # # DESCRIPTION # Returns a specific parameter value for a specified resvation if the # parameter exists for the resvation, or MISSING if it does not exist. # Specifying an invalid resvation name will result in a failure. # ################################################################ proc get_resvation_param { resv_name parameter_name } { set resvs_dict [get_reservations $resv_name] if [dict exists $resvs_dict $resv_name $parameter_name] { return [dict get $resvs_dict $resv_name $parameter_name] } else { return "MISSING" } } ################################################################ # # NAME # create_res - create new reservation in system # # SYNOPSIS # create_res ?res_name? ?res_params? # # RETURN VALUE # the exit code of the scontrol command run # ################################################################ proc create_res { res_name res_params } { global scontrol set result [run_command "$scontrol create res ReservationName=$res_name $res_params"] set output [dict get $result output] set ret_code [dict get $result exit_code] if { $ret_code } { log_warn "[lindex [info level 0] 0]: error from scontrol: $output" } else { log_debug "[lindex [info level 0] 0]: success from scontrol: $output" } return $ret_code } ################################################################ # # NAME # update_res - update exisiting reservation in system # # SYNOPSIS # update_res ?res_name? ?res_params? # # RETURN VALUE # the exit code of the scontrol command run # ################################################################ proc update_res { res_name res_params } { global scontrol set result [run_command "$scontrol update ReservationName=$res_name $res_params"] set output [dict get $result output] set ret_code [dict get $result exit_code] if { $ret_code } { log_warn "[lindex [info level 0] 0]: error from scontrol: $output" } else { log_debug "[lindex [info level 0] 0]: success from scontrol: $output" } return $ret_code } ################################################################ # # NAME # delete_res - delete reservation from system # # SYNOPSIS # delete_res ?res_name? # # RETURN VALUE # the exit code of the scontrol command run # ################################################################ proc delete_res { res_name } { global scontrol set result [run_command "$scontrol delete ReservationName=$res_name"] set output [dict get $result output] set ret_code [dict get $result exit_code] if { $ret_code } { log_warn "[lindex [info level 0] 0]: error from scontrol: $output" } else { log_debug "[lindex [info level 0] 0]: success from scontrol: $output" } return $ret_code } ################################################################ # # NAME # create_part - creates a partition # # SYNOPSIS # create_part partition num_nodes # # ARGUMENTS # partition # Name of partition to create # num_nodes # Number of nodes of partition to create # ################################################################ proc create_part { part_name num_nodes_in } { global scontrol srun bin_printenv number re_word_str set nodes "" set num_nodes_out 0 set found 0 spawn $scontrol show partitionname=$part_name expect { -re "PartitionName=$part_name" { set found 1 exp_continue } timeout { log_err "scontrol is not responding" set exit_code 1 } eof { wait } } if {$found == 1} { log_error "There is already a partition $part_name" return 1 } if {[string length [default_partition]] == 0} { log_warn "create_part does not work without a default partition" return 1 } if { $num_nodes_in } { set num_nodes $num_nodes_in } else { set num_nodes [available_nodes idle] } log_user 0 # Get a list of nodes spawn $srun -t1 -N1-$num_nodes $bin_printenv expect { -re "SLURM_JOB_NUM_NODES=($number)" { set num_nodes_out $expect_out(1,string) exp_continue } -re "SLURM_NODELIST=($re_word_str)" { set nodes $expect_out(1,string) exp_continue } timeout { log_error "srun is not responding getting number of nodes creating part" return 1 } eof { wait } } if {[string length $nodes] == 0} { log_error "Did not get a valid node list" return 1 } elseif { $num_nodes_out != $num_nodes_in } { log_error "Did not get enough nodes ($num_nodes_out != $num_nodes_in) to run test" return 1 } spawn $scontrol create partitionname=$part_name nodes=$nodes expect { timeout { log_error "scontrol is not responding creating partition" return 1 } eof { wait } } set found 0 spawn $scontrol show partitionname=$part_name expect { -re "PartitionName=$part_name" { set found 1 exp_continue } timeout { log_error "scontrol is not responding" set exit_code 1 } eof { wait } } if { $found == 0 } { log_error "scontrol did not create partition $part_name" return 1 } log_user 1 log_info "Created partition $part_name with $num_nodes_in nodes" return 0 } ################################################################ # # NAME # get_nodes - returns a dictionary of dictionaries of node parameters # # SYNOPSIS # get_nodes ?hostlist_expression? # # DESCRIPTION # Uses `scontrol show node` to query node parameters, returning a # dictionary of dictionaries with the node names as keys of the first # level dictionary and with the parameters as keys of the second level # dictionary. Specifying an invalid node name will result in a failure. # # RETURN VALUE # If the optional node expression argument is specified, the result will # be constrained by the specified hostlist expression. Otherwise, the # results for all nodes will be returned. # ################################################################ proc get_nodes { {hostlist_expression ""} } { global scontrol set command "$scontrol show node -o" if {$hostlist_expression ne ""} { append command " $hostlist_expression" } set output [run_command_output -fail -nolog "$command"] # Iterate over each node parameter line foreach line [split $output "\n"] { if {$line eq ""} { continue } # Peel off the node parameters one at a time # The first quantifier sets the greediness for the whole RE while {[regexp {^ *?([^ =]+)=(.*)(?= +[^ =]+=| *$)} $line {} param_name param_value]} { # Remove the consumed parameter from the line set line [regsub {^ *?[^ =]+=.*(?= +[^ =]+=| *$)} $line {}] # Add it to the temporary node dictionary dict set node_dict $param_name $param_value } set node_name [dict get $node_dict "NodeName"] # Add the node dictionary to nodes dictionary dict set nodes_dict $node_name $node_dict # Clear the node dictionary for use by the next node set node_dict {} } return $nodes_dict } ################################################################ # # NAME # get_node_param - returns a specific parameter value for a specific node # # SYNOPSIS # get_node_param node_name parameter_name # # DESCRIPTION # Returns a specific parameter value for a specified node if the # parameter exists for the node, or MISSING if it does not exist. # Specifying an invalid node name will result in a failure. # ################################################################ proc get_node_param { node_name parameter_name } { set nodes_dict [get_nodes $node_name] if [dict exists $nodes_dict $node_name $parameter_name] { return [dict get $nodes_dict $node_name $parameter_name] } else { log_warn "Parameter $parameter_name not found on node $node_name" return "MISSING" } } ################################################################ # # NAME # get_nodes_by_request - get a list of nodes satisfying requested resources # # SYNOPSIS # get_nodes_by_request ?options? ?request_args? # # DESCRIPTION # Using srun (optionally with the specified arguments), returns a list # of nodes having the requested resources. # If an error occurs, the invoking test will fail. # # OPTIONS # -fail # fail the test if the execution of srun results in an error or timeout # # ARGUMENTS # request_args # Desired resources of a node in form of srun arguments, # e.g. "--gres=gpu:1 -n1 -t1" # # RETURN VALUE # A list of nodes with at least the requested resources, or an empty # list otherwise. # ################################################################ proc get_nodes_by_request args { global srun set options [list] while {[llength $args]} { switch -glob -- [lindex $args 0] { -fail { lappend options [lrange $args 0 0] set args [lrange $args 1 end] } default break } } if {[llength $args] == 1} { lassign $args request_args } elseif {[llength $args] == 0} { set request_args "-n1 -t1" } else { fail "[lindex [info level 0] 0]: Invalid number of arguments ([llength $args]): $args" } set command "$srun $request_args printenv SLURMD_NODENAME" set result [run_command {*}$options $command] if [dict get $result exit_code] { return {} } set output [dict get $result output] foreach line [split $output "\n"] { if {$line eq ""} { continue } dict incr allocated_nodes $line } return [lsort [dict keys $allocated_nodes]] } ################################################################ # # NAME # get_jobs - returns a dictionary of dictionaries of job parameters # # SYNOPSIS # get_jobs ?job_id? # # DESCRIPTION # Uses `scontrol show job` to return a dictionary of dictionaries of job # parameters. Specifying an invalid job id will result in a failure. # ################################################################ proc get_jobs { {job_id_in ""} } { global scontrol set command "$scontrol show job -o" if {$job_id_in ne ""} { append command " $job_id_in" } set output [run_command_output -fail "$command"] # Iterate over each job's parameter list foreach line [split $output "\n"] { if {$line eq ""} { continue } # Peel off the job parameters one at a time # The first quantifier sets the greediness for the whole RE while {[regexp {^ *?([^ =]+)=(.*)(?= +[^ =]+=| *$)} $line {} param_name param_value]} { # Remove the consumed parameter from the line set line [regsub {^ *?[^ =]+=.*(?= +[^ =]+=| *$)} $line {}] # Add it to the temporary job dictionary dict set job_dict $param_name $param_value } set job_id [dict get $job_dict "JobId"] # Add the job dictionary to jobs dictionary dict set jobs_dict $job_id $job_dict # Clear the job dictionary for the next job set job_dict {} } return $jobs_dict } ################################################################ # # NAME # get_job_param - returns a specific parameter value for a specific job # # SYNOPSIS # get_job_param job_id parameter_name # # DESCRIPTION # Returns a specific parameter value for a specified job if the # parameter exists for the job, or MISSING if it does not exist. # Specifying an invalid job id will result in a failure. # ################################################################ proc get_job_param { job_id parameter_name } { set jobs_dict [get_jobs $job_id] if [dict exists $jobs_dict $job_id $parameter_name] { return [dict get $jobs_dict $job_id $parameter_name] } else { return "MISSING" } } proc get_job_gpu_cnt { job_id } { global scontrol gres_regex set job_gpu_cnt 0 array set gpu_type_found {} spawn $scontrol show job -d $job_id expect { -re "(JOB_GRES=|,)$gres_regex" { set count 0 set name $expect_out(2,string) # Assume typed GRES of format gpu:1080:5 to start out set type $expect_out(3,string) set count $expect_out(4,string) # Skip all GRES of different names if {$name != "gpu"} { exp_continue } if {$count == ""} { # Now assume GRES format gpu:5 set count $type set type "notype" } # Skip if type already found (in another GRES line) if {[info exists gpu_type_found($type)]} { exp_continue } if { $count } { incr job_gpu_cnt $count set gpu_type_found($type) true } exp_continue } timeout { log_error "scontrol not responding\n" set exit_code 1 } eof { wait } } return $job_gpu_cnt } proc check_reason { job_id reason } { global squeue set found 0 spawn $squeue -j $job_id --noheader -o "%r" expect { -re "$reason" { set found 1 exp_continue } timeout { log_error "squeue not responding" } eof { wait } } if {$found == 0} { log_error "Job $job_id should have a wait reason of $reason" } return $found } ################################################################################ # # NAME # compile_against_libslurm - compiles a test program against either libslurm.so or libslurmfull.so # # SYNOPSIS # compile_against_libslurm test_prog ?use_full? ?build_args? # # DESCRIPTION # Compile a test program against either libslurm.so or libslurmfull.so. # # ARGUMENTS # test_prog # The name of the test program (and .c file) # use_full # If 1, use libslurmfull.so instead of libslurm.so. Defaults to 0. # build_args # Additional string to be appended to the build command. # E.g. "-DUSING_VALGRIND -lm ${build_dir}/src/slurmctld/locks.o" # (initial space will be added automatically). # ################################################################################ proc compile_against_libslurm { test_prog {use_full 0} {build_args ""} } { global slurm_dir bin_cc src_dir build_dir exit_code bin_chmod if {$use_full} { set libfile "libslurmfull.so" } else { set libfile "libslurm.so" } if [file exists $slurm_dir/lib64/slurm/$libfile] { set libdir "lib64" } else { set libdir "lib" } if {$use_full} { set libline "$slurm_dir/$libdir/slurm" set libfile "slurmfull" } else { set libline "$slurm_dir/$libdir" set libfile "slurm" } set build_cmd "$bin_cc ${test_prog}.c -g -pthread -o $test_prog -I$src_dir -I$build_dir -I$slurm_dir/include -Wl,-rpath=$libline -L$libline -l$libfile -lresolv" # Add additional arguments to the build command if {$build_args != ""} { set build_cmd "$build_cmd $build_args" } log_info "Build command: $build_cmd" catch {exec {*}$build_cmd} out_str out_dict if {[dict get $out_dict -code]} { log_error $out_str set exit_code 1 return } exec $bin_chmod 700 $test_prog } ################################################################ # # NAME # assert - tests a boolean expression # # SYNOPSIS # assert condition message ?action? # # DESCRIPTION # Tests a boolean expression. If the condition is false, the # function will take the designated action (error, fail, skip, # or warn) using the specified message. # # ARGUMENTS # condition # The boolean expression to test # message # The message to print with the action # action # Specifies the action to take when false. The default is "error" # The action will be one of the following: # # error - Prints an error if the boolean_expression is false # fail - Fails the test if the boolean_expression is false # skip - Skips the test if the boolean_expression is false # warn - Prints a warning if the boolean_expression is false # ################################################################ proc assert { condition message { action "error" } } { # We must skip two levels if being called by one of the assert_* helpers set uplevel 1 if {[info level] > 1 && [string match "assert_*" [lindex [info level -1] 0] ]} { incr uplevel } if {![uplevel $uplevel expr [format "{%s}" $condition]]} { if {$action eq "fail"} { fail "$message" } elseif {$action eq "error"} { log_error "$message" } elseif {$action eq "warn"} { log_warn "$message" } elseif {$action eq "skip"} { skip "$message" } else { fail "Invalid action ($action) with assert" } } } ################################################################ # # NAME # assert_or_fail - tests a boolean condition, failing if false # # SYNOPSIS # assert_or_fail condition message # # DESCRIPTION # Tests a boolean condition. If the condition is false, the # function will call fail with the specified message. # # ARGUMENTS # condition # The boolean expression to test # message # The message to print with the action # ################################################################ proc assert_or_fail { condition message } { assert "$condition" "$message" "fail" } ################################################################ # # NAME # assert_or_error - tests a boolean expression, logging an error if false # # SYNOPSIS # assert_or_error condition message # # DESCRIPTION # Tests a boolean condition. If the condition is false, the # function will log_error with the specified message. # # ARGUMENTS # condition # The boolean expression to test # message # The message to print with the action # ################################################################ proc assert_or_error { condition message } { assert "$condition" "$message" "error" } ################################################################ # # NAME # assert_or_warn - tests a boolean expression, logging a warning if false # # SYNOPSIS # assert_or_warn condition message # # DESCRIPTION # Tests a boolean condition. If the condition is false, the # function will log_warn with the specified message. # # ARGUMENTS # condition # The boolean expression to test # message # The message to print with the action # ################################################################ proc assert_or_warn { condition message } { assert "$condition" "$message" "warn" } ################################################################ # # NAME # assert_or_skip - tests a boolean expression, skipping the test if false # # SYNOPSIS # assert_or_skip condition message # # DESCRIPTION # Tests a boolean condition. If the condition is false, the # function will call skip with the specified message. # # ARGUMENTS # condition # The boolean expression to test # message # The message to print with the action # ################################################################ proc assert_or_skip { condition message } { assert "$condition" "$message" "skip" } ################################################################ # # NAME # _log_format - prints a log message with colorization and formatting # # SYNOPSIS # _log_format log_level message # # DESCRIPTION # This procedure is called by the log_ procedures and # derives the relevant log level from the caller's procedure name. # # ARGUMENTS # log_level # The logging threshold that triggered the log statement # message # The message to print with colorization and formatting # # ENVIRONMENT # testsuite_log_format # Used as the template for the fields to be output. # Fields must be expressed in the form: # %{} # Supported fields include the following: # message # The log message # filename # The file name where the log_ # procedure was called from # lineno # The line number where the log_ # procedure was called from # timestamp # The date and time when the log_ # procedure was called at # msecs # The milliseconds when the log_ # procedure was called at # loglevel # The log level that triggers the log_ # procedure to be called # testsuite_time_format # Used as a template for the timestamp. See the format groups # for the tcl clock format command. # testsuite_colorize # Boolean that turns colorization on or off # testsuite_color_ # Can be set to define the color used for each log level # ################################################################ proc _log_format { log_level message } { global testsuite_colorize testsuite_log_format testsuite_time_format global COLOR_NONE global testsuite_color_fatal testsuite_color_error testsuite_color_warn global testsuite_color_info testsuite_color_debug testsuite_color_trace set format_string $testsuite_log_format set milliseconds_since_epoch [clock milliseconds] set date_time [clock format [expr {$milliseconds_since_epoch / 1000}] -format "$testsuite_time_format"] set milliseconds [expr {$milliseconds_since_epoch % 1000}] set frame_level -2 while { [dict get [info frame $frame_level] type] != "source" } { incr frame_level -1 } set format_args {} while {[regexp "%{\[a-z]+}" $format_string format_field]} { if {$format_field eq "%{message}"} { lappend format_args $message } elseif {$format_field eq "%{filename}"} { lappend format_args [file tail [dict get [info frame $frame_level] file]] } elseif {$format_field eq "%{lineno}"} { lappend format_args [dict get [info frame $frame_level] line] } elseif {$format_field eq "%{timestamp}"} { lappend format_args $date_time } elseif {$format_field eq "%{msecs}" || $format_field eq "%{milliseconds}"} { lappend format_args $milliseconds } elseif {$format_field eq "%{loglevel}" || $format_field eq "%{levelname}"} { lappend format_args [string totitle $log_level] } else { fail "Invalid field ($format_field) specified in testsuite_log_format" } regsub $format_field $format_string "%" format_string } if ($testsuite_colorize) { switch $log_level { fatal { append output $testsuite_color_fatal } error { append output $testsuite_color_error } warning { append output $testsuite_color_warn } info { append output $testsuite_color_info } debug { append output $testsuite_color_debug } trace { append output $testsuite_color_trace } } } append output [format $format_string {*}$format_args] if ($testsuite_colorize) { append output $COLOR_NONE } puts $output } ################################################################ # # NAME # _print_header - prints a test header # # SYNOPSIS # _print_header # # ENVIRONMENT # testsuite_colorize # Boolean that turns colorization on or off # testsuite_color_header # Can be set to define the color used for the header # ################################################################ proc _print_header { } { global test_name testsuite_color_header testsuite_colorize COLOR_NONE if ($testsuite_colorize) { append output $testsuite_color_header } append output [string repeat = 78]\n append output [format "%-9s" "TEST:"]${test_name}\n append output [string repeat = 78] if ($testsuite_colorize) { append output $COLOR_NONE } puts $output } ################################################################ # # NAME # _print_failure - prints FAILURE status with test name # # SYNOPSIS # _print_failure # # ENVIRONMENT # testsuite_colorize # Boolean that turns colorization on or off # testsuite_color_ # Can be set to define the color used for each test status # ################################################################ proc _print_failure { } { global test_name testsuite_color_failure testsuite_colorize COLOR_NONE if ($testsuite_colorize) { append output $testsuite_color_failure } append output [string repeat = 78]\n append output [format "%-9s" "FAILURE:"]${test_name}\n append output [string repeat = 78] if ($testsuite_colorize) { append output $COLOR_NONE } puts $output } ################################################################ # # NAME # _print_skipped - prints SKIPPED status with test name # # SYNOPSIS # _print_skipped # # ENVIRONMENT # testsuite_colorize # Boolean that turns colorization on or off # testsuite_color_ # Can be set to define the color used for each test status # ################################################################ proc _print_skipped { } { global test_name testsuite_color_skipped testsuite_colorize COLOR_NONE if ($testsuite_colorize) { append output $testsuite_color_skipped } append output [string repeat = 78]\n append output [format "%-9s" "SKIPPED:"]${test_name}\n append output [string repeat = 78] if ($testsuite_colorize) { append output $COLOR_NONE } puts $output } ################################################################ # # NAME # _print_success - prints SUCCESS status with test name # # SYNOPSIS # _print_success # # ENVIRONMENT # testsuite_colorize # Boolean that turns colorization on or off # testsuite_color_ # Can be set to define the color used for each test status # ################################################################ proc _print_success { } { global test_name testsuite_color_success testsuite_colorize COLOR_NONE if ($testsuite_colorize) { append output $testsuite_color_success } append output [string repeat = 78]\n append output [format "%-9s" "SUCCESS:"]${test_name}\n append output [string repeat = 78] if ($testsuite_colorize) { append output $COLOR_NONE } puts $output } ################################################################ # # NAME # _get_test_name - gets the name of the invoking source script # # SYNOPSIS # _get_test_name # # RETURN VALUE # The name of the originally called script # ################################################################ proc _get_test_name { } { set test_name unknown set frame_level 1 while { $frame_level <= [info frame] } { if { [dict get [info frame $frame_level] type] == "source" } { set test_name [file tail [dict get [info frame $frame_level] file]] break } incr frame_level } return $test_name } ################################################################ # # NAME # _test_cleanup - performs the test cleanup # # SYNOPSIS # _test_cleanup # # DESCRIPTION # This procedure removes the temporary test_dir and calls the # test-defined cleanup procedure. # # NOTES # This function should be called only from _test_init and _exit. # ################################################################ proc _test_cleanup {} { global log_warn test_dir set rc 0 # Call global cleanup procedure if it is defined by the test if {[info procs cleanup] eq "cleanup"} { if {[catch {cleanup} cleanup_error ]} { log_error "Cleanup had errors: $cleanup_error" set rc 1 } } # Remove the temporary test directory exec rm -rf $test_dir return $rc } ################################################################ # # NAME # _test_init - performs test initialization # # SYNOPSIS # _test_init # # DESCRIPTION # This procedure is called automatically at the beginning of each test. # It prints the header, creates the temporary test dir, etc. # ################################################################ proc _test_init {} { global test_dir test_id test_name testsuite_shared_dir # Set test name to name of originally invoked test script, e.g. test1.1 set test_name [_get_test_name] # Set test id to suffix of the test script, e.g. 1.1 set test_id [string map {test ""} $test_name] # Temporary test directory used to stash saved configs, output files... set test_dir "$testsuite_shared_dir/${test_name}dir" # Print test header _print_header # Cleanup in case test was not cleaned up on last execution if { [_test_cleanup] } { fail "Error in the initial cleanup" } # Create temporary shared test directory exec mkdir -p $test_dir } ################################################################ # # NAME # _test_fini - performs test finalization # # SYNOPSIS # _test_fini ?status? # # DESCRIPTION # This procedure is called automatically from the ending functions # pass, skip and failure. # It cleans up based on the status and the testsuite_cleanup_on_failure # variable, prints the final test status/summary, and exits the test. # # When status is zero, we print SUCCESS. # When status is negative, we print SKIPPED. # When status is positive, we print FAILURE. # # We will always cleanup for SUCCESS or SKIPPED tests. # Whether or not the cleanup procedure is called for FAILURE tests # depends on the setting of the $testsuite_cleanup_on_failure variable # which can be set in the globals.local file or overridden with the # SLURM_TESTSUITE_CLEANUP_ON_FAILURE environment variable. # ################################################################ proc _test_fini { status } { global testsuite_cleanup_on_failure if {$status == 0} { # Successful test _test_cleanup _print_success } elseif {$status < 0} { # Skipped test _test_cleanup _print_skipped } else { # Failed test # Only cleanup the test if configured to do so if {$testsuite_cleanup_on_failure} { _test_cleanup } _print_failure } __exit $status } ################################################################ # # Overload the exit routine to ensure that no one is explicitly # calling it, and to enforce _test_fini if exit is called when # the test reach its EOF. # # All tests should exit using pass, skip or fail. # ################################################################ rename exit __exit proc exit { {status 0} } { global test_name # # Minor sanity check to detect if exit was explicitly called (not # allowed) or automatically executed when the test ends # if {[info level] > 1} { # exit was called from a function, and it shouldn't log_error "Exit should not be directly called, use pass, skip or fail instead" } else { set frame_level 1 while { $frame_level <= [info frame] } { if { [dict get [info frame $frame_level] type] == "source" } { if { [file tail [dict get [info frame $frame_level] file]] eq $test_name } { # exit was called explicitly from the # test, and it shouldn't log_error "Exit should not be directly called, use pass, skip or fail instead" } break } incr frame_level } } # The exit was called implicitly when the test ends, allowed but # _test_fini call enforced _test_fini $status } # Call _test_init at the beginning of each test _test_init