#!/usr/bin/env expect ############################################################################ # Purpose: Test of Slurm functionality # Test of CPU affinity support for multi-core systems. ############################################################################ # Copyright (C) 2005-2007 The Regents of the University of California. # Copyright (C) 2008 Lawrence Livermore National Security. # Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). # Written by Morris Jette # CODE-OCEC-09-009. All rights reserved. # # This file is part of Slurm, a resource management program. # For details, see . # Please also read the included file: DISCLAIMER. # # Slurm is free software; you can redistribute it and/or modify it under # the terms of the GNU General Public License as published by the Free # Software Foundation; either version 2 of the License, or (at your option) # any later version. # # Slurm is distributed in the hope that it will be useful, but WITHOUT ANY # WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS # FOR A PARTICULAR PURPOSE. See the GNU General Public License for more # details. # # You should have received a copy of the GNU General Public License along # with Slurm; if not, write to the Free Software Foundation, Inc., # 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. ############################################################################ source ./globals set exit_code 0 set file_prog "test$test_id.prog" set prompt "PROMPT:" if {[test_select_type_params "CR_ONE_TASK_PER_CORE"]} { skip "This test is incompatible SelectTypeParameters=CR_ONE_TASK_PER_CORE" } # # Test if CPU affinity support is supported. # if {![test_cpu_affinity]} { skip "CPU affinity not supported on this system" } log_debug "Task affinity plugin installed" set task_params [string tolower [get_affinity_params]] if {[string first $task_params "boards"] != -1 || [string first $task_params "sockets"] != -1 || [string first $task_params "cores"] != -1} { skip "TaskPluginParams=$task_params not supported" } set force 0 log_user 0 spawn $scontrol show partition [default_partition] expect { -re "OverSubscribe=FORCE" { set force 1 exp_continue } eof { wait } } log_user 1 if {$force == 1} { skip "This test is not compatible with OverSubscribe=FORCE" } # Identify a usable node set timeout $max_job_delay set node_name "" set srun_pid [spawn $srun -N1 --exclusive --verbose $bin_printenv SLURMD_NODENAME] expect { -re "on host ($re_word_str)" { set node_name $expect_out(1,string) exp_continue } timeout { log_error "srun not responding" slow_kill $srun_pid set exit_code 1 } eof { wait } } if {[string compare $node_name ""] == 0} { fail "Failed to get a usable node name" } # Determine how many cpus, sockets, cores, and threads the node has set num_cputot 0 set num_sockets 0 set num_cores 0 set num_threads 0 log_user 0 spawn $scontrol show node $node_name expect { -re "CoresPerSocket=($number)" { set num_cores $expect_out(1,string) exp_continue } -re "CPUTot=($number)" { set num_cputot $expect_out(1,string) exp_continue } -re "Sockets=($number)" { set num_sockets $expect_out(1,string) exp_continue } -re "ThreadsPerCore=($number)" { set num_threads $expect_out(1,string) exp_continue } eof { wait } } log_user 1 if {$num_cputot == 0 || $num_sockets == 0 || $num_cores == 0 || $num_threads == 0} { skip "Could not determine number of CPUs:Sockets:Cores:Threads (saw $num_cputot:$num_sockets:$num_cores:$num_threads)" } # Intel KNL nodes with 272 threads generate huge numeric masks # which are too large for expect integer values to manage set total_thread_count [expr $num_sockets * $num_cores * $num_threads] if {$total_thread_count > 64} { skip "Total thread count too large for Expect to process ($total_thread_count > 64). Expect unable to work with more than 32-bit numbers" } log_debug "Node config: CPUs=$num_cputot Sockets=$num_sockets Cores=$num_cores Threads=$num_threads" if {$num_cputot < $total_thread_count} { skip "CPUs=$num_cputot < (Sockets($num_sockets)*Cores($num_cores)*Threads($num_threads)). Test incompatible with lowered CPUs count" } # # Build a test program to report affinity by task # exec $bin_rm -f $file_prog exec $bin_cc -I$build_dir $file_prog.c -o $file_prog exec $bin_chmod 700 $file_prog # # Create an allocation # global env set env(SLURM_CPU_BIND) "verbose" set salloc_pid [spawn $salloc -w $node_name -N1 --exclusive --verbose -t2 $bin_bash] expect { -re "Granted job allocation ($number)" { set job_id $expect_out(1,string) send "export PS1=\"$prompt\"\r" exp_continue } -re "export PS1=\"$prompt\"\r" { exp_continue } timeout { slow_kill $salloc_pid fail "salloc not responding or failure to recognize prompt" } -re $prompt { } } set timeout 1 expect { -re $prompt { exp_continue } timeout { } } set timeout 30 # # Reading a second prompt is required by some versions of Expect # set timeout 1 expect { -re $prompt { exp_continue } timeout { } } set timeout 30 ############################################################################# # # Run a job step to get allocated processor count and affinity # set mask 0 set task_cnt 0 send "$srun -c1 ./$file_prog\r" expect { -re "TASK_ID:($number),MASK:($number)" { incr task_cnt set mask $expect_out(2,string) exp_continue } -re "error" { log_error "Some error occurred" set exit_code 1 exp_continue } timeout { slow_kill $salloc_pid fail "salloc not responding or failure to recognize prompt" } -re "$srun" { # just so we don't grab the srun call exp_continue; } -re $prompt } ############################################################################# # # Run a job step with affinity to verify unique masks with min -B 1:1:1 # set expected_mask [ expr ((1 << $task_cnt) - 1) ] set task_mask 0 send "$srun -c1 -n $task_cnt -B 1:1:1 ./$file_prog\r" expect { -re "TASK_ID:($number),MASK:($number)" { incr task_mask $expect_out(2,string) exp_continue } -re "error" { log_error "Some error occurred" set exit_code 1 exp_continue } timeout { log_error "srun (from --allocate) not responding or failure to recognize prompt" set exit_code 1 } -re "$srun" { # just so we don't grab the srun call exp_continue; } -re $prompt } if {$task_mask != $expected_mask} { log_error "Affinity mask inconsistency ($task_mask,$expected_mask)" set exit_code 1 } ############################################################################# # # Run varying number of sockets, verify task count and number of set bits # set this_cnt 1 while {$this_cnt <= $num_sockets} { set expected_tasks [ expr $this_cnt * $num_cores * $num_threads ] set num_tasks 0 set num_bits 0 set task_mask 0 send "$srun -B $this_cnt-$this_cnt:$num_cores:$num_threads ./$file_prog\r" expect { -re "TASK_ID:($number),MASK:($number)" { incr task_mask $expect_out(2,string) incr num_tasks 1 # count number of set bits set this_mask $expect_out(2,string) while {$this_mask > 0} { if {$this_mask & 1} { incr num_bits 1 } set this_mask [ expr $this_mask >> 1 ] } exp_continue } -re "error" { log_error "Some error occurred" set exit_code 1 exp_continue } timeout { log_error "srun (from --allocate) not responding or failure to recognize prompt" set exit_code 1 } -re "$srun" { # just so we don't grab the srun call exp_continue; } -re $prompt } if {$num_tasks != $expected_tasks} { log_error "Number of tasks inconsistent ($num_tasks,$expected_tasks)" set exit_code 1 } if {$num_bits != $expected_tasks} { log_error "Number of set bits inconsistent ($num_bits,$expected_tasks)" set exit_code 1 } incr this_cnt 1 } ############################################################################# # # Run varying number of cores, verify task count and number of set bits # set this_cnt 1 while {$this_cnt <= $num_cores} { set expected_tasks [ expr $num_sockets * $this_cnt * $num_threads ] set num_tasks 0 set num_bits 0 set task_mask 0 send "$srun -B $num_sockets:$this_cnt-$this_cnt:$num_threads ./$file_prog\r" expect { -re "TASK_ID:($number),MASK:($number)" { incr task_mask $expect_out(2,string) incr num_tasks 1 # count number of set bits set this_mask $expect_out(2,string) while {$this_mask > 0} { if {$this_mask & 1} { incr num_bits 1 } set this_mask [ expr $this_mask >> 1 ] } exp_continue } -re "error" { log_error "Some error occurred" set exit_code 1 exp_continue } timeout { log_error "salloc not responding or failure to recognize prompt" set exit_code 1 } -re "$srun" { # just so we don't grab the srun call exp_continue; } -re $prompt } if {$num_tasks != $expected_tasks} { log_error "Number of tasks inconsistent ($num_tasks,$expected_tasks)" set exit_code 1 } if {$num_bits != $expected_tasks} { log_error "Number of set bits inconsistent ($num_bits,$expected_tasks)" set exit_code 1 } incr this_cnt 1 } ############################################################################# # # Run varying number of threads, verify task count and number of set bits # set this_cnt 1 while {$this_cnt <= $num_threads} { set expected_tasks [ expr $num_sockets * $num_cores * $this_cnt ] set num_tasks 0 set num_bits 0 set task_mask 0 send "$srun -B $num_sockets:$num_cores:$this_cnt-$this_cnt ./$file_prog\r" expect { -re "TASK_ID:($number),MASK:($number)" { incr task_mask $expect_out(2,string) incr num_tasks 1 # count number of set bits set this_mask $expect_out(2,string) while {$this_mask > 0} { if {$this_mask & 1} { incr num_bits 1 } set this_mask [ expr $this_mask >> 1 ] } exp_continue } -re "error" { log_error "Some error occurred" set exit_code 1 exp_continue } timeout { log_error "salloc not responding or failure to recognize prompt" set exit_code 1 } -re "$srun" { # just so we don't grab the srun call exp_continue; } -re $prompt } if {$num_tasks != $expected_tasks} { log_error "Number of tasks inconsistent ($num_tasks,$expected_tasks)" set exit_code 1 } if {$num_bits != $expected_tasks} { log_error "Number of set bits inconsistent ($num_bits,$expected_tasks)" set exit_code 1 } incr this_cnt 1 } ############################################################################# # # Run varying cpus per task, verify task count and number of set bits # set this_cnt 1 while {$this_cnt <= $task_cnt} { set expected_tasks 1 set num_tasks 0 set num_bits 0 set task_mask 0 send "$srun -c$this_cnt -B 1:1:1 ./$file_prog\r" expect { -re "TASK_ID:($number),MASK:($number)" { incr task_mask $expect_out(2,string) incr num_tasks 1 # count number of set bits set this_mask $expect_out(2,string) while {$this_mask > 0} { if {$this_mask & 1} { incr num_bits 1 } set this_mask [ expr $this_mask >> 1 ] } exp_continue } -re "error" { log_error "Some error occurred" set exit_code 1 exp_continue } timeout { log_error "salloc not responding or failure to recognize prompt" set exit_code 1 } -re "$srun" { # just so we don't grab the srun call exp_continue; } -re $prompt } if {$num_tasks != $expected_tasks} { log_error "Number of tasks inconsistent ($num_tasks,$expected_tasks)" set exit_code 1 } if {$num_bits != $this_cnt} { log_error "Number of set bits inconsistent ($num_bits,$this_cnt)" set exit_code 1 } incr this_cnt 1 } ############################################################################# # # Run a job step with plane distribution to exercise option # Automatic binding in slurm version 2.0 will bind one task per core # set expected_mask [ expr ((1 << $task_cnt) - 1) ] set task_mask 0 send "$srun -n $task_cnt -m plane=4 ./$file_prog\r" expect { -re "TASK_ID:($number),MASK:($number)" { incr task_mask $expect_out(2,string) exp_continue } -re "error" { log_error "Some error occurred" set exit_code 1 exp_continue } timeout { log_error "salloc not responding or failure to recognize prompt" set exit_code 1 } -re "$srun" { # just so we don't grab the srun call exp_continue; } -re $prompt } if {$task_mask != $expected_mask} { log_error "Affinity mask inconsistency ($task_mask,$expected_mask)" set exit_code 1 } ############################################################################# # # Terminate the job, free the allocation # send "exit\r" expect { -re "error" { log_error "Some error occurred" set exit_code 1 } timeout { log_error "salloc not responding or failure to recognize prompt" slow_kill $salloc_pid set exit_code 1 } eof { wait } } if {$exit_code == 0} { exec $bin_rm -f $file_prog } elseif { [test_config_overrides] == 1 } { exec $bin_rm -f $file_prog fail "This test can fail if the node configuration in slurm.conf (sockets, cores, threads) differs from the actual configuration or if using task/cgroup without task/affinity" } else { fail "Test failed due to previous errors (\$exit_code = $exit_code)" }