#!/usr/bin/env expect
############################################################################
# Purpose: Test of Slurm functionality
#          Validate proper GRES operation under heavy load (many jobs)
############################################################################
# Copyright (C) 2018 SchedMD LLC
# Written by Morris Jette
#
# This file is part of Slurm, a resource management program.
# For details, see <https://slurm.schedmd.com/>.
# Please also read the included file: DISCLAIMER.
#
# Slurm is free software; you can redistribute it and/or modify it under
# the terms of the GNU General Public License as published by the Free
# Software Foundation; either version 2 of the License, or (at your option)
# any later version.
#
# Slurm is distributed in the hope that it will be useful, but WITHOUT ANY
# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
# FOR A PARTICULAR PURPOSE.  See the GNU General Public License for more
# details.
#
# You should have received a copy of the GNU General Public License along
# with Slurm; if not, write to the Free Software Foundation, Inc.,
# 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA.
############################################################################
source ./globals

set exit_code      0
set file_in1       "test$test_id.input1"
set file_in2       "test$test_id.input2"
set file_out       "test$test_id.output"
set number_commas  "\[0-9_,\]+"

if {[test_cons_tres]} {
	log_debug "Valid configuration, using select/cons_tres"
} else {
	skip "This test is only compatible with select/cons_tres"
}
if {[test_front_end]} {
	skip "This test is incompatible with front-end systems"
}

set nb_nodes [get_node_cnt_in_part]
if {$nb_nodes > 1} {
	set nb_nodes 2
}
set gpu_cnt [get_highest_gres_count $nb_nodes "gpu"]
if {$gpu_cnt < 0} {
	fail "Error getting GPU count"
}
if {$gpu_cnt < 1} {
	skip "This test requires 1 or more GPUs in the default partition"
}

set node_name [get_nodes_by_request "--gres=gpu:1 -n1 -t1"]
if { [llength $node_name] != 1 } {
	skip "This test need to be able to submit jobs with at least --gres=gpu:1"
}
set cpus_per_node    [get_node_param $node_name "CPUTot"]
set sockets_per_node [get_node_param $node_name "Sockets"]
set cpus_per_socket  [expr $cpus_per_node / $sockets_per_node]

log_debug "GPU count is $gpu_cnt"
log_debug "Sockets per node is $sockets_per_node"
log_debug "CPUs per socket is $cpus_per_socket"

set tot_cpus $cpus_per_node
set tot_gpus $gpu_cnt
if {$nb_nodes > 1} {
	incr tot_gpus $gpu_cnt
}
if {$tot_gpus > 32} {
	set tot_gpus 32
}

#
# Build input script file
#
make_bash_script $file_in1 "$srun -l -N \$SLURM_NNODES -n \$SLURM_NNODES $file_in2
$scontrol -dd show job \$SLURM_JOB_ID | grep gpu
sleep 5
exit 0"

make_bash_script $file_in2 "echo HOST:\$SLURMD_NODENAME CUDA_VISIBLE_DEVICES:\$CUDA_VISIBLE_DEVICES"

#
# Submit job with various --gpus counters, up to 2 full nodes or 32 GPUs
#
#	spawn $sbatch --cpus-per-gpu=1 --gpus=$inx -t1 -w $node_name -J "test$test_id" --output=$ofile ./$file_in1
for {set inx 1} {$inx <= $tot_gpus} {incr inx} {
	set job_id 0
	set ofile ${file_out}.${inx}
	exec $bin_rm -f $ofile
	if {$tot_gpus > $tot_cpus} {
#		Assumes no configured DefCPUsPerGPU
		spawn $sbatch --gpus=$inx -t1 -w $node_name \
		              -J "test$test_id" --output=$ofile ./$file_in1
	} else {
		spawn $sbatch --cpus-per-gpu=1 --gpus=$inx -t1 \
		              -w $node_name -J "test$test_id" \
		              --output=$ofile ./$file_in1
	}
	expect {
		-re "Submitted batch job ($number)" {
			set job_id $expect_out(1,string)
			exp_continue
		}
		timeout {
			log_error "sbatch not responding"
			set exit_code 1
		}
		eof {
			wait
		}
	}
	if {$job_id == 0} {
		log_error "sbatch job submit failure"
		set exit_code 1
		set tot_gpus [expr $inx - 1]
		break
	}
}

#
# Check the output file contents to identify allocated GPUs
#
# Reset max_file_delay to avoid possibly huge delays here when tot_gpus is large
# Perform large delay at the start and small delays for each job's output
#
set save_max_file_delay $max_file_delay
if {$tot_gpus > 4 && $max_file_delay > 30} {
	sleep [expr $max_file_delay - 20]
	set max_file_delay 20
}
for {set inx 1} {$inx <= $tot_gpus} {incr inx} {
	set ofile ${file_out}.${inx}
	if {[wait_for_file $ofile] != 0} {
		log_error "Output file $ofile not found"
		set exit_code 1
	} else {
		set match 0
		spawn $bin_cat $ofile
		expect {
			-re "CUDA_VISIBLE_DEVICES:($number_commas)" {
				incr match [cuda_count $expect_out(1,string)]
				exp_continue
			}
			eof {
				wait
			}
		}
		if {$match != $inx} {
			log_error "Bad GPU count in file $ofile ($match != $inx)"
			set exit_code 1
		} else {
			exec $bin_rm -f $ofile
		}
	}
}
set max_file_delay $save_max_file_delay

#
# Log any vestigial jobs and cancel them
#
if {$exit_code != 0} {
	spawn $squeue -tall -n "test$test_id"
	expect {
		timeout {
			log_error "squeue not responding"
			set exit_code 1
		}
		eof {
			wait
		}
	}
}
exec $scancel -n "test$test_id"

if {$exit_code == 0} {
	exec $bin_rm -f $file_in1 $file_in2
} else {
	fail "Test failed due to previous errors (\$exit_code = $exit_code)"
}