#!/bin/bash # ************************************************************************** # Function: Wrapper that helps launching Intel MPI jobs within Slurm # using MICs in native mode. # mpiexec.hydra needs passwordless ssh access to all involved nodes # Version: 0.4 #--------------------------------------------------------------------------- # 11.10.2013 Created by Chrysovalantis Paschoulas, Juelich Supercomputing Centre - Forschungszentrum Juelich # Intial Script by (C) Olli-Pekka Lehto - CSC IT Center for Science Ltd. # ************************************************************************** # Usage message USAGE=" USAGE $(basename "$0") [ [-h] | [-v] [-x -c ] [-z -m ] ] OPTIONS -h Print this message. -c Binary that will run on host nodes. If it is not set then only the MICs will be used. -m Binary that will run inside the MICs. -x Number of tasks (MPI ranks) for the host nodes. Default value is 1. -z Number of tasks (MPI ranks) for the MICs. Default value is 1. -v Show more info for this script. --tv Run using TotalView (equivalent to export MPIEXEC_PREFIX=\"totalview -args\"). --tvcli Run using TotalView cli (equivalent to export MPIEXEC_PREFIX=\"totalviewcli -args\") MORE INFO The user MUST export the following environment variables: MIC_NUM_PER_HOST Number of MICs on each host that will be used by mpiexec. Available options: 0, 1, 2. Default 2. OMP_NUM_THREADS OpenMP threads number per task on hosts. This MUST be exported when OpenMP is used! MIC_OMP_NUM_THREADS OpenMP threads number per task on MICs. If not defined then is set same as OMP_NUM_THREADS. Also the user MAY pass additional flags to mpiexec exporting the following env vars: MPIEXEC_PREFIX Wrap the execution of mpiexec with another tool (e.g. totalview). MPIEXEC_FLAGS_HOST Flags that will be passed to the hosts. MPIEXEC_FLAGS_MIC Flags that will be passed to the MICs. -- Examples: export MPIEXEC_PREFIX=\"totalview -args\" export MPIEXEC_PREFIX=\"totalviewcli -args\" export MPIEXEC_FLAGS_HOST=\"-env VAR VALUE\" export MPIEXEC_FLAGS_MIC=\"-envlist VAR1,VAR2\" EXAMPLES Batch Script1 - Only hosts: --- #!/bin/bash #SBATCH -J TestJobMICNativeHybrid #SBATCH -N 4 #SBATCH -p q_mics #SBATCH -o TestJob-%j.out #SBATCH -e TestJob-%j.err #SBATCH --time=30 module purge module load impi intel/13.1.3 export MIC_NUM_PER_HOST=0 export OMP_NUM_THREADS=32 mpirun-mic -x 1 -c ./impi_native_hybrid --- Batch Script2 - Only mics: --- #!/bin/bash #SBATCH -J TestJobMICNativeHybrid #SBATCH -N 4 #SBATCH -p q_mics #SBATCH -o TestJob-%j.out #SBATCH -e TestJob-%j.err #SBATCH --time=30 module purge module load impi intel/13.1.3 export MIC_NUM_PER_HOST=2 export MIC_OMP_NUM_THREADS=240 mpirun-mic -z 1 -m ./impi_native_hybrid.mic --- Batch Script3 - Hosts and MICs: --- #!/bin/bash #SBATCH -J TestJobMICNativeHybrid #SBATCH -N 2 #SBATCH -p q_mics #SBATCH -o TestJob-%j.out #SBATCH -e TestJob-%j.err #SBATCH --time=30 module purge module load impi intel/13.1.3 export MIC_NUM_PER_HOST=2 export OMP_NUM_THREADS=2 export MIC_OMP_NUM_THREADS=4 mpirun-mic -v -x 16 -c ./impi_native_hybrid -z 60 -m ./impi_native_hybrid.mic --- "; # check script arguments if [ $# -lt 1 ] ; then echo "$USAGE" >&2 exit 1 fi # get script arguments while getopts "vhc:m:x:z:-:" OPTION do case $OPTION in c) HOST_BINARY=$OPTARG ;; h) echo "$USAGE"; exit 0; ;; m) MIC_BINARY=$OPTARG ;; v) MPIRUN_MIC_VERBOSE=1 ;; x) HOST_PPN=$OPTARG ;; z) MIC_PPN=$OPTARG ;; -) case $OPTARG in tv) MPIEXEC_PREFIX="totalview -args" ;; tvcli) MPIEXEC_PREFIX="totalviewcli -args" ;; \?) echo $USAGE >&2 exit 1 ;; esac ;; \?) echo "$USAGE"; exit 1; ;; esac done ### prepare the environment # If not under Slurm just run on the local system, but still we must be on a compute node.. if [[ -z "$SLURM_PROCID" ]] ; then SLURM_PROCID=0 fi if [[ -z "$SLURM_NODELIST" ]] ; then SLURM_NODELIST=`hostname` fi # give default values if [[ -z "$MIC_PPN" ]] ; then MIC_PPN=1 fi if [[ -z "$HOST_PPN" ]] ; then HOST_PPN=1 fi if [[ -z "$MIC_NUM_PER_HOST" ]] ; then MIC_NUM_PER_HOST=2 fi # We will use OMP_NUM_THREADS to decide if the user will run a Hybrid MPI+OpenMP job # Here set default value for MIC_OMP_NUM_THREADS if [[ -n "$OMP_NUM_THREADS" ]] ; then if [[ -z "$MIC_OMP_NUM_THREADS" ]] ; then MIC_OMP_NUM_THREADS=$OMP_NUM_THREADS fi fi # check the important values if [[ -z "$HOST_BINARY" ]] && [[ -z "$MIC_BINARY" ]] ; then echo "$USAGE" >&2 exit 1; fi # create the command line #MPI_EXEC=mpirun MPI_EXEC=mpiexec.hydra EXEC_ARGS="" # create the list of the nodes that are configured to have MICs LLIST_HOSTS_WITH_MICS=""; SLIST_HOSTS_WITH_MICS=`sinfo -h -o "%N %G" | grep mic | awk '{ print $1; }'`; for host in `scontrol show hostname $SLIST_HOSTS_WITH_MICS` ; do LLIST_HOSTS_WITH_MICS="${LLIST_HOSTS_WITH_MICS} ${host}"; done # create the lists of HOSTS AND MICS! HOST_NODELIST=""; MIC_NODELIST=""; for host in `scontrol show hostname $SLURM_NODELIST` ; do echo $LLIST_HOSTS_WITH_MICS | grep $host &> /dev/null if [ $? -eq 0 ] ; then if [ $MIC_NUM_PER_HOST -eq 1 ] ; then MIC_NODELIST="${MIC_NODELIST} ${host}-mic0"; elif [ $MIC_NUM_PER_HOST -eq 2 ] ; then MIC_NODELIST="${MIC_NODELIST} ${host}-mic0 ${host}-mic1"; fi fi HOST_NODELIST="${HOST_NODELIST} ${host}"; done # create the arguments # args for hosts here # run job on hosts if host binary is not null if [[ -n "$HOST_BINARY" ]] ; then if [[ -n "$HOST_NODELIST" ]] ; then for n in $HOST_NODELIST ; do if [[ -n "$OMP_NUM_THREADS" ]] ; then # with OpenMP EXEC_ARGS="${EXEC_ARGS} : -env OMP_NUM_THREADS $OMP_NUM_THREADS $MPIEXEC_FLAGS_HOST -n $HOST_PPN -host $n $HOST_BINARY"; else # without OpenMP EXEC_ARGS="${EXEC_ARGS} : $MPIEXEC_FLAGS_HOST -n $HOST_PPN -host $n $HOST_BINARY"; fi done fi fi # args for mics here # run job on mics if mic binary is not null and MIC_NUM_PER_HOST is 1 or 2 if [[ -n "$MIC_NODELIST" ]] ; then for n in $MIC_NODELIST ; do if [[ -n "$MIC_OMP_NUM_THREADS" ]] ; then # with OpenMP EXEC_ARGS="${EXEC_ARGS} : -env OMP_NUM_THREADS $MIC_OMP_NUM_THREADS -env LD_LIBRARY_PATH $MIC_LD_LIBRARY_PATH:$LD_LIBRARY_PATH $MPIEXEC_FLAGS_MIC -n $MIC_PPN -host $n $MIC_BINARY"; #EXEC_ARGS="${EXEC_ARGS} : -env OMP_NUM_THREADS $MIC_OMP_NUM_THREADS $MPIEXEC_FLAGS_MIC -n $MIC_PPN -host $n $MIC_BINARY"; else # NO OpenMP EXEC_ARGS="${EXEC_ARGS} : -env LD_LIBRARY_PATH $MIC_LD_LIBRARY_PATH:$LD_LIBRARY_PATH $MPIEXEC_FLAGS_MIC -n $MIC_PPN -host $n $MIC_BINARY"; #EXEC_ARGS="${EXEC_ARGS} : $MPIEXEC_FLAGS_MIC -n $MIC_PPN -host $n $MIC_BINARY"; fi done fi RUNCMD="$MPI_EXEC $EXEC_ARGS"; if [[ -n "$MPIEXEC_PREFIX" ]] ; then RUNCMD="$MPIEXEC_PREFIX $RUNCMD"; fi # extra important env (Local System depended) #export LD_LIBRARY_PATH="$MIC_LD_LIBRARY_PATH:$LD_LIBRARY_PATH" export I_MPI_MIC=1 export I_MPI_DAPL_PROVIDER_LIST=ofa-v2-mlx4_0-1 unset I_MPI_DEVICE unset I_MPI_PMI_LIBRARY # start the job if [ $SLURM_PROCID -eq 0 ] ; then if [[ -n "$MPIRUN_MIC_VERBOSE" ]] ; then echo echo "########################################################################" echo "MPI Tasks per host: $HOST_PPN" echo "Threads per host MPI task: $OMP_NUM_THREADS" echo "Binary for the hosts: $HOST_BINARY" echo "MPI Tasks per MIC: $MIC_PPN" echo "Threads per MIC MPI task: $MIC_OMP_NUM_THREADS" echo "Binary for the mics: $MIC_BINARY" echo "MIC_NUM_PER_HOST: $MIC_NUM_PER_HOST" echo echo "MPIEXEC_PREFIX: $MPIEXEC_PREFIX" echo "MPIEXEC_FLAGS_HOST: $MPIEXEC_FLAGS_HOST" echo "MPIEXEC_FLAGS_MIC: $MPIEXEC_FLAGS_MIC" echo "" echo "Run command: " echo "$RUNCMD" echo "########################################################################" echo fi $RUNCMD fi