/*****************************************************************************\ * slurm_mpi.c - Generic mpi selector for slurm ***************************************************************************** * Copyright (C) 2002 The Regents of the University of California. * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). * Written by Mark Grondona . * CODE-OCEC-09-009. All rights reserved. * * This file is part of Slurm, a resource management program. * For details, see . * Please also read the included file: DISCLAIMER. * * Slurm is free software; you can redistribute it and/or modify it under * the terms of the GNU General Public License as published by the Free * Software Foundation; either version 2 of the License, or (at your option) * any later version. * * In addition, as a special exception, the copyright holders give permission * to link the code of portions of this program with the OpenSSL library under * certain conditions as described in each individual source file, and * distribute linked combinations including the two. You must obey the GNU * General Public License in all respects for all of the code used other than * OpenSSL. If you modify file(s) with this exception, you may extend this * exception to your version of the file(s), but you are not obligated to do * so. If you do not wish to do so, delete this exception statement from your * version. If you delete this exception statement from all source files in * the program, then also delete it here. * * Slurm is distributed in the hope that it will be useful, but WITHOUT ANY * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more * details. * * You should have received a copy of the GNU General Public License along * with Slurm; if not, write to the Free Software Foundation, Inc., * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. \*****************************************************************************/ #include "config.h" #include #include #include "src/common/env.h" #include "src/common/macros.h" #include "src/common/plugin.h" #include "src/common/plugrack.h" #include "src/common/slurm_mpi.h" #include "src/common/xmalloc.h" #include "src/common/xstring.h" #define _DEBUG 0 typedef struct slurm_mpi_ops { int (*slurmstepd_prefork)(const stepd_step_rec_t *job, char ***env); int (*slurmstepd_init) (const mpi_plugin_task_info_t *job, char ***env); mpi_plugin_client_state_t * (*client_prelaunch) (const mpi_plugin_client_info_t *job, char ***env); int (*client_fini) (mpi_plugin_client_state_t *); } slurm_mpi_ops_t; /* * These strings must be kept in the same order as the fields * declared for slurm_mpi_ops_t. */ static const char *syms[] = { "p_mpi_hook_slurmstepd_prefork", "p_mpi_hook_slurmstepd_task", "p_mpi_hook_client_prelaunch", "p_mpi_hook_client_fini" }; static slurm_mpi_ops_t ops; static plugin_context_t *g_context = NULL; static pthread_mutex_t context_lock = PTHREAD_MUTEX_INITIALIZER; static bool init_run = false; #if _DEBUG /* Debugging information is invaluable to debug heterogeneous step support */ static inline void _log_env(char **env) { #if _DEBUG > 1 int i; if (!env) return; for (i = 0; env[i]; i++) info("%s", env[i]); #endif } static void _log_step_rec(const stepd_step_rec_t *job) { int i; info("STEPD_STEP_REC"); info("job_id:%u step_id:%u", job->jobid, job->stepid); info("ntasks:%u nnodes:%u node_id:%u", job->ntasks, job->nnodes, job->nodeid); info("node_tasks:%u", job->node_tasks); for (i = 0; i < job->node_tasks; i ++) info("gtid[%d]:%u", i, job->task[i]->gtid); for (i = 0; i < job->nnodes; i++) info("task_cnts[%d]:%u", i, job->task_cnts[i]); if ((job->het_job_id != 0) && (job->het_job_id != NO_VAL)) { info("het_job_id:%u step_id:%u", job->het_jobid, job->stepid); info("het_job_ntasks:%u het_job_nnodes:%u", job->het_job_ntasks, job->het_job_nnodes); info("het_job_node_offset:%u het_job_task_offset:%u", job->het_job_offset, job->het_job_task_offset); for (i = 0; i < job->het_job_nnodes; i++) info("het_job_task_cnts[%d]:%u", i, job->het_job_task_cnts[i]); info("het_job_node_list:%s", job->het_job_node_list); } } static void _log_mpi_rec(const mpi_plugin_client_info_t *job) { slurm_step_layout_t *layout = job->step_layout; int i, j; info("MPI_PLUGIN_CLIENT_INFO"); info("job_id:%u step_id:%u", job->jobid, job->stepid); if ((job->het_job_id != 0) && (job->het_job_id != NO_VAL)) { info("het_job_id:%u step_id:%u", job->het_job_id, job->stepid); } if (layout) { info("node_cnt:%u task_cnt:%u", layout->node_cnt, layout->task_cnt); info("node_list:%s", layout->node_list); info("plane_size:%u task_dist:%u", layout->plane_size, layout->task_dist); for (i = 0; i < layout->node_cnt; i++) { info("tasks[%d]:%u", i, layout->tasks[i]); for (j = 0; j < layout->tasks[i]; j++) { info("tids[%d][%d]:%u", i, j, layout->tids[i][j]); } } } } static void _log_task_rec(const mpi_plugin_task_info_t *job) { info("MPI_PLUGIN_TASK_INFO"); info("job_id:%u step_id:%u", job->jobid, job->stepid); info("nnodes:%u node_id:%u", job->nnodes, job->nodeid); info("ntasks:%u local_tasks:%u", job->ntasks, job->ltasks); info("global_task_id:%u local_task_id:%u", job->gtaskid, job->ltaskid); } #endif int _mpi_init (char *mpi_type) { int retval = SLURM_SUCCESS; char *plugin_type = "mpi"; char *type = NULL; int got_default = 0; if (init_run && g_context) return retval; slurm_mutex_lock( &context_lock ); if ( g_context ) goto done; if (mpi_type == NULL) { mpi_type = slurm_get_mpi_default(); got_default = 1; } else if (!xstrcmp(mpi_type, "openmpi")) { /* * The openmpi plugin has been equivalent to none for a while. * Translate so we can discard that duplicated no-op plugin. */ mpi_type = "none"; } if (mpi_type == NULL) { error("No MPI default set."); retval = SLURM_ERROR; goto done; } if (!xstrcmp(mpi_type, "list")) { char *plugin_dir; plugrack_t *mpi_rack = plugrack_create("mpi"); plugin_dir = slurm_get_plugin_dir(); plugrack_read_dir(mpi_rack, plugin_dir); plugrack_print_all_plugin(mpi_rack); exit(0); } setenvf(NULL, "SLURM_MPI_TYPE", "%s", mpi_type); type = xstrdup_printf("mpi/%s", mpi_type); g_context = plugin_context_create( plugin_type, type, (void **)&ops, syms, sizeof(syms)); if (!g_context) { error("cannot create %s context for %s", plugin_type, type); retval = SLURM_ERROR; goto done; } init_run = true; done: xfree(type); if (got_default) xfree(mpi_type); slurm_mutex_unlock( &context_lock ); return retval; } int mpi_hook_slurmstepd_init (char ***env) { char *mpi_type = getenvp (*env, "SLURM_MPI_TYPE"); #if _DEBUG info("IN %s mpi_type:%s", __func__, mpi_type); _log_env(*env); #else debug("mpi type = %s", mpi_type); #endif if (_mpi_init(mpi_type) == SLURM_ERROR) return SLURM_ERROR; /* * Unset env var so that "none" doesn't exist in salloc'ed env, but * still keep it in srun if not none. */ if (!xstrcmp(mpi_type, "none")) unsetenvp (*env, "SLURM_MPI_TYPE"); return SLURM_SUCCESS; } int mpi_hook_slurmstepd_prefork (const stepd_step_rec_t *job, char ***env) { #if _DEBUG info("IN %s", __func__); _log_env(*env); _log_step_rec(job); #endif if (mpi_hook_slurmstepd_init(env) == SLURM_ERROR) return SLURM_ERROR; return (*(ops.slurmstepd_prefork))(job, env); } int mpi_hook_slurmstepd_task (const mpi_plugin_task_info_t *job, char ***env) { #if _DEBUG info("IN %s", __func__); _log_task_rec(job); _log_env(*env); #endif if (mpi_hook_slurmstepd_init(env) == SLURM_ERROR) return SLURM_ERROR; return (*(ops.slurmstepd_init))(job, env); } int mpi_hook_client_init (char *mpi_type) { #if _DEBUG info("IN %s mpi_type:%s", __func__, mpi_type); #else debug("mpi type = %s", mpi_type); #endif if (_mpi_init(mpi_type) == SLURM_ERROR) return SLURM_ERROR; return SLURM_SUCCESS; } mpi_plugin_client_state_t * mpi_hook_client_prelaunch(const mpi_plugin_client_info_t *job, char ***env) { mpi_plugin_client_state_t *rc; #if _DEBUG info("IN %s", __func__); _log_env(*env); _log_mpi_rec(job); #endif if (_mpi_init(NULL) < 0) return NULL; rc = (*(ops.client_prelaunch))(job, env); #if _DEBUG _log_env(*env); #endif return rc; } int mpi_hook_client_fini (mpi_plugin_client_state_t *state) { #if _DEBUG info("IN %s", __func__); #endif if (_mpi_init(NULL) < 0) return SLURM_ERROR; return (*(ops.client_fini))(state); } int mpi_fini (void) { int rc; if (!g_context) return SLURM_SUCCESS; init_run = false; rc = plugin_context_destroy(g_context); return rc; }