/*****************************************************************************\ * proctrack_cray_aries.c - process tracking via Cray's API with Aries. ***************************************************************************** * Copyright (C) 2013 SchedMD LLC * Written by Danny Auble who borrowed heavily from * the proctrack/sgi_job plugin * CODE-OCEC-09-009. All rights reserved. * * This file is part of Slurm, a resource management program. * For details, see . * Please also read the included file: DISCLAIMER. * * Slurm is free software; you can redistribute it and/or modify it under * the terms of the GNU General Public License as published by the Free * Software Foundation; either version 2 of the License, or (at your option) * any later version. * * In addition, as a special exception, the copyright holders give permission * to link the code of portions of this program with the OpenSSL library under * certain conditions as described in each individual source file, and * distribute linked combinations including the two. You must obey the GNU * General Public License in all respects for all of the code used other than * OpenSSL. If you modify file(s) with this exception, you may extend this * exception to your version of the file(s), but you are not obligated to do * so. If you do not wish to do so, delete this exception statement from your * version. If you delete this exception statement from all source files in * the program, then also delete it here. * * Slurm is distributed in the hope that it will be useful, but WITHOUT ANY * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more * details. * * You should have received a copy of the GNU General Public License along * with Slurm; if not, write to the Free Software Foundation, Inc., * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. \*****************************************************************************/ #include "config.h" #define _GNU_SOURCE #include #include #include #include #include #include #include #include #include /* Cray's job module component */ #include "slurm/slurm.h" #include "slurm/slurm_errno.h" #include "src/common/log.h" #include "src/common/macros.h" #include "src/common/timers.h" #include "src/slurmd/common/proctrack.h" #include "src/slurmd/slurmstepd/slurmstepd_job.h" const char plugin_name[] = "Process tracking via Cray/Aries job module"; const char plugin_type[] = "proctrack/cray_aries"; const uint32_t plugin_version = SLURM_VERSION_NUMBER; /* * Handle to libjob.so */ static void *libjob_handle = NULL; static pthread_t threadid = 0; static pthread_cond_t notify = PTHREAD_COND_INITIALIZER; static pthread_mutex_t notify_mutex = PTHREAD_MUTEX_INITIALIZER; static pthread_mutex_t thread_mutex = PTHREAD_MUTEX_INITIALIZER; static uint64_t debug_flags = 0; extern bool proctrack_p_has_pid (uint64_t cont_id, pid_t pid); static void *_create_container_thread(void *args) { stepd_step_rec_t *job = (stepd_step_rec_t *)args; job->cont_id = (uint64_t)job_create(0, job->uid, 0); /* Signal the container_create we are done */ slurm_mutex_lock(¬ify_mutex); /* We need to signal failure or not */ slurm_cond_signal(¬ify); /* * Don't unlock the notify_mutex here, wait, it is not needed * and can cause deadlock if done. */ if (job->cont_id == (jid_t)-1) { error("Failed to create job container: %m"); } else { /* * Wait around for something else to be added and then exit * when that takes place. */ slurm_cond_wait(¬ify, ¬ify_mutex); } slurm_mutex_unlock(¬ify_mutex); return NULL; } static void _end_container_thread(void) { if (threadid) { /* This will end the thread and remove it from the container */ slurm_mutex_lock(&thread_mutex); slurm_mutex_lock(¬ify_mutex); slurm_cond_signal(¬ify); slurm_mutex_unlock(¬ify_mutex); pthread_join(threadid, NULL); threadid = 0; slurm_mutex_unlock(&thread_mutex); } } /* * init() is called when the plugin is loaded, before any other functions * are called. Put global initialization here. */ extern int init(void) { debug_flags = slurm_get_debug_flags(); debug("%s loaded", plugin_name); return SLURM_SUCCESS; } extern int fini(void) { _end_container_thread(); /* free up some memory */ slurm_mutex_destroy(¬ify_mutex); slurm_cond_destroy(¬ify); slurm_mutex_destroy(&thread_mutex); return SLURM_SUCCESS; } extern int proctrack_p_create(stepd_step_rec_t *job) { DEF_TIMERS; START_TIMER; if (!libjob_handle) init(); if (!job->cont_id) { /* * If we are forked then we can just use the pid from the fork * instead of using the thread method below. */ if (proctrack_forked) { job->cont_id = (uint64_t)job_create(0, job->uid, 0); goto endit; } /* * Since the cray job lib will create the container off the * process calling job_create we don't want to call it from * the main process since it will include all the threads * the main process spawns and there is no way to safely track * which pids need to be removed when removing the parent. * It turns out spawning a thread will make the job_create * create the container off that process instead of the main * process. Once we have added a process we can end the * thread which will remove the pid from the container * automatically. Empty containers are not valid. */ slurm_mutex_lock(&thread_mutex); if (threadid) { debug("Had a thread already 0x%08lx", threadid); slurm_mutex_lock(¬ify_mutex); slurm_cond_wait(¬ify, ¬ify_mutex); slurm_mutex_unlock(¬ify_mutex); debug("Last thread done 0x%08lx", threadid); } /* * We have to lock the notify_mutex here since the * thread could possibly signal things before we * started waiting for it. */ slurm_mutex_lock(¬ify_mutex); slurm_thread_create(&threadid, _create_container_thread, job); slurm_cond_wait(¬ify, ¬ify_mutex); slurm_mutex_unlock(¬ify_mutex); slurm_mutex_unlock(&thread_mutex); if (job->cont_id != (jid_t)-1) debug("proctrack_p_create: created jid 0x%08lx thread 0x%08lx", job->cont_id, threadid); } else error("proctrack_p_create: already have a cont_id"); endit: END_TIMER; if (debug_flags & DEBUG_FLAG_TIME_CRAY) INFO_LINE("call took: %s", TIME_STR); return SLURM_SUCCESS; } /* * NOTE: This function is called after slurmstepd spawns all user tasks. * Since the slurmstepd was placed in the job container when the container * was created and all of it's spawned tasks are placed into the container * when forked, all we need to do is remove the slurmstepd from the container * (once) at this time. */ int proctrack_p_add(stepd_step_rec_t *job, pid_t pid) { #ifdef HAVE_NATIVE_CRAY char fname[64]; int fd; uint32_t jobid; #endif int count = 0; DEF_TIMERS; START_TIMER; try_again: /* * If we aren't forked (pid was added in the job_create() call) this is * the time to add the pid to the job container. */ if (!proctrack_forked && job_attachpid(pid, job->cont_id) == (jid_t) -1) { if (errno == EINVAL && (count < 1)) { jid_t jid; if (proctrack_p_has_pid(job->cont_id, pid)) { debug("%s: Trying to add pid (%d) again to the same container, ignoring.", __func__, pid); return SLURM_SUCCESS; } if ((jid = job_detachpid(pid)) != (jid_t) -1) { error("%s: Pid %d was attached to container %"PRIu64" incorrectly. Moving to correct (%"PRIu64").", __func__, pid, jid, job->cont_id); count++; goto try_again; } else { error("%s: Couldn't detach pid %d from container: %m", __func__, pid); return SLURM_ERROR; } } else { error("Failed to attach pid %d to job container: %m", pid); return SLURM_ERROR; } } _end_container_thread(); #ifdef HAVE_NATIVE_CRAY // Set apid for this pid if (job->het_job_id && (job->het_job_id != NO_VAL)) jobid = job->het_job_id; else jobid = job->jobid; if (job_setapid(pid, SLURM_ID_HASH(jobid, job->stepid)) == -1) { error("Failed to set pid %d apid: %m", pid); return SLURM_ERROR; } // Explicitly mark pid as an application (/proc//task_is_app) snprintf(fname, sizeof(fname), "/proc/%d/task_is_app", pid); fd = open(fname, O_WRONLY); if (fd == -1) { error("Failed to open %s: %m", fname); return SLURM_ERROR; } if (write(fd, "1", 1) < 1) { error("Failed to write to %s: %m", fname); TEMP_FAILURE_RETRY(close(fd)); return SLURM_ERROR; } TEMP_FAILURE_RETRY(close(fd)); #endif END_TIMER; if (debug_flags & DEBUG_FLAG_TIME_CRAY) INFO_LINE("call took: %s", TIME_STR); return SLURM_SUCCESS; } int proctrack_p_signal(uint64_t id, int sig) { DEF_TIMERS; START_TIMER; if (!threadid) { if ((job_killjid((jid_t) id, sig) < 0) && (errno != ENODATA) && (errno != EBADF) ) return (SLURM_ERROR); } else if (sig == SIGKILL) { /* job ended before it started */ _end_container_thread(); } else error("Trying to send signal %d a container 0x%08lx " "that hasn't had anything added to it yet", sig, id); END_TIMER; if (debug_flags & DEBUG_FLAG_TIME_CRAY) INFO_LINE("call took: %s", TIME_STR); return (SLURM_SUCCESS); } int proctrack_p_destroy(uint64_t id) { int status; DEF_TIMERS; START_TIMER; debug("destroying 0x%08lx 0x%08lx", id, threadid); if (!threadid) job_waitjid((jid_t) id, &status, 0); /* * Assume any error means job doesn't exist. Therefore, * return SUCCESS to slurmd so it doesn't retry continuously */ END_TIMER; if (debug_flags & DEBUG_FLAG_TIME_CRAY) INFO_LINE("call took: %s", TIME_STR); return SLURM_SUCCESS; } uint64_t proctrack_p_find(pid_t pid) { jid_t jid; DEF_TIMERS; START_TIMER; if ((jid = job_getjid(pid)) == (jid_t) -1) return ((uint64_t) 0); END_TIMER; if (debug_flags & DEBUG_FLAG_TIME_CRAY) INFO_LINE("call took: %s", TIME_STR); return ((uint64_t) jid); } bool proctrack_p_has_pid (uint64_t cont_id, pid_t pid) { jid_t jid; if ((jid = job_getjid(pid)) == (jid_t) -1) return false; if ((uint64_t)jid != cont_id) return false; return true; } int proctrack_p_wait(uint64_t id) { int status; if (!threadid && job_waitjid((jid_t) id, &status, 0) == (jid_t)-1) return SLURM_ERROR; return SLURM_SUCCESS; } int proctrack_p_get_pids(uint64_t cont_id, pid_t **pids, int *npids) { int pidcnt, bufsize; pid_t *p; DEF_TIMERS; START_TIMER; pidcnt = job_getpidcnt((jid_t)cont_id); if (pidcnt > 0) { /* * FIXME - The "+ 128" is a rough attempt to allow for * the fact that _job_getpidcnt() followed by _job_get_pidlist * is not atomic. */ bufsize = sizeof(pid_t) * (pidcnt + 128); p = (pid_t *)xmalloc(bufsize); pidcnt = job_getpidlist((jid_t)cont_id, p, bufsize); if (pidcnt == -1) { int rc = SLURM_SUCCESS; /* * There is a possiblity for a race condition * where if the last task in the job exits * between job_getpidcnt and job_getpidlist. * That is ok, so just return SUCCESS; */ if (errno != ENODATA) { rc = SLURM_ERROR; error("job_getpidlist() failed: %m"); } *pids = NULL; *npids = 0; xfree(p); return rc; } *pids = p; *npids = pidcnt; } else { *pids = NULL; *npids = 0; } END_TIMER; if (debug_flags & DEBUG_FLAG_TIME_CRAY) INFO_LINE("call took: %s", TIME_STR); return SLURM_SUCCESS; }