/*****************************************************************************\ * proctrack_cgroup.c - process tracking via linux cgroup containers ***************************************************************************** * Copyright (C) 2009 CEA/DAM/DIF * Written by Matthieu Hautreux * * This file is part of Slurm, a resource management program. * For details, see . * Please also read the included file: DISCLAIMER. * * Slurm is free software; you can redistribute it and/or modify it under * the terms of the GNU General Public License as published by the Free * Software Foundation; either version 2 of the License, or (at your option) * any later version. * * In addition, as a special exception, the copyright holders give permission * to link the code of portions of this program with the OpenSSL library under * certain conditions as described in each individual source file, and * distribute linked combinations including the two. You must obey the GNU * General Public License in all respects for all of the code used other than * OpenSSL. If you modify file(s) with this exception, you may extend this * exception to your version of the file(s), but you are not obligated to do * so. If you do not wish to do so, delete this exception statement from your * version. If you delete this exception statement from all source files in * the program, then also delete it here. * * Slurm is distributed in the hope that it will be useful, but WITHOUT ANY * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more * details. * * You should have received a copy of the GNU General Public License along * with Slurm; if not, write to the Free Software Foundation, Inc., * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. \*****************************************************************************/ #include "config.h" #include #include #include #include #include #include #include #include "slurm/slurm.h" #include "slurm/slurm_errno.h" #include "src/common/log.h" #include "src/common/xcgroup_read_config.h" #include "src/common/xstring.h" #include "src/slurmd/common/xcpuinfo.h" #include "src/slurmd/common/xcgroup.h" #include "src/slurmd/slurmd/slurmd.h" #include "src/slurmd/slurmstepd/slurmstepd_job.h" /* * These variables are required by the generic plugin interface. If they * are not found in the plugin, the plugin loader will ignore it. * * plugin_name - a string giving a human-readable description of the * plugin. There is no maximum length, but the symbol must refer to * a valid string. * * plugin_type - a string suggesting the type of the plugin or its * applicability to a particular form of data or method of data handling. * If the low-level plugin API is used, the contents of this string are * unimportant and may be anything. Slurm uses the higher-level plugin * interface which requires this string to be of the form * * / * * where is a description of the intended application of * the plugin (e.g., "jobcomp" for Slurm job completion logging) and * is a description of how this plugin satisfies that application. Slurm will * only load job completion logging plugins if the plugin_type string has a * prefix of "jobcomp/". * * plugin_version - an unsigned 32-bit integer containing the Slurm version * (major.minor.micro combined into a single number). */ const char plugin_name[] = "Process tracking via linux cgroup freezer subsystem"; const char plugin_type[] = "proctrack/cgroup"; const uint32_t plugin_version = SLURM_VERSION_NUMBER; static char user_cgroup_path[PATH_MAX]; static char job_cgroup_path[PATH_MAX]; static char jobstep_cgroup_path[PATH_MAX]; static xcgroup_ns_t freezer_ns; static bool slurm_freezer_init = false; static xcgroup_t freezer_cg; static xcgroup_t slurm_freezer_cg; static xcgroup_t user_freezer_cg; static xcgroup_t job_freezer_cg; static xcgroup_t step_freezer_cg; int _slurm_cgroup_init(void) { /* initialize user/job/jobstep cgroup relative paths * and release agent path */ user_cgroup_path[0]='\0'; job_cgroup_path[0]='\0'; jobstep_cgroup_path[0]='\0'; /* initialize freezer cgroup namespace */ if (xcgroup_ns_create(&freezer_ns, "", "freezer") != XCGROUP_SUCCESS) { error("unable to create freezer cgroup namespace"); return SLURM_ERROR; } /* initialize the root freezer cg */ if (xcgroup_create(&freezer_ns, &freezer_cg, "", 0, 0) != XCGROUP_SUCCESS) { error("proctrack/cgroup unable to create root freezer xcgroup"); xcgroup_ns_destroy(&freezer_ns); return SLURM_ERROR; } return SLURM_SUCCESS; } int _slurm_cgroup_create(stepd_step_rec_t *job, uint64_t id, uid_t uid, gid_t gid) { /* * we do it here as we do not have access to the conf structure * in libslurm (src/common/xcgroup.c) */ char *pre; slurm_cgroup_conf_t *cg_conf; uint32_t jobid; /* read cgroup configuration */ slurm_mutex_lock(&xcgroup_config_read_mutex); cg_conf = xcgroup_get_slurm_cgroup_conf(); pre = xstrdup(cg_conf->cgroup_prepend); slurm_mutex_unlock(&xcgroup_config_read_mutex); #ifdef MULTIPLE_SLURMD if ( conf->node_name != NULL ) xstrsubstitute(pre,"%n", conf->node_name); else { xfree(pre); pre = (char*) xstrdup("/slurm"); } #endif if (xcgroup_create(&freezer_ns, &slurm_freezer_cg, pre, getuid(), getgid()) != XCGROUP_SUCCESS) { xfree(pre); return SLURM_ERROR; } /* * While creating the cgroup hierarchy of the step, lock the root * cgroup directory. The same lock is hold during removal of the * hierarchies of other jobs/steps. This helps to avoid the race * condition with concurrent creation/removal of the intermediate * shared directories that could result in the failure of the * hierarchy setup */ if (xcgroup_lock(&freezer_cg) != XCGROUP_SUCCESS) { error("%s: xcgroup_lock error", __func__); goto bail; } /* create slurm cgroup in the freezer ns (it could already exist) */ if (xcgroup_instantiate(&slurm_freezer_cg) != XCGROUP_SUCCESS) goto bail; /* build user cgroup relative path if not set (should not be) */ if (*user_cgroup_path == '\0') { if (snprintf(user_cgroup_path, PATH_MAX, "%s/uid_%u", pre, uid) >= PATH_MAX) { error("unable to build uid %u cgroup relative path : %m", uid); goto bail; } } xfree(pre); /* build job cgroup relative path if no set (should not be) */ if (job->het_job_id && (job->het_job_id != NO_VAL)) jobid = job->het_job_id; else jobid = job->jobid; if (*job_cgroup_path == '\0') { if (snprintf(job_cgroup_path, PATH_MAX, "%s/job_%u", user_cgroup_path, jobid) >= PATH_MAX) { error("unable to build job %u cgroup relative path : %m", jobid); goto bail; } } /* build job step cgroup relative path (should not be) */ if (*jobstep_cgroup_path == '\0') { int cc; if (job->stepid == SLURM_BATCH_SCRIPT) { cc = snprintf(jobstep_cgroup_path, PATH_MAX, "%s/step_batch", job_cgroup_path); } else if (job->stepid == SLURM_EXTERN_CONT) { cc = snprintf(jobstep_cgroup_path, PATH_MAX, "%s/step_extern", job_cgroup_path); } else { cc = snprintf(jobstep_cgroup_path, PATH_MAX, "%s/step_%u", job_cgroup_path, job->stepid); } if (cc >= PATH_MAX) { error("proctrack/cgroup unable to build job step %u.%u " "freezer cg relative path: %m", jobid, job->stepid); goto bail; } } /* create user cgroup in the freezer ns (it could already exist) */ if (xcgroup_create(&freezer_ns, &user_freezer_cg, user_cgroup_path, getuid(), getgid()) != XCGROUP_SUCCESS) { xcgroup_destroy(&slurm_freezer_cg); goto bail; } /* create job cgroup in the freezer ns (it could already exist) */ if (xcgroup_create(&freezer_ns, &job_freezer_cg, job_cgroup_path, getuid(), getgid()) != XCGROUP_SUCCESS) { xcgroup_destroy(&slurm_freezer_cg); xcgroup_destroy(&user_freezer_cg); goto bail; } /* create step cgroup in the freezer ns (it should not exists) */ if (xcgroup_create(&freezer_ns, &step_freezer_cg, jobstep_cgroup_path, getuid(), getgid()) != XCGROUP_SUCCESS) { xcgroup_destroy(&slurm_freezer_cg); xcgroup_destroy(&user_freezer_cg); xcgroup_destroy(&job_freezer_cg); goto bail; } if ((xcgroup_instantiate(&user_freezer_cg) != XCGROUP_SUCCESS) || (xcgroup_instantiate(&job_freezer_cg) != XCGROUP_SUCCESS) || (xcgroup_instantiate(&step_freezer_cg) != XCGROUP_SUCCESS)) { xcgroup_destroy(&user_freezer_cg); xcgroup_destroy(&job_freezer_cg); xcgroup_destroy(&step_freezer_cg); goto bail; } /* inhibit release agent for the step cgroup thus letting * slurmstepd being able to add new pids to the container * when the job ends (TaskEpilog,...) */ xcgroup_set_param(&step_freezer_cg, "notify_on_release", "0"); slurm_freezer_init = true; xcgroup_unlock(&freezer_cg); return SLURM_SUCCESS; bail: xfree(pre); xcgroup_destroy(&slurm_freezer_cg); xcgroup_unlock(&freezer_cg); xcgroup_destroy(&freezer_cg); return SLURM_ERROR; } static int _move_current_to_root_cgroup(xcgroup_ns_t *ns) { xcgroup_t cg; int rc; if (xcgroup_create(ns, &cg, "", 0, 0) != XCGROUP_SUCCESS) return SLURM_ERROR; rc = xcgroup_move_process(&cg, getpid()); xcgroup_destroy(&cg); return rc; } int _slurm_cgroup_destroy(void) { if (xcgroup_lock(&freezer_cg) != XCGROUP_SUCCESS) { error("%s: xcgroup_lock error", __func__); return SLURM_ERROR; } /* * First move slurmstepd process to the root cgroup, otherwise * the rmdir(2) triggered by the calls below will always fail, * because slurmstepd is still in the cgroup! */ if (_move_current_to_root_cgroup(&freezer_ns) != SLURM_SUCCESS) { error("%s: Unable to move pid %d to root cgroup", __func__, getpid()); xcgroup_unlock(&freezer_cg); return SLURM_ERROR; } xcgroup_wait_pid_moved(&job_freezer_cg, "freezer job"); if (jobstep_cgroup_path[0] != '\0') { if (xcgroup_delete(&step_freezer_cg) != XCGROUP_SUCCESS) { debug("_slurm_cgroup_destroy: problem deleting step cgroup path %s: %m", step_freezer_cg.path); xcgroup_unlock(&freezer_cg); return SLURM_ERROR; } xcgroup_destroy(&step_freezer_cg); } if (job_cgroup_path[0] != '\0') { (void)xcgroup_delete(&job_freezer_cg); xcgroup_destroy(&job_freezer_cg); } if (user_cgroup_path[0] != '\0') { (void)xcgroup_delete(&user_freezer_cg); xcgroup_destroy(&user_freezer_cg); } if (slurm_freezer_init) { xcgroup_destroy(&slurm_freezer_cg); } xcgroup_unlock(&freezer_cg); xcgroup_destroy(&freezer_cg); xcgroup_ns_destroy(&freezer_ns); return SLURM_SUCCESS; } int _slurm_cgroup_add_pids(uint64_t id, pid_t* pids, int npids) { if (*jobstep_cgroup_path == '\0') return SLURM_ERROR; return xcgroup_add_pids(&step_freezer_cg, pids, npids); } int _slurm_cgroup_stick_stepd(uint64_t id, pid_t pid) { if (*job_cgroup_path == '\0') return SLURM_ERROR; return xcgroup_add_pids(&job_freezer_cg, &pid, 1); } int _slurm_cgroup_get_pids(uint64_t id, pid_t **pids, int *npids) { if (*jobstep_cgroup_path == '\0') return SLURM_ERROR; return xcgroup_get_pids(&step_freezer_cg, pids, npids); } int _slurm_cgroup_suspend(uint64_t id) { if (*jobstep_cgroup_path == '\0') return SLURM_ERROR; return xcgroup_set_param(&step_freezer_cg, "freezer.state", "FROZEN"); } int _slurm_cgroup_resume(uint64_t id) { if (*jobstep_cgroup_path == '\0') return SLURM_ERROR; return xcgroup_set_param(&step_freezer_cg, "freezer.state", "THAWED"); } bool _slurm_cgroup_has_pid(pid_t pid) { bool fstatus; xcgroup_t cg; fstatus = xcgroup_ns_find_by_pid(&freezer_ns, &cg, pid); if ( fstatus != XCGROUP_SUCCESS) return false; if (xstrcmp(cg.path, step_freezer_cg.path)) { fstatus = false; } else { fstatus = true; } xcgroup_destroy(&cg); return fstatus; } int _slurm_cgroup_is_pid_a_slurm_task(uint64_t id, pid_t pid) { int fstatus = -1; int fd; pid_t ppid; char file_path[PATH_MAX], buf[2048]; if (snprintf(file_path, PATH_MAX, "/proc/%ld/stat", (long)pid) >= PATH_MAX) { debug2("unable to build pid '%d' stat file: %m ", pid); return fstatus; } if ((fd = open(file_path, O_RDONLY)) < 0) { debug2("unable to open '%s' : %m ", file_path); return fstatus; } if (read(fd, buf, 2048) <= 0) { debug2("unable to read '%s' : %m ", file_path); close(fd); return fstatus; } close(fd); if (sscanf(buf, "%*d %*s %*s %d", &ppid) != 1) { debug2("unable to get ppid of pid '%d', %m", pid); return fstatus; } /* * assume that any child of slurmstepd is a slurm task * they will get all signals, inherited processes will * only get SIGKILL */ if (ppid == (pid_t) id) fstatus = 1; else fstatus = 0; return fstatus; } /* * init() is called when the plugin is loaded, before any other functions * are called. Put global initialization here. */ extern int init (void) { /* initialize cpuinfo internal data */ if (xcpuinfo_init() != XCPUINFO_SUCCESS) { return SLURM_ERROR; } /* initialize cgroup internal data */ if (_slurm_cgroup_init() != SLURM_SUCCESS) { xcpuinfo_fini(); return SLURM_ERROR; } return SLURM_SUCCESS; } extern int fini (void) { _slurm_cgroup_destroy(); xcpuinfo_fini(); return SLURM_SUCCESS; } /* * Uses slurmd job-step manager's pid as the unique container id. */ extern int proctrack_p_create (stepd_step_rec_t *job) { int fstatus; /* create a new cgroup for that container */ fstatus = _slurm_cgroup_create(job, (uint64_t)job->jmgr_pid, job->uid, job->gid); if (fstatus) return SLURM_ERROR; /* stick slurmstepd pid to the newly created job container * (Note: we do not put it in the step container because this * container could be used to suspend/resume tasks using freezer * properties so we need to let the slurmstepd outside of * this one) */ fstatus = _slurm_cgroup_stick_stepd((uint64_t)job->jmgr_pid, job->jmgr_pid); if (fstatus) { _slurm_cgroup_destroy(); return SLURM_ERROR; } /* we use slurmstepd pid as the identifier of the container * the corresponding cgroup could be found using * _slurm_cgroup_find_by_pid */ job->cont_id = (uint64_t)job->jmgr_pid; return SLURM_SUCCESS; } extern int proctrack_p_add (stepd_step_rec_t *job, pid_t pid) { return _slurm_cgroup_add_pids(job->cont_id, &pid, 1); } extern int proctrack_p_signal (uint64_t id, int signal) { pid_t* pids = NULL; int npids; int i; int slurm_task; /* get all the pids associated with the step */ if (_slurm_cgroup_get_pids(id, &pids, &npids) != SLURM_SUCCESS) { debug3("unable to get pids list for cont_id=%"PRIu64"", id); /* that could mean that all the processes already exit */ /* the container so return success */ return SLURM_SUCCESS; } /* directly manage SIGSTOP using cgroup freezer subsystem */ if (signal == SIGSTOP) { xfree(pids); return _slurm_cgroup_suspend(id); } /* start by resuming in case of SIGKILL */ if (signal == SIGKILL) { _slurm_cgroup_resume(id); } for (i = 0 ; i