/*****************************************************************************\
* util.c - Library for managing a switch on a Cray system.
*****************************************************************************
* Copyright (C) 2013 SchedMD LLC
* Copyright 2013 Cray Inc. All Rights Reserved.
* Written by David Gloe
*
* This file is part of Slurm, a resource management program.
* For details, see .
* Please also read the included file: DISCLAIMER.
*
* Slurm is free software; you can redistribute it and/or modify it under
* the terms of the GNU General Public License as published by the Free
* Software Foundation; either version 2 of the License, or (at your option)
* any later version.
*
* In addition, as a special exception, the copyright holders give permission
* to link the code of portions of this program with the OpenSSL library under
* certain conditions as described in each individual source file, and
* distribute linked combinations including the two. You must obey the GNU
* General Public License in all respects for all of the code used other than
* OpenSSL. If you modify file(s) with this exception, you may extend this
* exception to your version of the file(s), but you are not obligated to do
* so. If you do not wish to do so, delete this exception statement from your
* version. If you delete this exception statement from all source files in
* the program, then also delete it here.
*
* Slurm is distributed in the hope that it will be useful, but WITHOUT ANY
* WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
* FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
* details.
*
* You should have received a copy of the GNU General Public License along
* with Slurm; if not, write to the Free Software Foundation, Inc.,
* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
\*****************************************************************************/
#include "config.h"
#include
#include
#include
#include
#include "switch_cray_aries.h"
#include "slurm/slurm.h"
#include "src/common/slurm_protocol_api.h"
#include "src/common/slurm_step_layout.h"
#include "src/common/xstring.h"
#if defined(HAVE_NATIVE_CRAY) || defined(HAVE_CRAY_NETWORK)
static void _recursive_rmdir(const char *dirnm);
/*
* Create APID directory with given uid/gid as the owner.
*/
int create_apid_dir(uint64_t apid, uid_t uid, gid_t gid)
{
int rc = 0;
char *apid_dir = NULL;
apid_dir = xstrdup_printf(LEGACY_SPOOL_DIR "%" PRIu64, apid);
rc = mkdir(apid_dir, 0700);
if (rc) {
CRAY_ERR("mkdir %s failed: %m", apid_dir);
xfree(apid_dir);
return SLURM_ERROR;
}
rc = chown(apid_dir, uid, gid);
if (rc) {
CRAY_ERR("chown %s, %d, %d failed: %m",
apid_dir, (int)uid, (int)gid);
xfree(apid_dir);
return SLURM_ERROR;
}
if (apid != SLURM_ID_HASH_LEGACY(apid)) {
char *oldapid_dir = xstrdup_printf(LEGACY_SPOOL_DIR "%" PRIu64,
SLURM_ID_HASH_LEGACY(apid));
if (symlink(apid_dir, oldapid_dir)) {
CRAY_ERR("symlink %s, %s failed: %m",
apid_dir, oldapid_dir);
xfree(apid_dir);
xfree(oldapid_dir);
return SLURM_ERROR;
}
xfree(oldapid_dir);
}
xfree(apid_dir);
return SLURM_SUCCESS;
}
/*
* Clean up spool directory files, directories, and links
*/
int remove_spool_files(uint64_t apid)
{
char *path_name = NULL;
uint64_t oldapid = SLURM_ID_HASH_LEGACY(apid);
// Remove the backwards compatibility apid directory symlink
if (apid != oldapid) {
path_name = xstrdup_printf(
LEGACY_SPOOL_DIR "%" PRIu64, oldapid);
if (remove(path_name)) {
CRAY_ERR("remove %s failed: %m", path_name);
xfree(path_name);
return SLURM_ERROR;
}
xfree(path_name);
}
// Remove the apid directory LEGACY_SPOOL_DIR/
path_name = xstrdup_printf(LEGACY_SPOOL_DIR "%" PRIu64, apid);
_recursive_rmdir(path_name);
xfree(path_name);
// Remove the backwards compatibility ALPS placement file
if (apid != oldapid) {
path_name = xstrdup_printf(LEGACY_SPOOL_DIR "places%" PRIu64,
oldapid);
if (remove(path_name)) {
CRAY_ERR("remove %s failed: %m", path_name);
xfree(path_name);
return SLURM_ERROR;
}
}
// Remove the ALPS placement file LEGACY_SPOOL_DIR/places
path_name = xstrdup_printf(LEGACY_SPOOL_DIR "places%" PRIu64, apid);
if (remove(path_name)) {
CRAY_ERR("remove %s failed: %m", path_name);
xfree(path_name);
return SLURM_ERROR;
}
xfree(path_name);
return SLURM_SUCCESS;
}
/*
* Set job environment variables used by LLI and PMI
*/
int set_job_env(stepd_step_rec_t *job, slurm_cray_jobinfo_t *sw_job)
{
int rc, i;
char *buff = NULL, *resv_ports = NULL, *tmp = NULL;
/*
* Write the CRAY_NUM_COOKIES and CRAY_COOKIES variables out
*/
rc = env_array_overwrite_fmt(&job->env, CRAY_NUM_COOKIES_ENV,
"%"PRIu32, sw_job->num_cookies);
if (rc == 0) {
CRAY_ERR("Failed to set env var " CRAY_NUM_COOKIES_ENV);
return SLURM_ERROR;
}
/*
* Create the CRAY_COOKIES environment variable in the application's
* environment.
* Create one string containing a comma separated list of cookies.
*/
for (i = 0; i < sw_job->num_cookies; i++) {
if (i > 0) {
xstrfmtcat(buff, ",%s", sw_job->cookies[i]);
} else
xstrcat(buff, sw_job->cookies[i]);
}
rc = env_array_overwrite(&job->env, CRAY_COOKIES_ENV, buff);
if (rc == 0) {
CRAY_ERR("Failed to set env var " CRAY_COOKIES_ENV);
xfree(buff);
return SLURM_ERROR;
}
xfree(buff);
/*
* Write the PMI_CONTROL_PORT
* Cray's PMI uses this is the port to communicate its control tree
* information.
*/
resv_ports = getenvp(job->env, "SLURM_STEP_RESV_PORTS");
if (resv_ports != NULL) {
buff = xstrdup(resv_ports);
tmp = strchr(buff, '-');
if (tmp != NULL) {
*tmp = '\0';
}
rc = env_array_overwrite(&job->env, PMI_CONTROL_PORT_ENV,
buff);
xfree(buff);
if (rc == 0) {
CRAY_ERR("Failed to set env var "PMI_CONTROL_PORT_ENV);
return SLURM_ERROR;
}
}
/* Set if task IDs are not monotonically increasing across all nodes */
rc = env_array_overwrite_fmt(&job->env, PMI_CRAY_NO_SMP_ENV,
"%d", job->non_smp);
if (rc == 0) {
CRAY_ERR("Failed to set env var "PMI_CRAY_NO_SMP_ENV);
return SLURM_ERROR;
}
return SLURM_SUCCESS;
}
/*
* Print the results of an alpscomm call
* err_msg is freed and NULLed
*/
void alpsc_debug(const char *file, int line, const char *func,
int rc, int expected_rc, const char *alpsc_func,
char **err_msg)
{
if (rc != expected_rc) {
error("(%s: %d: %s) %s failed: %s", file, line, func,
alpsc_func,
*err_msg ? *err_msg : "No error message present");
} else if (*err_msg) {
info("(%s: %d: %s) %s: %s", file, line, func,
alpsc_func, *err_msg);
}
free(*err_msg);
*err_msg = NULL;
}
/*
* Function: list_str_to_array
* Description:
* Convert the list string into an array of integers.
*
* IN list -- The list string
* OUT cnt -- The number of numbers in the list string
* OUT numbers -- Array of integers; Caller is responsible to xfree()
* this.
*
* N.B. Caller is responsible to xfree() numbers.
*
* RETURNS
* Returns 0 on success and -1 on failure.
*/
int list_str_to_array(char *list, int *cnt, int32_t **numbers)
{
int32_t *item_ptr = NULL;
hostlist_t hl;
int i, ret = 0;
char *str, *cptr = NULL;
/*
* Create a hostlist
*/
if (!(hl = hostlist_create(list))) {
CRAY_ERR("hostlist_create error on %s", list);
error("hostlist_create error on %s", list);
return -1;
}
*cnt = hostlist_count(hl);
if (!*cnt) {
*numbers = NULL;
return 0;
}
/*
* Create an integer array of item_ptr in the same order as in the list.
*/
i = 0;
item_ptr = *numbers = xmalloc((*cnt) * sizeof(int32_t));
while ((str = hostlist_shift(hl))) {
if (!(cptr = strpbrk(str, "0123456789"))) {
CRAY_ERR("Error: Node was not recognizable: %s", str);
free(str);
xfree(item_ptr);
*numbers = NULL;
hostlist_destroy(hl);
return -1;
}
item_ptr[i] = atoll(cptr);
i++;
free(str);
}
// Clean up
hostlist_destroy(hl);
return ret;
}
/*
* Recursive directory delete
*
* Call with a directory name and this function will delete
* all files and directories rooted in this name. Finally
* the named directory will be deleted.
* If called with a file name, only that file will be deleted.
*/
static void _recursive_rmdir(const char *dirnm)
{
int st;
size_t dirnm_len, fnm_len, name_len;
char *fnm = 0;
DIR *dirp;
struct dirent *dir;
struct stat st_buf;
/* Don't do anything if there is no directory name */
if (!dirnm) {
return;
}
dirp = opendir(dirnm);
if (!dirp) {
if (errno == ENOTDIR)
goto fileDel;
CRAY_ERR("Error opening directory %s", dirnm);
return;
}
dirnm_len = strlen(dirnm);
if (dirnm_len == 0)
return;
while ((dir = readdir(dirp))) {
name_len = strlen(dir->d_name);
if (name_len == 1 && dir->d_name[0] == '.')
continue;
if (name_len == 2 && xstrcmp(dir->d_name, "..") == 0)
continue;
fnm_len = dirnm_len + name_len + 2;
free(fnm);
fnm = malloc(fnm_len);
snprintf(fnm, fnm_len, "%s/%s", dirnm, dir->d_name);
st = stat(fnm, &st_buf);
if (st < 0) {
CRAY_ERR("stat of %s", fnm);
continue;
}
if (st_buf.st_mode & S_IFDIR) {
_recursive_rmdir(fnm);
} else {
st = unlink(fnm);
if (st < 0 && errno == EISDIR)
st = rmdir(fnm);
if (st < 0 && errno != ENOENT) {
CRAY_ERR("Error removing %s", fnm);
}
}
}
free(fnm);
closedir(dirp);
fileDel: st = unlink(dirnm);
if (st < 0 && errno == EISDIR)
st = rmdir(dirnm);
if (st < 0 && errno != ENOENT) {
CRAY_ERR("Error removing %s", dirnm);
}
}
void print_jobinfo(slurm_cray_jobinfo_t *job)
{
int i;
char *cookie_str = NULL, *cookie_id_str = NULL;
if (!job || (job->magic == CRAY_NULL_JOBINFO_MAGIC)) {
CRAY_ERR("job pointer was NULL");
return;
}
xassert(job->magic == CRAY_JOBINFO_MAGIC);
// Create cookie strings
for (i = 0; i < job->num_cookies; i++) {
xstrfmtcat(cookie_str, "%s%s", i ? "," : "", job->cookies[i]);
xstrfmtcat(cookie_id_str, "%s%"PRIu32,
i ? "," : "", job->cookie_ids[i]);
}
// Log jobinfo
info("jobinfo magic=%"PRIx32" apid=%"PRIu64
" num_cookies=%"PRIu32" cookies=%s cookie_ids=%s",
job->magic, job->apid,
job->num_cookies, cookie_str, cookie_id_str);
// Cleanup
xfree(cookie_str);
xfree(cookie_id_str);
}
#endif /* HAVE_NATIVE_CRAY || HAVE_CRAY_NETWORK */