/*****************************************************************************\ * capmc_resume.c - Power up identified nodes with (optional) features. * Once complete, modify the node's active features as needed. * * Usage: "capmc_resume [features]" ***************************************************************************** * Copyright (C) 2016-2017 SchedMD LLC. * * This file is part of Slurm, a resource management program. * For details, see . * Please also read the included file: DISCLAIMER. * * Slurm is free software; you can redistribute it and/or modify it under * the terms of the GNU General Public License as published by the Free * Software Foundation; either version 2 of the License, or (at your option) * any later version. * * In addition, as a special exception, the copyright holders give permission * to link the code of portions of this program with the OpenSSL library under * certain conditions as described in each individual source file, and * distribute linked combinations including the two. You must obey the GNU * General Public License in all respects for all of the code used other than * OpenSSL. If you modify file(s) with this exception, you may extend this * exception to your version of the file(s), but you are not obligated to do * so. If you do not wish to do so, delete this exception statement from your * version. If you delete this exception statement from all source files in * the program, then also delete it here. * * Slurm is distributed in the hope that it will be useful, but WITHOUT ANY * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more * details. * * You should have received a copy of the GNU General Public License along * with Slurm; if not, write to the Free Software Foundation, Inc., * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. \*****************************************************************************/ #if HAVE_CONFIG_H # include "config.h" #endif #define _GNU_SOURCE /* For POLLRDHUP */ #if HAVE_JSON_C_INC # include #elif HAVE_JSON_INC # include #endif #include #include #include #include #include #include "slurm/slurm.h" #include "slurm/slurm_errno.h" #include "src/common/log.h" #include "src/common/macros.h" #include "src/common/parse_config.h" #include "src/common/read_config.h" #include "src/common/slurm_protocol_api.h" #include "src/common/xassert.h" #include "src/common/xmalloc.h" #include "src/common/xstring.h" #define MAX_THREADS 256 /* Maximum poll wait time for child processes, in milliseconds */ #define MAX_POLL_WAIT 500 /* Default and minimum timeout parameters for the capmc command */ #define DEFAULT_CAPMC_RETRIES 4 #define DEFAULT_CAPMC_TIMEOUT 60000 /* 60 seconds */ #define MIN_CAPMC_TIMEOUT 1000 /* 1 second */ /* Number of times to try performing "node_reinit" operation */ #define NODE_REINIT_RETRIES 10 /* Number of times to try performing node state change operation */ #define NODE_STATE_RETRIES 10 /* Static variables */ static char *capmc_path = NULL; static uint32_t capmc_poll_freq = 45; static uint32_t capmc_retries = DEFAULT_CAPMC_RETRIES; static uint32_t capmc_timeout = DEFAULT_CAPMC_TIMEOUT; static char *log_file = NULL; static bitstr_t *node_bitmap = NULL; static char *prog_name = NULL; static char *mcdram_mode = NULL, *numa_mode = NULL; static char *syscfg_path = NULL; /* NOTE: Keep this table synchronized with the table in * src/plugins/node_features/knl_cray/node_features_knl_cray.c */ static s_p_options_t knl_conf_file_options[] = { {"AllowMCDRAM", S_P_STRING}, {"AllowNUMA", S_P_STRING}, {"AllowUserBoot", S_P_STRING}, {"BootTime", S_P_UINT32}, {"CapmcPath", S_P_STRING}, {"CapmcPollFreq", S_P_UINT32}, {"CapmcRetries", S_P_UINT32}, {"CapmcTimeout", S_P_UINT32}, {"CnselectPath", S_P_STRING}, {"DefaultMCDRAM", S_P_STRING}, {"DefaultNUMA", S_P_STRING}, {"LogFile", S_P_STRING}, {"McPath", S_P_STRING}, {"SyscfgPath", S_P_STRING}, {"UmeCheckInterval", S_P_UINT32}, {NULL} }; /* Static functions */ static s_p_hashtbl_t *_config_make_tbl(char *filename); static uint32_t *_json_parse_nids(json_object *jobj, char *key, int *num); static void _read_config(void); static char *_run_script(char **script_argv, int *status); static int _tot_wait(struct timeval *start_time); static s_p_hashtbl_t *_config_make_tbl(char *filename) { s_p_hashtbl_t *tbl = NULL; xassert(filename); if (!(tbl = s_p_hashtbl_create(knl_conf_file_options))) { error("%s: s_p_hashtbl_create error: %s", prog_name, slurm_strerror(slurm_get_errno())); return tbl; } if (s_p_parse_file(tbl, NULL, filename, false) == SLURM_ERROR) { error("%s: s_p_parse_file error: %s", prog_name, slurm_strerror(slurm_get_errno())); s_p_hashtbl_destroy(tbl); tbl = NULL; } return tbl; } static void _read_config(void) { char *knl_conf_file; s_p_hashtbl_t *tbl; capmc_timeout = DEFAULT_CAPMC_TIMEOUT; knl_conf_file = get_extra_conf_path("knl_cray.conf"); if ((tbl = _config_make_tbl(knl_conf_file))) { (void) s_p_get_string(&capmc_path, "CapmcPath", tbl); (void) s_p_get_uint32(&capmc_poll_freq, "CapmcPollFreq", tbl); (void) s_p_get_uint32(&capmc_retries, "CapmcRetries", tbl); (void) s_p_get_uint32(&capmc_timeout, "CapmcTimeout", tbl); (void) s_p_get_string(&log_file, "LogFile", tbl); (void) s_p_get_string(&syscfg_path, "SyscfgPath", tbl); } xfree(knl_conf_file); s_p_hashtbl_destroy(tbl); if (!capmc_path) capmc_path = xstrdup("/opt/cray/capmc/default/bin/capmc"); capmc_timeout = MAX(capmc_timeout, MIN_CAPMC_TIMEOUT); if (!log_file) log_file = slurm_get_job_slurmctld_logfile(); } /* * Return time in msec since "start time" */ static int _tot_wait(struct timeval *start_time) { struct timeval end_time; int msec_delay; gettimeofday(&end_time, NULL); msec_delay = (end_time.tv_sec - start_time->tv_sec ) * 1000; msec_delay += ((end_time.tv_usec - start_time->tv_usec + 500) / 1000); return msec_delay; } /* Run a script and return its stdout plus exit status */ static char *_run_script(char **script_argv, int *status) { int cc, i, new_wait, resp_size = 0, resp_offset = 0; pid_t cpid; char *resp = NULL; int pfd[2] = { -1, -1 }; if (access(capmc_path, R_OK | X_OK) < 0) { error("%s: Can not execute: %s", prog_name, capmc_path); *status = 127; resp = xstrdup("Slurm node_features/knl_cray configuration error"); return resp; } if (pipe(pfd) != 0) { error("%s: pipe(): %s", prog_name, slurm_strerror(slurm_get_errno())); *status = 127; resp = xstrdup("System error"); return resp; } if ((cpid = fork()) == 0) { cc = sysconf(_SC_OPEN_MAX); dup2(pfd[1], STDERR_FILENO); dup2(pfd[1], STDOUT_FILENO); for (i = 0; i < cc; i++) { if ((i != STDERR_FILENO) && (i != STDOUT_FILENO)) close(i); } setpgid(0, 0); execv(capmc_path, script_argv); error("%s: execv(): %s", prog_name, slurm_strerror(slurm_get_errno())); exit(127); } else if (cpid < 0) { close(pfd[0]); close(pfd[1]); error("%s: fork(): %s", prog_name, slurm_strerror(slurm_get_errno())); *status = 127; resp = xstrdup("System error"); return resp; } else { struct pollfd fds; struct timeval tstart; resp_size = 1024; resp = xmalloc(resp_size); close(pfd[1]); gettimeofday(&tstart, NULL); while (1) { fds.fd = pfd[0]; fds.events = POLLIN | POLLHUP | POLLRDHUP; fds.revents = 0; new_wait = capmc_timeout - _tot_wait(&tstart); if (new_wait <= 0) { error("%s: poll() timeout @ %d msec", prog_name, capmc_timeout); break; } new_wait = MIN(new_wait, MAX_POLL_WAIT); i = poll(&fds, 1, new_wait); if (i == 0) { continue; } else if (i < 0) { error("%s: poll(): %s", prog_name, slurm_strerror(slurm_get_errno())); break; } if ((fds.revents & POLLIN) == 0) break; i = read(pfd[0], resp + resp_offset, resp_size - resp_offset); if (i == 0) { break; } else if (i < 0) { if (errno == EAGAIN) continue; error("%s: read(): %s", prog_name, slurm_strerror(slurm_get_errno())); break; } else { resp_offset += i; if (resp_offset + 1024 >= resp_size) { resp_size *= 2; resp = xrealloc(resp, resp_size); } } } killpg(cpid, SIGTERM); usleep(10000); killpg(cpid, SIGKILL); waitpid(cpid, status, 0); close(pfd[0]); } return resp; } /* Convert node name string to equivalent nid string */ static char *_node_names_2_nid_list(char *node_names) { char *nid_list = NULL; int i, last_nid_index = -1; bool is_dash = false; for (i = 0; node_names[i]; i++) { int nid_index = 0; /* skip "nid[" */ if ((node_names[i] < '0') || (node_names[i] > '9')) continue; /* skip leading zeros */ while (node_names[i] == '0') i++; if (node_names[i] == '[') i++; while ((node_names[i] >= '0') && (node_names[i] <= '9')) { nid_index *= 10; nid_index += (node_names[i++] - '0'); } if (is_dash && (nid_index >= last_nid_index)) { bit_nset(node_bitmap, last_nid_index, nid_index); } else { bit_set(node_bitmap, nid_index); } if ((is_dash = (node_names[i] == '-'))) last_nid_index = nid_index; else if (node_names[i] == '\0') break; } i = strlen(node_names) + 1; nid_list = xmalloc(i); bit_fmt(nid_list, i, node_bitmap); return nid_list; } /* Attempt to modify modes and reboot nodes in a single capmc call. * RET 0 on success, -1 on failure */ static int _update_all_nodes(char *host_list) { char *argv[10], *nid_list, *resp_msg; int rc = 0, retry, status = 0; nid_list = _node_names_2_nid_list(host_list); if (mcdram_mode) { /* Update MCDRAM mode. * Example: "capmc set_mcdram_cfg –n 43 –m cache" */ argv[0] = "capmc"; argv[1] = "set_mcdram_cfg"; argv[2] = "-m"; argv[3] = mcdram_mode; argv[4] = "-n"; argv[5] = nid_list; argv[6] = NULL; for (retry = 0; ; retry++) { resp_msg = _run_script(argv, &status); if ((status == 0) || (resp_msg && strcasestr(resp_msg, "Success"))) { debug("%s: set_mcdram_cfg sent to %s", prog_name, argv[5]); xfree(resp_msg); break; } error("%s: capmc(%s,%s,%s,%s,%s): %d %s", prog_name, argv[1], argv[2], argv[3], argv[4], argv[5], status, resp_msg); if (resp_msg && strstr(resp_msg, "Could not lookup") && (retry <= capmc_retries)) { /* State Manager is down. Sleep and retry */ sleep(1); xfree(resp_msg); } else { /* Non-recoverable error */ rc = -1; xfree(resp_msg); break; } } } if (numa_mode && (rc == 0)) { /* Update NUMA mode. * Example: "capmc set_numa_cfg –m a2a –n 43" */ argv[0] = "capmc"; argv[1] = "set_numa_cfg"; argv[2] = "-m"; argv[3] = numa_mode; argv[4] = "-n"; argv[5] = nid_list; argv[6] = NULL; for (retry = 0; ; retry++) { resp_msg = _run_script(argv, &status); if ((status == 0) || (resp_msg && strcasestr(resp_msg, "Success"))) { debug("%s: set_numa_cfg sent to %s", prog_name, argv[5]); xfree(resp_msg); break; } error("%s: capmc(%s,%s,%s,%s,%s): %d %s", prog_name, argv[1], argv[2], argv[3], argv[4], argv[5], status, resp_msg); if (resp_msg && strstr(resp_msg, "Could not lookup") && (retry <= capmc_retries)) { /* State Manager is down. Sleep and retry */ sleep(1); xfree(resp_msg); } else { /* Non-recoverable error */ rc = -1; xfree(resp_msg); break; } } } /* Request node restart. * Example: "capmc node_reinit –n 43" */ if (rc == 0) { argv[0] = "capmc"; argv[1] = "node_reinit"; argv[2] = "-n"; argv[3] = nid_list; argv[4] = NULL; // argv[4] = "-r"; /* Future option: Reason */ // argv[5] = "Change KNL mode"; for (retry = 0; ; retry++) { resp_msg = _run_script(argv, &status); if ((status == 0) || (resp_msg && strcasestr(resp_msg, "Success"))) { debug("%s: node_reinit sent to %s", prog_name, argv[3]); xfree(resp_msg); break; } error("%s: capmc(%s,%s,%s): %d %s", prog_name, argv[1], argv[2], argv[3], status, resp_msg); if (resp_msg && (strstr(resp_msg, "Could not lookup") || strstr(resp_msg, "Internal server error")) && (retry <= capmc_retries)) { /* State Manager is down. Sleep and retry */ sleep(1); xfree(resp_msg); } else { /* Non-recoverable error */ rc = -1; xfree(resp_msg); break; } } } xfree(nid_list); return rc; } static uint32_t *_json_parse_nids(json_object *jobj, char *key, int *num) { json_object *j_array = NULL; json_object *j_value = NULL; enum json_type j_type; uint32_t *ents; int i, cnt; *num = 0; json_object_object_get_ex(jobj, key, &j_array); if (!j_array) { debug("%s: key=%s not found in nid specification", prog_name, key); return NULL; } cnt = json_object_array_length(j_array); ents = xmalloc(sizeof(uint32_t) * cnt); for (i = 0; i < cnt; i++) { j_value = json_object_array_get_idx(j_array, i); j_type = json_object_get_type(j_value); if (j_type != json_type_int) { error("%s: Unable to parse nid specification", prog_name); break; } else { ents[i] = (uint32_t) json_object_get_int64(j_value); *num = i + 1; } } return ents; } /* Wait for all identified computed nodes to enter "on" state */ static void _wait_all_nodes_on(void) { char *argv[10], *resp_msg; int i, nid_cnt = 0, status = 0; json_object *j; uint32_t *nid_array; time_t start_time = time(NULL); while ((difftime(time(NULL), start_time) < (30 * 60)) && (bit_set_count(node_bitmap) > 0)) { sleep(capmc_poll_freq); argv[0] = "capmc"; argv[1] = "node_status"; argv[2] = NULL; resp_msg = _run_script(argv, &status); if (status != 0) { error("%s: capmc(%s,%s,%s): %d %s", prog_name, argv[1], argv[2], argv[3], status, resp_msg); break; } j = json_tokener_parse(resp_msg); if (j == NULL) { error("%s: json parser failed on %s", prog_name, resp_msg); xfree(resp_msg); break; } xfree(resp_msg); nid_cnt = 0; nid_array = _json_parse_nids(j, "on", &nid_cnt); json_object_put(j); /* Frees json memory */ for (i = 0; i < nid_cnt; i++) { bit_clear(node_bitmap, nid_array[i]); } xfree(nid_array); } } int main(int argc, char *argv[]) { log_options_t log_opts = LOG_OPTS_INITIALIZER; char *features, *save_ptr = NULL, *tok; update_node_msg_t node_msg; int rc = SLURM_SUCCESS; xstrfmtcat(prog_name, "%s[%u]", argv[0], (uint32_t) getpid()); _read_config(); log_opts.stderr_level = LOG_LEVEL_QUIET; log_opts.syslog_level = LOG_LEVEL_QUIET; if (slurm_get_debug_flags() & DEBUG_FLAG_NODE_FEATURES) log_opts.logfile_level = LOG_LEVEL_DEBUG; else log_opts.logfile_level = LOG_LEVEL_ERROR; (void) log_init(argv[0], log_opts, LOG_DAEMON, log_file); if ((argc < 2) || (argc > 3)) { fprintf(stderr, "Usage: hostlist [mode]\n"); error("Usage: hostlist [mode]"); exit(2); } /* Parse the MCDRAM and NUMA boot options */ if (argc == 3) { features = xstrdup(argv[2]); tok = strtok_r(features, ",", &save_ptr); while (tok) { printf("%s\n", tok); if (!strcasecmp(tok, "a2a") || !strcasecmp(tok, "hemi") || !strcasecmp(tok, "quad") || !strcasecmp(tok, "snc2") || !strcasecmp(tok, "snc4")) { xfree(numa_mode); numa_mode = xstrdup(tok); } else if (!strcasecmp(tok, "cache") || !strcasecmp(tok, "split") || !strcasecmp(tok, "equal") || !strcasecmp(tok, "flat")) { xfree(mcdram_mode); mcdram_mode = xstrdup(tok); } tok = strtok_r(NULL, ",", &save_ptr); } xfree(features); } /* Attempt to update modes and restart nodes in a single capmc call */ node_bitmap = bit_alloc(100000); if (_update_all_nodes(argv[1]) != 0) { /* Could not reboot nodes. * Requeue the job we were trying to start */ uint32_t job_id = 0; char *job_id_str = getenv("SLURM_JOB_ID"); if (job_id_str) job_id = strtol(job_id_str, NULL, 10); if (job_id) (void) slurm_requeue(job_id, JOB_RECONFIG_FAIL); /* Return the nodes to service */ slurm_init_update_node_msg(&node_msg); node_msg.node_names = argv[1]; node_msg.node_state = NODE_STATE_POWER_SAVE | NODE_STATE_POWER_UP; rc = slurm_update_node(&node_msg); if (rc != SLURM_SUCCESS) { error("%s: slurm_update_node(\'%s\', \'IDLE\'): %s\n", prog_name, argv[1], slurm_strerror(slurm_get_errno())); } exit(1); } xfree(mcdram_mode); xfree(numa_mode); if (argc == 3) { slurm_init_update_node_msg(&node_msg); node_msg.node_names = argv[1]; node_msg.features_act = argv[2]; rc = slurm_update_node(&node_msg); if (rc != SLURM_SUCCESS) { error("%s: slurm_update_node(\'%s\', \'%s\'): %s\n", prog_name, argv[1], argv[2], slurm_strerror(slurm_get_errno())); } } /* Wait for all nodes to change state to "on" */ _wait_all_nodes_on(); bit_free(node_bitmap); xfree(prog_name); if (rc == SLURM_SUCCESS) exit(0); exit(1); }