/*****************************************************************************\ * Copyright (C) 2006-2009 Hewlett-Packard Development Company, L.P. * Copyright (C) 2008-2009 Lawrence Livermore National Security. * Written by Susanne M. Balle, * CODE-OCEC-09-009. All rights reserved. * * This file is part of Slurm, a resource management program. * For details, see . * Please also read the included file: DISCLAIMER. * * Slurm is free software; you can redistribute it and/or modify it under * the terms of the GNU General Public License as published by the Free * Software Foundation; either version 2 of the License, or (at your option) * any later version. * * In addition, as a special exception, the copyright holders give permission * to link the code of portions of this program with the OpenSSL library under * certain conditions as described in each individual source file, and * distribute linked combinations including the two. You must obey the GNU * General Public License in all respects for all of the code used other than * OpenSSL. If you modify file(s) with this exception, you may extend this * exception to your version of the file(s), but you are not obligated to do * so. If you do not wish to do so, delete this exception statement from your * version. If you delete this exception statement from all source files in * the program, then also delete it here. * * Slurm is distributed in the hope that it will be useful, but WITHOUT ANY * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more * details. * * You should have received a copy of the GNU General Public License along * with Slurm; if not, write to the Free Software Foundation, Inc., * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. \*****************************************************************************/ #define _GNU_SOURCE #include "affinity.h" #include "dist_tasks.h" #include "src/common/bitstring.h" #include "src/common/log.h" #include "src/common/slurm_cred.h" #include "src/common/slurm_protocol_api.h" #include "src/common/slurm_resource_info.h" #include "src/common/strlcpy.h" #include "src/common/xmalloc.h" #include "src/slurmd/slurmd/slurmd.h" #ifdef HAVE_NUMA #include #endif static char *_alloc_mask(launch_tasks_request_msg_t *req, int *whole_node_cnt, int *whole_socket_cnt, int *whole_core_cnt, int *whole_thread_cnt, int *part_socket_cnt, int *part_core_cnt); static bitstr_t *_get_avail_map(launch_tasks_request_msg_t *req, uint16_t *hw_sockets, uint16_t *hw_cores, uint16_t *hw_threads); static int _get_local_node_info(slurm_cred_arg_t *arg, int job_node_id, uint16_t *sockets, uint16_t *cores); static int _task_layout_lllp_block(launch_tasks_request_msg_t *req, uint32_t node_id, bitstr_t ***masks_p); static int _task_layout_lllp_cyclic(launch_tasks_request_msg_t *req, uint32_t node_id, bitstr_t ***masks_p); static void _lllp_map_abstract_masks(const uint32_t maxtasks, bitstr_t **masks); static void _lllp_generate_cpu_bind(launch_tasks_request_msg_t *req, const uint32_t maxtasks, bitstr_t **masks); /* BLOCK_MAP physical machine LLLP index to abstract block LLLP index * BLOCK_MAP_INV physical abstract block LLLP index to machine LLLP index */ #define BLOCK_MAP(index) _block_map(index, conf->block_map) #define BLOCK_MAP_INV(index) _block_map(index, conf->block_map_inv) /* _block_map * * safely returns a mapped index using a provided block map * * IN - index to map * IN - map to use */ static uint16_t _block_map(uint16_t index, uint16_t *map) { if (map == NULL) { return index; } /* make sure bit falls in map */ if (index >= conf->block_map_size) { debug3("wrapping index %u into block_map_size of %u", index, conf->block_map_size); index = index % conf->block_map_size; } index = map[index]; return(index); } static void _task_layout_display_masks(launch_tasks_request_msg_t *req, const uint32_t *gtid, const uint32_t maxtasks, bitstr_t **masks) { int i; char *str = NULL; for(i = 0; i < maxtasks; i++) { str = (char *)bit_fmt_hexmask(masks[i]); debug3("_task_layout_display_masks jobid [%u:%d] %s", req->job_id, gtid[i], str); xfree(str); } } static void _lllp_free_masks(const uint32_t maxtasks, bitstr_t **masks) { int i; bitstr_t *bitmask; for (i = 0; i < maxtasks; i++) { bitmask = masks[i]; FREE_NULL_BITMAP(bitmask); } xfree(masks); } #ifdef HAVE_NUMA /* _match_mask_to_ldom * * expand each mask to encompass the whole locality domain * within which it currently exists * NOTE: this assumes that the masks are already in logical * (and not abstract) CPU order. */ static void _match_masks_to_ldom(const uint32_t maxtasks, bitstr_t **masks) { uint32_t i, b, size; if (!masks || !masks[0]) return; size = bit_size(masks[0]); for(i = 0; i < maxtasks; i++) { for (b = 0; b < size; b++) { if (bit_test(masks[i], b)) { /* get the NUMA node for this CPU, and then * set all CPUs in the mask that exist in * the same CPU */ int c; uint16_t nnid = slurm_get_numa_node(b); for (c = 0; c < size; c++) { if (slurm_get_numa_node(c) == nnid) bit_set(masks[i], c); } } } } } #endif /* * batch_bind - Set the batch request message so as to bind the shell to the * proper resources */ void batch_bind(batch_job_launch_msg_t *req) { bitstr_t *req_map, *hw_map; slurm_cred_arg_t arg; uint16_t sockets=0, cores=0, num_cpus; int start, task_cnt=0; if (slurm_cred_get_args(req->cred, &arg) != SLURM_SUCCESS) { error("task/affinity: job lacks a credential"); return; } start = _get_local_node_info(&arg, 0, &sockets, &cores); if (start != 0) { error("task/affinity: missing node 0 in job credential"); slurm_cred_free_args(&arg); return; } if ((sockets * cores) == 0) { error("task/affinity: socket and core count both zero"); slurm_cred_free_args(&arg); return; } num_cpus = MIN((sockets * cores), (conf->sockets * conf->cores)); req_map = (bitstr_t *) bit_alloc(num_cpus); hw_map = (bitstr_t *) bit_alloc(conf->block_map_size); #ifdef HAVE_FRONT_END { /* Since the front-end nodes are a shared resource, we limit each job * to one CPU based upon monotonically increasing sequence number */ static int last_id = 0; bit_set(hw_map, ((last_id++) % conf->block_map_size)); task_cnt = 1; } #else { char *str; int t, p; /* Transfer core_bitmap data to local req_map. * The MOD function handles the case where fewer processes * physically exist than are configured (slurmd is out of * sync with the slurmctld daemon). */ for (p = 0; p < (sockets * cores); p++) { if (bit_test(arg.job_core_bitmap, p)) bit_set(req_map, (p % num_cpus)); } str = (char *)bit_fmt_hexmask(req_map); debug3("task/affinity: job %u core mask from slurmctld: %s", req->job_id, str); xfree(str); for (p = 0; p < num_cpus; p++) { if (bit_test(req_map, p) == 0) continue; /* core_bitmap does not include threads, so we * add them here but limit them to what the job * requested */ for (t = 0; t < conf->threads; t++) { uint16_t pos = p * conf->threads + t; if (pos >= conf->block_map_size) { info("more resources configured than exist"); p = num_cpus; break; } bit_set(hw_map, pos); task_cnt++; } } } #endif if (task_cnt) { req->cpu_bind_type = CPU_BIND_MASK; if (conf->task_plugin_param & CPU_BIND_VERBOSE) req->cpu_bind_type |= CPU_BIND_VERBOSE; xfree(req->cpu_bind); req->cpu_bind = (char *)bit_fmt_hexmask(hw_map); info("task/affinity: job %u CPU input mask for node: %s", req->job_id, req->cpu_bind); /* translate abstract masks to actual hardware layout */ _lllp_map_abstract_masks(1, &hw_map); #ifdef HAVE_NUMA if (req->cpu_bind_type & CPU_BIND_TO_LDOMS) { _match_masks_to_ldom(1, &hw_map); } #endif xfree(req->cpu_bind); req->cpu_bind = (char *)bit_fmt_hexmask(hw_map); info("task/affinity: job %u CPU final HW mask for node: %s", req->job_id, req->cpu_bind); } else { error("task/affinity: job %u allocated no CPUs", req->job_id); } FREE_NULL_BITMAP(hw_map); FREE_NULL_BITMAP(req_map); slurm_cred_free_args(&arg); } /* The job has specialized cores, synchronize user map with available cores */ static void _validate_map(launch_tasks_request_msg_t *req, char *avail_mask) { char *tmp_map, *save_ptr = NULL, *tok; cpu_set_t avail_cpus; bool superset = true; CPU_ZERO(&avail_cpus); (void) task_str_to_cpuset(&avail_cpus, avail_mask); tmp_map = xstrdup(req->cpu_bind); tok = strtok_r(tmp_map, ",", &save_ptr); while (tok) { int i = atoi(tok); if (!CPU_ISSET(i, &avail_cpus)) { /* The task's CPU map is completely invalid. * Disable CPU map. */ superset = false; break; } tok = strtok_r(NULL, ",", &save_ptr); } xfree(tmp_map); if (!superset) { info("task/affinity: Ignoring user CPU binding outside of job " "step allocation"); req->cpu_bind_type &= (~CPU_BIND_MAP); req->cpu_bind_type |= CPU_BIND_MASK; xfree(req->cpu_bind); req->cpu_bind = xstrdup(avail_mask); } } /* The job has specialized cores, synchronize user mask with available cores */ static void _validate_mask(launch_tasks_request_msg_t *req, char *avail_mask) { char *new_mask = NULL, *save_ptr = NULL, *tok; cpu_set_t avail_cpus, task_cpus; bool superset = true; CPU_ZERO(&avail_cpus); (void) task_str_to_cpuset(&avail_cpus, avail_mask); tok = strtok_r(req->cpu_bind, ",", &save_ptr); while (tok) { int i, overlaps = 0; char mask_str[1 + CPU_SETSIZE / 4]; CPU_ZERO(&task_cpus); (void) task_str_to_cpuset(&task_cpus, tok); for (i = 0; i < CPU_SETSIZE; i++) { if (!CPU_ISSET(i, &task_cpus)) continue; if (CPU_ISSET(i, &avail_cpus)) { overlaps++; } else { CPU_CLR(i, &task_cpus); superset = false; } } if (overlaps == 0) { /* The task's CPU mask is completely invalid. * Give it all allowed CPUs. */ for (i = 0; i < CPU_SETSIZE; i++) { if (CPU_ISSET(i, &avail_cpus)) CPU_SET(i, &task_cpus); } } task_cpuset_to_str(&task_cpus, mask_str); if (new_mask) xstrcat(new_mask, ","); xstrcat(new_mask, mask_str); tok = strtok_r(NULL, ",", &save_ptr); } if (!superset) { info("task/affinity: Ignoring user CPU binding outside of job " "step allocation"); } xfree(req->cpu_bind); req->cpu_bind = new_mask; } /* * lllp_distribution * * Note: lllp stands for Lowest Level of Logical Processors. * * When automatic binding is enabled: * - no binding flags set >= CPU_BIND_NONE, and * - a auto binding level selected CPU_BIND_TO_{SOCKETS,CORES,THREADS} * Otherwise limit job step to the allocated CPUs * * generate the appropriate cpu_bind type and string which results in * the specified lllp distribution. * * IN/OUT- job launch request (cpu_bind_type and cpu_bind updated) * IN- global task id array */ void lllp_distribution(launch_tasks_request_msg_t *req, uint32_t node_id) { int rc = SLURM_SUCCESS; bitstr_t **masks = NULL; char buf_type[100]; int maxtasks = req->tasks_to_launch[(int)node_id]; int whole_nodes, whole_sockets, whole_cores, whole_threads; int part_sockets, part_cores; const uint32_t *gtid = req->global_task_ids[(int)node_id]; static uint16_t bind_entity = CPU_BIND_TO_THREADS | CPU_BIND_TO_CORES | CPU_BIND_TO_SOCKETS | CPU_BIND_TO_LDOMS; static uint16_t bind_mode = CPU_BIND_NONE | CPU_BIND_MASK | CPU_BIND_RANK | CPU_BIND_MAP | CPU_BIND_LDMASK | CPU_BIND_LDRANK | CPU_BIND_LDMAP; static int only_one_thread_per_core = -1; if (only_one_thread_per_core == -1) { if (conf->cpus == (conf->sockets * conf->cores)) only_one_thread_per_core = 1; else only_one_thread_per_core = 0; } /* * If we are telling the system we only want to use 1 thread * per core with the CPUs node option this is the easiest way * to portray that to the affinity plugin. */ if (only_one_thread_per_core) req->cpu_bind_type |= CPU_BIND_ONE_THREAD_PER_CORE; if (req->cpu_bind_type & bind_mode) { /* Explicit step binding specified by user */ char *avail_mask = _alloc_mask(req, &whole_nodes, &whole_sockets, &whole_cores, &whole_threads, &part_sockets, &part_cores); if (!avail_mask) { error("task/affinity: Could not determine allocated CPUs"); } else if ((whole_nodes == 0) && (req->job_core_spec == NO_VAL16)) { info("task/affinity: entire node must be allocated, " "disabling affinity"); xfree(req->cpu_bind); req->cpu_bind = avail_mask; req->cpu_bind_type &= (~bind_mode); req->cpu_bind_type |= CPU_BIND_MASK; } else { if (req->job_core_spec == NO_VAL16) { if (req->cpu_bind_type & CPU_BIND_MASK) _validate_mask(req, avail_mask); else if (req->cpu_bind_type & CPU_BIND_MAP) _validate_map(req, avail_mask); } xfree(avail_mask); } slurm_sprint_cpu_bind_type(buf_type, req->cpu_bind_type); info("lllp_distribution jobid [%u] manual binding: %s", req->job_id, buf_type); return; } if (!(req->cpu_bind_type & bind_entity)) { /* * No bind unit (sockets, cores) specified by user, * pick something reasonable */ uint32_t task_plugin_param = slurm_get_task_plugin_param(); bool auto_def_set = false; int spec_thread_cnt = 0; int max_tasks = req->tasks_to_launch[(int)node_id] * req->cpus_per_task; char *avail_mask = _alloc_mask(req, &whole_nodes, &whole_sockets, &whole_cores, &whole_threads, &part_sockets, &part_cores); debug("binding tasks:%d to " "nodes:%d sockets:%d:%d cores:%d:%d threads:%d", max_tasks, whole_nodes, whole_sockets ,part_sockets, whole_cores, part_cores, whole_threads); if ((req->job_core_spec != NO_VAL16) && (req->job_core_spec & CORE_SPEC_THREAD) && (req->job_core_spec != CORE_SPEC_THREAD)) { spec_thread_cnt = req->job_core_spec & (~CORE_SPEC_THREAD); } if (((max_tasks == whole_sockets) && (part_sockets == 0)) || (spec_thread_cnt && (max_tasks == (whole_sockets + part_sockets)))) { req->cpu_bind_type |= CPU_BIND_TO_SOCKETS; goto make_auto; } if (((max_tasks == whole_cores) && (part_cores == 0)) || (spec_thread_cnt && (max_tasks == (whole_cores + part_cores)))) { req->cpu_bind_type |= CPU_BIND_TO_CORES; goto make_auto; } if (max_tasks == whole_threads) { req->cpu_bind_type |= CPU_BIND_TO_THREADS; goto make_auto; } if (task_plugin_param & CPU_AUTO_BIND_TO_THREADS) { auto_def_set = true; req->cpu_bind_type |= CPU_BIND_TO_THREADS; goto make_auto; } else if (task_plugin_param & CPU_AUTO_BIND_TO_CORES) { auto_def_set = true; req->cpu_bind_type |= CPU_BIND_TO_CORES; goto make_auto; } else if (task_plugin_param & CPU_AUTO_BIND_TO_SOCKETS) { auto_def_set = true; req->cpu_bind_type |= CPU_BIND_TO_SOCKETS; goto make_auto; } if (avail_mask) { xfree(req->cpu_bind); req->cpu_bind = avail_mask; req->cpu_bind_type |= CPU_BIND_MASK; } slurm_sprint_cpu_bind_type(buf_type, req->cpu_bind_type); info("lllp_distribution jobid [%u] auto binding off: %s", req->job_id, buf_type); return; make_auto: xfree(avail_mask); slurm_sprint_cpu_bind_type(buf_type, req->cpu_bind_type); info("lllp_distribution jobid [%u] %s auto binding: " "%s, dist %d", req->job_id, (auto_def_set) ? "default" : "implicit", buf_type, req->task_dist); } else { /* Explicit bind unit (sockets, cores) specified by user */ slurm_sprint_cpu_bind_type(buf_type, req->cpu_bind_type); info("lllp_distribution jobid [%u] binding: %s, dist %d", req->job_id, buf_type, req->task_dist); } switch (req->task_dist & SLURM_DIST_NODESOCKMASK) { case SLURM_DIST_BLOCK_BLOCK: case SLURM_DIST_CYCLIC_BLOCK: case SLURM_DIST_PLANE: /* tasks are distributed in blocks within a plane */ rc = _task_layout_lllp_block(req, node_id, &masks); break; case SLURM_DIST_ARBITRARY: case SLURM_DIST_BLOCK: case SLURM_DIST_CYCLIC: case SLURM_DIST_UNKNOWN: if (slurm_get_select_type_param() & CR_CORE_DEFAULT_DIST_BLOCK) { rc = _task_layout_lllp_block(req, node_id, &masks); break; } /* * We want to fall through here if we aren't doing a * default dist block. */ default: rc = _task_layout_lllp_cyclic(req, node_id, &masks); break; } /* * FIXME: I'm worried about core_bitmap with CPU_BIND_TO_SOCKETS & * max_cores - does select/cons_res plugin allocate whole * socket??? Maybe not. Check srun man page. */ if (rc == SLURM_SUCCESS) { _task_layout_display_masks(req, gtid, maxtasks, masks); /* translate abstract masks to actual hardware layout */ _lllp_map_abstract_masks(maxtasks, masks); _task_layout_display_masks(req, gtid, maxtasks, masks); #ifdef HAVE_NUMA if (req->cpu_bind_type & CPU_BIND_TO_LDOMS) { _match_masks_to_ldom(maxtasks, masks); _task_layout_display_masks(req, gtid, maxtasks, masks); } #endif /* convert masks into cpu_bind mask string */ _lllp_generate_cpu_bind(req, maxtasks, masks); } else { char *avail_mask = _alloc_mask(req, &whole_nodes, &whole_sockets, &whole_cores, &whole_threads, &part_sockets, &part_cores); if (avail_mask) { xfree(req->cpu_bind); req->cpu_bind = avail_mask; req->cpu_bind_type &= (~bind_mode); req->cpu_bind_type |= CPU_BIND_MASK; } slurm_sprint_cpu_bind_type(buf_type, req->cpu_bind_type); error("lllp_distribution jobid [%u] overriding binding: %s", req->job_id, buf_type); error("Verify socket/core/thread counts in configuration"); } if (masks) _lllp_free_masks(maxtasks, masks); } /* * _get_local_node_info - get job allocation details for this node * IN: req - launch request structure * IN: job_node_id - index of the local node in the job allocation * IN/OUT: sockets - pointer to socket count variable * IN/OUT: cores - pointer to cores_per_socket count variable * OUT: returns the core_bitmap index of the first core for this node */ static int _get_local_node_info(slurm_cred_arg_t *arg, int job_node_id, uint16_t *sockets, uint16_t *cores) { int bit_start = 0, bit_finish = 0; int i, index = -1, cur_node_id = -1; do { index++; for (i = 0; i < arg->sock_core_rep_count[index] && cur_node_id < job_node_id; i++) { bit_start = bit_finish; bit_finish += arg->sockets_per_node[index] * arg->cores_per_socket[index]; cur_node_id++; } } while (cur_node_id < job_node_id); *sockets = arg->sockets_per_node[index]; *cores = arg->cores_per_socket[index]; return bit_start; } /* * Determine which CPUs a job step can use. * OUT whole__count - returns count of whole in this * allocation for this node * OUT part___count - returns count of partial in this * allocation for this node * RET - a string representation of the available mask or NULL on error * NOTE: Caller must xfree() the return value. */ static char *_alloc_mask(launch_tasks_request_msg_t *req, int *whole_node_cnt, int *whole_socket_cnt, int *whole_core_cnt, int *whole_thread_cnt, int *part_socket_cnt, int *part_core_cnt) { uint16_t sockets, cores, threads; int c, s, t, i; int c_miss, s_miss, t_miss, c_hit, t_hit; bitstr_t *alloc_bitmap; char *str_mask; bitstr_t *alloc_mask; *whole_node_cnt = 0; *whole_socket_cnt = 0; *whole_core_cnt = 0; *whole_thread_cnt = 0; *part_socket_cnt = 0; *part_core_cnt = 0; alloc_bitmap = _get_avail_map(req, &sockets, &cores, &threads); if (!alloc_bitmap) return NULL; alloc_mask = bit_alloc(bit_size(alloc_bitmap)); i = 0; for (s = 0, s_miss = false; s < sockets; s++) { for (c = 0, c_hit = c_miss = false; c < cores; c++) { for (t = 0, t_hit = t_miss = false; t < threads; t++) { /* * If we are pretending we have a larger system * than we really have this is needed to make * sure we don't bust the bank. */ if (i >= bit_size(alloc_bitmap)) i = 0; if (bit_test(alloc_bitmap, i)) { bit_set(alloc_mask, i); (*whole_thread_cnt)++; t_hit = true; c_hit = true; } else t_miss = true; i++; } if (!t_miss) (*whole_core_cnt)++; else { if (t_hit) (*part_core_cnt)++; c_miss = true; } } if (!c_miss) (*whole_socket_cnt)++; else { if (c_hit) (*part_socket_cnt)++; s_miss = true; } } if (!s_miss) (*whole_node_cnt)++; FREE_NULL_BITMAP(alloc_bitmap); if ((req->job_core_spec != NO_VAL16) && (req->job_core_spec & CORE_SPEC_THREAD) && (req->job_core_spec != CORE_SPEC_THREAD)) { int spec_thread_cnt; spec_thread_cnt = req->job_core_spec & (~CORE_SPEC_THREAD); for (t = threads - 1; ((t > 0) && (spec_thread_cnt > 0)); t--) { for (c = cores - 1; ((c > 0) && (spec_thread_cnt > 0)); c--) { for (s = sockets - 1; ((s >= 0) && (spec_thread_cnt > 0)); s--) { i = s * cores + c; i = (i * threads) + t; bit_clear(alloc_mask, i); spec_thread_cnt--; } } } } /* translate abstract masks to actual hardware layout */ _lllp_map_abstract_masks(1, &alloc_mask); #ifdef HAVE_NUMA if (req->cpu_bind_type & CPU_BIND_TO_LDOMS) { _match_masks_to_ldom(1, &alloc_mask); } #endif str_mask = bit_fmt_hexmask(alloc_mask); FREE_NULL_BITMAP(alloc_mask); return str_mask; } /* * Given a job step request, return an equivalent local bitmap for this node * IN req - The job step launch request * OUT hw_sockets - number of actual sockets on this node * OUT hw_cores - number of actual cores per socket on this node * OUT hw_threads - number of actual threads per core on this node * RET: bitmap of processors available to this job step on this node * OR NULL on error */ static bitstr_t *_get_avail_map(launch_tasks_request_msg_t *req, uint16_t *hw_sockets, uint16_t *hw_cores, uint16_t *hw_threads) { bitstr_t *req_map, *hw_map; slurm_cred_arg_t arg; uint16_t p, t, new_p, num_cpus, sockets, cores; int job_node_id; int start; char *str; int spec_thread_cnt = 0; *hw_sockets = conf->sockets; *hw_cores = conf->cores; *hw_threads = conf->threads; if (slurm_cred_get_args(req->cred, &arg) != SLURM_SUCCESS) { error("task/affinity: job lacks a credential"); return NULL; } /* we need this node's ID in relation to the whole * job allocation, not just this jobstep */ job_node_id = nodelist_find(arg.job_hostlist, conf->node_name); start = _get_local_node_info(&arg, job_node_id, &sockets, &cores); if (start < 0) { error("task/affinity: missing node %d in job credential", job_node_id); slurm_cred_free_args(&arg); return NULL; } debug3("task/affinity: slurmctld s %u c %u; hw s %u c %u t %u", sockets, cores, *hw_sockets, *hw_cores, *hw_threads); num_cpus = MIN((sockets * cores), ((*hw_sockets)*(*hw_cores))); req_map = (bitstr_t *) bit_alloc(num_cpus); hw_map = (bitstr_t *) bit_alloc(conf->block_map_size); /* Transfer core_bitmap data to local req_map. * The MOD function handles the case where fewer processes * physically exist than are configured (slurmd is out of * sync with the slurmctld daemon). */ for (p = 0; p < (sockets * cores); p++) { if (bit_test(arg.step_core_bitmap, start+p)) bit_set(req_map, (p % num_cpus)); } str = (char *)bit_fmt_hexmask(req_map); debug3("task/affinity: job %u.%u core mask from slurmctld: %s", req->job_id, req->job_step_id, str); xfree(str); for (p = 0; p < num_cpus; p++) { if (bit_test(req_map, p) == 0) continue; /* If we are pretending we have a larger system than we really have this is needed to make sure we don't bust the bank. */ new_p = p % conf->block_map_size; /* core_bitmap does not include threads, so we * add them here but limit them to what the job * requested */ for (t = 0; t < (*hw_threads); t++) { uint16_t bit = new_p * (*hw_threads) + t; bit %= conf->block_map_size; bit_set(hw_map, bit); } } if ((req->job_core_spec != NO_VAL16) && (req->job_core_spec & CORE_SPEC_THREAD) && (req->job_core_spec != CORE_SPEC_THREAD)) { spec_thread_cnt = req->job_core_spec & (~CORE_SPEC_THREAD); } if (spec_thread_cnt) { /* Skip specialized threads as needed */ int i, t, c, s; for (t = conf->threads - 1; ((t >= 0) && (spec_thread_cnt > 0)); t--) { for (c = conf->cores - 1; ((c >= 0) && (spec_thread_cnt > 0)); c--) { for (s = conf->sockets - 1; ((s >= 0) && (spec_thread_cnt > 0)); s--) { i = s * conf->cores + c; i = (i * conf->threads) + t; bit_clear(hw_map, i); spec_thread_cnt--; } } } } str = (char *)bit_fmt_hexmask(hw_map); debug3("task/affinity: job %u.%u CPU final mask for local node: %s", req->job_id, req->job_step_id, str); xfree(str); FREE_NULL_BITMAP(req_map); slurm_cred_free_args(&arg); return hw_map; } /* helper function for _expand_masks() */ static void _blot_mask(bitstr_t *mask, bitstr_t *avail_map, uint16_t blot) { uint16_t i, j, size = 0; int prev = -1; if (!mask) return; size = bit_size(mask); for (i = 0; i < size; i++) { if (bit_test(mask, i)) { /* fill in this blot */ uint16_t start = (i / blot) * blot; if (start != prev) { for (j = start; j < start + blot; j++) { if (bit_test(avail_map, j)) bit_set(mask, j); } prev = start; } } } } /* helper function for _expand_masks() * for each task, consider which other bits are set in avail_map * on the same socket */ static void _blot_mask_sockets(const uint32_t maxtasks, const uint32_t task, bitstr_t **masks, uint16_t hw_sockets, uint16_t hw_cores, uint16_t hw_threads, bitstr_t *avail_map) { uint16_t i, j, size = 0; int blot; if (!masks[task]) return; blot = bit_size(avail_map) / hw_sockets; if (blot <= 0) blot = 1; size = bit_size(masks[task]); for (i = 0; i < size; i++) { if (bit_test(masks[task], i)) { /* check if other bits are set in avail_map on this * socket and set each corresponding bit in masks */ uint16_t start = (i / blot) * blot; for (j = start; j < start+blot; j++) { if (bit_test(avail_map, j)) bit_set(masks[task], j); } } } } /* for each mask, expand the mask around the set bits to include the * complete resource to which the set bits are to be bound */ static void _expand_masks(uint16_t cpu_bind_type, const uint32_t maxtasks, bitstr_t **masks, uint16_t hw_sockets, uint16_t hw_cores, uint16_t hw_threads, bitstr_t *avail_map) { uint32_t i; if (cpu_bind_type & CPU_BIND_TO_THREADS) return; if (cpu_bind_type & CPU_BIND_TO_CORES) { if (hw_threads < 2) return; for (i = 0; i < maxtasks; i++) { _blot_mask(masks[i], avail_map, hw_threads); } return; } if (cpu_bind_type & CPU_BIND_TO_SOCKETS) { if (hw_threads*hw_cores < 2) return; for (i = 0; i < maxtasks; i++) { _blot_mask_sockets(maxtasks, i, masks, hw_sockets, hw_cores, hw_threads, avail_map); } return; } } /* * _task_layout_lllp_cyclic * * task_layout_lllp_cyclic creates a cyclic distribution at the * lowest level of logical processor which is either socket, core or * thread depending on the system architecture. The Cyclic algorithm * is the same as the Cyclic distribution performed in srun. * * Distribution at the lllp: * -m hostfile|block|cyclic:block|cyclic * * The first distribution "hostfile|block|cyclic" is computed * in srun. The second distribution "block|cyclic" is computed * locally by each slurmd. * * The input to the lllp distribution algorithms is the gids (tasks * ids) generated for the local node. * * The output is a mapping of the gids onto logical processors * (thread/core/socket) with is expressed cpu_bind masks. * * If a task asks for more than one CPU per task, put the tasks as * close as possible (fill core rather than going next socket for the * extra task) * */ static int _task_layout_lllp_cyclic(launch_tasks_request_msg_t *req, uint32_t node_id, bitstr_t ***masks_p) { int last_taskcount = -1, taskcount = 0; uint16_t i, s, hw_sockets = 0, hw_cores = 0, hw_threads = 0; uint16_t offset = 0, p = 0; int size, max_tasks = req->tasks_to_launch[(int)node_id]; int max_cpus = max_tasks * req->cpus_per_task; bitstr_t *avail_map; bitstr_t **masks = NULL; int *socket_last_pu = NULL; int core_inx, pu_per_core, *core_tasks = NULL; info ("_task_layout_lllp_cyclic "); avail_map = _get_avail_map(req, &hw_sockets, &hw_cores, &hw_threads); if (!avail_map) return SLURM_ERROR; size = bit_set_count(avail_map); if (size < max_tasks) { error("task/affinity: only %d bits in avail_map for %d tasks!", size, max_tasks); FREE_NULL_BITMAP(avail_map); return SLURM_ERROR; } if (size < max_cpus) { /* Possible result of overcommit */ i = size / max_tasks; info("task/affinity: reset cpus_per_task from %d to %d", req->cpus_per_task, i); req->cpus_per_task = i; } pu_per_core = hw_threads; core_tasks = xmalloc(sizeof(int) * hw_sockets * hw_cores); socket_last_pu = xmalloc(hw_sockets * sizeof(int)); *masks_p = xmalloc(max_tasks * sizeof(bitstr_t*)); masks = *masks_p; size = bit_size(avail_map); offset = hw_cores * hw_threads; s = 0; while (taskcount < max_tasks) { if (taskcount == last_taskcount) fatal("_task_layout_lllp_cyclic failure"); last_taskcount = taskcount; for (i = 0; i < size; i++) { bool already_switched = false; uint16_t bit; uint16_t orig_s = s; while (socket_last_pu[s] >= offset) { /* Switch to the next socket we have * ran out here. */ /* This only happens if the slurmctld * gave us an allocation that made a * task split sockets. Or if the * entire allocation is on one socket. */ s = (s + 1) % hw_sockets; if (orig_s == s) { /* This should rarely happen, * but is here for sanity sake. */ debug("allocation is full, " "oversubscribing"); memset(core_tasks, 0, (sizeof(int) * hw_sockets * hw_cores)); memset(socket_last_pu, 0, (sizeof(int) * hw_sockets)); } } bit = socket_last_pu[s] + (s * offset); /* In case hardware and config differ */ bit %= size; /* set up for the next one */ socket_last_pu[s]++; /* skip unrequested threads */ if (req->cpu_bind_type & CPU_BIND_ONE_THREAD_PER_CORE) socket_last_pu[s] += hw_threads - 1; if (!bit_test(avail_map, bit)) continue; core_inx = bit / pu_per_core; if ((req->ntasks_per_core != 0) && (core_tasks[core_inx] >= req->ntasks_per_core)) continue; if (!masks[taskcount]) masks[taskcount] = bit_alloc(conf->block_map_size); //info("setting %d %d", taskcount, bit); bit_set(masks[taskcount], bit); if (!already_switched && (((req->task_dist & SLURM_DIST_NODESOCKMASK) == SLURM_DIST_CYCLIC_CFULL) || ((req->task_dist & SLURM_DIST_NODESOCKMASK) == SLURM_DIST_BLOCK_CFULL))) { /* This means we are laying out cpus * within a task cyclically as well. */ s = (s + 1) % hw_sockets; already_switched = true; } if (++p < req->cpus_per_task) continue; core_tasks[core_inx]++; /* Binding to cores, skip remaining of the threads */ if (!(req->cpu_bind_type & CPU_BIND_ONE_THREAD_PER_CORE) && ((req->cpu_bind_type & CPU_BIND_TO_CORES) || (req->ntasks_per_core == 1))) { int threads_not_used; if (req->cpus_per_task < hw_threads) threads_not_used = hw_threads - req->cpus_per_task; else threads_not_used = req->cpus_per_task % hw_threads; socket_last_pu[s] += threads_not_used; } p = 0; if (!already_switched) { /* Now that we have finished a task, switch to * the next socket. */ s = (s + 1) % hw_sockets; } if (++taskcount >= max_tasks) break; } } /* last step: expand the masks to bind each task * to the requested resource */ _expand_masks(req->cpu_bind_type, max_tasks, masks, hw_sockets, hw_cores, hw_threads, avail_map); FREE_NULL_BITMAP(avail_map); xfree(core_tasks); xfree(socket_last_pu); return SLURM_SUCCESS; } /* * _task_layout_lllp_block * * task_layout_lllp_block will create a block distribution at the * lowest level of logical processor which is either socket, core or * thread depending on the system architecture. The Block algorithm * is the same as the Block distribution performed in srun. * * Distribution at the lllp: * -m hostfile|plane|block|cyclic:block|cyclic * * The first distribution "hostfile|plane|block|cyclic" is computed * in srun. The second distribution "plane|block|cyclic" is computed * locally by each slurmd. * * The input to the lllp distribution algorithms is the gids (tasks * ids) generated for the local node. * * The output is a mapping of the gids onto logical processors * (thread/core/socket) with is expressed cpu_bind masks. * */ static int _task_layout_lllp_block(launch_tasks_request_msg_t *req, uint32_t node_id, bitstr_t ***masks_p) { int c, i, size, last_taskcount = -1, taskcount = 0; uint16_t hw_sockets = 0, hw_cores = 0, hw_threads = 0; int max_tasks = req->tasks_to_launch[(int)node_id]; int max_cpus = max_tasks * req->cpus_per_task; bitstr_t *avail_map; bitstr_t **masks = NULL; int core_inx, pu_per_core, *core_tasks = NULL; int sock_inx, pu_per_socket, *socket_tasks = NULL; info("_task_layout_lllp_block "); avail_map = _get_avail_map(req, &hw_sockets, &hw_cores, &hw_threads); if (!avail_map) { return SLURM_ERROR; } size = bit_set_count(avail_map); if ((req->cpu_bind_type & CPU_BIND_ONE_THREAD_PER_CORE) && (size < (req->cpus_per_task * hw_threads))) { error("task/affinity: only %d bits in avail_map, CPU_BIND_ONE_THREAD_PER_CORE requires %d!", size, (req->cpus_per_task * hw_threads)); FREE_NULL_BITMAP(avail_map); return SLURM_ERROR; } if (size < max_tasks) { error("task/affinity: only %d bits in avail_map for %d tasks!", size, max_tasks); FREE_NULL_BITMAP(avail_map); return SLURM_ERROR; } if (size < max_cpus) { /* Possible result of overcommit */ i = size / max_tasks; info("task/affinity: reset cpus_per_task from %d to %d", req->cpus_per_task, i); req->cpus_per_task = i; } size = bit_size(avail_map); if ((req->cpu_bind_type & CPU_BIND_ONE_THREAD_PER_CORE) && (max_cpus > (hw_sockets * hw_cores))) { /* More CPUs requested than available cores, * disable core-level binding */ req->cpu_bind_type &= (~CPU_BIND_ONE_THREAD_PER_CORE); } *masks_p = xmalloc(max_tasks * sizeof(bitstr_t*)); masks = *masks_p; pu_per_core = hw_threads; core_tasks = xmalloc(sizeof(int) * hw_sockets * hw_cores); pu_per_socket = hw_cores * hw_threads; socket_tasks = xmalloc(sizeof(int) * hw_sockets); /* block distribution with oversubsciption */ c = 0; while (taskcount < max_tasks) { if (taskcount == last_taskcount) fatal("_task_layout_lllp_block infinite loop"); if (taskcount > 0) { /* Clear counters to over-subscribe, if necessary */ memset(core_tasks, 0, (sizeof(int) * hw_sockets * hw_cores)); memset(socket_tasks, 0, (sizeof(int) * hw_sockets)); } last_taskcount = taskcount; /* the abstract map is already laid out in block order, * so just iterate over it */ for (i = 0; i < size; i++) { /* skip unavailable resources */ if (bit_test(avail_map, i) == 0) continue; core_inx = i / pu_per_core; if ((req->ntasks_per_core != 0) && (core_tasks[core_inx] >= req->ntasks_per_core)) continue; sock_inx = i / pu_per_socket; if ((req->ntasks_per_socket != 0) && (socket_tasks[sock_inx] >= req->ntasks_per_socket)) continue; if (!masks[taskcount]) masks[taskcount] = bit_alloc( conf->block_map_size); //info("setting %d %d", taskcount, i); bit_set(masks[taskcount], i); /* skip unrequested threads */ if (req->cpu_bind_type & CPU_BIND_ONE_THREAD_PER_CORE) i += hw_threads - 1; if (++c < req->cpus_per_task) continue; /* We found one! Increment the count on each unit */ core_tasks[core_inx]++; socket_tasks[sock_inx]++; /* Binding to cores, skip remaining of the threads */ if (!(req->cpu_bind_type & CPU_BIND_ONE_THREAD_PER_CORE) && ((req->cpu_bind_type & CPU_BIND_TO_CORES) || (req->ntasks_per_core == 1))) { int threads_not_used; if (req->cpus_per_task < hw_threads) threads_not_used = hw_threads - req->cpus_per_task; else threads_not_used = req->cpus_per_task % hw_threads; i += threads_not_used; } c = 0; if (++taskcount >= max_tasks) break; } } xfree(core_tasks); xfree(socket_tasks); /* last step: expand the masks to bind each task * to the requested resource */ _expand_masks(req->cpu_bind_type, max_tasks, masks, hw_sockets, hw_cores, hw_threads, avail_map); FREE_NULL_BITMAP(avail_map); return SLURM_SUCCESS; } /* * _lllp_map_abstract_mask * * Map one abstract block mask to a physical machine mask * * IN - mask to map * OUT - mapped mask (storage allocated in this routine) */ static bitstr_t *_lllp_map_abstract_mask(bitstr_t *bitmask) { int i, bit; int num_bits = bit_size(bitmask); bitstr_t *newmask = NULL; newmask = (bitstr_t *) bit_alloc(num_bits); /* remap to physical machine */ for (i = 0; i < num_bits; i++) { if (bit_test(bitmask,i)) { bit = BLOCK_MAP(i); if (bit < bit_size(newmask)) bit_set(newmask, bit); else error("%s: can't go from %d -> %d since we " "only have %"BITSTR_FMT" bits", __func__, i, bit, bit_size(newmask)); } } return newmask; } /* * _lllp_map_abstract_masks * * Map an array of abstract block masks to physical machine masks * * IN- maximum number of tasks * IN/OUT- array of masks */ static void _lllp_map_abstract_masks(const uint32_t maxtasks, bitstr_t **masks) { int i; debug3("_lllp_map_abstract_masks"); for (i = 0; i < maxtasks; i++) { bitstr_t *bitmask = masks[i]; if (bitmask) { bitstr_t *newmask = _lllp_map_abstract_mask(bitmask); FREE_NULL_BITMAP(bitmask); masks[i] = newmask; } } } /* * _lllp_generate_cpu_bind * * Generate the cpu_bind type and string given an array of bitstr_t masks * * IN/OUT- job launch request (cpu_bind_type and cpu_bind updated) * IN- maximum number of tasks * IN- array of masks */ static void _lllp_generate_cpu_bind(launch_tasks_request_msg_t *req, const uint32_t maxtasks, bitstr_t **masks) { int i, num_bits=0, masks_len; bitstr_t *bitmask; bitoff_t charsize; char *masks_str = NULL; char buf_type[100]; for (i = 0; i < maxtasks; i++) { bitmask = masks[i]; if (bitmask) { num_bits = bit_size(bitmask); break; } } charsize = (num_bits + 3) / 4; /* ASCII hex digits */ charsize += 3; /* "0x" and trailing "," */ masks_len = maxtasks * charsize + 1; /* number of masks + null */ debug3("%s %d %"BITSTR_FMT" %d", __func__, maxtasks, charsize, masks_len); masks_str = xmalloc(masks_len); masks_len = 0; for (i = 0; i < maxtasks; i++) { char *str; int curlen; bitmask = masks[i]; if (bitmask == NULL) { continue; } str = (char *)bit_fmt_hexmask(bitmask); curlen = strlen(str) + 1; if (masks_len > 0) masks_str[masks_len-1]=','; strlcpy(&masks_str[masks_len], str, curlen); masks_len += curlen; xfree(str); } if (req->cpu_bind) { xfree(req->cpu_bind); } if (masks_str[0] != '\0') { req->cpu_bind = masks_str; req->cpu_bind_type |= CPU_BIND_MASK; } else { req->cpu_bind = NULL; req->cpu_bind_type &= ~CPU_BIND_VERBOSE; } /* clear mask generation bits */ req->cpu_bind_type &= ~CPU_BIND_TO_THREADS; req->cpu_bind_type &= ~CPU_BIND_TO_CORES; req->cpu_bind_type &= ~CPU_BIND_TO_SOCKETS; req->cpu_bind_type &= ~CPU_BIND_TO_LDOMS; slurm_sprint_cpu_bind_type(buf_type, req->cpu_bind_type); info("_lllp_generate_cpu_bind jobid [%u]: %s, %s", req->job_id, buf_type, masks_str); }