/*****************************************************************************\ * job_test.c - Determine if job can be allocated resources. ***************************************************************************** * Copyright (C) 2018 SchedMD LLC * Derived in large part from select/cons_res plugin * * This file is part of Slurm, a resource management program. * For details, see . * Please also read the included file: DISCLAIMER. * * Slurm is free software; you can redistribute it and/or modify it under * the terms of the GNU General Public License as published by the Free * Software Foundation; either version 2 of the License, or (at your option) * any later version. * * In addition, as a special exception, the copyright holders give permission * to link the code of portions of this program with the OpenSSL library under * certain conditions as described in each individual source file, and * distribute linked combinations including the two. You must obey the GNU * General Public License in all respects for all of the code used other than * OpenSSL. If you modify file(s) with this exception, you may extend this * exception to your version of the file(s), but you are not obligated to do * so. If you do not wish to do so, delete this exception statement from your * version. If you delete this exception statement from all source files in * the program, then also delete it here. * * Slurm is distributed in the hope that it will be useful, but WITHOUT ANY * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more * details. * * You should have received a copy of the GNU General Public License along * with Slurm; if not, write to the Free Software Foundation, Inc., * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. \*****************************************************************************/ #include #include "select_cons_tres.h" #include "dist_tasks.h" #include "job_test.h" #define _DEBUG 0 /* Enables module specific debugging */ typedef struct node_weight_struct { bitstr_t *node_bitmap; /* bitmap of nodes with this weight */ uint32_t weight; /* priority of node for scheduling work on */ } node_weight_type; typedef struct topo_weight_info { bitstr_t *node_bitmap; int node_cnt; uint64_t weight; } topo_weight_info_t; /* Local functions */ static List _build_node_weight_list(bitstr_t *node_bitmap); static void _cpus_to_use(uint16_t *avail_cpus, int64_t rem_cpus, int rem_nodes, struct job_details *details_ptr, avail_res_t *avail_res, int node_inx, uint16_t cr_type); static bool _enough_nodes(int avail_nodes, int rem_nodes, uint32_t min_nodes, uint32_t req_nodes); static int _eval_nodes(job_record_t *job_ptr, gres_mc_data_t *mc_ptr, bitstr_t *node_map, bitstr_t **avail_core, uint32_t min_nodes, uint32_t max_nodes, uint32_t req_nodes, avail_res_t **avail_res_array, uint16_t cr_type, bool prefer_alloc_nodes, bool first_pass); static int _eval_nodes_busy(job_record_t *job_ptr, gres_mc_data_t *mc_ptr, bitstr_t *node_map, bitstr_t **avail_core, uint32_t min_nodes, uint32_t max_nodes, uint32_t req_nodes, avail_res_t **avail_res_array, uint16_t cr_type, bool prefer_alloc_nodes, bool first_pass); static int _eval_nodes_dfly(job_record_t *job_ptr, gres_mc_data_t *mc_ptr, bitstr_t *node_map, bitstr_t **avail_core, uint32_t min_nodes, uint32_t max_nodes, uint32_t req_nodes, avail_res_t **avail_res_array, uint16_t cr_type, bool prefer_alloc_nodes, bool first_pass); static int _eval_nodes_lln(job_record_t *job_ptr, gres_mc_data_t *mc_ptr, bitstr_t *node_map, bitstr_t **avail_core, uint32_t min_nodes, uint32_t max_nodes, uint32_t req_nodes, avail_res_t **avail_res_array, uint16_t cr_type, bool prefer_alloc_nodes, bool first_pass); static int _eval_nodes_serial(job_record_t *job_ptr, gres_mc_data_t *mc_ptr, bitstr_t *node_map, bitstr_t **avail_core, uint32_t min_nodes, uint32_t max_nodes, uint32_t req_nodes, avail_res_t **avail_res_array, uint16_t cr_type, bool prefer_alloc_nodes, bool first_pass); static int _eval_nodes_spread(job_record_t *job_ptr, gres_mc_data_t *mc_ptr, bitstr_t *node_map, bitstr_t **avail_core, uint32_t min_nodes, uint32_t max_nodes, uint32_t req_nodes, avail_res_t **avail_res_array, uint16_t cr_type, bool prefer_alloc_nodes, bool first_pass); static int _eval_nodes_topo(job_record_t *job_ptr, gres_mc_data_t *mc_ptr, bitstr_t *node_map, bitstr_t **avail_core, uint32_t min_nodes, uint32_t max_nodes, uint32_t req_nodes, avail_res_t **avail_res_array, uint16_t cr_type, bool prefer_alloc_nodes, bool first_pass); static int _node_weight_find(void *x, void *key); static void _node_weight_free(void *x); static int _node_weight_sort(void *x, void *y); /* Find node_weight_type element from list with same weight as node config */ static int _node_weight_find(void *x, void *key) { node_weight_type *nwt = (node_weight_type *) x; config_record_t *config_ptr = (config_record_t *) key; if (nwt->weight == config_ptr->weight) return 1; return 0; } /* Free node_weight_type element from list */ static void _node_weight_free(void *x) { node_weight_type *nwt = (node_weight_type *) x; bit_free(nwt->node_bitmap); xfree(nwt); } /* Sort list of node_weight_type reords in order of increasing node weight */ static int _node_weight_sort(void *x, void *y) { node_weight_type *nwt1 = *(node_weight_type **) x; node_weight_type *nwt2 = *(node_weight_type **) y; return (int) (nwt1->weight - nwt2->weight); } /* * Given a bitmap of available nodes, return a list of node_weight_type * records in order of increasing "weight" (priority) */ static List _build_node_weight_list(bitstr_t *node_bitmap) { int i, i_first, i_last; List node_list; node_record_t *node_ptr; node_weight_type *nwt; xassert(node_bitmap); /* Build list of node_weight_type records, one per node weight */ node_list = list_create(_node_weight_free); i_first = bit_ffs(node_bitmap); if (i_first == -1) return node_list; i_last = bit_fls(node_bitmap); for (i = i_first; i <= i_last; i++) { if (!bit_test(node_bitmap, i)) continue; node_ptr = node_record_table_ptr + i; nwt = list_find_first(node_list, _node_weight_find, node_ptr->config_ptr); if (!nwt) { nwt = xmalloc(sizeof(node_weight_type)); nwt->node_bitmap = bit_alloc(select_node_cnt); nwt->weight = node_ptr->config_ptr->weight; list_append(node_list, nwt); } bit_set(nwt->node_bitmap, i); } /* Sort the list in order of increasing node weight */ list_sort(node_list, _node_weight_sort); return node_list; } /* Log avail_res_t information for a given node */ static void _avail_res_log(avail_res_t *avail_res, char *node_name) { #if _DEBUG int i; char *gres_info = ""; if (!avail_res) { info("Node:%s No resources", node_name); return; } info("Node:%s Sockets:%u SpecThreads:%u CPUs:Min-Max,Avail:%u-%u,%u VPUs:%u", node_name, avail_res->sock_cnt, avail_res->spec_threads, avail_res->min_cpus, avail_res->max_cpus, avail_res->avail_cpus, avail_res->vpus); gres_info = gres_plugin_sock_str(avail_res->sock_gres_list, -1); if (gres_info) { info(" AnySocket %s", gres_info); xfree(gres_info); } for (i = 0; i < avail_res->sock_cnt; i++) { gres_info = gres_plugin_sock_str(avail_res->sock_gres_list, i); if (gres_info) { info(" Socket[%d] Cores:%u GRES:%s", i, avail_res->avail_cores_per_sock[i], gres_info); xfree(gres_info); } else { info(" Socket[%d] Cores:%u", i, avail_res->avail_cores_per_sock[i]); } } #endif } /* * Determine how many CPUs on the node can be used based upon the resource * allocation unit (node, socket, core, etc.) and making sure that * resources will be available for nodes considered later in the * scheduling process * OUT avail_cpus - Count of CPUs to use on this node * IN rem_max_cpus - Maximum count of CPUs remaining to be allocated for job * IN rem_nodes - Count of nodes remaining to be allocated for job * IN details_ptr - Job details information * IN avail_res - Available resources for job on this node, contents updated * IN node_inx - Node index * IN cr_type - Resource allocation units (CR_CORE, CR_SOCKET, etc). */ static void _cpus_to_use(uint16_t *avail_cpus, int64_t rem_max_cpus, int rem_nodes, struct job_details *details_ptr, avail_res_t *avail_res, int node_inx, uint16_t cr_type) { int resv_cpus; /* CPUs to be allocated on other nodes */ if (details_ptr->whole_node == 1) /* Use all resources on node */ return; resv_cpus = MAX((rem_nodes - 1), 0); resv_cpus *= common_cpus_per_core(details_ptr, node_inx); if (cr_type & CR_SOCKET) resv_cpus *= select_node_record[node_inx].cores; rem_max_cpus -= resv_cpus; if (*avail_cpus > rem_max_cpus) { *avail_cpus = MAX(rem_max_cpus, (int)details_ptr->pn_min_cpus); *avail_cpus = MAX(*avail_cpus, details_ptr->min_gres_cpu); /* Round up CPU count to CPU in allocation unit (e.g. core) */ avail_res->avail_cpus = *avail_cpus; } avail_res->avail_res_cnt = avail_res->avail_cpus + avail_res->avail_gpus; } static bool _enough_nodes(int avail_nodes, int rem_nodes, uint32_t min_nodes, uint32_t req_nodes) { int needed_nodes; if (req_nodes > min_nodes) needed_nodes = rem_nodes + min_nodes - req_nodes; else needed_nodes = rem_nodes; return (avail_nodes >= needed_nodes); } /* * Identify the specific cores and GRES available to this job on this node. * The job's requirements for tasks-per-socket, cpus-per-task, etc. are * not considered at this point, but must be considered later. * IN job_ptr - job attempting to be scheduled * IN mc_ptr - job's multi-core specs, NO_VAL and INFINITE mapped to zero * IN node_inx - zero-origin node index * IN max_nodes - maximum additional node count to allocate * IN rem_nodes - desired additional node count to allocate * IN avail_core - available core bitmap, UPDATED * IN avail_res_array - available resources on the node * IN first_pass - set if first scheduling attempt for this job, only use * co-located GRES and cores */ static void _select_cores(job_record_t *job_ptr, gres_mc_data_t *mc_ptr, bool enforce_binding, int node_inx, uint16_t *avail_cpus, uint32_t max_nodes, int rem_nodes, bitstr_t **avail_core, avail_res_t **avail_res_array, bool first_pass) { int alloc_tasks = 0; uint32_t min_tasks_this_node = 0, max_tasks_this_node = 0; struct job_details *details_ptr = job_ptr->details; rem_nodes = MIN(rem_nodes, 1); /* If range of node counts */ if (mc_ptr->ntasks_per_node) { min_tasks_this_node = mc_ptr->ntasks_per_node; max_tasks_this_node = mc_ptr->ntasks_per_node; } else if (mc_ptr->ntasks_per_board) { min_tasks_this_node = mc_ptr->ntasks_per_board; max_tasks_this_node = mc_ptr->ntasks_per_board * select_node_record[node_inx].boards; } else if (mc_ptr->ntasks_per_socket) { min_tasks_this_node = mc_ptr->ntasks_per_socket; max_tasks_this_node = mc_ptr->ntasks_per_socket * select_node_record[node_inx].tot_sockets; } else if (mc_ptr->ntasks_per_core) { min_tasks_this_node = mc_ptr->ntasks_per_core; max_tasks_this_node = mc_ptr->ntasks_per_core * select_node_record[node_inx].tot_cores; } else if (details_ptr && (details_ptr->max_nodes == 1)) { if ((details_ptr->num_tasks == NO_VAL) || (details_ptr->num_tasks == 0)) { min_tasks_this_node = 1; max_tasks_this_node = NO_VAL; } else { min_tasks_this_node = details_ptr->num_tasks; max_tasks_this_node = details_ptr->num_tasks; } } else if (details_ptr && ((details_ptr->num_tasks == 1) || ((details_ptr->num_tasks == details_ptr->min_nodes) && (details_ptr->num_tasks == details_ptr->max_nodes)))) { min_tasks_this_node = 1; max_tasks_this_node = 1; } else { min_tasks_this_node = 1; max_tasks_this_node = NO_VAL; } /* Determine how many tasks can be started on this node */ if (mc_ptr->cpus_per_task && (!details_ptr || !details_ptr->overcommit)) { alloc_tasks = avail_res_array[node_inx]->avail_cpus / mc_ptr->cpus_per_task; if (alloc_tasks < min_tasks_this_node) max_tasks_this_node = 0; } *avail_cpus = avail_res_array[node_inx]->avail_cpus; if (job_ptr->gres_list) { gres_plugin_job_core_filter3(mc_ptr, avail_res_array[node_inx]->sock_gres_list, avail_res_array[node_inx]->sock_cnt, select_node_record[node_inx].cores, select_node_record[node_inx].vpus, avail_cpus, &min_tasks_this_node, &max_tasks_this_node, rem_nodes, enforce_binding, first_pass, avail_core[node_inx]); } if (max_tasks_this_node == 0) { *avail_cpus = 0; } else if ((slurmctld_conf.select_type_param & CR_ONE_TASK_PER_CORE) && ((mc_ptr->ntasks_per_core == INFINITE16) || (mc_ptr->ntasks_per_core == 0)) && details_ptr && (details_ptr->min_gres_cpu == 0)) { *avail_cpus = bit_set_count(avail_core[node_inx]); } } /* * This is the heart of the selection process * IN job_ptr - job attempting to be scheduled * IN mc_ptr - job's multi-core specs, NO_VAL and INFINITE mapped to zero * IN node_map - bitmap of available/selected nodes, UPDATED * IN avail_core - available core bitmap, UPDATED * IN min_nodes - minimum node allocation size in nodes * IN max_nodes - maximum node allocation size in nodes * IN: req_nodes - number of requested nodes * IN avail_res_array - available resources on the node * IN cr_type - allocation type (sockets, cores, etc.) * IN prefer_alloc_nodes - if set, prefer use of already allocated nodes * IN first_pass - set if first scheduling attempt for this job, be picky * RET SLURM_SUCCESS or an error code */ static int _eval_nodes(job_record_t *job_ptr, gres_mc_data_t *mc_ptr, bitstr_t *node_map, bitstr_t **avail_core, uint32_t min_nodes, uint32_t max_nodes, uint32_t req_nodes, avail_res_t **avail_res_array, uint16_t cr_type, bool prefer_alloc_nodes, bool first_pass) { int i, j, error_code = SLURM_ERROR; int *consec_cpus; /* how many CPUs we can add from this * consecutive set of nodes */ List *consec_gres; /* how many GRES we can add from this * consecutive set of nodes */ int *consec_nodes; /* how many nodes we can add from this * consecutive set of nodes */ int *consec_start; /* where this consecutive set starts (index) */ int *consec_end; /* where this consecutive set ends (index) */ int *consec_req; /* are nodes from this set required * (in req_bitmap) */ uint64_t *consec_weight; /* node scheduling weight */ node_record_t *node_ptr = NULL; int consec_index, consec_size, sufficient; int rem_cpus, rem_nodes; /* remaining resources desired */ int min_rem_nodes; /* remaining resources desired */ int best_fit_nodes, best_fit_cpus, best_fit_req; int best_fit_sufficient, best_fit_index = 0; bool new_best; uint64_t best_weight = 0; uint16_t avail_cpus = 0; int64_t rem_max_cpus; int total_cpus = 0; /* #CPUs allocated to job */ bool gres_per_job, required_node; struct job_details *details_ptr = job_ptr->details; bitstr_t *req_map = details_ptr->req_node_bitmap; bool enforce_binding = false; uint16_t *avail_cpu_per_node = NULL; xassert(node_map); if (select_node_cnt != node_record_count) { error("%s: node count inconsistent with slurmctld (%u != %u)", plugin_type, select_node_cnt, node_record_count); return error_code; } if (bit_set_count(node_map) < min_nodes) return error_code; if ((details_ptr->req_node_bitmap) && (!bit_super_set(details_ptr->req_node_bitmap, node_map))) return error_code; if (job_ptr->bit_flags & SPREAD_JOB) { /* Spread the job out over many nodes */ return _eval_nodes_spread(job_ptr, mc_ptr, node_map, avail_core, min_nodes, max_nodes, req_nodes, avail_res_array, cr_type, prefer_alloc_nodes, first_pass); } if (prefer_alloc_nodes && !details_ptr->contiguous) { /* * Select resource on busy nodes first in order to leave * idle resources free for as long as possible so that longer * running jobs can get more easily started by the backfill * scheduler plugin */ return _eval_nodes_busy(job_ptr, mc_ptr, node_map, avail_core, min_nodes, max_nodes, req_nodes, avail_res_array, cr_type, prefer_alloc_nodes, first_pass); } if ((cr_type & CR_LLN) || (job_ptr->part_ptr && (job_ptr->part_ptr->flags & PART_FLAG_LLN))) { /* Select resource on the Least Loaded Node */ return _eval_nodes_lln(job_ptr, mc_ptr, node_map, avail_core, min_nodes, max_nodes, req_nodes, avail_res_array, cr_type, prefer_alloc_nodes, first_pass); } if (pack_serial_at_end && (details_ptr->min_cpus == 1) && (req_nodes == 1)) { /* * Put serial jobs at the end of the available node list * rather than using a best-fit algorithm, which fragments * resources. */ return _eval_nodes_serial(job_ptr, mc_ptr, node_map, avail_core, min_nodes, max_nodes, req_nodes, avail_res_array, cr_type, prefer_alloc_nodes, first_pass); } if (switch_record_cnt && switch_record_table && !details_ptr->contiguous && ((topo_optional == false) || job_ptr->req_switch)) { /* Perform optimized resource selection based upon topology */ if (have_dragonfly) { return _eval_nodes_dfly(job_ptr, mc_ptr, node_map, avail_core, min_nodes, max_nodes, req_nodes, avail_res_array, cr_type, prefer_alloc_nodes, first_pass); } else { return _eval_nodes_topo(job_ptr, mc_ptr, node_map, avail_core, min_nodes, max_nodes, req_nodes, avail_res_array, cr_type, prefer_alloc_nodes, first_pass); } } if (job_ptr->gres_list && (job_ptr->bit_flags & GRES_ENFORCE_BIND)) enforce_binding = true; /* make allocation for 50 sets of consecutive nodes, expand as needed */ consec_size = 50; consec_cpus = xmalloc(sizeof(int) * consec_size); consec_nodes = xmalloc(sizeof(int) * consec_size); consec_start = xmalloc(sizeof(int) * consec_size); consec_end = xmalloc(sizeof(int) * consec_size); consec_req = xmalloc(sizeof(int) * consec_size); consec_weight = xmalloc(sizeof(uint64_t) * consec_size); /* Build table with information about sets of consecutive nodes */ consec_index = 0; consec_req[consec_index] = -1; /* no required nodes here by default */ consec_weight[consec_index] = NO_VAL64; avail_cpu_per_node = xmalloc(sizeof(uint16_t) * select_node_cnt); rem_cpus = details_ptr->min_cpus; rem_max_cpus = details_ptr->max_cpus; min_rem_nodes = min_nodes; if ((gres_per_job = gres_plugin_job_sched_init(job_ptr->gres_list))) { rem_nodes = MIN(min_nodes, req_nodes); consec_gres = xmalloc(sizeof(List) * consec_size); } else rem_nodes = MAX(min_nodes, req_nodes); /* * If there are required nodes, first determine the resources they * provide, then select additional resources as needed in next loop */ if (req_map) { int i_first, i_last; i_first = bit_ffs(req_map); if (i_first >= 0) { i_last = bit_fls(req_map); if (((i_last - i_first + 1) > max_nodes) && (bit_set_count(req_map) > max_nodes)) goto fini; } else i_last = i_first - 1; for (i = i_first; ((i <= i_last) && (max_nodes > 0)); i++) { if (!bit_test(req_map, i)) continue; node_ptr = node_record_table_ptr + i; _select_cores(job_ptr, mc_ptr, enforce_binding, i, &avail_cpus, max_nodes, min_rem_nodes, avail_core, avail_res_array, first_pass); _cpus_to_use(&avail_cpus, rem_max_cpus, min_rem_nodes, details_ptr, avail_res_array[i], i, cr_type); if (avail_cpus == 0) { debug("%pJ required node %s lacks available resources", job_ptr, node_ptr->name); goto fini; } avail_cpu_per_node[i] = avail_cpus; total_cpus += avail_cpus; rem_cpus -= avail_cpus; rem_max_cpus -= avail_cpus; rem_nodes--; min_rem_nodes--; max_nodes--; if (gres_per_job) { gres_plugin_job_sched_add(job_ptr->gres_list, avail_res_array[i]->sock_gres_list, avail_cpus); } } if ((rem_nodes <= 0) && (rem_cpus <= 0) && gres_plugin_job_sched_test(job_ptr->gres_list, job_ptr->job_id)) { error_code = SLURM_SUCCESS; bit_and(node_map, req_map); goto fini; } if (max_nodes <= 0) { error_code = SLURM_ERROR; goto fini; } } for (i = 0; i < select_node_cnt; i++) { /* For each node */ if ((consec_index + 1) >= consec_size) { consec_size *= 2; xrealloc(consec_cpus, sizeof(int) * consec_size); xrealloc(consec_nodes, sizeof(int) * consec_size); xrealloc(consec_start, sizeof(int) * consec_size); xrealloc(consec_end, sizeof(int) * consec_size); xrealloc(consec_req, sizeof(int) * consec_size); xrealloc(consec_weight, sizeof(uint64_t) * consec_size); if (gres_per_job) { xrealloc(consec_gres, sizeof(List) * consec_size); } } if (req_map) required_node = bit_test(req_map, i); else required_node = false; if (!bit_test(node_map, i)) { node_ptr = NULL; /* Use as flag, avoid second test */ } else if (required_node) { node_ptr = node_record_table_ptr + i; } else { node_ptr = node_record_table_ptr + i; _select_cores(job_ptr, mc_ptr, enforce_binding, i, &avail_cpus, max_nodes, min_rem_nodes, avail_core, avail_res_array, first_pass); if (avail_cpus == 0) { bit_clear(node_map, i); node_ptr = NULL; } avail_cpu_per_node[i] = avail_cpus; } /* * If job requested contiguous nodes, * do not worry about matching node weights */ if (node_ptr && !details_ptr->contiguous && (consec_weight[consec_index] != NO_VAL64) && /* Init value*/ (node_ptr->sched_weight != consec_weight[consec_index])) { /* End last consecutive set, setup start of next set */ if (consec_nodes[consec_index] == 0) { /* Only required nodes, re-use consec record */ consec_req[consec_index] = -1; } else { /* End last set, setup for start of next set */ consec_end[consec_index] = i - 1; consec_req[++consec_index] = -1; } } if (node_ptr) { if (consec_nodes[consec_index] == 0) consec_start[consec_index] = i; if (required_node) { /* * Required node, resources counters updated * in above loop, leave bitmap set */ if (consec_req[consec_index] == -1) { /* first required node in set */ consec_req[consec_index] = i; } continue; } /* node not selected (yet) */ bit_clear(node_map, i); consec_cpus[consec_index] += avail_cpus; consec_nodes[consec_index]++; if (gres_per_job) { gres_plugin_job_sched_consec( &consec_gres[consec_index], job_ptr->gres_list, avail_res_array[i]->sock_gres_list); } consec_weight[consec_index] = node_ptr->sched_weight; } else if (consec_nodes[consec_index] == 0) { /* Only required nodes, re-use consec record */ consec_req[consec_index] = -1; consec_weight[consec_index] = NO_VAL64; } else { /* End last set, setup for start of next set */ consec_end[consec_index] = i - 1; consec_req[++consec_index] = -1; consec_weight[consec_index] = NO_VAL64; } } if (consec_nodes[consec_index] != 0) consec_end[consec_index++] = i - 1; if (select_debug_flags & DEBUG_FLAG_SELECT_TYPE) { if (consec_index == 0) { info("%s: %s: consec_index is zero", plugin_type, __func__); } for (i = 0; i < consec_index; i++) { char *gres_str = NULL, *gres_print = ""; bitstr_t *host_bitmap; char *host_list; if (gres_per_job) { gres_str = gres_plugin_job_sched_str( consec_gres[i], job_ptr->gres_list); if (gres_str) { xstrcat(gres_str, " "); gres_print = gres_str; } } host_bitmap = bit_alloc(select_node_cnt); bit_nset(host_bitmap, consec_start[i], consec_end[i]); host_list = bitmap2node_name(host_bitmap); info("%s: eval_nodes: set:%d consec " "CPUs:%d nodes:%d:%s %sbegin:%d end:%d required:%d weight:%"PRIu64, plugin_type, i, consec_cpus[i], consec_nodes[i], host_list, gres_print, consec_start[i], consec_end[i], consec_req[i], consec_weight[i]); bit_free(host_bitmap); xfree(gres_str); xfree(host_list); } } /* Compute CPUs already allocated to required nodes */ if ((details_ptr->max_cpus != NO_VAL) && (total_cpus > details_ptr->max_cpus)) { info("%s: %s: %pJ can't use required nodes due to max CPU limit", plugin_type, __func__, job_ptr); goto fini; } /* * accumulate nodes from these sets of consecutive nodes until * sufficient resources have been accumulated */ while (consec_index && (max_nodes > 0)) { best_fit_cpus = best_fit_nodes = best_fit_sufficient = 0; best_fit_req = -1; /* first required node, -1 if none */ for (i = 0; i < consec_index; i++) { if (consec_nodes[i] == 0) continue; /* no usable nodes here */ if (details_ptr->contiguous && details_ptr->req_node_bitmap && (consec_req[i] == -1)) continue; /* not required nodes */ sufficient = (consec_cpus[i] >= rem_cpus) && _enough_nodes(consec_nodes[i], rem_nodes, min_nodes, req_nodes); if (sufficient && gres_per_job) { sufficient = gres_plugin_job_sched_sufficient( job_ptr->gres_list, consec_gres[i]); } /* * if first possibility OR * contains required nodes OR * lowest node weight */ if ((best_fit_nodes == 0) || ((best_fit_req == -1) && (consec_req[i] != -1)) || (consec_weight[i] < best_weight)) new_best = true; else new_best = false; /* * If equal node weight * first set large enough for request OR * tightest fit (less resource/CPU waste) OR * nothing yet large enough, but this is biggest */ if (!new_best && (consec_weight[i] == best_weight) && ((sufficient && (best_fit_sufficient == 0)) || (sufficient && (consec_cpus[i] < best_fit_cpus)) || (!sufficient && (consec_cpus[i] > best_fit_cpus)))) new_best = true; /* * if first continuous node set large enough */ if (!new_best && !best_fit_sufficient && details_ptr->contiguous && sufficient) new_best = true; if (new_best) { best_fit_cpus = consec_cpus[i]; best_fit_nodes = consec_nodes[i]; best_fit_index = i; best_fit_req = consec_req[i]; best_fit_sufficient = sufficient; best_weight = consec_weight[i]; } if (details_ptr->contiguous && details_ptr->req_node_bitmap) { /* * Must wait for all required nodes to be * in a single consecutive block */ int j, other_blocks = 0; for (j = (i+1); j < consec_index; j++) { if (consec_req[j] != -1) { other_blocks = 1; break; } } if (other_blocks) { best_fit_nodes = 0; break; } } } if (best_fit_nodes == 0) break; if (details_ptr->contiguous && !best_fit_sufficient) break; /* no hole large enough */ if (best_fit_req != -1) { /* * This collection of nodes includes required ones * select nodes from this set, first working up * then down from the required nodes */ for (i = best_fit_req; i <= consec_end[best_fit_index]; i++) { if ((max_nodes == 0) || ((rem_nodes <= 0) && (rem_cpus <= 0) && (!gres_per_job || gres_plugin_job_sched_test( job_ptr->gres_list, job_ptr->job_id)))) break; if (bit_test(node_map, i)) { /* required node already in set */ continue; } if (avail_cpu_per_node[i] == 0) continue; avail_cpus = avail_cpu_per_node[i]; /* * This could result in 0, but if the user * requested nodes here we will still give * them and then the step layout will sort * things out. */ _cpus_to_use(&avail_cpus, rem_max_cpus, min_rem_nodes, details_ptr, avail_res_array[i], i, cr_type); /* enforce the max_cpus limit */ total_cpus += avail_cpus; if ((details_ptr->max_cpus != NO_VAL) && (total_cpus > details_ptr->max_cpus)) { debug2("%s: %s: %pJ can't use node %d without exceeding job limit", plugin_type, __func__, job_ptr, i); total_cpus -= avail_cpus; continue; } bit_set(node_map, i); rem_nodes--; min_rem_nodes--; max_nodes--; rem_cpus -= avail_cpus; rem_max_cpus -= avail_cpus; if (gres_per_job) { gres_plugin_job_sched_add( job_ptr->gres_list, avail_res_array[i]-> sock_gres_list, avail_cpus); } } for (i = (best_fit_req - 1); i >= consec_start[best_fit_index]; i--) { if ((max_nodes == 0) || ((rem_nodes <= 0) && (rem_cpus <= 0) && (!gres_per_job || gres_plugin_job_sched_test( job_ptr->gres_list, job_ptr->job_id)))) break; if (bit_test(node_map, i)) continue; if (avail_cpu_per_node[i] == 0) continue; avail_cpus = avail_cpu_per_node[i]; /* * This could result in 0, but if the user * requested nodes here we will still give * them and then the step layout will sort * things out. */ _cpus_to_use(&avail_cpus, rem_max_cpus, min_rem_nodes, details_ptr, avail_res_array[i], i, cr_type); total_cpus += avail_cpus; if ((details_ptr->max_cpus != NO_VAL) && (total_cpus > details_ptr->max_cpus)) { debug2("%s: %s: %pJ can't use node %d without exceeding job limit", plugin_type, __func__, job_ptr, i); total_cpus -= avail_cpus; continue; } rem_cpus -= avail_cpus; rem_max_cpus -= avail_cpus; bit_set(node_map, i); rem_nodes--; min_rem_nodes--; max_nodes--; if (gres_per_job) { gres_plugin_job_sched_add( job_ptr->gres_list, avail_res_array[i]-> sock_gres_list, avail_cpus); } } } else { /* No required nodes, try best fit single node */ int best_fit = -1, best_size = 0; int first = consec_start[best_fit_index]; int last = consec_end[best_fit_index]; if (rem_nodes <= 1) { for (i = first, j = 0; i <= last; i++, j++) { if (bit_test(node_map, i) || !avail_res_array[i]) continue; if (avail_cpu_per_node[i] < rem_cpus) continue; if (gres_per_job && !gres_plugin_job_sched_test2( job_ptr->gres_list, avail_res_array[i]-> sock_gres_list, job_ptr->job_id)) { continue; } if ((best_fit == -1) || (avail_cpu_per_node[i] gres_list, job_ptr->job_id)))) break; if (bit_test(node_map, i) || !avail_res_array[i]) continue; avail_cpus = avail_cpu_per_node[i]; if (avail_cpus <= 0) continue; if ((max_nodes == 1) && (avail_cpus < rem_cpus)) { /* * Job can only take one more node and * this one has insufficient CPU */ continue; } /* * This could result in 0, but if the user * requested nodes here we will still give * them and then the step layout will sort * things out. */ _cpus_to_use(&avail_cpus, rem_max_cpus, min_rem_nodes, details_ptr, avail_res_array[i], i, cr_type); total_cpus += avail_cpus; if ((details_ptr->max_cpus != NO_VAL) && (total_cpus > details_ptr->max_cpus)) { debug2("%s: %s: %pJ can't use node %d without exceeding job limit", plugin_type, __func__, job_ptr, i); total_cpus -= avail_cpus; continue; } rem_cpus -= avail_cpus; rem_max_cpus -= avail_cpus; bit_set(node_map, i); rem_nodes--; min_rem_nodes--; max_nodes--; if (gres_per_job) { gres_plugin_job_sched_add( job_ptr->gres_list, avail_res_array[i]-> sock_gres_list, avail_cpus); } } } if ((rem_nodes <= 0) && (rem_cpus <= 0) && gres_plugin_job_sched_test(job_ptr->gres_list, job_ptr->job_id)) { error_code = SLURM_SUCCESS; break; } consec_cpus[best_fit_index] = 0; consec_nodes[best_fit_index] = 0; } if (error_code && (rem_cpus <= 0) && gres_plugin_job_sched_test(job_ptr->gres_list, job_ptr->job_id) && _enough_nodes(0, rem_nodes, min_nodes, req_nodes)) error_code = SLURM_SUCCESS; fini: xfree(avail_cpu_per_node); xfree(consec_cpus); xfree(consec_nodes); xfree(consec_start); xfree(consec_end); xfree(consec_req); xfree(consec_weight); if (gres_per_job) { for (i = 0; i < consec_size; i++) FREE_NULL_LIST(consec_gres[i]); xfree(consec_gres); } return error_code; } /* * A variation of _eval_nodes() to select resources using as many nodes as * possible. */ static int _eval_nodes_spread(job_record_t *job_ptr, gres_mc_data_t *mc_ptr, bitstr_t *node_map, bitstr_t **avail_core, uint32_t min_nodes, uint32_t max_nodes, uint32_t req_nodes, avail_res_t **avail_res_array, uint16_t cr_type, bool prefer_alloc_nodes, bool first_pass) { int i, i_start, i_end, error_code = SLURM_ERROR; int rem_cpus, rem_nodes; /* remaining resources desired */ int min_rem_nodes; /* remaining resources desired */ int total_cpus = 0; /* #CPUs allocated to job */ int64_t rem_max_cpus; struct job_details *details_ptr = job_ptr->details; bitstr_t *req_map = details_ptr->req_node_bitmap; bitstr_t *orig_node_map = bit_copy(node_map); bool all_done = false, gres_per_job; uint16_t avail_cpus = 0; node_record_t *node_ptr; List node_weight_list = NULL; node_weight_type *nwt; ListIterator iter; bool enforce_binding = false; if (job_ptr->gres_list && (job_ptr->bit_flags & GRES_ENFORCE_BIND)) enforce_binding = true; rem_cpus = details_ptr->min_cpus; rem_max_cpus = details_ptr->max_cpus; min_rem_nodes = min_nodes; if ((details_ptr->num_tasks != NO_VAL) && (details_ptr->num_tasks != 0)) max_nodes = MIN(max_nodes, details_ptr->num_tasks); if ((gres_per_job = gres_plugin_job_sched_init(job_ptr->gres_list))) rem_nodes = MIN(min_nodes, req_nodes); else rem_nodes = MAX(min_nodes, req_nodes); i_start = bit_ffs(node_map); if (i_start >= 0) i_end = bit_fls(node_map); else i_end = i_start - 1; if (req_map) { for (i = i_start; i <= i_end; i++) { if (!bit_test(req_map, i)) { bit_clear(node_map, i); continue; } node_ptr = node_record_table_ptr + i; if (!bit_test(node_map, i)) { debug("%pJ required node %s not available", job_ptr, node_ptr->name); continue; } if (!avail_res_array[i] || !avail_res_array[i]->avail_cpus) { debug("%pJ required node %s lacks available resources", job_ptr, node_ptr->name); goto fini; } _select_cores(job_ptr, mc_ptr, enforce_binding, i, &avail_cpus, max_nodes, min_rem_nodes, avail_core, avail_res_array, first_pass); _cpus_to_use(&avail_cpus, rem_max_cpus, min_rem_nodes, details_ptr, avail_res_array[i], i, cr_type); if ((avail_cpus > 0) && (max_nodes > 0)) { total_cpus += avail_cpus; rem_cpus -= avail_cpus; rem_max_cpus -= avail_cpus; rem_nodes--; min_rem_nodes--; /* leaving bitmap set, decr max limit */ max_nodes--; if (gres_per_job) { gres_plugin_job_sched_add( job_ptr->gres_list, avail_res_array[i]-> sock_gres_list, avail_cpus); } } else { /* node not selected (yet) */ debug("%pJ required node %s lacks available resources", job_ptr, node_ptr->name); goto fini; } } if ((rem_nodes <= 0) && (rem_cpus <= 0) && gres_plugin_job_sched_test(job_ptr->gres_list, job_ptr->job_id)) { error_code = SLURM_SUCCESS; bit_and(node_map, req_map); goto fini; } if (max_nodes <= 0) { error_code = SLURM_ERROR; goto fini; } bit_and_not(orig_node_map, node_map); } else { bit_clear_all(node_map); } /* Compute CPUs already allocated to required nodes */ if ((details_ptr->max_cpus != NO_VAL) && (total_cpus > details_ptr->max_cpus)) { info("%pJ can't use required nodes due to max CPU limit", job_ptr); goto fini; } if (max_nodes == 0) all_done = true; node_weight_list = _build_node_weight_list(orig_node_map); iter = list_iterator_create(node_weight_list); while (!all_done && (nwt = (node_weight_type *) list_next(iter))) { for (i = i_start; i <= i_end; i++) { if (!avail_res_array[i] || !avail_res_array[i]->avail_cpus) continue; /* Node not available or already selected */ if (!bit_test(nwt->node_bitmap, i) || bit_test(node_map, i)) continue; _select_cores(job_ptr, mc_ptr, enforce_binding, i, &avail_cpus, max_nodes, min_rem_nodes, avail_core, avail_res_array, first_pass); _cpus_to_use(&avail_cpus, rem_max_cpus, min_rem_nodes, details_ptr, avail_res_array[i], i, cr_type); if (avail_cpus == 0) continue; total_cpus += avail_cpus; if ((details_ptr->max_cpus != NO_VAL) && (total_cpus > details_ptr->max_cpus)) { debug2("%s: %s: %pJ can't use node %d without exceeding job limit", plugin_type, __func__, job_ptr, i); total_cpus -= avail_cpus; continue; } rem_cpus -= avail_cpus; rem_max_cpus -= avail_cpus; rem_nodes--; min_rem_nodes--; max_nodes--; bit_set(node_map, i); if (gres_per_job) { gres_plugin_job_sched_add(job_ptr->gres_list, avail_res_array[i]->sock_gres_list, avail_cpus); } if ((rem_nodes <= 0) && (rem_cpus <= 0) && gres_plugin_job_sched_test(job_ptr->gres_list, job_ptr->job_id)) { error_code = SLURM_SUCCESS; all_done = true; break; } if (max_nodes == 0) { all_done = true; break; } } } list_iterator_destroy(iter); if (error_code == SLURM_SUCCESS) { /* Already succeeded */ } else if ((rem_cpus > 0) || (min_rem_nodes > 0) || !gres_plugin_job_sched_test(job_ptr->gres_list, job_ptr->job_id)) { bit_clear_all(node_map); error_code = SLURM_ERROR; } else { error_code = SLURM_SUCCESS; } fini: FREE_NULL_LIST(node_weight_list); bit_free(orig_node_map); return error_code; } /* * A variation of _eval_nodes() to select resources using busy nodes first. */ static int _eval_nodes_busy(job_record_t *job_ptr, gres_mc_data_t *mc_ptr, bitstr_t *node_map, bitstr_t **avail_core, uint32_t min_nodes, uint32_t max_nodes, uint32_t req_nodes, avail_res_t **avail_res_array, uint16_t cr_type, bool prefer_alloc_nodes, bool first_pass) { int i, i_start, i_end, error_code = SLURM_ERROR; int idle_test; int rem_cpus, rem_nodes; /* remaining resources desired */ int min_rem_nodes; /* remaining resources desired */ int total_cpus = 0; /* #CPUs allocated to job */ int64_t rem_max_cpus; struct job_details *details_ptr = job_ptr->details; bitstr_t *req_map = details_ptr->req_node_bitmap; bitstr_t *orig_node_map = bit_copy(node_map); bool all_done = false, gres_per_job; uint16_t avail_cpus = 0; node_record_t *node_ptr; List node_weight_list = NULL; node_weight_type *nwt; ListIterator iter; bool enforce_binding = false; if (job_ptr->gres_list && (job_ptr->bit_flags & GRES_ENFORCE_BIND)) enforce_binding = true; rem_cpus = details_ptr->min_cpus; rem_max_cpus = details_ptr->max_cpus; min_rem_nodes = min_nodes; if ((details_ptr->num_tasks != NO_VAL) && (details_ptr->num_tasks != 0)) max_nodes = MIN(max_nodes, details_ptr->num_tasks); if ((gres_per_job = gres_plugin_job_sched_init(job_ptr->gres_list))) rem_nodes = MIN(min_nodes, req_nodes); else rem_nodes = MAX(min_nodes, req_nodes); i_start = bit_ffs(node_map); if (i_start >= 0) i_end = bit_fls(node_map); else i_end = i_start - 1; if (req_map) { for (i = i_start; i <= i_end; i++) { if (!bit_test(req_map, i)) { bit_clear(node_map, i); continue; } node_ptr = node_record_table_ptr + i; if (!bit_test(node_map, i)) { debug("%pJ required node %s not available", job_ptr, node_ptr->name); continue; } if (!avail_res_array[i] || !avail_res_array[i]->avail_cpus) { debug("%pJ required node %s lacks available resources", job_ptr, node_ptr->name); goto fini; } _select_cores(job_ptr, mc_ptr, enforce_binding, i, &avail_cpus, max_nodes, min_rem_nodes, avail_core, avail_res_array, first_pass); _cpus_to_use(&avail_cpus, rem_max_cpus, min_rem_nodes, details_ptr, avail_res_array[i], i, cr_type); if ((avail_cpus > 0) && (max_nodes > 0)) { total_cpus += avail_cpus; rem_cpus -= avail_cpus; rem_max_cpus -= avail_cpus; rem_nodes--; min_rem_nodes--; /* leaving bitmap set, decr max limit */ if (max_nodes) max_nodes--; if (gres_per_job) { gres_plugin_job_sched_add( job_ptr->gres_list, avail_res_array[i]-> sock_gres_list, avail_cpus); } } else { /* node not selected (yet) */ debug("%pJ required node %s lacks available resources", job_ptr, node_ptr->name); goto fini; } } if ((rem_nodes <= 0) && (rem_cpus <= 0) && gres_plugin_job_sched_test(job_ptr->gres_list, job_ptr->job_id)) { error_code = SLURM_SUCCESS; bit_and(node_map, req_map); goto fini; } if (max_nodes <= 0) { error_code = SLURM_ERROR; goto fini; } bit_and_not(orig_node_map, node_map); } else { bit_clear_all(node_map); } /* Compute CPUs already allocated to required nodes */ if ((details_ptr->max_cpus != NO_VAL) && (total_cpus > details_ptr->max_cpus)) { info("%pJ can't use required nodes due to max CPU limit", job_ptr); goto fini; } /* * Start by using nodes that already have a job running. * Then try to use idle nodes. */ if (max_nodes == 0) all_done = true; node_weight_list = _build_node_weight_list(orig_node_map); iter = list_iterator_create(node_weight_list); while (!all_done && (nwt = (node_weight_type *) list_next(iter))) { for (idle_test = 0; idle_test < 2; idle_test++) { for (i = i_start; i <= i_end; i++) { if (!avail_res_array[i] || !avail_res_array[i]->avail_cpus) continue; /* Node not available or already selected */ if (!bit_test(nwt->node_bitmap, i) || bit_test(node_map, i)) continue; if (((idle_test == 0) && bit_test(idle_node_bitmap, i)) || ((idle_test == 1) && !bit_test(idle_node_bitmap, i))) continue; _select_cores(job_ptr, mc_ptr, enforce_binding, i, &avail_cpus, max_nodes, min_rem_nodes, avail_core, avail_res_array, first_pass); _cpus_to_use(&avail_cpus, rem_max_cpus, min_rem_nodes, details_ptr, avail_res_array[i], i, cr_type); if (avail_cpus == 0) continue; total_cpus += avail_cpus; if ((details_ptr->max_cpus != NO_VAL) && (total_cpus > details_ptr->max_cpus)) { debug2("%s: %s: %pJ can't use node %d without exceeding job limit", plugin_type, __func__, job_ptr, i); total_cpus -= avail_cpus; continue; } rem_cpus -= avail_cpus; rem_max_cpus -= avail_cpus; rem_nodes--; min_rem_nodes--; max_nodes--; bit_set(node_map, i); if (gres_per_job) { gres_plugin_job_sched_add( job_ptr->gres_list, avail_res_array[i]-> sock_gres_list, avail_cpus); } if ((rem_nodes <= 0) && (rem_cpus <= 0) && gres_plugin_job_sched_test( job_ptr->gres_list, job_ptr->job_id)) { error_code = SLURM_SUCCESS; all_done = true; break; } if (max_nodes == 0) { all_done = true; break; } } } } list_iterator_destroy(iter); if (error_code == SLURM_SUCCESS) { /* Already succeeded */ } else if ((rem_cpus > 0) || (min_rem_nodes > 0) || !gres_plugin_job_sched_test(job_ptr->gres_list, job_ptr->job_id)) { bit_clear_all(node_map); error_code = SLURM_ERROR; } else { error_code = SLURM_SUCCESS; } fini: FREE_NULL_LIST(node_weight_list); bit_free(orig_node_map); return error_code; } static int _topo_weight_find(void *x, void *key) { topo_weight_info_t *nw = (topo_weight_info_t *) x; topo_weight_info_t *nw_key = (topo_weight_info_t *) key; if (nw->weight == nw_key->weight) return 1; return 0; } static int _topo_node_find(void *x, void *key) { topo_weight_info_t *nw = (topo_weight_info_t *) x; bitstr_t *nw_key = (bitstr_t *) key; if (bit_overlap_any(nw->node_bitmap, nw_key)) return 1; return 0; } static void _topo_weight_free(void *x) { topo_weight_info_t *nw = (topo_weight_info_t *) x; FREE_NULL_BITMAP(nw->node_bitmap); xfree(nw); } static int _topo_weight_log(void *x, void *arg) { topo_weight_info_t *nw = (topo_weight_info_t *) x; char *node_names = bitmap2node_name(nw->node_bitmap); info("%s: Topo:%s weight:%"PRIu64, __func__, node_names, nw->weight); xfree(node_names); return 0; } static int _topo_weight_sort(void *x, void *y) { topo_weight_info_t *nwt1 = *(topo_weight_info_t **) x; topo_weight_info_t *nwt2 = *(topo_weight_info_t **) y; return (int) (nwt1->weight - nwt2->weight); } /* * Allocate resources to the job on one leaf switch if possible, * otherwise distribute the job allocation over many leaf switches. */ static int _eval_nodes_dfly(job_record_t *job_ptr, gres_mc_data_t *mc_ptr, bitstr_t *node_map, bitstr_t **avail_core, uint32_t min_nodes, uint32_t max_nodes, uint32_t req_nodes, avail_res_t **avail_res_array, uint16_t cr_type, bool prefer_alloc_nodes, bool first_pass) { int *switch_cpu_cnt = NULL; /* total CPUs on switch */ List *switch_gres = NULL; /* available GRES on switch */ bitstr_t **switch_node_bitmap = NULL; /* nodes on this switch */ int *switch_node_cnt = NULL; /* total nodes on switch */ int *switch_required = NULL; /* set if has required node */ bitstr_t *avail_nodes_bitmap = NULL; /* nodes on any switch */ bitstr_t *req_nodes_bitmap = NULL; /* required node bitmap */ bitstr_t *req2_nodes_bitmap = NULL; /* required+lowest prio nodes */ bitstr_t *best_nodes_bitmap = NULL; /* required+low prio nodes */ int i, i_first, i_last, j, rc = SLURM_SUCCESS; int best_cpu_cnt = 0, best_node_cnt = 0, req_node_cnt = 0; List best_gres = NULL; switch_record_t *switch_ptr; List node_weight_list = NULL; topo_weight_info_t *nw = NULL; ListIterator iter; node_record_t *node_ptr; uint16_t avail_cpus = 0; int64_t rem_max_cpus; int rem_cpus, rem_nodes; /* remaining resources desired */ int min_rem_nodes; /* remaining resources desired */ int total_cpus = 0; /* #CPUs allocated to job */ bool enforce_binding = false; struct job_details *details_ptr = job_ptr->details; bool gres_per_job, sufficient = false; uint16_t *avail_cpu_per_node = NULL; time_t time_waiting = 0; int leaf_switch_count = 0; int top_switch_inx = -1; int prev_rem_nodes; if (job_ptr->req_switch > 1) { /* Maximum leaf switch count >1 probably makes no sense */ info("%s: Resetting %pJ leaf switch count from %u to 0", __func__, job_ptr, job_ptr->req_switch); job_ptr->req_switch = 0; } if (job_ptr->req_switch) { time_t time_now; time_now = time(NULL); if (job_ptr->wait4switch_start == 0) job_ptr->wait4switch_start = time_now; time_waiting = time_now - job_ptr->wait4switch_start; } if (job_ptr->gres_list && (job_ptr->bit_flags & GRES_ENFORCE_BIND)) enforce_binding = true; rem_cpus = details_ptr->min_cpus; rem_max_cpus = details_ptr->max_cpus; min_rem_nodes = min_nodes; if ((gres_per_job = gres_plugin_job_sched_init(job_ptr->gres_list))) rem_nodes = MIN(min_nodes, req_nodes); else rem_nodes = MAX(min_nodes, req_nodes); /* Validate availability of required nodes */ if (job_ptr->details->req_node_bitmap) { if (!bit_super_set(job_ptr->details->req_node_bitmap, node_map)) { info("%s: %s: %pJ requires nodes which are not currently available", plugin_type, __func__, job_ptr); rc = SLURM_ERROR; goto fini; } req_node_cnt = bit_set_count(job_ptr->details->req_node_bitmap); if (req_node_cnt == 0) { info("%s: %s: %pJ required node list has no nodes", plugin_type, __func__, job_ptr); rc = SLURM_ERROR; goto fini; } if (req_node_cnt > max_nodes) { info("%s: %s: %pJ requires more nodes than currently available (%u>%u)", plugin_type, __func__, job_ptr, req_node_cnt, max_nodes); rc = SLURM_ERROR; goto fini; } req_nodes_bitmap = bit_copy(job_ptr->details->req_node_bitmap); } /* * Add required nodes to job allocation and * build list of node bitmaps, sorted by weight */ i_first = bit_ffs(node_map); if (i_first == -1) { debug("%s: %s: %pJ node_map is empty", plugin_type, __func__, job_ptr); rc = SLURM_ERROR; goto fini; } i_last = bit_fls(node_map); avail_cpu_per_node = xmalloc(sizeof(uint16_t) * select_node_cnt); node_weight_list = list_create(_topo_weight_free); for (i = i_first; i <= i_last; i++) { topo_weight_info_t nw_static; if (!bit_test(node_map, i)) continue; if (req_nodes_bitmap && bit_test(req_nodes_bitmap, i)) { _select_cores(job_ptr, mc_ptr, enforce_binding, i, &avail_cpus, max_nodes, min_rem_nodes, avail_core, avail_res_array, first_pass); _cpus_to_use(&avail_cpus, rem_max_cpus, min_rem_nodes, details_ptr, avail_res_array[i], i, cr_type); if (avail_cpus == 0) { debug2("%s: %s: %pJ insufficient resources on required node", plugin_type, __func__, job_ptr); rc = SLURM_ERROR; goto fini; } avail_cpu_per_node[i] = avail_cpus; rem_nodes--; min_rem_nodes--; max_nodes--; total_cpus += avail_cpus; rem_cpus -= avail_cpus; rem_max_cpus -= avail_cpus; if (gres_per_job) { gres_plugin_job_sched_add(job_ptr->gres_list, avail_res_array[i]->sock_gres_list, avail_cpus); } } node_ptr = node_record_table_ptr + i; nw_static.weight = node_ptr->sched_weight; nw = list_find_first(node_weight_list, _topo_weight_find, &nw_static); if (!nw) { /* New node weight to add */ nw = xmalloc(sizeof(topo_weight_info_t)); nw->node_bitmap = bit_alloc(select_node_cnt); nw->weight = node_ptr->sched_weight; list_append(node_weight_list, nw); } bit_set(nw->node_bitmap, i); nw->node_cnt++; } if (req_nodes_bitmap) { bit_and(node_map, req_nodes_bitmap); if ((rem_nodes <= 0) && (rem_cpus <= 0) && gres_plugin_job_sched_test(job_ptr->gres_list, job_ptr->job_id)) { /* Required nodes completely satisfied the request */ rc = SLURM_SUCCESS; goto fini; } if (max_nodes <= 0) { rc = SLURM_ERROR; info("%s: %s: %pJ requires nodes exceed maximum node limit", plugin_type, __func__, job_ptr); goto fini; } } else { bit_clear_all(node_map); } list_sort(node_weight_list, _topo_weight_sort); if (select_debug_flags & DEBUG_FLAG_SELECT_TYPE) (void) list_for_each(node_weight_list, _topo_weight_log, NULL); /* * Identify the highest level switch to be used. * Note that nodes can be on multiple non-overlapping switches. */ switch_cpu_cnt = xmalloc(sizeof(int) * switch_record_cnt); switch_gres = xmalloc(sizeof(List) * switch_record_cnt); switch_node_bitmap = xmalloc(sizeof(bitstr_t *) * switch_record_cnt); switch_node_cnt = xmalloc(sizeof(int) * switch_record_cnt); switch_required = xmalloc(sizeof(int) * switch_record_cnt); if (!req_nodes_bitmap) nw = list_peek(node_weight_list); for (i = 0, switch_ptr = switch_record_table; i < switch_record_cnt; i++, switch_ptr++) { switch_node_bitmap[i] = bit_copy(switch_ptr->node_bitmap); if (req_nodes_bitmap && bit_overlap_any(req_nodes_bitmap, switch_node_bitmap[i])) { switch_required[i] = 1; if (switch_record_table[i].level == 0) { leaf_switch_count++; } if ((top_switch_inx == -1) || (switch_record_table[i].level > switch_record_table[top_switch_inx].level)) { top_switch_inx = i; } } if (!req_nodes_bitmap && (list_find_first(node_weight_list, _topo_node_find, switch_node_bitmap[i]))) { if ((top_switch_inx == -1) || (switch_record_table[i].level > switch_record_table[top_switch_inx].level)) { top_switch_inx = i; } } } /* * Top switch is highest level switch containing all required nodes * OR all nodes of the lowest scheduling weight * OR -1 of can not identify top-level switch */ if (top_switch_inx == -1) { error("%s: %s: %pJ unable to identify top level switch", plugin_type, __func__, job_ptr); rc = SLURM_ERROR; goto fini; } /* Check that all specificly required nodes are on shared network */ if (req_nodes_bitmap && !bit_super_set(req_nodes_bitmap, switch_node_bitmap[top_switch_inx])) { rc = SLURM_ERROR; info("%s: %s: %pJ requires nodes that do not have shared network", plugin_type, __func__, job_ptr); goto fini; } /* * Remove nodes from consideration that can not be reached from this * top level switch */ for (i = 0; i < switch_record_cnt; i++) { if (top_switch_inx != i) { bit_and(switch_node_bitmap[i], switch_node_bitmap[top_switch_inx]); } } /* * Identify the best set of nodes (i.e. nodes with the lowest weight, * in addition to the required nodes) that can be used to satisfy the * job request. All nodes must be on a common top-level switch. The * logic here adds groups of nodes, all with the same weight, so we * usually identify more nodes than required to satisfy the request. * Later logic selects from those nodes to get the best topology. */ best_nodes_bitmap = bit_alloc(select_node_cnt); iter = list_iterator_create(node_weight_list); while (!sufficient && (nw = list_next(iter))) { if (best_node_cnt > 0) { /* * All of the lower priority nodes should be included * in the job's allocation. Nodes from the next highest * weight nodes are included only as needed. */ if (req2_nodes_bitmap) bit_or(req2_nodes_bitmap, best_nodes_bitmap); else req2_nodes_bitmap = bit_copy(best_nodes_bitmap); } i_first = bit_ffs(nw->node_bitmap); if (i_first == -1) continue; i_last = bit_fls(nw->node_bitmap); for (i = i_first; i <= i_last; i++) { if (avail_cpu_per_node[i]) continue; /* Required node */ if (!bit_test(nw->node_bitmap, i) || !bit_test(switch_node_bitmap[top_switch_inx], i)) continue; _select_cores(job_ptr, mc_ptr, enforce_binding, i, &avail_cpus, max_nodes, min_rem_nodes, avail_core, avail_res_array, first_pass); if (avail_cpus == 0) { bit_clear(nw->node_bitmap, i); continue; } bit_set(best_nodes_bitmap, i); avail_cpu_per_node[i] = avail_cpus; best_cpu_cnt += avail_cpus; best_node_cnt++; if (gres_per_job) { gres_plugin_job_sched_consec( &best_gres, job_ptr->gres_list, avail_res_array[i]->sock_gres_list); } } sufficient = (best_cpu_cnt >= rem_cpus) && _enough_nodes(best_node_cnt, rem_nodes, min_nodes, req_nodes); if (sufficient && gres_per_job) { sufficient = gres_plugin_job_sched_sufficient( job_ptr->gres_list, best_gres); } } list_iterator_destroy(iter); if (select_debug_flags & DEBUG_FLAG_SELECT_TYPE) { char *gres_str = NULL, *gres_print = ""; char *node_names; if (req_nodes_bitmap) { node_names = bitmap2node_name(req_nodes_bitmap); info("%s: Required nodes:%s", __func__, node_names); xfree(node_names); } node_names = bitmap2node_name(best_nodes_bitmap); if (gres_per_job) { gres_str = gres_plugin_job_sched_str(best_gres, job_ptr->gres_list); if (gres_str) gres_print = gres_str; } info("%s: Best nodes:%s node_cnt:%d cpu_cnt:%d %s", __func__, node_names, best_node_cnt, best_cpu_cnt, gres_print); xfree(node_names); xfree(gres_str); } if (!sufficient) { info("%s: %s: insufficient resources currently available for %pJ", plugin_type, __func__, job_ptr); rc = SLURM_ERROR; goto fini; } /* * Add lowest weight nodes. Treat similar to required nodes for the job. * Job will still need to add some higher weight nodes later. */ if (req2_nodes_bitmap) { i_first = bit_ffs(req2_nodes_bitmap); if (i_first >= 0) i_last = bit_fls(req2_nodes_bitmap); else i_last = -2; for (i = i_first; ((i <= i_last) && (max_nodes > 0)); i++) { if (!bit_test(req2_nodes_bitmap, i)) continue; rem_nodes--; min_rem_nodes--; max_nodes--; avail_cpus = avail_cpu_per_node[i]; total_cpus += avail_cpus; rem_cpus -= avail_cpus; rem_max_cpus -= avail_cpus; if (gres_per_job) { gres_plugin_job_sched_add(job_ptr->gres_list, avail_res_array[i]->sock_gres_list, avail_cpus); } } for (i = 0, switch_ptr = switch_record_table; i < switch_record_cnt; i++, switch_ptr++) { if (switch_required[i]) continue; if (bit_overlap_any(req2_nodes_bitmap, switch_node_bitmap[i])) { switch_required[i] = 1; if (switch_record_table[i].level == 0) { leaf_switch_count++; } } } bit_or(node_map, req2_nodes_bitmap); if (max_nodes <= 0) { rc = SLURM_ERROR; info("%s: %s: %pJ reached maximum node limit", plugin_type, __func__, job_ptr); goto fini; } if ((rem_nodes <= 0) && (rem_cpus <= 0) && (!gres_per_job || gres_plugin_job_sched_test(job_ptr->gres_list, job_ptr->job_id))) { /* Required nodes completely satisfied the request */ error("%s: Scheduling anomaly for %pJ", __func__, job_ptr); rc = SLURM_SUCCESS; goto fini; } } /* * Construct a set of switch array entries. * Use the same indexes as switch_record_table in slurmctld. */ bit_or(best_nodes_bitmap, node_map); avail_nodes_bitmap = bit_alloc(node_record_count); for (i = 0, switch_ptr = switch_record_table; i < switch_record_cnt; i++, switch_ptr++) { bit_and(switch_node_bitmap[i], best_nodes_bitmap); bit_or(avail_nodes_bitmap, switch_node_bitmap[i]); switch_node_cnt[i] = bit_set_count(switch_node_bitmap[i]); } if (select_debug_flags & DEBUG_FLAG_SELECT_TYPE) { for (i = 0; i < switch_record_cnt; i++) { char *node_names = NULL; if (switch_node_cnt[i]) { node_names = bitmap2node_name(switch_node_bitmap[i]); } info("switch=%s level=%d nodes=%u:%s required:%u speed:%u", switch_record_table[i].name, switch_record_table[i].level, switch_node_cnt[i], node_names, switch_required[i], switch_record_table[i].link_speed); xfree(node_names); } } /* count up leaf switches */ if (!req_nodes_bitmap) { for (i = 0, switch_ptr = switch_record_table; i < switch_record_cnt; i++, switch_ptr++) { if (switch_record_table[i].level != 0) continue; if (bit_overlap_any(switch_node_bitmap[i], best_nodes_bitmap)) leaf_switch_count++; } } if (req_nodes_bitmap && (!bit_super_set(req_nodes_bitmap, avail_nodes_bitmap))) { info("%s: %s: %pJ requires nodes not available on any switch", plugin_type, __func__, job_ptr); rc = SLURM_ERROR; goto fini; } /* * If no resources have yet been selected, * then pick one leaf switch with the most available nodes. */ if (leaf_switch_count == 0) { int best_switch_inx = -1; for (i = 0; i < switch_record_cnt; i++) { if (switch_record_table[i].level != 0) continue; if ((best_switch_inx == -1) || (switch_node_cnt[i] > switch_node_cnt[best_switch_inx])) best_switch_inx = i; } if (best_switch_inx != -1) { leaf_switch_count = 1; switch_required[best_switch_inx] = 1; } } /* * All required resources currently on one leaf switch. Determine if * the entire job request can be satisfied using just that one switch. */ if (leaf_switch_count == 1) { best_cpu_cnt = 0; best_node_cnt = 0; FREE_NULL_LIST(best_gres); for (i = 0; i < switch_record_cnt; i++) { if (!switch_required[i] || !switch_node_bitmap[i] || (switch_record_table[i].level != 0)) continue; i_first = bit_ffs(switch_node_bitmap[i]); if (i_first >= 0) i_last = bit_fls(switch_node_bitmap[i]); else i_last = -2; for (j = i_first; j <= i_last; j++) { if (!bit_test(switch_node_bitmap[i], j) || bit_test(node_map, j) || !avail_cpu_per_node[j]) continue; avail_cpus = avail_cpu_per_node[j]; best_cpu_cnt += avail_cpus; best_node_cnt++; if (gres_per_job) { gres_plugin_job_sched_consec( &best_gres, job_ptr->gres_list, avail_res_array[j]->sock_gres_list); } } break; } sufficient = (best_cpu_cnt >= rem_cpus) && _enough_nodes(best_node_cnt, rem_nodes, min_nodes, req_nodes); if (sufficient && gres_per_job) { sufficient = gres_plugin_job_sched_sufficient( job_ptr->gres_list, best_gres); } if (sufficient && (i < switch_record_cnt)) { /* Complete request using this one leaf switch */ for (j = i_first; j <= i_last; j++) { if (!bit_test(switch_node_bitmap[i], j) || bit_test(node_map, j) || !avail_cpu_per_node[j]) continue; avail_cpus = avail_cpu_per_node[j]; rem_nodes--; min_rem_nodes--; max_nodes--; total_cpus += avail_cpus; rem_cpus -= avail_cpus; rem_max_cpus -= avail_cpus; if (gres_per_job) { gres_plugin_job_sched_add( job_ptr->gres_list, avail_res_array[j]-> sock_gres_list, avail_cpus); } bit_set(node_map, j); if ((rem_nodes <= 0) && (rem_cpus <= 0) && (!gres_per_job || gres_plugin_job_sched_test( job_ptr->gres_list, job_ptr->job_id))) { rc = SLURM_SUCCESS; goto fini; } if (max_nodes <= 0) { rc = SLURM_ERROR; info("%s: %s: %pJ reached maximum node limit", plugin_type, __func__, job_ptr); goto fini; } } } } if (job_ptr->req_switch > 0) { if (time_waiting >= job_ptr->wait4switch) { job_ptr->best_switch = true; debug3("%pJ waited %ld sec for switches use=%d", job_ptr, time_waiting, leaf_switch_count); } else if (leaf_switch_count > job_ptr->req_switch) { /* * Allocation is for more than requested number of * switches. */ job_ptr->best_switch = false; debug3("%pJ waited %ld sec for switches=%u found=%d wait %u", job_ptr, time_waiting, job_ptr->req_switch, leaf_switch_count, job_ptr->wait4switch); } else { job_ptr->best_switch = true; } } /* * Add additional resources as required from additional leaf switches * on a round-robin basis */ prev_rem_nodes = rem_nodes + 1; while (1) { if (prev_rem_nodes == rem_nodes) break; /* Stalled */ prev_rem_nodes = rem_nodes; for (i = 0; i < switch_record_cnt; i++) { if (!switch_node_bitmap[i] || (switch_record_table[i].level != 0)) continue; i_first = bit_ffs(switch_node_bitmap[i]); if (i_first >= 0) i_last = bit_fls(switch_node_bitmap[i]); else i_last = -2; for (j = i_first; j <= i_last; j++) { if (!bit_test(switch_node_bitmap[i], j) || bit_test(node_map, j) || !avail_cpu_per_node[j]) continue; avail_cpus = avail_cpu_per_node[j]; rem_nodes--; min_rem_nodes--; max_nodes--; total_cpus += avail_cpus; rem_cpus -= avail_cpus; rem_max_cpus -= avail_cpus; if (gres_per_job) { gres_plugin_job_sched_add( job_ptr->gres_list, avail_res_array[j]-> sock_gres_list, avail_cpus); } bit_set(node_map, j); if ((rem_nodes <= 0) && (rem_cpus <= 0) && (!gres_per_job || gres_plugin_job_sched_test( job_ptr->gres_list, job_ptr->job_id))) { rc = SLURM_SUCCESS; goto fini; } if (max_nodes <= 0) { rc = SLURM_ERROR; info("%s: %s: %pJ reached maximum node limit", plugin_type, __func__, job_ptr); goto fini; } break; /* Move to next switch */ } } } if ((min_rem_nodes <= 0) && (rem_cpus <= 0) && (!gres_per_job || gres_plugin_job_sched_test(job_ptr->gres_list, job_ptr->job_id))) { rc = SLURM_SUCCESS; goto fini; } rc = SLURM_ERROR; fini: FREE_NULL_LIST(best_gres); FREE_NULL_LIST(node_weight_list); FREE_NULL_BITMAP(avail_nodes_bitmap); FREE_NULL_BITMAP(req_nodes_bitmap); FREE_NULL_BITMAP(req2_nodes_bitmap); FREE_NULL_BITMAP(best_nodes_bitmap); xfree(avail_cpu_per_node); xfree(switch_cpu_cnt); xfree(switch_gres); if (switch_node_bitmap) { for (i = 0; i < switch_record_cnt; i++) FREE_NULL_BITMAP(switch_node_bitmap[i]); xfree(switch_node_bitmap); } xfree(switch_node_cnt); xfree(switch_required); return rc; } /* Allocate resources to job using a minimal leaf switch count */ static int _eval_nodes_topo(job_record_t *job_ptr, gres_mc_data_t *mc_ptr, bitstr_t *node_map, bitstr_t **avail_core, uint32_t min_nodes, uint32_t max_nodes, uint32_t req_nodes, avail_res_t **avail_res_array, uint16_t cr_type, bool prefer_alloc_nodes, bool first_pass) { int *switch_cpu_cnt = NULL; /* total CPUs on switch */ List *switch_gres = NULL; /* available GRES on switch */ bitstr_t **switch_node_bitmap = NULL; /* nodes on this switch */ int *switch_node_cnt = NULL; /* total nodes on switch */ int *switch_required = NULL; /* set if has required node */ bitstr_t *avail_nodes_bitmap = NULL; /* nodes on any switch */ bitstr_t *req_nodes_bitmap = NULL; /* required node bitmap */ bitstr_t *req2_nodes_bitmap = NULL; /* required+lowest prio nodes */ bitstr_t *best_nodes_bitmap = NULL; /* required+low prio nodes */ int i, i_first, i_last, j, rc = SLURM_SUCCESS; int best_cpu_cnt = 0, best_node_cnt = 0, req_node_cnt = 0; List best_gres = NULL; switch_record_t *switch_ptr; List node_weight_list = NULL; topo_weight_info_t *nw = NULL; ListIterator iter; node_record_t *node_ptr; uint16_t avail_cpus = 0; int64_t rem_max_cpus; int rem_cpus, rem_nodes; /* remaining resources desired */ int min_rem_nodes; /* remaining resources desired */ int total_cpus = 0; /* #CPUs allocated to job */ bool enforce_binding = false; struct job_details *details_ptr = job_ptr->details; bool gres_per_job, sufficient = false; uint16_t *avail_cpu_per_node = NULL; time_t time_waiting = 0; int leaf_switch_count = 0; int top_switch_inx = -1; int prev_rem_nodes; if (job_ptr->req_switch) { time_t time_now; time_now = time(NULL); if (job_ptr->wait4switch_start == 0) job_ptr->wait4switch_start = time_now; time_waiting = time_now - job_ptr->wait4switch_start; } if (job_ptr->gres_list && (job_ptr->bit_flags & GRES_ENFORCE_BIND)) enforce_binding = true; rem_cpus = details_ptr->min_cpus; rem_max_cpus = details_ptr->max_cpus; min_rem_nodes = min_nodes; if ((gres_per_job = gres_plugin_job_sched_init(job_ptr->gres_list))) rem_nodes = MIN(min_nodes, req_nodes); else rem_nodes = MAX(min_nodes, req_nodes); /* Validate availability of required nodes */ if (job_ptr->details->req_node_bitmap) { if (!bit_super_set(job_ptr->details->req_node_bitmap, node_map)) { info("%s: %s: %pJ requires nodes which are not currently available", plugin_type, __func__, job_ptr); rc = SLURM_ERROR; goto fini; } req_node_cnt = bit_set_count(job_ptr->details->req_node_bitmap); if (req_node_cnt == 0) { info("%s: %s: %pJ required node list has no nodes", plugin_type, __func__, job_ptr); rc = SLURM_ERROR; goto fini; } if (req_node_cnt > max_nodes) { info("%s: %s: %pJ requires more nodes than currently available (%u>%u)", plugin_type, __func__, job_ptr, req_node_cnt, max_nodes); rc = SLURM_ERROR; goto fini; } req_nodes_bitmap = bit_copy(job_ptr->details->req_node_bitmap); } /* * Add required nodes to job allocation and * build list of node bitmaps, sorted by weight */ i_first = bit_ffs(node_map); if (i_first == -1) { debug("%s: %s: %pJ node_map is empty", plugin_type, __func__, job_ptr); rc = SLURM_ERROR; goto fini; } i_last = bit_fls(node_map); avail_cpu_per_node = xmalloc(sizeof(uint16_t) * select_node_cnt); node_weight_list = list_create(_topo_weight_free); for (i = i_first; i <= i_last; i++) { topo_weight_info_t nw_static; if (!bit_test(node_map, i)) continue; if (req_nodes_bitmap && bit_test(req_nodes_bitmap, i)) { _select_cores(job_ptr, mc_ptr, enforce_binding, i, &avail_cpus, max_nodes, min_rem_nodes, avail_core, avail_res_array, first_pass); _cpus_to_use(&avail_cpus, rem_max_cpus, min_rem_nodes, details_ptr, avail_res_array[i], i, cr_type); if (avail_cpus == 0) { debug2("%s: %s: %pJ insufficient resources on required node", plugin_type, __func__, job_ptr); rc = SLURM_ERROR; goto fini; } avail_cpu_per_node[i] = avail_cpus; rem_nodes--; min_rem_nodes--; max_nodes--; total_cpus += avail_cpus; rem_cpus -= avail_cpus; rem_max_cpus -= avail_cpus; if (gres_per_job) { gres_plugin_job_sched_add(job_ptr->gres_list, avail_res_array[i]->sock_gres_list, avail_cpus); } } node_ptr = node_record_table_ptr + i; nw_static.weight = node_ptr->sched_weight; nw = list_find_first(node_weight_list, _topo_weight_find, &nw_static); if (!nw) { /* New node weight to add */ nw = xmalloc(sizeof(topo_weight_info_t)); nw->node_bitmap = bit_alloc(select_node_cnt); nw->weight = node_ptr->sched_weight; list_append(node_weight_list, nw); } bit_set(nw->node_bitmap, i); nw->node_cnt++; } list_sort(node_weight_list, _topo_weight_sort); if (select_debug_flags & DEBUG_FLAG_SELECT_TYPE) (void) list_for_each(node_weight_list, _topo_weight_log, NULL); /* * Identify the highest level switch to be used. * Note that nodes can be on multiple non-overlapping switches. */ switch_cpu_cnt = xmalloc(sizeof(int) * switch_record_cnt); switch_gres = xmalloc(sizeof(List) * switch_record_cnt); switch_node_bitmap = xmalloc(sizeof(bitstr_t *) * switch_record_cnt); switch_node_cnt = xmalloc(sizeof(int) * switch_record_cnt); switch_required = xmalloc(sizeof(int) * switch_record_cnt); for (i = 0, switch_ptr = switch_record_table; i < switch_record_cnt; i++, switch_ptr++) { switch_node_bitmap[i] = bit_copy(switch_ptr->node_bitmap); bit_and(switch_node_bitmap[i], node_map); switch_node_cnt[i] = bit_set_count(switch_node_bitmap[i]); if (req_nodes_bitmap && bit_overlap_any(req_nodes_bitmap, switch_node_bitmap[i])) { switch_required[i] = 1; if (switch_record_table[i].level == 0) { leaf_switch_count++; } if ((top_switch_inx == -1) || (switch_record_table[i].level > switch_record_table[top_switch_inx].level)) { top_switch_inx = i; } } if (!_enough_nodes(switch_node_cnt[i], rem_nodes, min_nodes, req_nodes)) continue; if (!req_nodes_bitmap && (list_find_first(node_weight_list, _topo_node_find, switch_node_bitmap[i]))) { if ((top_switch_inx == -1) || (switch_record_table[i].level > switch_record_table[top_switch_inx].level)) { top_switch_inx = i; } } } if (!req_nodes_bitmap) { bit_clear_all(node_map); } /* * Top switch is highest level switch containing all required nodes * OR all nodes of the lowest scheduling weight * OR -1 if can not identify top-level switch, which may be due to a * disjoint topology and available nodes living on different switches. */ if (top_switch_inx == -1) { log_flag(SELECT_TYPE, "%s: %s: %pJ unable to identify top level switch", plugin_type, __func__, job_ptr); rc = SLURM_ERROR; goto fini; } /* Check that all specificly required nodes are on shared network */ if (req_nodes_bitmap && !bit_super_set(req_nodes_bitmap, switch_node_bitmap[top_switch_inx])) { rc = SLURM_ERROR; info("%s: %s: %pJ requires nodes that do not have shared network", plugin_type, __func__, job_ptr); goto fini; } /* * Remove nodes from consideration that can not be reached from this * top level switch. */ for (i = 0; i < switch_record_cnt; i++) { if (top_switch_inx != i) { bit_and(switch_node_bitmap[i], switch_node_bitmap[top_switch_inx]); } } if (req_nodes_bitmap) { bit_and(node_map, req_nodes_bitmap); if ((rem_nodes <= 0) && (rem_cpus <= 0) && gres_plugin_job_sched_test(job_ptr->gres_list, job_ptr->job_id)) { /* Required nodes completely satisfied the request */ rc = SLURM_SUCCESS; goto fini; } if (max_nodes <= 0) { rc = SLURM_ERROR; info("%s: %s: %pJ requires nodes exceed maximum node limit", plugin_type, __func__, job_ptr); goto fini; } } /* * Identify the best set of nodes (i.e. nodes with the lowest weight, * in addition to the required nodes) that can be used to satisfy the * job request. All nodes must be on a common top-level switch. The * logic here adds groups of nodes, all with the same weight, so we * usually identify more nodes than required to satisfy the request. * Later logic selects from those nodes to get the best topology. */ best_nodes_bitmap = bit_alloc(select_node_cnt); iter = list_iterator_create(node_weight_list); while (!sufficient && (nw = list_next(iter))) { if (best_node_cnt > 0) { /* * All of the lower priority nodes should be included * in the job's allocation. Nodes from the next highest * weight nodes are included only as needed. */ if (req2_nodes_bitmap) bit_or(req2_nodes_bitmap, best_nodes_bitmap); else req2_nodes_bitmap = bit_copy(best_nodes_bitmap); } i_first = bit_ffs(nw->node_bitmap); if (i_first == -1) continue; i_last = bit_fls(nw->node_bitmap); for (i = i_first; i <= i_last; i++) { if (avail_cpu_per_node[i]) continue; /* Required node */ if (!bit_test(nw->node_bitmap, i) || !bit_test(switch_node_bitmap[top_switch_inx], i)) continue; _select_cores(job_ptr, mc_ptr, enforce_binding, i, &avail_cpus, max_nodes, min_rem_nodes, avail_core, avail_res_array, first_pass); if (avail_cpus == 0) { bit_clear(nw->node_bitmap, i); continue; } bit_set(best_nodes_bitmap, i); avail_cpu_per_node[i] = avail_cpus; best_cpu_cnt += avail_cpus; best_node_cnt++; if (gres_per_job) { gres_plugin_job_sched_consec( &best_gres, job_ptr->gres_list, avail_res_array[i]->sock_gres_list); } } sufficient = (best_cpu_cnt >= rem_cpus) && _enough_nodes(best_node_cnt, rem_nodes, min_nodes, req_nodes); if (sufficient && gres_per_job) { sufficient = gres_plugin_job_sched_sufficient( job_ptr->gres_list, best_gres); } } list_iterator_destroy(iter); if (select_debug_flags & DEBUG_FLAG_SELECT_TYPE) { char *gres_str = NULL, *gres_print = ""; char *node_names; if (req_nodes_bitmap) { node_names = bitmap2node_name(req_nodes_bitmap); info("%s: Required nodes:%s", __func__, node_names); xfree(node_names); } node_names = bitmap2node_name(best_nodes_bitmap); if (gres_per_job) { gres_str = gres_plugin_job_sched_str(best_gres, job_ptr->gres_list); if (gres_str) gres_print = gres_str; } info("%s: Best nodes:%s node_cnt:%d cpu_cnt:%d %s", __func__, node_names, best_node_cnt, best_cpu_cnt, gres_print); xfree(node_names); xfree(gres_str); } if (!sufficient) { info("%s: %s: insufficient resources currently available for %pJ", plugin_type, __func__, job_ptr); rc = SLURM_ERROR; goto fini; } /* * Add lowest weight nodes. Treat similar to required nodes for the job. * Job will still need to add some higher weight nodes later. */ if (req2_nodes_bitmap) { i_first = bit_ffs(req2_nodes_bitmap); if (i_first >= 0) i_last = bit_fls(req2_nodes_bitmap); else i_last = -2; for (i = i_first; ((i <= i_last) && (max_nodes > 0)); i++) { if (!bit_test(req2_nodes_bitmap, i)) continue; rem_nodes--; min_rem_nodes--; max_nodes--; avail_cpus = avail_cpu_per_node[i]; total_cpus += avail_cpus; rem_cpus -= avail_cpus; rem_max_cpus -= avail_cpus; if (gres_per_job) { gres_plugin_job_sched_add(job_ptr->gres_list, avail_res_array[i]->sock_gres_list, avail_cpus); } } for (i = 0, switch_ptr = switch_record_table; i < switch_record_cnt; i++, switch_ptr++) { if (switch_required[i]) continue; if (bit_overlap_any(req2_nodes_bitmap, switch_node_bitmap[i])) { switch_required[i] = 1; if (switch_record_table[i].level == 0) { leaf_switch_count++; } } } bit_or(node_map, req2_nodes_bitmap); if ((rem_nodes <= 0) && (rem_cpus <= 0) && (!gres_per_job || gres_plugin_job_sched_test(job_ptr->gres_list, job_ptr->job_id))) { /* Required nodes completely satisfied the request */ error("%s: Scheduling anomaly for %pJ", __func__, job_ptr); rc = SLURM_SUCCESS; goto fini; } if (max_nodes <= 0) { rc = SLURM_ERROR; info("%s: %s: %pJ reached maximum node limit", plugin_type, __func__, job_ptr); goto fini; } } /* * Construct a set of switch array entries. * Use the same indexes as switch_record_table in slurmctld. */ bit_or(best_nodes_bitmap, node_map); avail_nodes_bitmap = bit_alloc(node_record_count); for (i = 0, switch_ptr = switch_record_table; i < switch_record_cnt; i++, switch_ptr++) { bit_and(switch_node_bitmap[i], best_nodes_bitmap); bit_or(avail_nodes_bitmap, switch_node_bitmap[i]); switch_node_cnt[i] = bit_set_count(switch_node_bitmap[i]); } if (select_debug_flags & DEBUG_FLAG_SELECT_TYPE) { for (i = 0; i < switch_record_cnt; i++) { char *node_names = NULL; if (switch_node_cnt[i]) { node_names = bitmap2node_name(switch_node_bitmap[i]); } info("switch=%s level=%d nodes=%u:%s required:%u speed:%u", switch_record_table[i].name, switch_record_table[i].level, switch_node_cnt[i], node_names, switch_required[i], switch_record_table[i].link_speed); xfree(node_names); } } /* Count up leaf switches. */ if (!req_nodes_bitmap) { for (i = 0, switch_ptr = switch_record_table; i < switch_record_cnt; i++, switch_ptr++) { if (switch_record_table[i].level != 0) continue; if (bit_overlap_any(switch_node_bitmap[i], best_nodes_bitmap)) leaf_switch_count++; } } if (req_nodes_bitmap && (!bit_super_set(req_nodes_bitmap, avail_nodes_bitmap))) { info("%s: %s: %pJ requires nodes not available on any switch", plugin_type, __func__, job_ptr); rc = SLURM_ERROR; goto fini; } /* Add additional resources for already required leaf switches */ if (leaf_switch_count) { for (i = 0; i < switch_record_cnt; i++) { if (!switch_required[i] || !switch_node_bitmap[i] || (switch_record_table[i].level != 0)) continue; i_first = bit_ffs(switch_node_bitmap[i]); if (i_first >= 0) i_last = bit_fls(switch_node_bitmap[i]); else i_last = -2; for (j = i_first; j <= i_last; j++) { if (!bit_test(switch_node_bitmap[i], j) || bit_test(node_map, j) || !avail_cpu_per_node[j]) continue; avail_cpus = avail_cpu_per_node[j]; rem_nodes--; min_rem_nodes--; max_nodes--; total_cpus += avail_cpus; rem_cpus -= avail_cpus; rem_max_cpus -= avail_cpus; if (gres_per_job) { gres_plugin_job_sched_add( job_ptr->gres_list, avail_res_array[j]-> sock_gres_list, avail_cpus); } bit_set(node_map, j); if ((rem_nodes <= 0) && (rem_cpus <= 0) && (!gres_per_job || gres_plugin_job_sched_test( job_ptr->gres_list, job_ptr->job_id))) { rc = SLURM_SUCCESS; goto fini; } } } } if (job_ptr->req_switch > 0) { if (time_waiting >= job_ptr->wait4switch) { job_ptr->best_switch = true; debug3("%pJ waited %ld sec for switches use=%d", job_ptr, time_waiting, leaf_switch_count); } else if (leaf_switch_count > job_ptr->req_switch) { /* * Allocation is for more than requested number of * switches. */ job_ptr->best_switch = false; debug3("%pJ waited %ld sec for switches=%u found=%d wait %u", job_ptr, time_waiting, job_ptr->req_switch, leaf_switch_count, job_ptr->wait4switch); } else { job_ptr->best_switch = true; } } /* Add additional resources as required from additional leaf switches */ prev_rem_nodes = rem_nodes + 1; while (1) { if (prev_rem_nodes == rem_nodes) break; /* Stalled */ prev_rem_nodes = rem_nodes; top_switch_inx = -1; for (i = 0; i < switch_record_cnt; i++) { if (switch_required[i] || !switch_node_bitmap[i] || (switch_record_table[i].level != 0)) continue; if (switch_node_cnt[i] && ((top_switch_inx == -1) || (switch_node_cnt[i] > switch_node_cnt[top_switch_inx]))) top_switch_inx = i; } if (top_switch_inx == -1) break; /* * NOTE: Ideally we would add nodes in order of resource * availability rather than in order of bitmap position, but * that would add even more complexity and overhead. */ i_first = bit_ffs(switch_node_bitmap[top_switch_inx]); if (i_first >= 0) i_last = bit_fls(switch_node_bitmap[top_switch_inx]); else i_last = -2; for (i = i_first; ((i <= i_last) && (max_nodes > 0)); i++) { if (!bit_test(switch_node_bitmap[top_switch_inx], i) || bit_test(node_map, i) || !avail_cpu_per_node[i]) continue; rem_nodes--; min_rem_nodes--; max_nodes--; avail_cpus = avail_cpu_per_node[i]; total_cpus += avail_cpus; rem_cpus -= avail_cpus; rem_max_cpus -= avail_cpus; if (gres_per_job) { gres_plugin_job_sched_add(job_ptr->gres_list, avail_res_array[i]->sock_gres_list, avail_cpus); } bit_set(node_map, i); if ((rem_nodes <= 0) && (rem_cpus <= 0) && (!gres_per_job || gres_plugin_job_sched_test(job_ptr->gres_list, job_ptr->job_id))) { rc = SLURM_SUCCESS; goto fini; } } switch_node_cnt[top_switch_inx] = 0; /* Used all */ } if ((min_rem_nodes <= 0) && (rem_cpus <= 0) && (!gres_per_job || gres_plugin_job_sched_test(job_ptr->gres_list, job_ptr->job_id))) { rc = SLURM_SUCCESS; goto fini; } rc = SLURM_ERROR; fini: FREE_NULL_LIST(best_gres); FREE_NULL_LIST(node_weight_list); FREE_NULL_BITMAP(avail_nodes_bitmap); FREE_NULL_BITMAP(req_nodes_bitmap); FREE_NULL_BITMAP(req2_nodes_bitmap); FREE_NULL_BITMAP(best_nodes_bitmap); xfree(avail_cpu_per_node); xfree(switch_cpu_cnt); xfree(switch_gres); if (switch_node_bitmap) { for (i = 0; i < switch_record_cnt; i++) FREE_NULL_BITMAP(switch_node_bitmap[i]); xfree(switch_node_bitmap); } xfree(switch_node_cnt); xfree(switch_required); return rc; } static int _eval_nodes_lln(job_record_t *job_ptr, gres_mc_data_t *mc_ptr, bitstr_t *node_map, bitstr_t **avail_core, uint32_t min_nodes, uint32_t max_nodes, uint32_t req_nodes, avail_res_t **avail_res_array, uint16_t cr_type, bool prefer_alloc_nodes, bool first_pass) { int i, i_start, i_end, error_code = SLURM_ERROR; int rem_cpus, rem_nodes; /* remaining resources desired */ int min_rem_nodes; /* remaining resources desired */ int total_cpus = 0; /* #CPUs allocated to job */ int64_t rem_max_cpus; struct job_details *details_ptr = job_ptr->details; bitstr_t *req_map = details_ptr->req_node_bitmap; bitstr_t *orig_node_map = bit_copy(node_map); bool all_done = false, gres_per_job; uint16_t avail_cpus = 0; node_record_t *node_ptr; List node_weight_list = NULL; node_weight_type *nwt; ListIterator iter; uint16_t *avail_cpu_per_node = NULL; bool enforce_binding = false; if (job_ptr->gres_list && (job_ptr->bit_flags & GRES_ENFORCE_BIND)) enforce_binding = true; rem_cpus = details_ptr->min_cpus; rem_max_cpus = details_ptr->max_cpus; min_rem_nodes = min_nodes; if ((details_ptr->num_tasks != NO_VAL) && (details_ptr->num_tasks != 0)) max_nodes = MIN(max_nodes, details_ptr->num_tasks); if ((gres_per_job = gres_plugin_job_sched_init(job_ptr->gres_list))) rem_nodes = MIN(min_nodes, req_nodes); else rem_nodes = MAX(min_nodes, req_nodes); i_start = bit_ffs(node_map); if (i_start >= 0) i_end = bit_fls(node_map); else i_end = i_start - 1; if (req_map) { for (i = i_start; i <= i_end; i++) { if (!bit_test(req_map, i)) { bit_clear(node_map, i); continue; } node_ptr = node_record_table_ptr + i; if (!bit_test(node_map, i)) { debug("%pJ required node %s not available", job_ptr, node_ptr->name); goto fini; } if (!avail_res_array[i] || !avail_res_array[i]->avail_cpus) { debug("%pJ required node %s lacks available resources", job_ptr, node_ptr->name); goto fini; } _select_cores(job_ptr, mc_ptr, enforce_binding, i, &avail_cpus, max_nodes, min_rem_nodes, avail_core, avail_res_array, first_pass); _cpus_to_use(&avail_cpus, rem_max_cpus, min_rem_nodes, details_ptr, avail_res_array[i], i, cr_type); if ((avail_cpus > 0) && (max_nodes > 0)) { total_cpus += avail_cpus; rem_cpus -= avail_cpus; rem_max_cpus -= avail_cpus; rem_nodes--; min_rem_nodes--; /* leaving bitmap set, decr max limit */ max_nodes--; if (gres_per_job) { gres_plugin_job_sched_add( job_ptr->gres_list, avail_res_array[i]-> sock_gres_list, avail_cpus); } } else { /* node not selected (yet) */ debug("%pJ required node %s not available", job_ptr, node_ptr->name); goto fini; } } if ((rem_nodes <= 0) && (rem_cpus <= 0) && gres_plugin_job_sched_test(job_ptr->gres_list, job_ptr->job_id)) { error_code = SLURM_SUCCESS; bit_and(node_map, req_map); goto fini; } if (max_nodes <= 0) { error_code = SLURM_ERROR; goto fini; } bit_and_not(orig_node_map, node_map); } else { bit_clear_all(node_map); } /* Compute CPUs already allocated to required nodes */ if ((details_ptr->max_cpus != NO_VAL) && (total_cpus > details_ptr->max_cpus)) { info("%pJ can't use required nodes due to max CPU limit", job_ptr); goto fini; } /* * Accumulate nodes from those with highest available CPU count. * Logic is optimized for small node/CPU count allocations. * For larger allocation, use list_sort(). */ if (max_nodes == 0) all_done = true; avail_cpu_per_node = xmalloc(sizeof(uint16_t) * select_node_cnt); node_weight_list = _build_node_weight_list(orig_node_map); iter = list_iterator_create(node_weight_list); while (!all_done && (nwt = (node_weight_type *) list_next(iter))) { int last_max_cpu_cnt = -1; while (!all_done) { int max_cpu_idx = -1; for (i = i_start; i <= i_end; i++) { /* Node not available or already selected */ if (!bit_test(nwt->node_bitmap, i) || bit_test(node_map, i)) continue; _select_cores(job_ptr, mc_ptr, enforce_binding, i, &avail_cpus, max_nodes, min_rem_nodes, avail_core, avail_res_array, first_pass); _cpus_to_use(&avail_cpus, rem_max_cpus, min_rem_nodes, details_ptr, avail_res_array[i], i, cr_type); if (avail_cpus == 0) continue; avail_cpu_per_node[i] = avail_cpus; if ((max_cpu_idx == -1) || (avail_cpu_per_node[max_cpu_idx] < avail_cpu_per_node[i])) { max_cpu_idx = i; if (avail_cpu_per_node[max_cpu_idx] == last_max_cpu_cnt) break; } } if ((max_cpu_idx == -1) || (avail_cpu_per_node[max_cpu_idx] == 0)) { /* No more usable nodes left, get next weight */ break; } i = max_cpu_idx; avail_cpus = avail_cpu_per_node[i]; last_max_cpu_cnt = avail_cpus; total_cpus += avail_cpus; if ((details_ptr->max_cpus != NO_VAL) && (total_cpus > details_ptr->max_cpus)) { debug2("%s: %s: %pJ can't use node %d without exceeding job limit", plugin_type, __func__, job_ptr, i); bit_clear(nwt->node_bitmap, i); total_cpus -= avail_cpus; continue; } rem_cpus -= avail_cpus; rem_max_cpus -= avail_cpus; rem_nodes--; min_rem_nodes--; max_nodes--; bit_set(node_map, i); if (gres_per_job) { gres_plugin_job_sched_add(job_ptr->gres_list, avail_res_array[i]->sock_gres_list, avail_cpus); } if ((rem_nodes <= 0) && (rem_cpus <= 0) && gres_plugin_job_sched_test(job_ptr->gres_list, job_ptr->job_id)) { error_code = SLURM_SUCCESS; all_done = true; break; } if (max_nodes == 0) { all_done = true; break; } } } list_iterator_destroy(iter); if (error_code == SLURM_SUCCESS) { /* Already succeeded */ } else if ((rem_cpus > 0) || (min_rem_nodes > 0) || !gres_plugin_job_sched_test(job_ptr->gres_list, job_ptr->job_id)) { bit_clear_all(node_map); error_code = SLURM_ERROR; } else { error_code = SLURM_SUCCESS; } fini: FREE_NULL_LIST(node_weight_list); bit_free(orig_node_map); xfree(avail_cpu_per_node); return error_code; } /* * A variation of _eval_nodes() to select resources at the end of the node * list to reduce fragmentation */ static int _eval_nodes_serial(job_record_t *job_ptr, gres_mc_data_t *mc_ptr, bitstr_t *node_map, bitstr_t **avail_core, uint32_t min_nodes, uint32_t max_nodes, uint32_t req_nodes, avail_res_t **avail_res_array, uint16_t cr_type, bool prefer_alloc_nodes, bool first_pass) { int i, i_start, i_end, error_code = SLURM_ERROR; int rem_cpus, rem_nodes; /* remaining resources desired */ int min_rem_nodes; /* remaining resources desired */ int total_cpus = 0; /* #CPUs allocated to job */ int64_t rem_max_cpus; struct job_details *details_ptr = job_ptr->details; bitstr_t *req_map = details_ptr->req_node_bitmap; bitstr_t *orig_node_map = bit_copy(node_map); bool all_done = false, gres_per_job; uint16_t avail_cpus = 0; node_record_t *node_ptr; List node_weight_list = NULL; node_weight_type *nwt; ListIterator iter; bool enforce_binding = false; if (job_ptr->gres_list && (job_ptr->bit_flags & GRES_ENFORCE_BIND)) enforce_binding = true; rem_cpus = details_ptr->min_cpus; rem_max_cpus = details_ptr->max_cpus; min_rem_nodes = min_nodes; if ((details_ptr->num_tasks != NO_VAL) && (details_ptr->num_tasks != 0)) max_nodes = MIN(max_nodes, details_ptr->num_tasks); if ((gres_per_job = gres_plugin_job_sched_init(job_ptr->gres_list))) rem_nodes = MIN(min_nodes, req_nodes); else rem_nodes = MAX(min_nodes, req_nodes); i_start = bit_ffs(node_map); if (i_start >= 0) i_end = bit_fls(node_map); else i_end = i_start - 1; if (req_map) { for (i = i_start; i <= i_end; i++) { if (!bit_test(req_map, i)) { bit_clear(node_map, i); continue; } node_ptr = node_record_table_ptr + i; if (!bit_test(node_map, i)) { debug("%pJ required node %s not available", job_ptr, node_ptr->name); goto fini; } if (!avail_res_array[i] || !avail_res_array[i]->avail_cpus) { debug("%pJ required node %s lacks available resources", job_ptr, node_ptr->name); goto fini; } _select_cores(job_ptr, mc_ptr, enforce_binding, i, &avail_cpus, max_nodes, min_rem_nodes, avail_core, avail_res_array, first_pass); _cpus_to_use(&avail_cpus, rem_max_cpus, min_rem_nodes, details_ptr, avail_res_array[i], i, cr_type); if ((avail_cpus > 0) && (max_nodes > 0)) { total_cpus += avail_cpus; rem_cpus -= avail_cpus; rem_max_cpus -= avail_cpus; rem_nodes--; min_rem_nodes--; /* leaving bitmap set, decr max limit */ max_nodes--; if (gres_per_job) { gres_plugin_job_sched_add( job_ptr->gres_list, avail_res_array[i]-> sock_gres_list, avail_cpus); } } else { /* node not selected (yet) */ debug("%pJ required node %s lacks available resources", job_ptr, node_ptr->name); goto fini; } } if ((rem_nodes <= 0) && (rem_cpus <= 0) && gres_plugin_job_sched_test(job_ptr->gres_list, job_ptr->job_id)) { error_code = SLURM_SUCCESS; bit_and(node_map, req_map); goto fini; } if (max_nodes <= 0) { error_code = SLURM_ERROR; goto fini; } bit_and_not(orig_node_map, node_map); } else { bit_clear_all(node_map); } /* Compute CPUs already allocated to required nodes */ if ((details_ptr->max_cpus != NO_VAL) && (total_cpus > details_ptr->max_cpus)) { info("%pJ can't use required nodes due to max CPU limit", job_ptr); goto fini; } if (max_nodes == 0) all_done = true; node_weight_list = _build_node_weight_list(orig_node_map); iter = list_iterator_create(node_weight_list); while (!all_done && (nwt = (node_weight_type *) list_next(iter))) { for (i = i_end; ((i >= i_start) && (max_nodes > 0)); i--) { if (!avail_res_array[i] || !avail_res_array[i]->avail_cpus) continue; /* Node not available or already selected */ if (!bit_test(nwt->node_bitmap, i) || bit_test(node_map, i)) continue; _select_cores(job_ptr, mc_ptr, enforce_binding, i, &avail_cpus, max_nodes, min_rem_nodes, avail_core, avail_res_array, first_pass); _cpus_to_use(&avail_cpus, rem_max_cpus, min_rem_nodes, details_ptr, avail_res_array[i], i, cr_type); if (avail_cpus == 0) continue; total_cpus += avail_cpus; if ((details_ptr->max_cpus != NO_VAL) && (total_cpus > details_ptr->max_cpus)) { debug2("%s: %s: %pJ can't use node %d without exceeding job limit", plugin_type, __func__, job_ptr, i); total_cpus -= avail_cpus; continue; } rem_cpus -= avail_cpus; rem_max_cpus -= avail_cpus; rem_nodes--; min_rem_nodes--; max_nodes--; bit_set(node_map, i); if (gres_per_job) { gres_plugin_job_sched_add(job_ptr->gres_list, avail_res_array[i]->sock_gres_list, avail_cpus); } if ((rem_nodes <= 0) && (rem_cpus <= 0) && gres_plugin_job_sched_test(job_ptr->gres_list, job_ptr->job_id)) { error_code = SLURM_SUCCESS; all_done = true; break; } if (max_nodes == 0) { all_done = true; break; } } } list_iterator_destroy(iter); if (error_code == SLURM_SUCCESS) { /* Already succeeded */ } else if ((rem_cpus > 0) || (min_rem_nodes > 0) || !gres_plugin_job_sched_test(job_ptr->gres_list, job_ptr->job_id)) { bit_clear_all(node_map); error_code = SLURM_ERROR; } else { error_code = SLURM_SUCCESS; } fini: FREE_NULL_LIST(node_weight_list); bit_free(orig_node_map); return error_code; } /* * This is an intermediary step between _select_nodes() and _eval_nodes() * to tackle the knapsack problem. This code incrementally removes nodes * with low CPU counts for the job and re-evaluates each result. * * RET SLURM_SUCCESS or an error code */ extern int choose_nodes(job_record_t *job_ptr, bitstr_t *node_map, bitstr_t **avail_core, uint32_t min_nodes, uint32_t max_nodes, uint32_t req_nodes, avail_res_t **avail_res_array, uint16_t cr_type, bool prefer_alloc_nodes, gres_mc_data_t *tres_mc_ptr) { int i, i_first, i_last; int count, ec, most_res = 0, rem_nodes, node_cnt = 0; bitstr_t *orig_node_map, *req_node_map = NULL; bitstr_t **orig_core_array; if (job_ptr->details->req_node_bitmap) req_node_map = job_ptr->details->req_node_bitmap; /* clear nodes from the bitmap that don't have available resources */ i_first = bit_ffs(node_map); if (i_first >= 0) i_last = bit_fls(node_map); else i_last = i_first - 1; for (i = i_first; i <= i_last; i++) { if (!bit_test(node_map, i)) continue; /* * Make sure we don't say we can use a node exclusively * that is bigger than our whole-job maximum CPU count. */ if (((job_ptr->details->whole_node == 1) && (job_ptr->details->max_cpus != NO_VAL) && (job_ptr->details->max_cpus < avail_res_array[i]->avail_cpus)) || /* OR node has no CPUs */ (avail_res_array[i]->avail_cpus < 1)) { if (req_node_map && bit_test(req_node_map, i)) { /* can't clear a required node! */ return SLURM_ERROR; } bit_clear(node_map, i); } else { node_cnt++; } } if ((job_ptr->details->num_tasks > 1) && (max_nodes > job_ptr->details->num_tasks)) max_nodes = MAX(job_ptr->details->num_tasks, min_nodes); /* * _eval_nodes() might need to be called more than once and is * destructive of node_map and avail_core. Copy those bitmaps. */ orig_node_map = bit_copy(node_map); orig_core_array = copy_core_array(avail_core); ec = _eval_nodes(job_ptr, tres_mc_ptr, node_map, avail_core, min_nodes, max_nodes, req_nodes, avail_res_array, cr_type, prefer_alloc_nodes, true); if (ec == SLURM_SUCCESS) goto fini; bit_or(node_map, orig_node_map); core_array_or(avail_core, orig_core_array); rem_nodes = bit_set_count(node_map); if (rem_nodes <= min_nodes) { /* Can not remove any nodes, enable use of non-local GRES */ ec = _eval_nodes(job_ptr, tres_mc_ptr, node_map, avail_core, min_nodes, max_nodes, req_nodes, avail_res_array, cr_type, prefer_alloc_nodes, false); goto fini; } /* * This nodeset didn't work. To avoid a possible knapsack problem, * incrementally remove nodes with low resource counts (sum of CPU and * GPU count if using GPUs, otherwise the CPU count) and retry */ for (i = 0; i < select_node_cnt; i++) { if (avail_res_array[i]) { most_res = MAX(most_res, avail_res_array[i]->avail_res_cnt); } } for (count = 1; count < most_res; count++) { int nochange = 1; bit_or(node_map, orig_node_map); core_array_or(avail_core, orig_core_array); for (i = i_first; i <= i_last; i++) { if (!bit_test(node_map, i)) continue; if ((avail_res_array[i]->avail_res_cnt > 0) && (avail_res_array[i]->avail_res_cnt <= count)) { if (req_node_map && bit_test(req_node_map, i)) continue; nochange = 0; bit_clear(node_map, i); bit_clear(orig_node_map, i); if (--rem_nodes <= min_nodes) break; } } if (nochange && (count != 1)) continue; ec = _eval_nodes(job_ptr, tres_mc_ptr, node_map, avail_core, min_nodes, max_nodes, req_nodes, avail_res_array, cr_type, prefer_alloc_nodes, false); if (ec == SLURM_SUCCESS) break; if (rem_nodes <= min_nodes) break; } fini: if ((ec == SLURM_SUCCESS) && job_ptr->gres_list && orig_core_array) { /* * Update available CPU count for any removed cores. * Cores are only removed for jobs with GRES to enforce binding. */ for (i = i_first; i <= i_last; i++) { if (!bit_test(node_map, i)|| !orig_core_array[i] || !avail_core[i]) continue; count = bit_set_count(avail_core[i]); count *= select_node_record[i].vpus; avail_res_array[i]->avail_cpus = MIN(count, avail_res_array[i]->avail_cpus); if (avail_res_array[i]->avail_cpus == 0) { error("%s: %s: avail_cpus underflow for %pJ", plugin_type, __func__, job_ptr); if (req_node_map && bit_test(req_node_map, i)) { /* can't clear a required node! */ ec = SLURM_ERROR; } bit_clear(node_map, i); } } } FREE_NULL_BITMAP(orig_node_map); free_core_array(&orig_core_array); return ec; } /* * can_job_run_on_node - Given the job requirements, determine which * resources from the given node (if any) can be * allocated to this job. Returns a structure identifying * the resources available for allocation to this job. * NOTE: This process does NOT support overcommitting resources * * IN job_ptr - pointer to job requirements * IN/OUT core_map - per-node bitmap of available cores * IN node_i - index of node to be evaluated * IN s_p_n - Expected sockets_per_node (NO_VAL if not limited) * IN cr_type - Consumable Resource setting * IN test_only - Determine if job could ever run, ignore allocated memory * check * IN will_run - Determining when a pending job can start * IN: part_core_map - per-node bitmap of cores allocated to jobs of this * partition or NULL if don't care * RET Available resources. Call _array() to release memory. * * NOTE: The returned cpu_count may be less than the number of set bits in * core_map for the given node. The cr_dist functions will determine * which bits to de-select from the core_map to match the cpu_count. */ extern avail_res_t *can_job_run_on_node(job_record_t *job_ptr, bitstr_t **core_map, const uint32_t node_i, uint32_t s_p_n, node_use_record_t *node_usage, uint16_t cr_type, bool test_only, bool will_run, bitstr_t **part_core_map) { uint16_t cpus = 0; uint64_t avail_mem = NO_VAL64, req_mem; int cpu_alloc_size, i, rc; node_record_t *node_ptr = node_record_table_ptr + node_i; List node_gres_list; bitstr_t *part_core_map_ptr = NULL, *req_sock_map = NULL; avail_res_t *avail_res = NULL; List sock_gres_list = NULL; bool enforce_binding = false; uint16_t min_cpus_per_node, ntasks_per_node = 1; if (((job_ptr->bit_flags & BACKFILL_TEST) == 0) && !test_only && !will_run && IS_NODE_COMPLETING(node_ptr)) { /* * Do not allocate more jobs to nodes with completing jobs, * backfill scheduler independently handles completing nodes */ return NULL; } if (part_core_map) part_core_map_ptr = part_core_map[node_i]; if (node_usage[node_i].gres_list) node_gres_list = node_usage[node_i].gres_list; else node_gres_list = node_ptr->gres_list; if (job_ptr->gres_list) { /* Identify available GRES and adjacent cores */ if (job_ptr->bit_flags & GRES_ENFORCE_BIND) enforce_binding = true; if (!core_map[node_i]) { core_map[node_i] = bit_alloc( select_node_record[node_i].tot_cores); bit_set_all(core_map[node_i]); } sock_gres_list = gres_plugin_job_test2( job_ptr->gres_list, node_gres_list, test_only, core_map[node_i], select_node_record[node_i].tot_sockets, select_node_record[node_i].cores, job_ptr->job_id, node_ptr->name, enforce_binding, s_p_n, &req_sock_map, job_ptr->user_id, node_i); if (!sock_gres_list) { /* GRES requirement fail */ #if _DEBUG info("Test fail on node %d: gres_plugin_job_test2", node_i); #endif return NULL; } } /* Identify available CPUs */ if (cr_type & CR_CORE) { /* cpu_alloc_size = # of CPUs per core */ cpu_alloc_size = select_node_record[node_i].vpus; avail_res = common_allocate_cores(job_ptr, core_map[node_i], part_core_map_ptr, node_i, &cpu_alloc_size, false, req_sock_map); } else if (cr_type & CR_SOCKET) { /* cpu_alloc_size = # of CPUs per socket */ cpu_alloc_size = select_node_record[node_i].cores * select_node_record[node_i].vpus; avail_res = common_allocate_sockets(job_ptr, core_map[node_i], part_core_map_ptr, node_i, &cpu_alloc_size, req_sock_map); } else { /* cpu_alloc_size = 1 individual CPU */ cpu_alloc_size = 1; avail_res = common_allocate_cores(job_ptr, core_map[node_i], part_core_map_ptr, node_i, &cpu_alloc_size, true, req_sock_map); } FREE_NULL_BITMAP(req_sock_map); if (!avail_res || (avail_res->max_cpus == 0)) { common_free_avail_res(avail_res); #if _DEBUG info("Test fail on node %d: _allocate_cores/sockets", node_i); #endif FREE_NULL_LIST(sock_gres_list); return NULL; } /* Check that sufficient CPUs remain to run a task on this node */ if (job_ptr->details->ntasks_per_node) { ntasks_per_node = job_ptr->details->ntasks_per_node; } else if (job_ptr->details->overcommit) { ntasks_per_node = 1; } else if ((job_ptr->details->max_nodes == 1) && (job_ptr->details->num_tasks != 0)) { ntasks_per_node = job_ptr->details->num_tasks; } else if (job_ptr->details->max_nodes) { ntasks_per_node = (job_ptr->details->num_tasks + job_ptr->details->max_nodes - 1) / job_ptr->details->max_nodes; } min_cpus_per_node = ntasks_per_node * job_ptr->details->cpus_per_task; if (avail_res->max_cpus < min_cpus_per_node) { #if _DEBUG info("Test fail on node %d: max_cpus < min_cpus_per_node (%u < %u)", node_i, avail_res->max_cpus, min_cpus_per_node); #endif FREE_NULL_LIST(sock_gres_list); common_free_avail_res(avail_res); return NULL; } if (cr_type & CR_MEMORY) { avail_mem = select_node_record[node_i].real_memory - select_node_record[node_i].mem_spec_limit; if (!test_only) avail_mem -= node_usage[node_i].alloc_memory; } if (sock_gres_list) { uint16_t near_gpu_cnt = 0; avail_res->sock_gres_list = sock_gres_list; /* Disable GRES that can't be used with remaining cores */ rc = gres_plugin_job_core_filter2( sock_gres_list, avail_mem, avail_res->max_cpus, enforce_binding, core_map[node_i], select_node_record[node_i].tot_sockets, select_node_record[node_i].cores, select_node_record[node_i].vpus, s_p_n, job_ptr->details->ntasks_per_node, (job_ptr->details->whole_node == 1), &avail_res->avail_gpus, &near_gpu_cnt); if (rc != 0) { #if _DEBUG info("Test fail on node %d: gres_plugin_job_core_filter2", node_i); #endif common_free_avail_res(avail_res); return NULL; } /* Favor nodes with more co-located GPUs */ node_ptr->sched_weight = (node_ptr->sched_weight & 0xffffffffffffff00) | (0xff - near_gpu_cnt); } cpus = avail_res->max_cpus; if (cr_type & CR_MEMORY) { /* * Memory Check: check pn_min_memory to see if: * - this node has enough memory (MEM_PER_CPU == 0) * - there are enough free_cores (MEM_PER_CPU == 1) */ req_mem = job_ptr->details->pn_min_memory & ~MEM_PER_CPU; if (job_ptr->details->pn_min_memory & MEM_PER_CPU) { /* memory is per-CPU */ if (!(job_ptr->bit_flags & BF_WHOLE_NODE_TEST) && ((req_mem * cpus) > avail_mem) && (job_ptr->details->whole_node == 1)) { cpus = 0; } else if (!(cr_type & CR_CPU) && job_ptr->details->mc_ptr && (job_ptr->details->mc_ptr-> ntasks_per_core == 1) && job_ptr->details->cpus_per_task == 1) { /* * In this scenario, CPUs represents cores and * the CPU/core count will be inflated later on * to include all of the threads on a core. So * we need to compare apples to apples and only * remove 1 CPU/core at a time. */ while ((cpus > 0) && ((req_mem * ((int) cpus * (int) select_node_record[node_i].vpus)) > avail_mem)) cpus -= 1; } else { while ((req_mem * cpus) > avail_mem) { if (cpus >= cpu_alloc_size) { cpus -= cpu_alloc_size; } else { cpus = 0; break; } } } if (job_ptr->details->cpus_per_task > 1) { i = cpus % job_ptr->details->cpus_per_task; cpus -= i; } if (cpus < job_ptr->details->ntasks_per_node) cpus = 0; /* FIXME: Need to recheck min_cores, etc. here */ } else { /* memory is per node */ if (req_mem > avail_mem) cpus = 0; } } if (cpus == 0) { #if _DEBUG info("Test fail on node %d: cpus == 0", node_i); #endif bit_clear_all(core_map[node_i]); } if (select_debug_flags & DEBUG_FLAG_SELECT_TYPE) { info("%s: %s: %u CPUs on %s(state:%d), mem %"PRIu64"/%"PRIu64, plugin_type, __func__, cpus, select_node_record[node_i].node_ptr->name, node_usage[node_i].node_state, node_usage[node_i].alloc_memory, select_node_record[node_i].real_memory); } avail_res->avail_cpus = cpus; avail_res->avail_res_cnt = cpus + avail_res->avail_gpus; _avail_res_log(avail_res, node_ptr->name); return avail_res; }