/*****************************************************************************\
* job_test.c - Determine if job can be allocated resources.
*****************************************************************************
* Copyright (C) 2018 SchedMD LLC
* Derived in large part from select/cons_res plugin
*
* This file is part of Slurm, a resource management program.
* For details, see .
* Please also read the included file: DISCLAIMER.
*
* Slurm is free software; you can redistribute it and/or modify it under
* the terms of the GNU General Public License as published by the Free
* Software Foundation; either version 2 of the License, or (at your option)
* any later version.
*
* In addition, as a special exception, the copyright holders give permission
* to link the code of portions of this program with the OpenSSL library under
* certain conditions as described in each individual source file, and
* distribute linked combinations including the two. You must obey the GNU
* General Public License in all respects for all of the code used other than
* OpenSSL. If you modify file(s) with this exception, you may extend this
* exception to your version of the file(s), but you are not obligated to do
* so. If you do not wish to do so, delete this exception statement from your
* version. If you delete this exception statement from all source files in
* the program, then also delete it here.
*
* Slurm is distributed in the hope that it will be useful, but WITHOUT ANY
* WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
* FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
* details.
*
* You should have received a copy of the GNU General Public License along
* with Slurm; if not, write to the Free Software Foundation, Inc.,
* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
\*****************************************************************************/
#include
#include "select_cons_tres.h"
#include "dist_tasks.h"
#include "job_test.h"
#define _DEBUG 0 /* Enables module specific debugging */
typedef struct node_weight_struct {
bitstr_t *node_bitmap; /* bitmap of nodes with this weight */
uint32_t weight; /* priority of node for scheduling work on */
} node_weight_type;
typedef struct topo_weight_info {
bitstr_t *node_bitmap;
int node_cnt;
uint64_t weight;
} topo_weight_info_t;
/* Local functions */
static List _build_node_weight_list(bitstr_t *node_bitmap);
static void _cpus_to_use(uint16_t *avail_cpus, int64_t rem_cpus, int rem_nodes,
struct job_details *details_ptr,
avail_res_t *avail_res, int node_inx,
uint16_t cr_type);
static bool _enough_nodes(int avail_nodes, int rem_nodes,
uint32_t min_nodes, uint32_t req_nodes);
static int _eval_nodes(job_record_t *job_ptr, gres_mc_data_t *mc_ptr,
bitstr_t *node_map, bitstr_t **avail_core,
uint32_t min_nodes, uint32_t max_nodes,
uint32_t req_nodes, avail_res_t **avail_res_array,
uint16_t cr_type, bool prefer_alloc_nodes,
bool first_pass);
static int _eval_nodes_busy(job_record_t *job_ptr,
gres_mc_data_t *mc_ptr, bitstr_t *node_map,
bitstr_t **avail_core, uint32_t min_nodes,
uint32_t max_nodes, uint32_t req_nodes,
avail_res_t **avail_res_array, uint16_t cr_type,
bool prefer_alloc_nodes, bool first_pass);
static int _eval_nodes_dfly(job_record_t *job_ptr,
gres_mc_data_t *mc_ptr, bitstr_t *node_map,
bitstr_t **avail_core, uint32_t min_nodes,
uint32_t max_nodes, uint32_t req_nodes,
avail_res_t **avail_res_array, uint16_t cr_type,
bool prefer_alloc_nodes, bool first_pass);
static int _eval_nodes_lln(job_record_t *job_ptr,
gres_mc_data_t *mc_ptr, bitstr_t *node_map,
bitstr_t **avail_core, uint32_t min_nodes,
uint32_t max_nodes, uint32_t req_nodes,
avail_res_t **avail_res_array, uint16_t cr_type,
bool prefer_alloc_nodes, bool first_pass);
static int _eval_nodes_serial(job_record_t *job_ptr,
gres_mc_data_t *mc_ptr, bitstr_t *node_map,
bitstr_t **avail_core, uint32_t min_nodes,
uint32_t max_nodes, uint32_t req_nodes,
avail_res_t **avail_res_array, uint16_t cr_type,
bool prefer_alloc_nodes, bool first_pass);
static int _eval_nodes_spread(job_record_t *job_ptr,
gres_mc_data_t *mc_ptr, bitstr_t *node_map,
bitstr_t **avail_core, uint32_t min_nodes,
uint32_t max_nodes, uint32_t req_nodes,
avail_res_t **avail_res_array, uint16_t cr_type,
bool prefer_alloc_nodes, bool first_pass);
static int _eval_nodes_topo(job_record_t *job_ptr,
gres_mc_data_t *mc_ptr, bitstr_t *node_map,
bitstr_t **avail_core, uint32_t min_nodes,
uint32_t max_nodes, uint32_t req_nodes,
avail_res_t **avail_res_array, uint16_t cr_type,
bool prefer_alloc_nodes, bool first_pass);
static int _node_weight_find(void *x, void *key);
static void _node_weight_free(void *x);
static int _node_weight_sort(void *x, void *y);
/* Find node_weight_type element from list with same weight as node config */
static int _node_weight_find(void *x, void *key)
{
node_weight_type *nwt = (node_weight_type *) x;
config_record_t *config_ptr = (config_record_t *) key;
if (nwt->weight == config_ptr->weight)
return 1;
return 0;
}
/* Free node_weight_type element from list */
static void _node_weight_free(void *x)
{
node_weight_type *nwt = (node_weight_type *) x;
bit_free(nwt->node_bitmap);
xfree(nwt);
}
/* Sort list of node_weight_type reords in order of increasing node weight */
static int _node_weight_sort(void *x, void *y)
{
node_weight_type *nwt1 = *(node_weight_type **) x;
node_weight_type *nwt2 = *(node_weight_type **) y;
return (int) (nwt1->weight - nwt2->weight);
}
/*
* Given a bitmap of available nodes, return a list of node_weight_type
* records in order of increasing "weight" (priority)
*/
static List _build_node_weight_list(bitstr_t *node_bitmap)
{
int i, i_first, i_last;
List node_list;
node_record_t *node_ptr;
node_weight_type *nwt;
xassert(node_bitmap);
/* Build list of node_weight_type records, one per node weight */
node_list = list_create(_node_weight_free);
i_first = bit_ffs(node_bitmap);
if (i_first == -1)
return node_list;
i_last = bit_fls(node_bitmap);
for (i = i_first; i <= i_last; i++) {
if (!bit_test(node_bitmap, i))
continue;
node_ptr = node_record_table_ptr + i;
nwt = list_find_first(node_list, _node_weight_find,
node_ptr->config_ptr);
if (!nwt) {
nwt = xmalloc(sizeof(node_weight_type));
nwt->node_bitmap = bit_alloc(select_node_cnt);
nwt->weight = node_ptr->config_ptr->weight;
list_append(node_list, nwt);
}
bit_set(nwt->node_bitmap, i);
}
/* Sort the list in order of increasing node weight */
list_sort(node_list, _node_weight_sort);
return node_list;
}
/* Log avail_res_t information for a given node */
static void _avail_res_log(avail_res_t *avail_res, char *node_name)
{
#if _DEBUG
int i;
char *gres_info = "";
if (!avail_res) {
info("Node:%s No resources", node_name);
return;
}
info("Node:%s Sockets:%u SpecThreads:%u CPUs:Min-Max,Avail:%u-%u,%u VPUs:%u",
node_name, avail_res->sock_cnt, avail_res->spec_threads,
avail_res->min_cpus, avail_res->max_cpus, avail_res->avail_cpus,
avail_res->vpus);
gres_info = gres_plugin_sock_str(avail_res->sock_gres_list, -1);
if (gres_info) {
info(" AnySocket %s", gres_info);
xfree(gres_info);
}
for (i = 0; i < avail_res->sock_cnt; i++) {
gres_info = gres_plugin_sock_str(avail_res->sock_gres_list, i);
if (gres_info) {
info(" Socket[%d] Cores:%u GRES:%s", i,
avail_res->avail_cores_per_sock[i], gres_info);
xfree(gres_info);
} else {
info(" Socket[%d] Cores:%u", i,
avail_res->avail_cores_per_sock[i]);
}
}
#endif
}
/*
* Determine how many CPUs on the node can be used based upon the resource
* allocation unit (node, socket, core, etc.) and making sure that
* resources will be available for nodes considered later in the
* scheduling process
* OUT avail_cpus - Count of CPUs to use on this node
* IN rem_max_cpus - Maximum count of CPUs remaining to be allocated for job
* IN rem_nodes - Count of nodes remaining to be allocated for job
* IN details_ptr - Job details information
* IN avail_res - Available resources for job on this node, contents updated
* IN node_inx - Node index
* IN cr_type - Resource allocation units (CR_CORE, CR_SOCKET, etc).
*/
static void _cpus_to_use(uint16_t *avail_cpus, int64_t rem_max_cpus,
int rem_nodes, struct job_details *details_ptr,
avail_res_t *avail_res, int node_inx,
uint16_t cr_type)
{
int resv_cpus; /* CPUs to be allocated on other nodes */
if (details_ptr->whole_node == 1) /* Use all resources on node */
return;
resv_cpus = MAX((rem_nodes - 1), 0);
resv_cpus *= common_cpus_per_core(details_ptr, node_inx);
if (cr_type & CR_SOCKET)
resv_cpus *= select_node_record[node_inx].cores;
rem_max_cpus -= resv_cpus;
if (*avail_cpus > rem_max_cpus) {
*avail_cpus = MAX(rem_max_cpus, (int)details_ptr->pn_min_cpus);
*avail_cpus = MAX(*avail_cpus, details_ptr->min_gres_cpu);
/* Round up CPU count to CPU in allocation unit (e.g. core) */
avail_res->avail_cpus = *avail_cpus;
}
avail_res->avail_res_cnt = avail_res->avail_cpus +
avail_res->avail_gpus;
}
static bool _enough_nodes(int avail_nodes, int rem_nodes,
uint32_t min_nodes, uint32_t req_nodes)
{
int needed_nodes;
if (req_nodes > min_nodes)
needed_nodes = rem_nodes + min_nodes - req_nodes;
else
needed_nodes = rem_nodes;
return (avail_nodes >= needed_nodes);
}
/*
* Identify the specific cores and GRES available to this job on this node.
* The job's requirements for tasks-per-socket, cpus-per-task, etc. are
* not considered at this point, but must be considered later.
* IN job_ptr - job attempting to be scheduled
* IN mc_ptr - job's multi-core specs, NO_VAL and INFINITE mapped to zero
* IN node_inx - zero-origin node index
* IN max_nodes - maximum additional node count to allocate
* IN rem_nodes - desired additional node count to allocate
* IN avail_core - available core bitmap, UPDATED
* IN avail_res_array - available resources on the node
* IN first_pass - set if first scheduling attempt for this job, only use
* co-located GRES and cores
*/
static void _select_cores(job_record_t *job_ptr, gres_mc_data_t *mc_ptr,
bool enforce_binding, int node_inx,
uint16_t *avail_cpus, uint32_t max_nodes,
int rem_nodes, bitstr_t **avail_core,
avail_res_t **avail_res_array, bool first_pass)
{
int alloc_tasks = 0;
uint32_t min_tasks_this_node = 0, max_tasks_this_node = 0;
struct job_details *details_ptr = job_ptr->details;
rem_nodes = MIN(rem_nodes, 1); /* If range of node counts */
if (mc_ptr->ntasks_per_node) {
min_tasks_this_node = mc_ptr->ntasks_per_node;
max_tasks_this_node = mc_ptr->ntasks_per_node;
} else if (mc_ptr->ntasks_per_board) {
min_tasks_this_node = mc_ptr->ntasks_per_board;
max_tasks_this_node = mc_ptr->ntasks_per_board *
select_node_record[node_inx].boards;
} else if (mc_ptr->ntasks_per_socket) {
min_tasks_this_node = mc_ptr->ntasks_per_socket;
max_tasks_this_node = mc_ptr->ntasks_per_socket *
select_node_record[node_inx].tot_sockets;
} else if (mc_ptr->ntasks_per_core) {
min_tasks_this_node = mc_ptr->ntasks_per_core;
max_tasks_this_node = mc_ptr->ntasks_per_core *
select_node_record[node_inx].tot_cores;
} else if (details_ptr && (details_ptr->max_nodes == 1)) {
if ((details_ptr->num_tasks == NO_VAL) ||
(details_ptr->num_tasks == 0)) {
min_tasks_this_node = 1;
max_tasks_this_node = NO_VAL;
} else {
min_tasks_this_node = details_ptr->num_tasks;
max_tasks_this_node = details_ptr->num_tasks;
}
} else if (details_ptr &&
((details_ptr->num_tasks == 1) ||
((details_ptr->num_tasks == details_ptr->min_nodes) &&
(details_ptr->num_tasks == details_ptr->max_nodes)))) {
min_tasks_this_node = 1;
max_tasks_this_node = 1;
} else {
min_tasks_this_node = 1;
max_tasks_this_node = NO_VAL;
}
/* Determine how many tasks can be started on this node */
if (mc_ptr->cpus_per_task &&
(!details_ptr || !details_ptr->overcommit)) {
alloc_tasks = avail_res_array[node_inx]->avail_cpus /
mc_ptr->cpus_per_task;
if (alloc_tasks < min_tasks_this_node)
max_tasks_this_node = 0;
}
*avail_cpus = avail_res_array[node_inx]->avail_cpus;
if (job_ptr->gres_list) {
gres_plugin_job_core_filter3(mc_ptr,
avail_res_array[node_inx]->sock_gres_list,
avail_res_array[node_inx]->sock_cnt,
select_node_record[node_inx].cores,
select_node_record[node_inx].vpus, avail_cpus,
&min_tasks_this_node, &max_tasks_this_node,
rem_nodes, enforce_binding, first_pass,
avail_core[node_inx]);
}
if (max_tasks_this_node == 0) {
*avail_cpus = 0;
} else if ((slurmctld_conf.select_type_param & CR_ONE_TASK_PER_CORE) &&
((mc_ptr->ntasks_per_core == INFINITE16) ||
(mc_ptr->ntasks_per_core == 0)) &&
details_ptr && (details_ptr->min_gres_cpu == 0)) {
*avail_cpus = bit_set_count(avail_core[node_inx]);
}
}
/*
* This is the heart of the selection process
* IN job_ptr - job attempting to be scheduled
* IN mc_ptr - job's multi-core specs, NO_VAL and INFINITE mapped to zero
* IN node_map - bitmap of available/selected nodes, UPDATED
* IN avail_core - available core bitmap, UPDATED
* IN min_nodes - minimum node allocation size in nodes
* IN max_nodes - maximum node allocation size in nodes
* IN: req_nodes - number of requested nodes
* IN avail_res_array - available resources on the node
* IN cr_type - allocation type (sockets, cores, etc.)
* IN prefer_alloc_nodes - if set, prefer use of already allocated nodes
* IN first_pass - set if first scheduling attempt for this job, be picky
* RET SLURM_SUCCESS or an error code
*/
static int _eval_nodes(job_record_t *job_ptr, gres_mc_data_t *mc_ptr,
bitstr_t *node_map, bitstr_t **avail_core,
uint32_t min_nodes, uint32_t max_nodes,
uint32_t req_nodes, avail_res_t **avail_res_array,
uint16_t cr_type, bool prefer_alloc_nodes,
bool first_pass)
{
int i, j, error_code = SLURM_ERROR;
int *consec_cpus; /* how many CPUs we can add from this
* consecutive set of nodes */
List *consec_gres; /* how many GRES we can add from this
* consecutive set of nodes */
int *consec_nodes; /* how many nodes we can add from this
* consecutive set of nodes */
int *consec_start; /* where this consecutive set starts (index) */
int *consec_end; /* where this consecutive set ends (index) */
int *consec_req; /* are nodes from this set required
* (in req_bitmap) */
uint64_t *consec_weight; /* node scheduling weight */
node_record_t *node_ptr = NULL;
int consec_index, consec_size, sufficient;
int rem_cpus, rem_nodes; /* remaining resources desired */
int min_rem_nodes; /* remaining resources desired */
int best_fit_nodes, best_fit_cpus, best_fit_req;
int best_fit_sufficient, best_fit_index = 0;
bool new_best;
uint64_t best_weight = 0;
uint16_t avail_cpus = 0;
int64_t rem_max_cpus;
int total_cpus = 0; /* #CPUs allocated to job */
bool gres_per_job, required_node;
struct job_details *details_ptr = job_ptr->details;
bitstr_t *req_map = details_ptr->req_node_bitmap;
bool enforce_binding = false;
uint16_t *avail_cpu_per_node = NULL;
xassert(node_map);
if (select_node_cnt != node_record_count) {
error("%s: node count inconsistent with slurmctld (%u != %u)",
plugin_type, select_node_cnt, node_record_count);
return error_code;
}
if (bit_set_count(node_map) < min_nodes)
return error_code;
if ((details_ptr->req_node_bitmap) &&
(!bit_super_set(details_ptr->req_node_bitmap, node_map)))
return error_code;
if (job_ptr->bit_flags & SPREAD_JOB) {
/* Spread the job out over many nodes */
return _eval_nodes_spread(job_ptr, mc_ptr, node_map, avail_core,
min_nodes, max_nodes, req_nodes,
avail_res_array, cr_type,
prefer_alloc_nodes, first_pass);
}
if (prefer_alloc_nodes && !details_ptr->contiguous) {
/*
* Select resource on busy nodes first in order to leave
* idle resources free for as long as possible so that longer
* running jobs can get more easily started by the backfill
* scheduler plugin
*/
return _eval_nodes_busy(job_ptr, mc_ptr, node_map, avail_core,
min_nodes, max_nodes, req_nodes,
avail_res_array, cr_type,
prefer_alloc_nodes, first_pass);
}
if ((cr_type & CR_LLN) ||
(job_ptr->part_ptr &&
(job_ptr->part_ptr->flags & PART_FLAG_LLN))) {
/* Select resource on the Least Loaded Node */
return _eval_nodes_lln(job_ptr, mc_ptr, node_map, avail_core,
min_nodes, max_nodes, req_nodes,
avail_res_array, cr_type,
prefer_alloc_nodes, first_pass);
}
if (pack_serial_at_end &&
(details_ptr->min_cpus == 1) && (req_nodes == 1)) {
/*
* Put serial jobs at the end of the available node list
* rather than using a best-fit algorithm, which fragments
* resources.
*/
return _eval_nodes_serial(job_ptr, mc_ptr, node_map, avail_core,
min_nodes, max_nodes, req_nodes,
avail_res_array, cr_type,
prefer_alloc_nodes, first_pass);
}
if (switch_record_cnt && switch_record_table &&
!details_ptr->contiguous &&
((topo_optional == false) || job_ptr->req_switch)) {
/* Perform optimized resource selection based upon topology */
if (have_dragonfly) {
return _eval_nodes_dfly(job_ptr, mc_ptr, node_map,
avail_core, min_nodes,
max_nodes, req_nodes,
avail_res_array, cr_type,
prefer_alloc_nodes, first_pass);
} else {
return _eval_nodes_topo(job_ptr, mc_ptr, node_map,
avail_core, min_nodes,
max_nodes, req_nodes,
avail_res_array, cr_type,
prefer_alloc_nodes, first_pass);
}
}
if (job_ptr->gres_list && (job_ptr->bit_flags & GRES_ENFORCE_BIND))
enforce_binding = true;
/* make allocation for 50 sets of consecutive nodes, expand as needed */
consec_size = 50;
consec_cpus = xmalloc(sizeof(int) * consec_size);
consec_nodes = xmalloc(sizeof(int) * consec_size);
consec_start = xmalloc(sizeof(int) * consec_size);
consec_end = xmalloc(sizeof(int) * consec_size);
consec_req = xmalloc(sizeof(int) * consec_size);
consec_weight = xmalloc(sizeof(uint64_t) * consec_size);
/* Build table with information about sets of consecutive nodes */
consec_index = 0;
consec_req[consec_index] = -1; /* no required nodes here by default */
consec_weight[consec_index] = NO_VAL64;
avail_cpu_per_node = xmalloc(sizeof(uint16_t) * select_node_cnt);
rem_cpus = details_ptr->min_cpus;
rem_max_cpus = details_ptr->max_cpus;
min_rem_nodes = min_nodes;
if ((gres_per_job = gres_plugin_job_sched_init(job_ptr->gres_list))) {
rem_nodes = MIN(min_nodes, req_nodes);
consec_gres = xmalloc(sizeof(List) * consec_size);
} else
rem_nodes = MAX(min_nodes, req_nodes);
/*
* If there are required nodes, first determine the resources they
* provide, then select additional resources as needed in next loop
*/
if (req_map) {
int i_first, i_last;
i_first = bit_ffs(req_map);
if (i_first >= 0) {
i_last = bit_fls(req_map);
if (((i_last - i_first + 1) > max_nodes) &&
(bit_set_count(req_map) > max_nodes))
goto fini;
} else
i_last = i_first - 1;
for (i = i_first; ((i <= i_last) && (max_nodes > 0)); i++) {
if (!bit_test(req_map, i))
continue;
node_ptr = node_record_table_ptr + i;
_select_cores(job_ptr, mc_ptr, enforce_binding, i,
&avail_cpus, max_nodes, min_rem_nodes,
avail_core, avail_res_array, first_pass);
_cpus_to_use(&avail_cpus, rem_max_cpus, min_rem_nodes,
details_ptr, avail_res_array[i], i,
cr_type);
if (avail_cpus == 0) {
debug("%pJ required node %s lacks available resources",
job_ptr, node_ptr->name);
goto fini;
}
avail_cpu_per_node[i] = avail_cpus;
total_cpus += avail_cpus;
rem_cpus -= avail_cpus;
rem_max_cpus -= avail_cpus;
rem_nodes--;
min_rem_nodes--;
max_nodes--;
if (gres_per_job) {
gres_plugin_job_sched_add(job_ptr->gres_list,
avail_res_array[i]->sock_gres_list,
avail_cpus);
}
}
if ((rem_nodes <= 0) && (rem_cpus <= 0) &&
gres_plugin_job_sched_test(job_ptr->gres_list,
job_ptr->job_id)) {
error_code = SLURM_SUCCESS;
bit_and(node_map, req_map);
goto fini;
}
if (max_nodes <= 0) {
error_code = SLURM_ERROR;
goto fini;
}
}
for (i = 0; i < select_node_cnt; i++) { /* For each node */
if ((consec_index + 1) >= consec_size) {
consec_size *= 2;
xrealloc(consec_cpus, sizeof(int) * consec_size);
xrealloc(consec_nodes, sizeof(int) * consec_size);
xrealloc(consec_start, sizeof(int) * consec_size);
xrealloc(consec_end, sizeof(int) * consec_size);
xrealloc(consec_req, sizeof(int) * consec_size);
xrealloc(consec_weight,
sizeof(uint64_t) * consec_size);
if (gres_per_job) {
xrealloc(consec_gres,
sizeof(List) * consec_size);
}
}
if (req_map)
required_node = bit_test(req_map, i);
else
required_node = false;
if (!bit_test(node_map, i)) {
node_ptr = NULL; /* Use as flag, avoid second test */
} else if (required_node) {
node_ptr = node_record_table_ptr + i;
} else {
node_ptr = node_record_table_ptr + i;
_select_cores(job_ptr, mc_ptr, enforce_binding, i,
&avail_cpus, max_nodes, min_rem_nodes,
avail_core, avail_res_array, first_pass);
if (avail_cpus == 0) {
bit_clear(node_map, i);
node_ptr = NULL;
}
avail_cpu_per_node[i] = avail_cpus;
}
/*
* If job requested contiguous nodes,
* do not worry about matching node weights
*/
if (node_ptr &&
!details_ptr->contiguous &&
(consec_weight[consec_index] != NO_VAL64) && /* Init value*/
(node_ptr->sched_weight != consec_weight[consec_index])) {
/* End last consecutive set, setup start of next set */
if (consec_nodes[consec_index] == 0) {
/* Only required nodes, re-use consec record */
consec_req[consec_index] = -1;
} else {
/* End last set, setup for start of next set */
consec_end[consec_index] = i - 1;
consec_req[++consec_index] = -1;
}
}
if (node_ptr) {
if (consec_nodes[consec_index] == 0)
consec_start[consec_index] = i;
if (required_node) {
/*
* Required node, resources counters updated
* in above loop, leave bitmap set
*/
if (consec_req[consec_index] == -1) {
/* first required node in set */
consec_req[consec_index] = i;
}
continue;
}
/* node not selected (yet) */
bit_clear(node_map, i);
consec_cpus[consec_index] += avail_cpus;
consec_nodes[consec_index]++;
if (gres_per_job) {
gres_plugin_job_sched_consec(
&consec_gres[consec_index],
job_ptr->gres_list,
avail_res_array[i]->sock_gres_list);
}
consec_weight[consec_index] = node_ptr->sched_weight;
} else if (consec_nodes[consec_index] == 0) {
/* Only required nodes, re-use consec record */
consec_req[consec_index] = -1;
consec_weight[consec_index] = NO_VAL64;
} else {
/* End last set, setup for start of next set */
consec_end[consec_index] = i - 1;
consec_req[++consec_index] = -1;
consec_weight[consec_index] = NO_VAL64;
}
}
if (consec_nodes[consec_index] != 0)
consec_end[consec_index++] = i - 1;
if (select_debug_flags & DEBUG_FLAG_SELECT_TYPE) {
if (consec_index == 0) {
info("%s: %s: consec_index is zero", plugin_type,
__func__);
}
for (i = 0; i < consec_index; i++) {
char *gres_str = NULL, *gres_print = "";
bitstr_t *host_bitmap;
char *host_list;
if (gres_per_job) {
gres_str = gres_plugin_job_sched_str(
consec_gres[i],
job_ptr->gres_list);
if (gres_str) {
xstrcat(gres_str, " ");
gres_print = gres_str;
}
}
host_bitmap = bit_alloc(select_node_cnt);
bit_nset(host_bitmap, consec_start[i], consec_end[i]);
host_list = bitmap2node_name(host_bitmap);
info("%s: eval_nodes: set:%d consec "
"CPUs:%d nodes:%d:%s %sbegin:%d end:%d required:%d weight:%"PRIu64,
plugin_type, i, consec_cpus[i], consec_nodes[i],
host_list, gres_print, consec_start[i],
consec_end[i], consec_req[i], consec_weight[i]);
bit_free(host_bitmap);
xfree(gres_str);
xfree(host_list);
}
}
/* Compute CPUs already allocated to required nodes */
if ((details_ptr->max_cpus != NO_VAL) &&
(total_cpus > details_ptr->max_cpus)) {
info("%s: %s: %pJ can't use required nodes due to max CPU limit",
plugin_type, __func__, job_ptr);
goto fini;
}
/*
* accumulate nodes from these sets of consecutive nodes until
* sufficient resources have been accumulated
*/
while (consec_index && (max_nodes > 0)) {
best_fit_cpus = best_fit_nodes = best_fit_sufficient = 0;
best_fit_req = -1; /* first required node, -1 if none */
for (i = 0; i < consec_index; i++) {
if (consec_nodes[i] == 0)
continue; /* no usable nodes here */
if (details_ptr->contiguous &&
details_ptr->req_node_bitmap &&
(consec_req[i] == -1))
continue; /* not required nodes */
sufficient = (consec_cpus[i] >= rem_cpus) &&
_enough_nodes(consec_nodes[i], rem_nodes,
min_nodes, req_nodes);
if (sufficient && gres_per_job) {
sufficient = gres_plugin_job_sched_sufficient(
job_ptr->gres_list,
consec_gres[i]);
}
/*
* if first possibility OR
* contains required nodes OR
* lowest node weight
*/
if ((best_fit_nodes == 0) ||
((best_fit_req == -1) && (consec_req[i] != -1)) ||
(consec_weight[i] < best_weight))
new_best = true;
else
new_best = false;
/*
* If equal node weight
* first set large enough for request OR
* tightest fit (less resource/CPU waste) OR
* nothing yet large enough, but this is biggest
*/
if (!new_best && (consec_weight[i] == best_weight) &&
((sufficient && (best_fit_sufficient == 0)) ||
(sufficient && (consec_cpus[i] < best_fit_cpus)) ||
(!sufficient &&
(consec_cpus[i] > best_fit_cpus))))
new_best = true;
/*
* if first continuous node set large enough
*/
if (!new_best && !best_fit_sufficient &&
details_ptr->contiguous && sufficient)
new_best = true;
if (new_best) {
best_fit_cpus = consec_cpus[i];
best_fit_nodes = consec_nodes[i];
best_fit_index = i;
best_fit_req = consec_req[i];
best_fit_sufficient = sufficient;
best_weight = consec_weight[i];
}
if (details_ptr->contiguous &&
details_ptr->req_node_bitmap) {
/*
* Must wait for all required nodes to be
* in a single consecutive block
*/
int j, other_blocks = 0;
for (j = (i+1); j < consec_index; j++) {
if (consec_req[j] != -1) {
other_blocks = 1;
break;
}
}
if (other_blocks) {
best_fit_nodes = 0;
break;
}
}
}
if (best_fit_nodes == 0)
break;
if (details_ptr->contiguous && !best_fit_sufficient)
break; /* no hole large enough */
if (best_fit_req != -1) {
/*
* This collection of nodes includes required ones
* select nodes from this set, first working up
* then down from the required nodes
*/
for (i = best_fit_req;
i <= consec_end[best_fit_index]; i++) {
if ((max_nodes == 0) ||
((rem_nodes <= 0) && (rem_cpus <= 0) &&
(!gres_per_job ||
gres_plugin_job_sched_test(
job_ptr->gres_list,
job_ptr->job_id))))
break;
if (bit_test(node_map, i)) {
/* required node already in set */
continue;
}
if (avail_cpu_per_node[i] == 0)
continue;
avail_cpus = avail_cpu_per_node[i];
/*
* This could result in 0, but if the user
* requested nodes here we will still give
* them and then the step layout will sort
* things out.
*/
_cpus_to_use(&avail_cpus, rem_max_cpus,
min_rem_nodes, details_ptr,
avail_res_array[i], i, cr_type);
/* enforce the max_cpus limit */
total_cpus += avail_cpus;
if ((details_ptr->max_cpus != NO_VAL) &&
(total_cpus > details_ptr->max_cpus)) {
debug2("%s: %s: %pJ can't use node %d without exceeding job limit",
plugin_type, __func__,
job_ptr, i);
total_cpus -= avail_cpus;
continue;
}
bit_set(node_map, i);
rem_nodes--;
min_rem_nodes--;
max_nodes--;
rem_cpus -= avail_cpus;
rem_max_cpus -= avail_cpus;
if (gres_per_job) {
gres_plugin_job_sched_add(
job_ptr->gres_list,
avail_res_array[i]->
sock_gres_list, avail_cpus);
}
}
for (i = (best_fit_req - 1);
i >= consec_start[best_fit_index]; i--) {
if ((max_nodes == 0) ||
((rem_nodes <= 0) && (rem_cpus <= 0) &&
(!gres_per_job ||
gres_plugin_job_sched_test(
job_ptr->gres_list,
job_ptr->job_id))))
break;
if (bit_test(node_map, i))
continue;
if (avail_cpu_per_node[i] == 0)
continue;
avail_cpus = avail_cpu_per_node[i];
/*
* This could result in 0, but if the user
* requested nodes here we will still give
* them and then the step layout will sort
* things out.
*/
_cpus_to_use(&avail_cpus, rem_max_cpus,
min_rem_nodes, details_ptr,
avail_res_array[i], i, cr_type);
total_cpus += avail_cpus;
if ((details_ptr->max_cpus != NO_VAL) &&
(total_cpus > details_ptr->max_cpus)) {
debug2("%s: %s: %pJ can't use node %d without exceeding job limit",
plugin_type, __func__,
job_ptr, i);
total_cpus -= avail_cpus;
continue;
}
rem_cpus -= avail_cpus;
rem_max_cpus -= avail_cpus;
bit_set(node_map, i);
rem_nodes--;
min_rem_nodes--;
max_nodes--;
if (gres_per_job) {
gres_plugin_job_sched_add(
job_ptr->gres_list,
avail_res_array[i]->
sock_gres_list, avail_cpus);
}
}
} else {
/* No required nodes, try best fit single node */
int best_fit = -1, best_size = 0;
int first = consec_start[best_fit_index];
int last = consec_end[best_fit_index];
if (rem_nodes <= 1) {
for (i = first, j = 0; i <= last; i++, j++) {
if (bit_test(node_map, i) ||
!avail_res_array[i])
continue;
if (avail_cpu_per_node[i] < rem_cpus)
continue;
if (gres_per_job &&
!gres_plugin_job_sched_test2(
job_ptr->gres_list,
avail_res_array[i]->
sock_gres_list,
job_ptr->job_id)) {
continue;
}
if ((best_fit == -1) ||
(avail_cpu_per_node[i] gres_list,
job_ptr->job_id))))
break;
if (bit_test(node_map, i) ||
!avail_res_array[i])
continue;
avail_cpus = avail_cpu_per_node[i];
if (avail_cpus <= 0)
continue;
if ((max_nodes == 1) &&
(avail_cpus < rem_cpus)) {
/*
* Job can only take one more node and
* this one has insufficient CPU
*/
continue;
}
/*
* This could result in 0, but if the user
* requested nodes here we will still give
* them and then the step layout will sort
* things out.
*/
_cpus_to_use(&avail_cpus, rem_max_cpus,
min_rem_nodes, details_ptr,
avail_res_array[i], i, cr_type);
total_cpus += avail_cpus;
if ((details_ptr->max_cpus != NO_VAL) &&
(total_cpus > details_ptr->max_cpus)) {
debug2("%s: %s: %pJ can't use node %d without exceeding job limit",
plugin_type, __func__,
job_ptr, i);
total_cpus -= avail_cpus;
continue;
}
rem_cpus -= avail_cpus;
rem_max_cpus -= avail_cpus;
bit_set(node_map, i);
rem_nodes--;
min_rem_nodes--;
max_nodes--;
if (gres_per_job) {
gres_plugin_job_sched_add(
job_ptr->gres_list,
avail_res_array[i]->
sock_gres_list, avail_cpus);
}
}
}
if ((rem_nodes <= 0) && (rem_cpus <= 0) &&
gres_plugin_job_sched_test(job_ptr->gres_list,
job_ptr->job_id)) {
error_code = SLURM_SUCCESS;
break;
}
consec_cpus[best_fit_index] = 0;
consec_nodes[best_fit_index] = 0;
}
if (error_code && (rem_cpus <= 0) &&
gres_plugin_job_sched_test(job_ptr->gres_list, job_ptr->job_id) &&
_enough_nodes(0, rem_nodes, min_nodes, req_nodes))
error_code = SLURM_SUCCESS;
fini: xfree(avail_cpu_per_node);
xfree(consec_cpus);
xfree(consec_nodes);
xfree(consec_start);
xfree(consec_end);
xfree(consec_req);
xfree(consec_weight);
if (gres_per_job) {
for (i = 0; i < consec_size; i++)
FREE_NULL_LIST(consec_gres[i]);
xfree(consec_gres);
}
return error_code;
}
/*
* A variation of _eval_nodes() to select resources using as many nodes as
* possible.
*/
static int _eval_nodes_spread(job_record_t *job_ptr,
gres_mc_data_t *mc_ptr, bitstr_t *node_map,
bitstr_t **avail_core, uint32_t min_nodes,
uint32_t max_nodes, uint32_t req_nodes,
avail_res_t **avail_res_array, uint16_t cr_type,
bool prefer_alloc_nodes, bool first_pass)
{
int i, i_start, i_end, error_code = SLURM_ERROR;
int rem_cpus, rem_nodes; /* remaining resources desired */
int min_rem_nodes; /* remaining resources desired */
int total_cpus = 0; /* #CPUs allocated to job */
int64_t rem_max_cpus;
struct job_details *details_ptr = job_ptr->details;
bitstr_t *req_map = details_ptr->req_node_bitmap;
bitstr_t *orig_node_map = bit_copy(node_map);
bool all_done = false, gres_per_job;
uint16_t avail_cpus = 0;
node_record_t *node_ptr;
List node_weight_list = NULL;
node_weight_type *nwt;
ListIterator iter;
bool enforce_binding = false;
if (job_ptr->gres_list && (job_ptr->bit_flags & GRES_ENFORCE_BIND))
enforce_binding = true;
rem_cpus = details_ptr->min_cpus;
rem_max_cpus = details_ptr->max_cpus;
min_rem_nodes = min_nodes;
if ((details_ptr->num_tasks != NO_VAL) &&
(details_ptr->num_tasks != 0))
max_nodes = MIN(max_nodes, details_ptr->num_tasks);
if ((gres_per_job = gres_plugin_job_sched_init(job_ptr->gres_list)))
rem_nodes = MIN(min_nodes, req_nodes);
else
rem_nodes = MAX(min_nodes, req_nodes);
i_start = bit_ffs(node_map);
if (i_start >= 0)
i_end = bit_fls(node_map);
else
i_end = i_start - 1;
if (req_map) {
for (i = i_start; i <= i_end; i++) {
if (!bit_test(req_map, i)) {
bit_clear(node_map, i);
continue;
}
node_ptr = node_record_table_ptr + i;
if (!bit_test(node_map, i)) {
debug("%pJ required node %s not available",
job_ptr, node_ptr->name);
continue;
}
if (!avail_res_array[i] ||
!avail_res_array[i]->avail_cpus) {
debug("%pJ required node %s lacks available resources",
job_ptr, node_ptr->name);
goto fini;
}
_select_cores(job_ptr, mc_ptr, enforce_binding, i,
&avail_cpus, max_nodes, min_rem_nodes,
avail_core, avail_res_array, first_pass);
_cpus_to_use(&avail_cpus, rem_max_cpus, min_rem_nodes,
details_ptr, avail_res_array[i], i,
cr_type);
if ((avail_cpus > 0) && (max_nodes > 0)) {
total_cpus += avail_cpus;
rem_cpus -= avail_cpus;
rem_max_cpus -= avail_cpus;
rem_nodes--;
min_rem_nodes--;
/* leaving bitmap set, decr max limit */
max_nodes--;
if (gres_per_job) {
gres_plugin_job_sched_add(
job_ptr->gres_list,
avail_res_array[i]->
sock_gres_list, avail_cpus);
}
} else { /* node not selected (yet) */
debug("%pJ required node %s lacks available resources",
job_ptr, node_ptr->name);
goto fini;
}
}
if ((rem_nodes <= 0) && (rem_cpus <= 0) &&
gres_plugin_job_sched_test(job_ptr->gres_list,
job_ptr->job_id)) {
error_code = SLURM_SUCCESS;
bit_and(node_map, req_map);
goto fini;
}
if (max_nodes <= 0) {
error_code = SLURM_ERROR;
goto fini;
}
bit_and_not(orig_node_map, node_map);
} else {
bit_clear_all(node_map);
}
/* Compute CPUs already allocated to required nodes */
if ((details_ptr->max_cpus != NO_VAL) &&
(total_cpus > details_ptr->max_cpus)) {
info("%pJ can't use required nodes due to max CPU limit",
job_ptr);
goto fini;
}
if (max_nodes == 0)
all_done = true;
node_weight_list = _build_node_weight_list(orig_node_map);
iter = list_iterator_create(node_weight_list);
while (!all_done && (nwt = (node_weight_type *) list_next(iter))) {
for (i = i_start; i <= i_end; i++) {
if (!avail_res_array[i] ||
!avail_res_array[i]->avail_cpus)
continue;
/* Node not available or already selected */
if (!bit_test(nwt->node_bitmap, i) ||
bit_test(node_map, i))
continue;
_select_cores(job_ptr, mc_ptr, enforce_binding, i,
&avail_cpus, max_nodes, min_rem_nodes,
avail_core, avail_res_array, first_pass);
_cpus_to_use(&avail_cpus, rem_max_cpus, min_rem_nodes,
details_ptr, avail_res_array[i], i,
cr_type);
if (avail_cpus == 0)
continue;
total_cpus += avail_cpus;
if ((details_ptr->max_cpus != NO_VAL) &&
(total_cpus > details_ptr->max_cpus)) {
debug2("%s: %s: %pJ can't use node %d without exceeding job limit",
plugin_type, __func__, job_ptr, i);
total_cpus -= avail_cpus;
continue;
}
rem_cpus -= avail_cpus;
rem_max_cpus -= avail_cpus;
rem_nodes--;
min_rem_nodes--;
max_nodes--;
bit_set(node_map, i);
if (gres_per_job) {
gres_plugin_job_sched_add(job_ptr->gres_list,
avail_res_array[i]->sock_gres_list,
avail_cpus);
}
if ((rem_nodes <= 0) && (rem_cpus <= 0) &&
gres_plugin_job_sched_test(job_ptr->gres_list,
job_ptr->job_id)) {
error_code = SLURM_SUCCESS;
all_done = true;
break;
}
if (max_nodes == 0) {
all_done = true;
break;
}
}
}
list_iterator_destroy(iter);
if (error_code == SLURM_SUCCESS) {
/* Already succeeded */
} else if ((rem_cpus > 0) || (min_rem_nodes > 0) ||
!gres_plugin_job_sched_test(job_ptr->gres_list,
job_ptr->job_id)) {
bit_clear_all(node_map);
error_code = SLURM_ERROR;
} else {
error_code = SLURM_SUCCESS;
}
fini: FREE_NULL_LIST(node_weight_list);
bit_free(orig_node_map);
return error_code;
}
/*
* A variation of _eval_nodes() to select resources using busy nodes first.
*/
static int _eval_nodes_busy(job_record_t *job_ptr,
gres_mc_data_t *mc_ptr, bitstr_t *node_map,
bitstr_t **avail_core, uint32_t min_nodes,
uint32_t max_nodes, uint32_t req_nodes,
avail_res_t **avail_res_array, uint16_t cr_type,
bool prefer_alloc_nodes, bool first_pass)
{
int i, i_start, i_end, error_code = SLURM_ERROR;
int idle_test;
int rem_cpus, rem_nodes; /* remaining resources desired */
int min_rem_nodes; /* remaining resources desired */
int total_cpus = 0; /* #CPUs allocated to job */
int64_t rem_max_cpus;
struct job_details *details_ptr = job_ptr->details;
bitstr_t *req_map = details_ptr->req_node_bitmap;
bitstr_t *orig_node_map = bit_copy(node_map);
bool all_done = false, gres_per_job;
uint16_t avail_cpus = 0;
node_record_t *node_ptr;
List node_weight_list = NULL;
node_weight_type *nwt;
ListIterator iter;
bool enforce_binding = false;
if (job_ptr->gres_list && (job_ptr->bit_flags & GRES_ENFORCE_BIND))
enforce_binding = true;
rem_cpus = details_ptr->min_cpus;
rem_max_cpus = details_ptr->max_cpus;
min_rem_nodes = min_nodes;
if ((details_ptr->num_tasks != NO_VAL) &&
(details_ptr->num_tasks != 0))
max_nodes = MIN(max_nodes, details_ptr->num_tasks);
if ((gres_per_job = gres_plugin_job_sched_init(job_ptr->gres_list)))
rem_nodes = MIN(min_nodes, req_nodes);
else
rem_nodes = MAX(min_nodes, req_nodes);
i_start = bit_ffs(node_map);
if (i_start >= 0)
i_end = bit_fls(node_map);
else
i_end = i_start - 1;
if (req_map) {
for (i = i_start; i <= i_end; i++) {
if (!bit_test(req_map, i)) {
bit_clear(node_map, i);
continue;
}
node_ptr = node_record_table_ptr + i;
if (!bit_test(node_map, i)) {
debug("%pJ required node %s not available",
job_ptr, node_ptr->name);
continue;
}
if (!avail_res_array[i] ||
!avail_res_array[i]->avail_cpus) {
debug("%pJ required node %s lacks available resources",
job_ptr, node_ptr->name);
goto fini;
}
_select_cores(job_ptr, mc_ptr, enforce_binding, i,
&avail_cpus, max_nodes, min_rem_nodes,
avail_core, avail_res_array, first_pass);
_cpus_to_use(&avail_cpus, rem_max_cpus, min_rem_nodes,
details_ptr, avail_res_array[i], i,
cr_type);
if ((avail_cpus > 0) && (max_nodes > 0)) {
total_cpus += avail_cpus;
rem_cpus -= avail_cpus;
rem_max_cpus -= avail_cpus;
rem_nodes--;
min_rem_nodes--;
/* leaving bitmap set, decr max limit */
if (max_nodes)
max_nodes--;
if (gres_per_job) {
gres_plugin_job_sched_add(
job_ptr->gres_list,
avail_res_array[i]->
sock_gres_list, avail_cpus);
}
} else { /* node not selected (yet) */
debug("%pJ required node %s lacks available resources",
job_ptr, node_ptr->name);
goto fini;
}
}
if ((rem_nodes <= 0) && (rem_cpus <= 0) &&
gres_plugin_job_sched_test(job_ptr->gres_list,
job_ptr->job_id)) {
error_code = SLURM_SUCCESS;
bit_and(node_map, req_map);
goto fini;
}
if (max_nodes <= 0) {
error_code = SLURM_ERROR;
goto fini;
}
bit_and_not(orig_node_map, node_map);
} else {
bit_clear_all(node_map);
}
/* Compute CPUs already allocated to required nodes */
if ((details_ptr->max_cpus != NO_VAL) &&
(total_cpus > details_ptr->max_cpus)) {
info("%pJ can't use required nodes due to max CPU limit",
job_ptr);
goto fini;
}
/*
* Start by using nodes that already have a job running.
* Then try to use idle nodes.
*/
if (max_nodes == 0)
all_done = true;
node_weight_list = _build_node_weight_list(orig_node_map);
iter = list_iterator_create(node_weight_list);
while (!all_done && (nwt = (node_weight_type *) list_next(iter))) {
for (idle_test = 0; idle_test < 2; idle_test++) {
for (i = i_start; i <= i_end; i++) {
if (!avail_res_array[i] ||
!avail_res_array[i]->avail_cpus)
continue;
/* Node not available or already selected */
if (!bit_test(nwt->node_bitmap, i) ||
bit_test(node_map, i))
continue;
if (((idle_test == 0) &&
bit_test(idle_node_bitmap, i)) ||
((idle_test == 1) &&
!bit_test(idle_node_bitmap, i)))
continue;
_select_cores(job_ptr, mc_ptr, enforce_binding,
i, &avail_cpus, max_nodes,
min_rem_nodes, avail_core,
avail_res_array, first_pass);
_cpus_to_use(&avail_cpus, rem_max_cpus,
min_rem_nodes, details_ptr,
avail_res_array[i], i, cr_type);
if (avail_cpus == 0)
continue;
total_cpus += avail_cpus;
if ((details_ptr->max_cpus != NO_VAL) &&
(total_cpus > details_ptr->max_cpus)) {
debug2("%s: %s: %pJ can't use node %d without exceeding job limit",
plugin_type, __func__, job_ptr,
i);
total_cpus -= avail_cpus;
continue;
}
rem_cpus -= avail_cpus;
rem_max_cpus -= avail_cpus;
rem_nodes--;
min_rem_nodes--;
max_nodes--;
bit_set(node_map, i);
if (gres_per_job) {
gres_plugin_job_sched_add(
job_ptr->gres_list,
avail_res_array[i]->
sock_gres_list,
avail_cpus);
}
if ((rem_nodes <= 0) && (rem_cpus <= 0) &&
gres_plugin_job_sched_test(
job_ptr->gres_list,
job_ptr->job_id)) {
error_code = SLURM_SUCCESS;
all_done = true;
break;
}
if (max_nodes == 0) {
all_done = true;
break;
}
}
}
}
list_iterator_destroy(iter);
if (error_code == SLURM_SUCCESS) {
/* Already succeeded */
} else if ((rem_cpus > 0) || (min_rem_nodes > 0) ||
!gres_plugin_job_sched_test(job_ptr->gres_list,
job_ptr->job_id)) {
bit_clear_all(node_map);
error_code = SLURM_ERROR;
} else {
error_code = SLURM_SUCCESS;
}
fini: FREE_NULL_LIST(node_weight_list);
bit_free(orig_node_map);
return error_code;
}
static int _topo_weight_find(void *x, void *key)
{
topo_weight_info_t *nw = (topo_weight_info_t *) x;
topo_weight_info_t *nw_key = (topo_weight_info_t *) key;
if (nw->weight == nw_key->weight)
return 1;
return 0;
}
static int _topo_node_find(void *x, void *key)
{
topo_weight_info_t *nw = (topo_weight_info_t *) x;
bitstr_t *nw_key = (bitstr_t *) key;
if (bit_overlap_any(nw->node_bitmap, nw_key))
return 1;
return 0;
}
static void _topo_weight_free(void *x)
{
topo_weight_info_t *nw = (topo_weight_info_t *) x;
FREE_NULL_BITMAP(nw->node_bitmap);
xfree(nw);
}
static int _topo_weight_log(void *x, void *arg)
{
topo_weight_info_t *nw = (topo_weight_info_t *) x;
char *node_names = bitmap2node_name(nw->node_bitmap);
info("%s: Topo:%s weight:%"PRIu64, __func__, node_names, nw->weight);
xfree(node_names);
return 0;
}
static int _topo_weight_sort(void *x, void *y)
{
topo_weight_info_t *nwt1 = *(topo_weight_info_t **) x;
topo_weight_info_t *nwt2 = *(topo_weight_info_t **) y;
return (int) (nwt1->weight - nwt2->weight);
}
/*
* Allocate resources to the job on one leaf switch if possible,
* otherwise distribute the job allocation over many leaf switches.
*/
static int _eval_nodes_dfly(job_record_t *job_ptr,
gres_mc_data_t *mc_ptr, bitstr_t *node_map,
bitstr_t **avail_core, uint32_t min_nodes,
uint32_t max_nodes, uint32_t req_nodes,
avail_res_t **avail_res_array, uint16_t cr_type,
bool prefer_alloc_nodes, bool first_pass)
{
int *switch_cpu_cnt = NULL; /* total CPUs on switch */
List *switch_gres = NULL; /* available GRES on switch */
bitstr_t **switch_node_bitmap = NULL; /* nodes on this switch */
int *switch_node_cnt = NULL; /* total nodes on switch */
int *switch_required = NULL; /* set if has required node */
bitstr_t *avail_nodes_bitmap = NULL; /* nodes on any switch */
bitstr_t *req_nodes_bitmap = NULL; /* required node bitmap */
bitstr_t *req2_nodes_bitmap = NULL; /* required+lowest prio nodes */
bitstr_t *best_nodes_bitmap = NULL; /* required+low prio nodes */
int i, i_first, i_last, j, rc = SLURM_SUCCESS;
int best_cpu_cnt = 0, best_node_cnt = 0, req_node_cnt = 0;
List best_gres = NULL;
switch_record_t *switch_ptr;
List node_weight_list = NULL;
topo_weight_info_t *nw = NULL;
ListIterator iter;
node_record_t *node_ptr;
uint16_t avail_cpus = 0;
int64_t rem_max_cpus;
int rem_cpus, rem_nodes; /* remaining resources desired */
int min_rem_nodes; /* remaining resources desired */
int total_cpus = 0; /* #CPUs allocated to job */
bool enforce_binding = false;
struct job_details *details_ptr = job_ptr->details;
bool gres_per_job, sufficient = false;
uint16_t *avail_cpu_per_node = NULL;
time_t time_waiting = 0;
int leaf_switch_count = 0;
int top_switch_inx = -1;
int prev_rem_nodes;
if (job_ptr->req_switch > 1) {
/* Maximum leaf switch count >1 probably makes no sense */
info("%s: Resetting %pJ leaf switch count from %u to 0",
__func__, job_ptr, job_ptr->req_switch);
job_ptr->req_switch = 0;
}
if (job_ptr->req_switch) {
time_t time_now;
time_now = time(NULL);
if (job_ptr->wait4switch_start == 0)
job_ptr->wait4switch_start = time_now;
time_waiting = time_now - job_ptr->wait4switch_start;
}
if (job_ptr->gres_list && (job_ptr->bit_flags & GRES_ENFORCE_BIND))
enforce_binding = true;
rem_cpus = details_ptr->min_cpus;
rem_max_cpus = details_ptr->max_cpus;
min_rem_nodes = min_nodes;
if ((gres_per_job = gres_plugin_job_sched_init(job_ptr->gres_list)))
rem_nodes = MIN(min_nodes, req_nodes);
else
rem_nodes = MAX(min_nodes, req_nodes);
/* Validate availability of required nodes */
if (job_ptr->details->req_node_bitmap) {
if (!bit_super_set(job_ptr->details->req_node_bitmap,
node_map)) {
info("%s: %s: %pJ requires nodes which are not currently available",
plugin_type, __func__, job_ptr);
rc = SLURM_ERROR;
goto fini;
}
req_node_cnt = bit_set_count(job_ptr->details->req_node_bitmap);
if (req_node_cnt == 0) {
info("%s: %s: %pJ required node list has no nodes",
plugin_type, __func__, job_ptr);
rc = SLURM_ERROR;
goto fini;
}
if (req_node_cnt > max_nodes) {
info("%s: %s: %pJ requires more nodes than currently available (%u>%u)",
plugin_type, __func__, job_ptr, req_node_cnt,
max_nodes);
rc = SLURM_ERROR;
goto fini;
}
req_nodes_bitmap = bit_copy(job_ptr->details->req_node_bitmap);
}
/*
* Add required nodes to job allocation and
* build list of node bitmaps, sorted by weight
*/
i_first = bit_ffs(node_map);
if (i_first == -1) {
debug("%s: %s: %pJ node_map is empty",
plugin_type, __func__, job_ptr);
rc = SLURM_ERROR;
goto fini;
}
i_last = bit_fls(node_map);
avail_cpu_per_node = xmalloc(sizeof(uint16_t) * select_node_cnt);
node_weight_list = list_create(_topo_weight_free);
for (i = i_first; i <= i_last; i++) {
topo_weight_info_t nw_static;
if (!bit_test(node_map, i))
continue;
if (req_nodes_bitmap && bit_test(req_nodes_bitmap, i)) {
_select_cores(job_ptr, mc_ptr, enforce_binding, i,
&avail_cpus, max_nodes, min_rem_nodes,
avail_core, avail_res_array, first_pass);
_cpus_to_use(&avail_cpus, rem_max_cpus, min_rem_nodes,
details_ptr, avail_res_array[i], i,
cr_type);
if (avail_cpus == 0) {
debug2("%s: %s: %pJ insufficient resources on required node",
plugin_type, __func__, job_ptr);
rc = SLURM_ERROR;
goto fini;
}
avail_cpu_per_node[i] = avail_cpus;
rem_nodes--;
min_rem_nodes--;
max_nodes--;
total_cpus += avail_cpus;
rem_cpus -= avail_cpus;
rem_max_cpus -= avail_cpus;
if (gres_per_job) {
gres_plugin_job_sched_add(job_ptr->gres_list,
avail_res_array[i]->sock_gres_list,
avail_cpus);
}
}
node_ptr = node_record_table_ptr + i;
nw_static.weight = node_ptr->sched_weight;
nw = list_find_first(node_weight_list, _topo_weight_find,
&nw_static);
if (!nw) { /* New node weight to add */
nw = xmalloc(sizeof(topo_weight_info_t));
nw->node_bitmap = bit_alloc(select_node_cnt);
nw->weight = node_ptr->sched_weight;
list_append(node_weight_list, nw);
}
bit_set(nw->node_bitmap, i);
nw->node_cnt++;
}
if (req_nodes_bitmap) {
bit_and(node_map, req_nodes_bitmap);
if ((rem_nodes <= 0) && (rem_cpus <= 0) &&
gres_plugin_job_sched_test(job_ptr->gres_list,
job_ptr->job_id)) {
/* Required nodes completely satisfied the request */
rc = SLURM_SUCCESS;
goto fini;
}
if (max_nodes <= 0) {
rc = SLURM_ERROR;
info("%s: %s: %pJ requires nodes exceed maximum node limit",
plugin_type, __func__, job_ptr);
goto fini;
}
} else {
bit_clear_all(node_map);
}
list_sort(node_weight_list, _topo_weight_sort);
if (select_debug_flags & DEBUG_FLAG_SELECT_TYPE)
(void) list_for_each(node_weight_list, _topo_weight_log, NULL);
/*
* Identify the highest level switch to be used.
* Note that nodes can be on multiple non-overlapping switches.
*/
switch_cpu_cnt = xmalloc(sizeof(int) * switch_record_cnt);
switch_gres = xmalloc(sizeof(List) * switch_record_cnt);
switch_node_bitmap = xmalloc(sizeof(bitstr_t *) * switch_record_cnt);
switch_node_cnt = xmalloc(sizeof(int) * switch_record_cnt);
switch_required = xmalloc(sizeof(int) * switch_record_cnt);
if (!req_nodes_bitmap)
nw = list_peek(node_weight_list);
for (i = 0, switch_ptr = switch_record_table; i < switch_record_cnt;
i++, switch_ptr++) {
switch_node_bitmap[i] = bit_copy(switch_ptr->node_bitmap);
if (req_nodes_bitmap &&
bit_overlap_any(req_nodes_bitmap, switch_node_bitmap[i])) {
switch_required[i] = 1;
if (switch_record_table[i].level == 0) {
leaf_switch_count++;
}
if ((top_switch_inx == -1) ||
(switch_record_table[i].level >
switch_record_table[top_switch_inx].level)) {
top_switch_inx = i;
}
}
if (!req_nodes_bitmap &&
(list_find_first(node_weight_list, _topo_node_find,
switch_node_bitmap[i]))) {
if ((top_switch_inx == -1) ||
(switch_record_table[i].level >
switch_record_table[top_switch_inx].level)) {
top_switch_inx = i;
}
}
}
/*
* Top switch is highest level switch containing all required nodes
* OR all nodes of the lowest scheduling weight
* OR -1 of can not identify top-level switch
*/
if (top_switch_inx == -1) {
error("%s: %s: %pJ unable to identify top level switch",
plugin_type, __func__, job_ptr);
rc = SLURM_ERROR;
goto fini;
}
/* Check that all specificly required nodes are on shared network */
if (req_nodes_bitmap &&
!bit_super_set(req_nodes_bitmap,
switch_node_bitmap[top_switch_inx])) {
rc = SLURM_ERROR;
info("%s: %s: %pJ requires nodes that do not have shared network",
plugin_type, __func__, job_ptr);
goto fini;
}
/*
* Remove nodes from consideration that can not be reached from this
* top level switch
*/
for (i = 0; i < switch_record_cnt; i++) {
if (top_switch_inx != i) {
bit_and(switch_node_bitmap[i],
switch_node_bitmap[top_switch_inx]);
}
}
/*
* Identify the best set of nodes (i.e. nodes with the lowest weight,
* in addition to the required nodes) that can be used to satisfy the
* job request. All nodes must be on a common top-level switch. The
* logic here adds groups of nodes, all with the same weight, so we
* usually identify more nodes than required to satisfy the request.
* Later logic selects from those nodes to get the best topology.
*/
best_nodes_bitmap = bit_alloc(select_node_cnt);
iter = list_iterator_create(node_weight_list);
while (!sufficient && (nw = list_next(iter))) {
if (best_node_cnt > 0) {
/*
* All of the lower priority nodes should be included
* in the job's allocation. Nodes from the next highest
* weight nodes are included only as needed.
*/
if (req2_nodes_bitmap)
bit_or(req2_nodes_bitmap, best_nodes_bitmap);
else
req2_nodes_bitmap = bit_copy(best_nodes_bitmap);
}
i_first = bit_ffs(nw->node_bitmap);
if (i_first == -1)
continue;
i_last = bit_fls(nw->node_bitmap);
for (i = i_first; i <= i_last; i++) {
if (avail_cpu_per_node[i])
continue; /* Required node */
if (!bit_test(nw->node_bitmap, i) ||
!bit_test(switch_node_bitmap[top_switch_inx], i))
continue;
_select_cores(job_ptr, mc_ptr, enforce_binding, i,
&avail_cpus, max_nodes, min_rem_nodes,
avail_core, avail_res_array, first_pass);
if (avail_cpus == 0) {
bit_clear(nw->node_bitmap, i);
continue;
}
bit_set(best_nodes_bitmap, i);
avail_cpu_per_node[i] = avail_cpus;
best_cpu_cnt += avail_cpus;
best_node_cnt++;
if (gres_per_job) {
gres_plugin_job_sched_consec(
&best_gres, job_ptr->gres_list,
avail_res_array[i]->sock_gres_list);
}
}
sufficient = (best_cpu_cnt >= rem_cpus) &&
_enough_nodes(best_node_cnt, rem_nodes,
min_nodes, req_nodes);
if (sufficient && gres_per_job) {
sufficient = gres_plugin_job_sched_sufficient(
job_ptr->gres_list, best_gres);
}
}
list_iterator_destroy(iter);
if (select_debug_flags & DEBUG_FLAG_SELECT_TYPE) {
char *gres_str = NULL, *gres_print = "";
char *node_names;
if (req_nodes_bitmap) {
node_names = bitmap2node_name(req_nodes_bitmap);
info("%s: Required nodes:%s", __func__, node_names);
xfree(node_names);
}
node_names = bitmap2node_name(best_nodes_bitmap);
if (gres_per_job) {
gres_str = gres_plugin_job_sched_str(best_gres,
job_ptr->gres_list);
if (gres_str)
gres_print = gres_str;
}
info("%s: Best nodes:%s node_cnt:%d cpu_cnt:%d %s", __func__,
node_names, best_node_cnt, best_cpu_cnt, gres_print);
xfree(node_names);
xfree(gres_str);
}
if (!sufficient) {
info("%s: %s: insufficient resources currently available for %pJ",
plugin_type, __func__, job_ptr);
rc = SLURM_ERROR;
goto fini;
}
/*
* Add lowest weight nodes. Treat similar to required nodes for the job.
* Job will still need to add some higher weight nodes later.
*/
if (req2_nodes_bitmap) {
i_first = bit_ffs(req2_nodes_bitmap);
if (i_first >= 0)
i_last = bit_fls(req2_nodes_bitmap);
else
i_last = -2;
for (i = i_first; ((i <= i_last) && (max_nodes > 0)); i++) {
if (!bit_test(req2_nodes_bitmap, i))
continue;
rem_nodes--;
min_rem_nodes--;
max_nodes--;
avail_cpus = avail_cpu_per_node[i];
total_cpus += avail_cpus;
rem_cpus -= avail_cpus;
rem_max_cpus -= avail_cpus;
if (gres_per_job) {
gres_plugin_job_sched_add(job_ptr->gres_list,
avail_res_array[i]->sock_gres_list,
avail_cpus);
}
}
for (i = 0, switch_ptr = switch_record_table;
i < switch_record_cnt; i++, switch_ptr++) {
if (switch_required[i])
continue;
if (bit_overlap_any(req2_nodes_bitmap,
switch_node_bitmap[i])) {
switch_required[i] = 1;
if (switch_record_table[i].level == 0) {
leaf_switch_count++;
}
}
}
bit_or(node_map, req2_nodes_bitmap);
if (max_nodes <= 0) {
rc = SLURM_ERROR;
info("%s: %s: %pJ reached maximum node limit",
plugin_type, __func__, job_ptr);
goto fini;
}
if ((rem_nodes <= 0) && (rem_cpus <= 0) &&
(!gres_per_job ||
gres_plugin_job_sched_test(job_ptr->gres_list,
job_ptr->job_id))) {
/* Required nodes completely satisfied the request */
error("%s: Scheduling anomaly for %pJ",
__func__, job_ptr);
rc = SLURM_SUCCESS;
goto fini;
}
}
/*
* Construct a set of switch array entries.
* Use the same indexes as switch_record_table in slurmctld.
*/
bit_or(best_nodes_bitmap, node_map);
avail_nodes_bitmap = bit_alloc(node_record_count);
for (i = 0, switch_ptr = switch_record_table; i < switch_record_cnt;
i++, switch_ptr++) {
bit_and(switch_node_bitmap[i], best_nodes_bitmap);
bit_or(avail_nodes_bitmap, switch_node_bitmap[i]);
switch_node_cnt[i] = bit_set_count(switch_node_bitmap[i]);
}
if (select_debug_flags & DEBUG_FLAG_SELECT_TYPE) {
for (i = 0; i < switch_record_cnt; i++) {
char *node_names = NULL;
if (switch_node_cnt[i]) {
node_names =
bitmap2node_name(switch_node_bitmap[i]);
}
info("switch=%s level=%d nodes=%u:%s required:%u speed:%u",
switch_record_table[i].name,
switch_record_table[i].level,
switch_node_cnt[i], node_names,
switch_required[i],
switch_record_table[i].link_speed);
xfree(node_names);
}
}
/* count up leaf switches */
if (!req_nodes_bitmap) {
for (i = 0, switch_ptr = switch_record_table;
i < switch_record_cnt; i++, switch_ptr++) {
if (switch_record_table[i].level != 0)
continue;
if (bit_overlap_any(switch_node_bitmap[i],
best_nodes_bitmap))
leaf_switch_count++;
}
}
if (req_nodes_bitmap &&
(!bit_super_set(req_nodes_bitmap, avail_nodes_bitmap))) {
info("%s: %s: %pJ requires nodes not available on any switch",
plugin_type, __func__, job_ptr);
rc = SLURM_ERROR;
goto fini;
}
/*
* If no resources have yet been selected,
* then pick one leaf switch with the most available nodes.
*/
if (leaf_switch_count == 0) {
int best_switch_inx = -1;
for (i = 0; i < switch_record_cnt; i++) {
if (switch_record_table[i].level != 0)
continue;
if ((best_switch_inx == -1) ||
(switch_node_cnt[i] >
switch_node_cnt[best_switch_inx]))
best_switch_inx = i;
}
if (best_switch_inx != -1) {
leaf_switch_count = 1;
switch_required[best_switch_inx] = 1;
}
}
/*
* All required resources currently on one leaf switch. Determine if
* the entire job request can be satisfied using just that one switch.
*/
if (leaf_switch_count == 1) {
best_cpu_cnt = 0;
best_node_cnt = 0;
FREE_NULL_LIST(best_gres);
for (i = 0; i < switch_record_cnt; i++) {
if (!switch_required[i] || !switch_node_bitmap[i] ||
(switch_record_table[i].level != 0))
continue;
i_first = bit_ffs(switch_node_bitmap[i]);
if (i_first >= 0)
i_last = bit_fls(switch_node_bitmap[i]);
else
i_last = -2;
for (j = i_first; j <= i_last; j++) {
if (!bit_test(switch_node_bitmap[i], j) ||
bit_test(node_map, j) ||
!avail_cpu_per_node[j])
continue;
avail_cpus = avail_cpu_per_node[j];
best_cpu_cnt += avail_cpus;
best_node_cnt++;
if (gres_per_job) {
gres_plugin_job_sched_consec(
&best_gres, job_ptr->gres_list,
avail_res_array[j]->sock_gres_list);
}
}
break;
}
sufficient = (best_cpu_cnt >= rem_cpus) &&
_enough_nodes(best_node_cnt, rem_nodes,
min_nodes, req_nodes);
if (sufficient && gres_per_job) {
sufficient = gres_plugin_job_sched_sufficient(
job_ptr->gres_list, best_gres);
}
if (sufficient && (i < switch_record_cnt)) {
/* Complete request using this one leaf switch */
for (j = i_first; j <= i_last; j++) {
if (!bit_test(switch_node_bitmap[i], j) ||
bit_test(node_map, j) ||
!avail_cpu_per_node[j])
continue;
avail_cpus = avail_cpu_per_node[j];
rem_nodes--;
min_rem_nodes--;
max_nodes--;
total_cpus += avail_cpus;
rem_cpus -= avail_cpus;
rem_max_cpus -= avail_cpus;
if (gres_per_job) {
gres_plugin_job_sched_add(
job_ptr->gres_list,
avail_res_array[j]->
sock_gres_list,
avail_cpus);
}
bit_set(node_map, j);
if ((rem_nodes <= 0) && (rem_cpus <= 0) &&
(!gres_per_job ||
gres_plugin_job_sched_test(
job_ptr->gres_list,
job_ptr->job_id))) {
rc = SLURM_SUCCESS;
goto fini;
}
if (max_nodes <= 0) {
rc = SLURM_ERROR;
info("%s: %s: %pJ reached maximum node limit",
plugin_type, __func__, job_ptr);
goto fini;
}
}
}
}
if (job_ptr->req_switch > 0) {
if (time_waiting >= job_ptr->wait4switch) {
job_ptr->best_switch = true;
debug3("%pJ waited %ld sec for switches use=%d",
job_ptr, time_waiting, leaf_switch_count);
} else if (leaf_switch_count > job_ptr->req_switch) {
/*
* Allocation is for more than requested number of
* switches.
*/
job_ptr->best_switch = false;
debug3("%pJ waited %ld sec for switches=%u found=%d wait %u",
job_ptr, time_waiting, job_ptr->req_switch,
leaf_switch_count, job_ptr->wait4switch);
} else {
job_ptr->best_switch = true;
}
}
/*
* Add additional resources as required from additional leaf switches
* on a round-robin basis
*/
prev_rem_nodes = rem_nodes + 1;
while (1) {
if (prev_rem_nodes == rem_nodes)
break; /* Stalled */
prev_rem_nodes = rem_nodes;
for (i = 0; i < switch_record_cnt; i++) {
if (!switch_node_bitmap[i] ||
(switch_record_table[i].level != 0))
continue;
i_first = bit_ffs(switch_node_bitmap[i]);
if (i_first >= 0)
i_last = bit_fls(switch_node_bitmap[i]);
else
i_last = -2;
for (j = i_first; j <= i_last; j++) {
if (!bit_test(switch_node_bitmap[i], j) ||
bit_test(node_map, j) ||
!avail_cpu_per_node[j])
continue;
avail_cpus = avail_cpu_per_node[j];
rem_nodes--;
min_rem_nodes--;
max_nodes--;
total_cpus += avail_cpus;
rem_cpus -= avail_cpus;
rem_max_cpus -= avail_cpus;
if (gres_per_job) {
gres_plugin_job_sched_add(
job_ptr->gres_list,
avail_res_array[j]->
sock_gres_list,
avail_cpus);
}
bit_set(node_map, j);
if ((rem_nodes <= 0) && (rem_cpus <= 0) &&
(!gres_per_job ||
gres_plugin_job_sched_test(
job_ptr->gres_list,
job_ptr->job_id))) {
rc = SLURM_SUCCESS;
goto fini;
}
if (max_nodes <= 0) {
rc = SLURM_ERROR;
info("%s: %s: %pJ reached maximum node limit",
plugin_type, __func__, job_ptr);
goto fini;
}
break; /* Move to next switch */
}
}
}
if ((min_rem_nodes <= 0) && (rem_cpus <= 0) &&
(!gres_per_job ||
gres_plugin_job_sched_test(job_ptr->gres_list, job_ptr->job_id))) {
rc = SLURM_SUCCESS;
goto fini;
}
rc = SLURM_ERROR;
fini: FREE_NULL_LIST(best_gres);
FREE_NULL_LIST(node_weight_list);
FREE_NULL_BITMAP(avail_nodes_bitmap);
FREE_NULL_BITMAP(req_nodes_bitmap);
FREE_NULL_BITMAP(req2_nodes_bitmap);
FREE_NULL_BITMAP(best_nodes_bitmap);
xfree(avail_cpu_per_node);
xfree(switch_cpu_cnt);
xfree(switch_gres);
if (switch_node_bitmap) {
for (i = 0; i < switch_record_cnt; i++)
FREE_NULL_BITMAP(switch_node_bitmap[i]);
xfree(switch_node_bitmap);
}
xfree(switch_node_cnt);
xfree(switch_required);
return rc;
}
/* Allocate resources to job using a minimal leaf switch count */
static int _eval_nodes_topo(job_record_t *job_ptr,
gres_mc_data_t *mc_ptr, bitstr_t *node_map,
bitstr_t **avail_core, uint32_t min_nodes,
uint32_t max_nodes, uint32_t req_nodes,
avail_res_t **avail_res_array, uint16_t cr_type,
bool prefer_alloc_nodes, bool first_pass)
{
int *switch_cpu_cnt = NULL; /* total CPUs on switch */
List *switch_gres = NULL; /* available GRES on switch */
bitstr_t **switch_node_bitmap = NULL; /* nodes on this switch */
int *switch_node_cnt = NULL; /* total nodes on switch */
int *switch_required = NULL; /* set if has required node */
bitstr_t *avail_nodes_bitmap = NULL; /* nodes on any switch */
bitstr_t *req_nodes_bitmap = NULL; /* required node bitmap */
bitstr_t *req2_nodes_bitmap = NULL; /* required+lowest prio nodes */
bitstr_t *best_nodes_bitmap = NULL; /* required+low prio nodes */
int i, i_first, i_last, j, rc = SLURM_SUCCESS;
int best_cpu_cnt = 0, best_node_cnt = 0, req_node_cnt = 0;
List best_gres = NULL;
switch_record_t *switch_ptr;
List node_weight_list = NULL;
topo_weight_info_t *nw = NULL;
ListIterator iter;
node_record_t *node_ptr;
uint16_t avail_cpus = 0;
int64_t rem_max_cpus;
int rem_cpus, rem_nodes; /* remaining resources desired */
int min_rem_nodes; /* remaining resources desired */
int total_cpus = 0; /* #CPUs allocated to job */
bool enforce_binding = false;
struct job_details *details_ptr = job_ptr->details;
bool gres_per_job, sufficient = false;
uint16_t *avail_cpu_per_node = NULL;
time_t time_waiting = 0;
int leaf_switch_count = 0;
int top_switch_inx = -1;
int prev_rem_nodes;
if (job_ptr->req_switch) {
time_t time_now;
time_now = time(NULL);
if (job_ptr->wait4switch_start == 0)
job_ptr->wait4switch_start = time_now;
time_waiting = time_now - job_ptr->wait4switch_start;
}
if (job_ptr->gres_list && (job_ptr->bit_flags & GRES_ENFORCE_BIND))
enforce_binding = true;
rem_cpus = details_ptr->min_cpus;
rem_max_cpus = details_ptr->max_cpus;
min_rem_nodes = min_nodes;
if ((gres_per_job = gres_plugin_job_sched_init(job_ptr->gres_list)))
rem_nodes = MIN(min_nodes, req_nodes);
else
rem_nodes = MAX(min_nodes, req_nodes);
/* Validate availability of required nodes */
if (job_ptr->details->req_node_bitmap) {
if (!bit_super_set(job_ptr->details->req_node_bitmap,
node_map)) {
info("%s: %s: %pJ requires nodes which are not currently available",
plugin_type, __func__, job_ptr);
rc = SLURM_ERROR;
goto fini;
}
req_node_cnt = bit_set_count(job_ptr->details->req_node_bitmap);
if (req_node_cnt == 0) {
info("%s: %s: %pJ required node list has no nodes",
plugin_type, __func__, job_ptr);
rc = SLURM_ERROR;
goto fini;
}
if (req_node_cnt > max_nodes) {
info("%s: %s: %pJ requires more nodes than currently available (%u>%u)",
plugin_type, __func__, job_ptr, req_node_cnt,
max_nodes);
rc = SLURM_ERROR;
goto fini;
}
req_nodes_bitmap = bit_copy(job_ptr->details->req_node_bitmap);
}
/*
* Add required nodes to job allocation and
* build list of node bitmaps, sorted by weight
*/
i_first = bit_ffs(node_map);
if (i_first == -1) {
debug("%s: %s: %pJ node_map is empty",
plugin_type, __func__, job_ptr);
rc = SLURM_ERROR;
goto fini;
}
i_last = bit_fls(node_map);
avail_cpu_per_node = xmalloc(sizeof(uint16_t) * select_node_cnt);
node_weight_list = list_create(_topo_weight_free);
for (i = i_first; i <= i_last; i++) {
topo_weight_info_t nw_static;
if (!bit_test(node_map, i))
continue;
if (req_nodes_bitmap && bit_test(req_nodes_bitmap, i)) {
_select_cores(job_ptr, mc_ptr, enforce_binding, i,
&avail_cpus, max_nodes, min_rem_nodes,
avail_core, avail_res_array, first_pass);
_cpus_to_use(&avail_cpus, rem_max_cpus, min_rem_nodes,
details_ptr, avail_res_array[i], i,
cr_type);
if (avail_cpus == 0) {
debug2("%s: %s: %pJ insufficient resources on required node",
plugin_type, __func__, job_ptr);
rc = SLURM_ERROR;
goto fini;
}
avail_cpu_per_node[i] = avail_cpus;
rem_nodes--;
min_rem_nodes--;
max_nodes--;
total_cpus += avail_cpus;
rem_cpus -= avail_cpus;
rem_max_cpus -= avail_cpus;
if (gres_per_job) {
gres_plugin_job_sched_add(job_ptr->gres_list,
avail_res_array[i]->sock_gres_list,
avail_cpus);
}
}
node_ptr = node_record_table_ptr + i;
nw_static.weight = node_ptr->sched_weight;
nw = list_find_first(node_weight_list, _topo_weight_find,
&nw_static);
if (!nw) { /* New node weight to add */
nw = xmalloc(sizeof(topo_weight_info_t));
nw->node_bitmap = bit_alloc(select_node_cnt);
nw->weight = node_ptr->sched_weight;
list_append(node_weight_list, nw);
}
bit_set(nw->node_bitmap, i);
nw->node_cnt++;
}
list_sort(node_weight_list, _topo_weight_sort);
if (select_debug_flags & DEBUG_FLAG_SELECT_TYPE)
(void) list_for_each(node_weight_list, _topo_weight_log, NULL);
/*
* Identify the highest level switch to be used.
* Note that nodes can be on multiple non-overlapping switches.
*/
switch_cpu_cnt = xmalloc(sizeof(int) * switch_record_cnt);
switch_gres = xmalloc(sizeof(List) * switch_record_cnt);
switch_node_bitmap = xmalloc(sizeof(bitstr_t *) * switch_record_cnt);
switch_node_cnt = xmalloc(sizeof(int) * switch_record_cnt);
switch_required = xmalloc(sizeof(int) * switch_record_cnt);
for (i = 0, switch_ptr = switch_record_table; i < switch_record_cnt;
i++, switch_ptr++) {
switch_node_bitmap[i] = bit_copy(switch_ptr->node_bitmap);
bit_and(switch_node_bitmap[i], node_map);
switch_node_cnt[i] = bit_set_count(switch_node_bitmap[i]);
if (req_nodes_bitmap &&
bit_overlap_any(req_nodes_bitmap, switch_node_bitmap[i])) {
switch_required[i] = 1;
if (switch_record_table[i].level == 0) {
leaf_switch_count++;
}
if ((top_switch_inx == -1) ||
(switch_record_table[i].level >
switch_record_table[top_switch_inx].level)) {
top_switch_inx = i;
}
}
if (!_enough_nodes(switch_node_cnt[i], rem_nodes,
min_nodes, req_nodes))
continue;
if (!req_nodes_bitmap &&
(list_find_first(node_weight_list, _topo_node_find,
switch_node_bitmap[i]))) {
if ((top_switch_inx == -1) ||
(switch_record_table[i].level >
switch_record_table[top_switch_inx].level)) {
top_switch_inx = i;
}
}
}
if (!req_nodes_bitmap) {
bit_clear_all(node_map);
}
/*
* Top switch is highest level switch containing all required nodes
* OR all nodes of the lowest scheduling weight
* OR -1 if can not identify top-level switch, which may be due to a
* disjoint topology and available nodes living on different switches.
*/
if (top_switch_inx == -1) {
log_flag(SELECT_TYPE, "%s: %s: %pJ unable to identify top level switch",
plugin_type, __func__, job_ptr);
rc = SLURM_ERROR;
goto fini;
}
/* Check that all specificly required nodes are on shared network */
if (req_nodes_bitmap &&
!bit_super_set(req_nodes_bitmap,
switch_node_bitmap[top_switch_inx])) {
rc = SLURM_ERROR;
info("%s: %s: %pJ requires nodes that do not have shared network",
plugin_type, __func__, job_ptr);
goto fini;
}
/*
* Remove nodes from consideration that can not be reached from this
* top level switch.
*/
for (i = 0; i < switch_record_cnt; i++) {
if (top_switch_inx != i) {
bit_and(switch_node_bitmap[i],
switch_node_bitmap[top_switch_inx]);
}
}
if (req_nodes_bitmap) {
bit_and(node_map, req_nodes_bitmap);
if ((rem_nodes <= 0) && (rem_cpus <= 0) &&
gres_plugin_job_sched_test(job_ptr->gres_list,
job_ptr->job_id)) {
/* Required nodes completely satisfied the request */
rc = SLURM_SUCCESS;
goto fini;
}
if (max_nodes <= 0) {
rc = SLURM_ERROR;
info("%s: %s: %pJ requires nodes exceed maximum node limit",
plugin_type, __func__, job_ptr);
goto fini;
}
}
/*
* Identify the best set of nodes (i.e. nodes with the lowest weight,
* in addition to the required nodes) that can be used to satisfy the
* job request. All nodes must be on a common top-level switch. The
* logic here adds groups of nodes, all with the same weight, so we
* usually identify more nodes than required to satisfy the request.
* Later logic selects from those nodes to get the best topology.
*/
best_nodes_bitmap = bit_alloc(select_node_cnt);
iter = list_iterator_create(node_weight_list);
while (!sufficient && (nw = list_next(iter))) {
if (best_node_cnt > 0) {
/*
* All of the lower priority nodes should be included
* in the job's allocation. Nodes from the next highest
* weight nodes are included only as needed.
*/
if (req2_nodes_bitmap)
bit_or(req2_nodes_bitmap, best_nodes_bitmap);
else
req2_nodes_bitmap = bit_copy(best_nodes_bitmap);
}
i_first = bit_ffs(nw->node_bitmap);
if (i_first == -1)
continue;
i_last = bit_fls(nw->node_bitmap);
for (i = i_first; i <= i_last; i++) {
if (avail_cpu_per_node[i])
continue; /* Required node */
if (!bit_test(nw->node_bitmap, i) ||
!bit_test(switch_node_bitmap[top_switch_inx], i))
continue;
_select_cores(job_ptr, mc_ptr, enforce_binding, i,
&avail_cpus, max_nodes, min_rem_nodes,
avail_core, avail_res_array, first_pass);
if (avail_cpus == 0) {
bit_clear(nw->node_bitmap, i);
continue;
}
bit_set(best_nodes_bitmap, i);
avail_cpu_per_node[i] = avail_cpus;
best_cpu_cnt += avail_cpus;
best_node_cnt++;
if (gres_per_job) {
gres_plugin_job_sched_consec(
&best_gres, job_ptr->gres_list,
avail_res_array[i]->sock_gres_list);
}
}
sufficient = (best_cpu_cnt >= rem_cpus) &&
_enough_nodes(best_node_cnt, rem_nodes,
min_nodes, req_nodes);
if (sufficient && gres_per_job) {
sufficient = gres_plugin_job_sched_sufficient(
job_ptr->gres_list, best_gres);
}
}
list_iterator_destroy(iter);
if (select_debug_flags & DEBUG_FLAG_SELECT_TYPE) {
char *gres_str = NULL, *gres_print = "";
char *node_names;
if (req_nodes_bitmap) {
node_names = bitmap2node_name(req_nodes_bitmap);
info("%s: Required nodes:%s", __func__, node_names);
xfree(node_names);
}
node_names = bitmap2node_name(best_nodes_bitmap);
if (gres_per_job) {
gres_str = gres_plugin_job_sched_str(best_gres,
job_ptr->gres_list);
if (gres_str)
gres_print = gres_str;
}
info("%s: Best nodes:%s node_cnt:%d cpu_cnt:%d %s", __func__,
node_names, best_node_cnt, best_cpu_cnt, gres_print);
xfree(node_names);
xfree(gres_str);
}
if (!sufficient) {
info("%s: %s: insufficient resources currently available for %pJ",
plugin_type, __func__, job_ptr);
rc = SLURM_ERROR;
goto fini;
}
/*
* Add lowest weight nodes. Treat similar to required nodes for the job.
* Job will still need to add some higher weight nodes later.
*/
if (req2_nodes_bitmap) {
i_first = bit_ffs(req2_nodes_bitmap);
if (i_first >= 0)
i_last = bit_fls(req2_nodes_bitmap);
else
i_last = -2;
for (i = i_first; ((i <= i_last) && (max_nodes > 0)); i++) {
if (!bit_test(req2_nodes_bitmap, i))
continue;
rem_nodes--;
min_rem_nodes--;
max_nodes--;
avail_cpus = avail_cpu_per_node[i];
total_cpus += avail_cpus;
rem_cpus -= avail_cpus;
rem_max_cpus -= avail_cpus;
if (gres_per_job) {
gres_plugin_job_sched_add(job_ptr->gres_list,
avail_res_array[i]->sock_gres_list,
avail_cpus);
}
}
for (i = 0, switch_ptr = switch_record_table;
i < switch_record_cnt; i++, switch_ptr++) {
if (switch_required[i])
continue;
if (bit_overlap_any(req2_nodes_bitmap,
switch_node_bitmap[i])) {
switch_required[i] = 1;
if (switch_record_table[i].level == 0) {
leaf_switch_count++;
}
}
}
bit_or(node_map, req2_nodes_bitmap);
if ((rem_nodes <= 0) && (rem_cpus <= 0) &&
(!gres_per_job ||
gres_plugin_job_sched_test(job_ptr->gres_list,
job_ptr->job_id))) {
/* Required nodes completely satisfied the request */
error("%s: Scheduling anomaly for %pJ",
__func__, job_ptr);
rc = SLURM_SUCCESS;
goto fini;
}
if (max_nodes <= 0) {
rc = SLURM_ERROR;
info("%s: %s: %pJ reached maximum node limit",
plugin_type, __func__, job_ptr);
goto fini;
}
}
/*
* Construct a set of switch array entries.
* Use the same indexes as switch_record_table in slurmctld.
*/
bit_or(best_nodes_bitmap, node_map);
avail_nodes_bitmap = bit_alloc(node_record_count);
for (i = 0, switch_ptr = switch_record_table; i < switch_record_cnt;
i++, switch_ptr++) {
bit_and(switch_node_bitmap[i], best_nodes_bitmap);
bit_or(avail_nodes_bitmap, switch_node_bitmap[i]);
switch_node_cnt[i] = bit_set_count(switch_node_bitmap[i]);
}
if (select_debug_flags & DEBUG_FLAG_SELECT_TYPE) {
for (i = 0; i < switch_record_cnt; i++) {
char *node_names = NULL;
if (switch_node_cnt[i]) {
node_names =
bitmap2node_name(switch_node_bitmap[i]);
}
info("switch=%s level=%d nodes=%u:%s required:%u speed:%u",
switch_record_table[i].name,
switch_record_table[i].level,
switch_node_cnt[i], node_names,
switch_required[i],
switch_record_table[i].link_speed);
xfree(node_names);
}
}
/* Count up leaf switches. */
if (!req_nodes_bitmap) {
for (i = 0, switch_ptr = switch_record_table;
i < switch_record_cnt; i++, switch_ptr++) {
if (switch_record_table[i].level != 0)
continue;
if (bit_overlap_any(switch_node_bitmap[i],
best_nodes_bitmap))
leaf_switch_count++;
}
}
if (req_nodes_bitmap &&
(!bit_super_set(req_nodes_bitmap, avail_nodes_bitmap))) {
info("%s: %s: %pJ requires nodes not available on any switch",
plugin_type, __func__, job_ptr);
rc = SLURM_ERROR;
goto fini;
}
/* Add additional resources for already required leaf switches */
if (leaf_switch_count) {
for (i = 0; i < switch_record_cnt; i++) {
if (!switch_required[i] || !switch_node_bitmap[i] ||
(switch_record_table[i].level != 0))
continue;
i_first = bit_ffs(switch_node_bitmap[i]);
if (i_first >= 0)
i_last = bit_fls(switch_node_bitmap[i]);
else
i_last = -2;
for (j = i_first; j <= i_last; j++) {
if (!bit_test(switch_node_bitmap[i], j) ||
bit_test(node_map, j) ||
!avail_cpu_per_node[j])
continue;
avail_cpus = avail_cpu_per_node[j];
rem_nodes--;
min_rem_nodes--;
max_nodes--;
total_cpus += avail_cpus;
rem_cpus -= avail_cpus;
rem_max_cpus -= avail_cpus;
if (gres_per_job) {
gres_plugin_job_sched_add(
job_ptr->gres_list,
avail_res_array[j]->
sock_gres_list,
avail_cpus);
}
bit_set(node_map, j);
if ((rem_nodes <= 0) && (rem_cpus <= 0) &&
(!gres_per_job ||
gres_plugin_job_sched_test(
job_ptr->gres_list,
job_ptr->job_id))) {
rc = SLURM_SUCCESS;
goto fini;
}
}
}
}
if (job_ptr->req_switch > 0) {
if (time_waiting >= job_ptr->wait4switch) {
job_ptr->best_switch = true;
debug3("%pJ waited %ld sec for switches use=%d",
job_ptr, time_waiting, leaf_switch_count);
} else if (leaf_switch_count > job_ptr->req_switch) {
/*
* Allocation is for more than requested number of
* switches.
*/
job_ptr->best_switch = false;
debug3("%pJ waited %ld sec for switches=%u found=%d wait %u",
job_ptr, time_waiting, job_ptr->req_switch,
leaf_switch_count, job_ptr->wait4switch);
} else {
job_ptr->best_switch = true;
}
}
/* Add additional resources as required from additional leaf switches */
prev_rem_nodes = rem_nodes + 1;
while (1) {
if (prev_rem_nodes == rem_nodes)
break; /* Stalled */
prev_rem_nodes = rem_nodes;
top_switch_inx = -1;
for (i = 0; i < switch_record_cnt; i++) {
if (switch_required[i] || !switch_node_bitmap[i] ||
(switch_record_table[i].level != 0))
continue;
if (switch_node_cnt[i] &&
((top_switch_inx == -1) ||
(switch_node_cnt[i] >
switch_node_cnt[top_switch_inx])))
top_switch_inx = i;
}
if (top_switch_inx == -1)
break;
/*
* NOTE: Ideally we would add nodes in order of resource
* availability rather than in order of bitmap position, but
* that would add even more complexity and overhead.
*/
i_first = bit_ffs(switch_node_bitmap[top_switch_inx]);
if (i_first >= 0)
i_last = bit_fls(switch_node_bitmap[top_switch_inx]);
else
i_last = -2;
for (i = i_first; ((i <= i_last) && (max_nodes > 0)); i++) {
if (!bit_test(switch_node_bitmap[top_switch_inx], i) ||
bit_test(node_map, i) ||
!avail_cpu_per_node[i])
continue;
rem_nodes--;
min_rem_nodes--;
max_nodes--;
avail_cpus = avail_cpu_per_node[i];
total_cpus += avail_cpus;
rem_cpus -= avail_cpus;
rem_max_cpus -= avail_cpus;
if (gres_per_job) {
gres_plugin_job_sched_add(job_ptr->gres_list,
avail_res_array[i]->sock_gres_list,
avail_cpus);
}
bit_set(node_map, i);
if ((rem_nodes <= 0) && (rem_cpus <= 0) &&
(!gres_per_job ||
gres_plugin_job_sched_test(job_ptr->gres_list,
job_ptr->job_id))) {
rc = SLURM_SUCCESS;
goto fini;
}
}
switch_node_cnt[top_switch_inx] = 0; /* Used all */
}
if ((min_rem_nodes <= 0) && (rem_cpus <= 0) &&
(!gres_per_job ||
gres_plugin_job_sched_test(job_ptr->gres_list, job_ptr->job_id))) {
rc = SLURM_SUCCESS;
goto fini;
}
rc = SLURM_ERROR;
fini: FREE_NULL_LIST(best_gres);
FREE_NULL_LIST(node_weight_list);
FREE_NULL_BITMAP(avail_nodes_bitmap);
FREE_NULL_BITMAP(req_nodes_bitmap);
FREE_NULL_BITMAP(req2_nodes_bitmap);
FREE_NULL_BITMAP(best_nodes_bitmap);
xfree(avail_cpu_per_node);
xfree(switch_cpu_cnt);
xfree(switch_gres);
if (switch_node_bitmap) {
for (i = 0; i < switch_record_cnt; i++)
FREE_NULL_BITMAP(switch_node_bitmap[i]);
xfree(switch_node_bitmap);
}
xfree(switch_node_cnt);
xfree(switch_required);
return rc;
}
static int _eval_nodes_lln(job_record_t *job_ptr,
gres_mc_data_t *mc_ptr, bitstr_t *node_map,
bitstr_t **avail_core, uint32_t min_nodes,
uint32_t max_nodes, uint32_t req_nodes,
avail_res_t **avail_res_array, uint16_t cr_type,
bool prefer_alloc_nodes, bool first_pass)
{
int i, i_start, i_end, error_code = SLURM_ERROR;
int rem_cpus, rem_nodes; /* remaining resources desired */
int min_rem_nodes; /* remaining resources desired */
int total_cpus = 0; /* #CPUs allocated to job */
int64_t rem_max_cpus;
struct job_details *details_ptr = job_ptr->details;
bitstr_t *req_map = details_ptr->req_node_bitmap;
bitstr_t *orig_node_map = bit_copy(node_map);
bool all_done = false, gres_per_job;
uint16_t avail_cpus = 0;
node_record_t *node_ptr;
List node_weight_list = NULL;
node_weight_type *nwt;
ListIterator iter;
uint16_t *avail_cpu_per_node = NULL;
bool enforce_binding = false;
if (job_ptr->gres_list && (job_ptr->bit_flags & GRES_ENFORCE_BIND))
enforce_binding = true;
rem_cpus = details_ptr->min_cpus;
rem_max_cpus = details_ptr->max_cpus;
min_rem_nodes = min_nodes;
if ((details_ptr->num_tasks != NO_VAL) &&
(details_ptr->num_tasks != 0))
max_nodes = MIN(max_nodes, details_ptr->num_tasks);
if ((gres_per_job = gres_plugin_job_sched_init(job_ptr->gres_list)))
rem_nodes = MIN(min_nodes, req_nodes);
else
rem_nodes = MAX(min_nodes, req_nodes);
i_start = bit_ffs(node_map);
if (i_start >= 0)
i_end = bit_fls(node_map);
else
i_end = i_start - 1;
if (req_map) {
for (i = i_start; i <= i_end; i++) {
if (!bit_test(req_map, i)) {
bit_clear(node_map, i);
continue;
}
node_ptr = node_record_table_ptr + i;
if (!bit_test(node_map, i)) {
debug("%pJ required node %s not available",
job_ptr, node_ptr->name);
goto fini;
}
if (!avail_res_array[i] ||
!avail_res_array[i]->avail_cpus) {
debug("%pJ required node %s lacks available resources",
job_ptr, node_ptr->name);
goto fini;
}
_select_cores(job_ptr, mc_ptr, enforce_binding, i,
&avail_cpus, max_nodes, min_rem_nodes,
avail_core, avail_res_array, first_pass);
_cpus_to_use(&avail_cpus, rem_max_cpus, min_rem_nodes,
details_ptr, avail_res_array[i], i,
cr_type);
if ((avail_cpus > 0) && (max_nodes > 0)) {
total_cpus += avail_cpus;
rem_cpus -= avail_cpus;
rem_max_cpus -= avail_cpus;
rem_nodes--;
min_rem_nodes--;
/* leaving bitmap set, decr max limit */
max_nodes--;
if (gres_per_job) {
gres_plugin_job_sched_add(
job_ptr->gres_list,
avail_res_array[i]->
sock_gres_list, avail_cpus);
}
} else { /* node not selected (yet) */
debug("%pJ required node %s not available",
job_ptr, node_ptr->name);
goto fini;
}
}
if ((rem_nodes <= 0) && (rem_cpus <= 0) &&
gres_plugin_job_sched_test(job_ptr->gres_list,
job_ptr->job_id)) {
error_code = SLURM_SUCCESS;
bit_and(node_map, req_map);
goto fini;
}
if (max_nodes <= 0) {
error_code = SLURM_ERROR;
goto fini;
}
bit_and_not(orig_node_map, node_map);
} else {
bit_clear_all(node_map);
}
/* Compute CPUs already allocated to required nodes */
if ((details_ptr->max_cpus != NO_VAL) &&
(total_cpus > details_ptr->max_cpus)) {
info("%pJ can't use required nodes due to max CPU limit",
job_ptr);
goto fini;
}
/*
* Accumulate nodes from those with highest available CPU count.
* Logic is optimized for small node/CPU count allocations.
* For larger allocation, use list_sort().
*/
if (max_nodes == 0)
all_done = true;
avail_cpu_per_node = xmalloc(sizeof(uint16_t) * select_node_cnt);
node_weight_list = _build_node_weight_list(orig_node_map);
iter = list_iterator_create(node_weight_list);
while (!all_done && (nwt = (node_weight_type *) list_next(iter))) {
int last_max_cpu_cnt = -1;
while (!all_done) {
int max_cpu_idx = -1;
for (i = i_start; i <= i_end; i++) {
/* Node not available or already selected */
if (!bit_test(nwt->node_bitmap, i) ||
bit_test(node_map, i))
continue;
_select_cores(job_ptr, mc_ptr, enforce_binding,
i, &avail_cpus, max_nodes,
min_rem_nodes, avail_core,
avail_res_array, first_pass);
_cpus_to_use(&avail_cpus, rem_max_cpus,
min_rem_nodes, details_ptr,
avail_res_array[i], i, cr_type);
if (avail_cpus == 0)
continue;
avail_cpu_per_node[i] = avail_cpus;
if ((max_cpu_idx == -1) ||
(avail_cpu_per_node[max_cpu_idx] <
avail_cpu_per_node[i])) {
max_cpu_idx = i;
if (avail_cpu_per_node[max_cpu_idx] ==
last_max_cpu_cnt)
break;
}
}
if ((max_cpu_idx == -1) ||
(avail_cpu_per_node[max_cpu_idx] == 0)) {
/* No more usable nodes left, get next weight */
break;
}
i = max_cpu_idx;
avail_cpus = avail_cpu_per_node[i];
last_max_cpu_cnt = avail_cpus;
total_cpus += avail_cpus;
if ((details_ptr->max_cpus != NO_VAL) &&
(total_cpus > details_ptr->max_cpus)) {
debug2("%s: %s: %pJ can't use node %d without exceeding job limit",
plugin_type, __func__, job_ptr, i);
bit_clear(nwt->node_bitmap, i);
total_cpus -= avail_cpus;
continue;
}
rem_cpus -= avail_cpus;
rem_max_cpus -= avail_cpus;
rem_nodes--;
min_rem_nodes--;
max_nodes--;
bit_set(node_map, i);
if (gres_per_job) {
gres_plugin_job_sched_add(job_ptr->gres_list,
avail_res_array[i]->sock_gres_list,
avail_cpus);
}
if ((rem_nodes <= 0) && (rem_cpus <= 0) &&
gres_plugin_job_sched_test(job_ptr->gres_list,
job_ptr->job_id)) {
error_code = SLURM_SUCCESS;
all_done = true;
break;
}
if (max_nodes == 0) {
all_done = true;
break;
}
}
}
list_iterator_destroy(iter);
if (error_code == SLURM_SUCCESS) {
/* Already succeeded */
} else if ((rem_cpus > 0) || (min_rem_nodes > 0) ||
!gres_plugin_job_sched_test(job_ptr->gres_list,
job_ptr->job_id)) {
bit_clear_all(node_map);
error_code = SLURM_ERROR;
} else {
error_code = SLURM_SUCCESS;
}
fini: FREE_NULL_LIST(node_weight_list);
bit_free(orig_node_map);
xfree(avail_cpu_per_node);
return error_code;
}
/*
* A variation of _eval_nodes() to select resources at the end of the node
* list to reduce fragmentation
*/
static int _eval_nodes_serial(job_record_t *job_ptr,
gres_mc_data_t *mc_ptr, bitstr_t *node_map,
bitstr_t **avail_core, uint32_t min_nodes,
uint32_t max_nodes, uint32_t req_nodes,
avail_res_t **avail_res_array, uint16_t cr_type,
bool prefer_alloc_nodes, bool first_pass)
{
int i, i_start, i_end, error_code = SLURM_ERROR;
int rem_cpus, rem_nodes; /* remaining resources desired */
int min_rem_nodes; /* remaining resources desired */
int total_cpus = 0; /* #CPUs allocated to job */
int64_t rem_max_cpus;
struct job_details *details_ptr = job_ptr->details;
bitstr_t *req_map = details_ptr->req_node_bitmap;
bitstr_t *orig_node_map = bit_copy(node_map);
bool all_done = false, gres_per_job;
uint16_t avail_cpus = 0;
node_record_t *node_ptr;
List node_weight_list = NULL;
node_weight_type *nwt;
ListIterator iter;
bool enforce_binding = false;
if (job_ptr->gres_list && (job_ptr->bit_flags & GRES_ENFORCE_BIND))
enforce_binding = true;
rem_cpus = details_ptr->min_cpus;
rem_max_cpus = details_ptr->max_cpus;
min_rem_nodes = min_nodes;
if ((details_ptr->num_tasks != NO_VAL) &&
(details_ptr->num_tasks != 0))
max_nodes = MIN(max_nodes, details_ptr->num_tasks);
if ((gres_per_job = gres_plugin_job_sched_init(job_ptr->gres_list)))
rem_nodes = MIN(min_nodes, req_nodes);
else
rem_nodes = MAX(min_nodes, req_nodes);
i_start = bit_ffs(node_map);
if (i_start >= 0)
i_end = bit_fls(node_map);
else
i_end = i_start - 1;
if (req_map) {
for (i = i_start; i <= i_end; i++) {
if (!bit_test(req_map, i)) {
bit_clear(node_map, i);
continue;
}
node_ptr = node_record_table_ptr + i;
if (!bit_test(node_map, i)) {
debug("%pJ required node %s not available",
job_ptr, node_ptr->name);
goto fini;
}
if (!avail_res_array[i] ||
!avail_res_array[i]->avail_cpus) {
debug("%pJ required node %s lacks available resources",
job_ptr, node_ptr->name);
goto fini;
}
_select_cores(job_ptr, mc_ptr, enforce_binding, i,
&avail_cpus, max_nodes, min_rem_nodes,
avail_core, avail_res_array, first_pass);
_cpus_to_use(&avail_cpus, rem_max_cpus, min_rem_nodes,
details_ptr, avail_res_array[i], i,
cr_type);
if ((avail_cpus > 0) && (max_nodes > 0)) {
total_cpus += avail_cpus;
rem_cpus -= avail_cpus;
rem_max_cpus -= avail_cpus;
rem_nodes--;
min_rem_nodes--;
/* leaving bitmap set, decr max limit */
max_nodes--;
if (gres_per_job) {
gres_plugin_job_sched_add(
job_ptr->gres_list,
avail_res_array[i]->
sock_gres_list, avail_cpus);
}
} else { /* node not selected (yet) */
debug("%pJ required node %s lacks available resources",
job_ptr, node_ptr->name);
goto fini;
}
}
if ((rem_nodes <= 0) && (rem_cpus <= 0) &&
gres_plugin_job_sched_test(job_ptr->gres_list,
job_ptr->job_id)) {
error_code = SLURM_SUCCESS;
bit_and(node_map, req_map);
goto fini;
}
if (max_nodes <= 0) {
error_code = SLURM_ERROR;
goto fini;
}
bit_and_not(orig_node_map, node_map);
} else {
bit_clear_all(node_map);
}
/* Compute CPUs already allocated to required nodes */
if ((details_ptr->max_cpus != NO_VAL) &&
(total_cpus > details_ptr->max_cpus)) {
info("%pJ can't use required nodes due to max CPU limit",
job_ptr);
goto fini;
}
if (max_nodes == 0)
all_done = true;
node_weight_list = _build_node_weight_list(orig_node_map);
iter = list_iterator_create(node_weight_list);
while (!all_done && (nwt = (node_weight_type *) list_next(iter))) {
for (i = i_end; ((i >= i_start) && (max_nodes > 0)); i--) {
if (!avail_res_array[i] ||
!avail_res_array[i]->avail_cpus)
continue;
/* Node not available or already selected */
if (!bit_test(nwt->node_bitmap, i) ||
bit_test(node_map, i))
continue;
_select_cores(job_ptr, mc_ptr, enforce_binding, i,
&avail_cpus, max_nodes, min_rem_nodes,
avail_core, avail_res_array, first_pass);
_cpus_to_use(&avail_cpus, rem_max_cpus,
min_rem_nodes, details_ptr,
avail_res_array[i], i, cr_type);
if (avail_cpus == 0)
continue;
total_cpus += avail_cpus;
if ((details_ptr->max_cpus != NO_VAL) &&
(total_cpus > details_ptr->max_cpus)) {
debug2("%s: %s: %pJ can't use node %d without exceeding job limit",
plugin_type, __func__, job_ptr, i);
total_cpus -= avail_cpus;
continue;
}
rem_cpus -= avail_cpus;
rem_max_cpus -= avail_cpus;
rem_nodes--;
min_rem_nodes--;
max_nodes--;
bit_set(node_map, i);
if (gres_per_job) {
gres_plugin_job_sched_add(job_ptr->gres_list,
avail_res_array[i]->sock_gres_list,
avail_cpus);
}
if ((rem_nodes <= 0) && (rem_cpus <= 0) &&
gres_plugin_job_sched_test(job_ptr->gres_list,
job_ptr->job_id)) {
error_code = SLURM_SUCCESS;
all_done = true;
break;
}
if (max_nodes == 0) {
all_done = true;
break;
}
}
}
list_iterator_destroy(iter);
if (error_code == SLURM_SUCCESS) {
/* Already succeeded */
} else if ((rem_cpus > 0) || (min_rem_nodes > 0) ||
!gres_plugin_job_sched_test(job_ptr->gres_list,
job_ptr->job_id)) {
bit_clear_all(node_map);
error_code = SLURM_ERROR;
} else {
error_code = SLURM_SUCCESS;
}
fini: FREE_NULL_LIST(node_weight_list);
bit_free(orig_node_map);
return error_code;
}
/*
* This is an intermediary step between _select_nodes() and _eval_nodes()
* to tackle the knapsack problem. This code incrementally removes nodes
* with low CPU counts for the job and re-evaluates each result.
*
* RET SLURM_SUCCESS or an error code
*/
extern int choose_nodes(job_record_t *job_ptr, bitstr_t *node_map,
bitstr_t **avail_core, uint32_t min_nodes,
uint32_t max_nodes, uint32_t req_nodes,
avail_res_t **avail_res_array, uint16_t cr_type,
bool prefer_alloc_nodes, gres_mc_data_t *tres_mc_ptr)
{
int i, i_first, i_last;
int count, ec, most_res = 0, rem_nodes, node_cnt = 0;
bitstr_t *orig_node_map, *req_node_map = NULL;
bitstr_t **orig_core_array;
if (job_ptr->details->req_node_bitmap)
req_node_map = job_ptr->details->req_node_bitmap;
/* clear nodes from the bitmap that don't have available resources */
i_first = bit_ffs(node_map);
if (i_first >= 0)
i_last = bit_fls(node_map);
else
i_last = i_first - 1;
for (i = i_first; i <= i_last; i++) {
if (!bit_test(node_map, i))
continue;
/*
* Make sure we don't say we can use a node exclusively
* that is bigger than our whole-job maximum CPU count.
*/
if (((job_ptr->details->whole_node == 1) &&
(job_ptr->details->max_cpus != NO_VAL) &&
(job_ptr->details->max_cpus <
avail_res_array[i]->avail_cpus)) ||
/* OR node has no CPUs */
(avail_res_array[i]->avail_cpus < 1)) {
if (req_node_map && bit_test(req_node_map, i)) {
/* can't clear a required node! */
return SLURM_ERROR;
}
bit_clear(node_map, i);
} else {
node_cnt++;
}
}
if ((job_ptr->details->num_tasks > 1) &&
(max_nodes > job_ptr->details->num_tasks))
max_nodes = MAX(job_ptr->details->num_tasks, min_nodes);
/*
* _eval_nodes() might need to be called more than once and is
* destructive of node_map and avail_core. Copy those bitmaps.
*/
orig_node_map = bit_copy(node_map);
orig_core_array = copy_core_array(avail_core);
ec = _eval_nodes(job_ptr, tres_mc_ptr, node_map, avail_core, min_nodes,
max_nodes, req_nodes, avail_res_array, cr_type,
prefer_alloc_nodes, true);
if (ec == SLURM_SUCCESS)
goto fini;
bit_or(node_map, orig_node_map);
core_array_or(avail_core, orig_core_array);
rem_nodes = bit_set_count(node_map);
if (rem_nodes <= min_nodes) {
/* Can not remove any nodes, enable use of non-local GRES */
ec = _eval_nodes(job_ptr, tres_mc_ptr, node_map, avail_core,
min_nodes, max_nodes, req_nodes,
avail_res_array, cr_type, prefer_alloc_nodes,
false);
goto fini;
}
/*
* This nodeset didn't work. To avoid a possible knapsack problem,
* incrementally remove nodes with low resource counts (sum of CPU and
* GPU count if using GPUs, otherwise the CPU count) and retry
*/
for (i = 0; i < select_node_cnt; i++) {
if (avail_res_array[i]) {
most_res = MAX(most_res,
avail_res_array[i]->avail_res_cnt);
}
}
for (count = 1; count < most_res; count++) {
int nochange = 1;
bit_or(node_map, orig_node_map);
core_array_or(avail_core, orig_core_array);
for (i = i_first; i <= i_last; i++) {
if (!bit_test(node_map, i))
continue;
if ((avail_res_array[i]->avail_res_cnt > 0) &&
(avail_res_array[i]->avail_res_cnt <= count)) {
if (req_node_map && bit_test(req_node_map, i))
continue;
nochange = 0;
bit_clear(node_map, i);
bit_clear(orig_node_map, i);
if (--rem_nodes <= min_nodes)
break;
}
}
if (nochange && (count != 1))
continue;
ec = _eval_nodes(job_ptr, tres_mc_ptr, node_map, avail_core,
min_nodes, max_nodes, req_nodes,
avail_res_array, cr_type, prefer_alloc_nodes,
false);
if (ec == SLURM_SUCCESS)
break;
if (rem_nodes <= min_nodes)
break;
}
fini: if ((ec == SLURM_SUCCESS) && job_ptr->gres_list && orig_core_array) {
/*
* Update available CPU count for any removed cores.
* Cores are only removed for jobs with GRES to enforce binding.
*/
for (i = i_first; i <= i_last; i++) {
if (!bit_test(node_map, i)||
!orig_core_array[i] || !avail_core[i])
continue;
count = bit_set_count(avail_core[i]);
count *= select_node_record[i].vpus;
avail_res_array[i]->avail_cpus =
MIN(count, avail_res_array[i]->avail_cpus);
if (avail_res_array[i]->avail_cpus == 0) {
error("%s: %s: avail_cpus underflow for %pJ",
plugin_type, __func__, job_ptr);
if (req_node_map && bit_test(req_node_map, i)) {
/* can't clear a required node! */
ec = SLURM_ERROR;
}
bit_clear(node_map, i);
}
}
}
FREE_NULL_BITMAP(orig_node_map);
free_core_array(&orig_core_array);
return ec;
}
/*
* can_job_run_on_node - Given the job requirements, determine which
* resources from the given node (if any) can be
* allocated to this job. Returns a structure identifying
* the resources available for allocation to this job.
* NOTE: This process does NOT support overcommitting resources
*
* IN job_ptr - pointer to job requirements
* IN/OUT core_map - per-node bitmap of available cores
* IN node_i - index of node to be evaluated
* IN s_p_n - Expected sockets_per_node (NO_VAL if not limited)
* IN cr_type - Consumable Resource setting
* IN test_only - Determine if job could ever run, ignore allocated memory
* check
* IN will_run - Determining when a pending job can start
* IN: part_core_map - per-node bitmap of cores allocated to jobs of this
* partition or NULL if don't care
* RET Available resources. Call _array() to release memory.
*
* NOTE: The returned cpu_count may be less than the number of set bits in
* core_map for the given node. The cr_dist functions will determine
* which bits to de-select from the core_map to match the cpu_count.
*/
extern avail_res_t *can_job_run_on_node(job_record_t *job_ptr,
bitstr_t **core_map,
const uint32_t node_i,
uint32_t s_p_n,
node_use_record_t *node_usage,
uint16_t cr_type,
bool test_only, bool will_run,
bitstr_t **part_core_map)
{
uint16_t cpus = 0;
uint64_t avail_mem = NO_VAL64, req_mem;
int cpu_alloc_size, i, rc;
node_record_t *node_ptr = node_record_table_ptr + node_i;
List node_gres_list;
bitstr_t *part_core_map_ptr = NULL, *req_sock_map = NULL;
avail_res_t *avail_res = NULL;
List sock_gres_list = NULL;
bool enforce_binding = false;
uint16_t min_cpus_per_node, ntasks_per_node = 1;
if (((job_ptr->bit_flags & BACKFILL_TEST) == 0) &&
!test_only && !will_run && IS_NODE_COMPLETING(node_ptr)) {
/*
* Do not allocate more jobs to nodes with completing jobs,
* backfill scheduler independently handles completing nodes
*/
return NULL;
}
if (part_core_map)
part_core_map_ptr = part_core_map[node_i];
if (node_usage[node_i].gres_list)
node_gres_list = node_usage[node_i].gres_list;
else
node_gres_list = node_ptr->gres_list;
if (job_ptr->gres_list) {
/* Identify available GRES and adjacent cores */
if (job_ptr->bit_flags & GRES_ENFORCE_BIND)
enforce_binding = true;
if (!core_map[node_i]) {
core_map[node_i] = bit_alloc(
select_node_record[node_i].tot_cores);
bit_set_all(core_map[node_i]);
}
sock_gres_list = gres_plugin_job_test2(
job_ptr->gres_list, node_gres_list,
test_only, core_map[node_i],
select_node_record[node_i].tot_sockets,
select_node_record[node_i].cores,
job_ptr->job_id, node_ptr->name,
enforce_binding, s_p_n, &req_sock_map,
job_ptr->user_id, node_i);
if (!sock_gres_list) { /* GRES requirement fail */
#if _DEBUG
info("Test fail on node %d: gres_plugin_job_test2",
node_i);
#endif
return NULL;
}
}
/* Identify available CPUs */
if (cr_type & CR_CORE) {
/* cpu_alloc_size = # of CPUs per core */
cpu_alloc_size = select_node_record[node_i].vpus;
avail_res = common_allocate_cores(job_ptr, core_map[node_i],
part_core_map_ptr, node_i,
&cpu_alloc_size, false,
req_sock_map);
} else if (cr_type & CR_SOCKET) {
/* cpu_alloc_size = # of CPUs per socket */
cpu_alloc_size = select_node_record[node_i].cores *
select_node_record[node_i].vpus;
avail_res = common_allocate_sockets(job_ptr, core_map[node_i],
part_core_map_ptr, node_i,
&cpu_alloc_size,
req_sock_map);
} else {
/* cpu_alloc_size = 1 individual CPU */
cpu_alloc_size = 1;
avail_res = common_allocate_cores(job_ptr, core_map[node_i],
part_core_map_ptr, node_i,
&cpu_alloc_size, true,
req_sock_map);
}
FREE_NULL_BITMAP(req_sock_map);
if (!avail_res || (avail_res->max_cpus == 0)) {
common_free_avail_res(avail_res);
#if _DEBUG
info("Test fail on node %d: _allocate_cores/sockets",
node_i);
#endif
FREE_NULL_LIST(sock_gres_list);
return NULL;
}
/* Check that sufficient CPUs remain to run a task on this node */
if (job_ptr->details->ntasks_per_node) {
ntasks_per_node = job_ptr->details->ntasks_per_node;
} else if (job_ptr->details->overcommit) {
ntasks_per_node = 1;
} else if ((job_ptr->details->max_nodes == 1) &&
(job_ptr->details->num_tasks != 0)) {
ntasks_per_node = job_ptr->details->num_tasks;
} else if (job_ptr->details->max_nodes) {
ntasks_per_node = (job_ptr->details->num_tasks +
job_ptr->details->max_nodes - 1) /
job_ptr->details->max_nodes;
}
min_cpus_per_node = ntasks_per_node * job_ptr->details->cpus_per_task;
if (avail_res->max_cpus < min_cpus_per_node) {
#if _DEBUG
info("Test fail on node %d: max_cpus < min_cpus_per_node (%u < %u)",
node_i, avail_res->max_cpus, min_cpus_per_node);
#endif
FREE_NULL_LIST(sock_gres_list);
common_free_avail_res(avail_res);
return NULL;
}
if (cr_type & CR_MEMORY) {
avail_mem = select_node_record[node_i].real_memory -
select_node_record[node_i].mem_spec_limit;
if (!test_only)
avail_mem -= node_usage[node_i].alloc_memory;
}
if (sock_gres_list) {
uint16_t near_gpu_cnt = 0;
avail_res->sock_gres_list = sock_gres_list;
/* Disable GRES that can't be used with remaining cores */
rc = gres_plugin_job_core_filter2(
sock_gres_list, avail_mem,
avail_res->max_cpus,
enforce_binding, core_map[node_i],
select_node_record[node_i].tot_sockets,
select_node_record[node_i].cores,
select_node_record[node_i].vpus,
s_p_n,
job_ptr->details->ntasks_per_node,
(job_ptr->details->whole_node == 1),
&avail_res->avail_gpus, &near_gpu_cnt);
if (rc != 0) {
#if _DEBUG
info("Test fail on node %d: gres_plugin_job_core_filter2",
node_i);
#endif
common_free_avail_res(avail_res);
return NULL;
}
/* Favor nodes with more co-located GPUs */
node_ptr->sched_weight =
(node_ptr->sched_weight & 0xffffffffffffff00) |
(0xff - near_gpu_cnt);
}
cpus = avail_res->max_cpus;
if (cr_type & CR_MEMORY) {
/*
* Memory Check: check pn_min_memory to see if:
* - this node has enough memory (MEM_PER_CPU == 0)
* - there are enough free_cores (MEM_PER_CPU == 1)
*/
req_mem = job_ptr->details->pn_min_memory & ~MEM_PER_CPU;
if (job_ptr->details->pn_min_memory & MEM_PER_CPU) {
/* memory is per-CPU */
if (!(job_ptr->bit_flags & BF_WHOLE_NODE_TEST) &&
((req_mem * cpus) > avail_mem) &&
(job_ptr->details->whole_node == 1)) {
cpus = 0;
} else if (!(cr_type & CR_CPU) &&
job_ptr->details->mc_ptr &&
(job_ptr->details->mc_ptr->
ntasks_per_core == 1) &&
job_ptr->details->cpus_per_task == 1) {
/*
* In this scenario, CPUs represents cores and
* the CPU/core count will be inflated later on
* to include all of the threads on a core. So
* we need to compare apples to apples and only
* remove 1 CPU/core at a time.
*/
while ((cpus > 0) &&
((req_mem *
((int) cpus *
(int) select_node_record[node_i].vpus))
> avail_mem))
cpus -= 1;
} else {
while ((req_mem * cpus) > avail_mem) {
if (cpus >= cpu_alloc_size) {
cpus -= cpu_alloc_size;
} else {
cpus = 0;
break;
}
}
}
if (job_ptr->details->cpus_per_task > 1) {
i = cpus % job_ptr->details->cpus_per_task;
cpus -= i;
}
if (cpus < job_ptr->details->ntasks_per_node)
cpus = 0;
/* FIXME: Need to recheck min_cores, etc. here */
} else {
/* memory is per node */
if (req_mem > avail_mem)
cpus = 0;
}
}
if (cpus == 0) {
#if _DEBUG
info("Test fail on node %d: cpus == 0", node_i);
#endif
bit_clear_all(core_map[node_i]);
}
if (select_debug_flags & DEBUG_FLAG_SELECT_TYPE) {
info("%s: %s: %u CPUs on %s(state:%d), mem %"PRIu64"/%"PRIu64,
plugin_type, __func__, cpus,
select_node_record[node_i].node_ptr->name,
node_usage[node_i].node_state,
node_usage[node_i].alloc_memory,
select_node_record[node_i].real_memory);
}
avail_res->avail_cpus = cpus;
avail_res->avail_res_cnt = cpus + avail_res->avail_gpus;
_avail_res_log(avail_res, node_ptr->name);
return avail_res;
}