/*****************************************************************************\
* select_cons_tres.c - Resource selection plugin supporting Trackable
* RESources (TRES) policies.
*****************************************************************************
* Copyright (C) 2018 SchedMD LLC
* Derived in large part from select/cons_res plugin
*
* This file is part of Slurm, a resource management program.
* For details, see .
* Please also read the included file: DISCLAIMER.
*
* Slurm is free software; you can redistribute it and/or modify it under
* the terms of the GNU General Public License as published by the Free
* Software Foundation; either version 2 of the License, or (at your option)
* any later version.
*
* In addition, as a special exception, the copyright holders give permission
* to link the code of portions of this program with the OpenSSL library under
* certain conditions as described in each individual source file, and
* distribute linked combinations including the two. You must obey the GNU
* General Public License in all respects for all of the code used other than
* OpenSSL. If you modify file(s) with this exception, you may extend this
* exception to your version of the file(s), but you are not obligated to do
* so. If you do not wish to do so, delete this exception statement from your
* version. If you delete this exception statement from all source files in
* the program, then also delete it here.
*
* Slurm is distributed in the hope that it will be useful, but WITHOUT ANY
* WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
* FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
* details.
*
* You should have received a copy of the GNU General Public License along
* with Slurm; if not, write to the Free Software Foundation, Inc.,
* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
\*****************************************************************************/
#include "config.h"
#define _GNU_SOURCE
#include
#include
#include "src/common/slurm_xlator.h"
#include "src/common/assoc_mgr.h"
#include "src/common/slurm_selecttype_info.h"
#include "src/common/xstring.h"
#include "select_cons_tres.h"
#include "job_test.h"
#include "dist_tasks.h"
#define _DEBUG 0 /* Enables module specific debugging */
#define NODEINFO_MAGIC 0x8a5d
/*
* These variables are required by the generic plugin interface. If they
* are not found in the plugin, the plugin loader will ignore it.
*
* plugin_name - a string giving a human-readable description of the
* plugin. There is no maximum length, but the symbol must refer to
* a valid string.
*
* plugin_type - a string suggesting the type of the plugin or its
* applicability to a particular form of data or method of data handling.
* If the low-level plugin API is used, the contents of this string are
* unimportant and may be anything. Slurm uses the higher-level plugin
* interface which requires this string to be of the form
*
* /
*
* where is a description of the intended application of
* the plugin (e.g., "select" for Slurm node selection) and
* is a description of how this plugin satisfies that application. Slurm will
* only load select plugins if the plugin_type string has a
* prefix of "select/".
*
* plugin_version - an unsigned 32-bit integer containing the Slurm version
* (major.minor.micro combined into a single number).
*/
const char plugin_name[] = "Trackable RESources (TRES) Selection plugin";
const char *plugin_type = "select/cons_tres";
const uint32_t plugin_id = SELECT_PLUGIN_CONS_TRES;
const uint32_t plugin_version = SLURM_VERSION_NUMBER;
const uint32_t pstate_version = 7; /* version control on saved state */
const uint16_t nodeinfo_magic = 0x8a5d;
/* Global variables */
bitstr_t **spec_core_res = NULL;
/* Clear from avail_cores all specialized cores */
static void _spec_core_filter(bitstr_t *node_bitmap, bitstr_t **avail_cores)
{
if (!spec_core_res)
return; /* No specialized cores */
xassert(avail_cores);
core_array_and_not(avail_cores, spec_core_res);
}
/*
* Select resources for advanced reservation
* avail_node_bitmap IN - Available nodes
* node_cnt IN - required node count
* core_cnt IN - required core count
* exc_cores IN/OUT - Cores to AVOID using on input, selected cores on output
* RET selected nodes
*/
static bitstr_t *_pick_first_cores(bitstr_t *avail_node_bitmap,
uint32_t node_cnt, uint32_t *core_cnt,
bitstr_t ***exc_cores)
{
#if _DEBUG
char tmp[128];
bitstr_t **tmp_cores;
#endif
bitstr_t **avail_cores, **local_cores = NULL;
bitstr_t *picked_node_bitmap = NULL;
bitstr_t *tmp_core_bitmap;
int c, c_cnt, i;
int local_node_offset = 0;
bool fini = false;
if (!core_cnt || (core_cnt[0] == 0))
return picked_node_bitmap;
if (*exc_cores == NULL) { /* Exclude no cores by default */
#if _DEBUG
bit_fmt(tmp, sizeof(tmp), avail_node_bitmap);
info("%s: avail_nodes:%s", __func__, tmp);
info("%s: exc_cores: NULL", __func__);
#endif
c = select_node_record[select_node_cnt-1].cume_cores;
tmp_core_bitmap = bit_alloc(c);
bit_not(tmp_core_bitmap);
avail_cores = core_bitmap_to_array(tmp_core_bitmap);
local_cores = avail_cores;
FREE_NULL_BITMAP(tmp_core_bitmap);
} else {
#if _DEBUG
tmp_cores = *exc_cores;
bit_fmt(tmp, sizeof(tmp), avail_node_bitmap);
info("%s: avail_nodes:%s", __func__, tmp);
for (i = 0; i < select_node_cnt; i++) {
if (!tmp_cores[i])
continue;
bit_fmt(tmp, sizeof(tmp), tmp_cores[i]);
info("%s: exc_cores[%d]: %s", __func__, i, tmp);
}
#endif
/*
* Ensure all nodes in avail_node_bitmap are represented
* in exc_cores. For now include ALL nodes.
*/
c = select_node_record[select_node_cnt-1].cume_cores;
tmp_core_bitmap = bit_alloc(c);
bit_not(tmp_core_bitmap);
avail_cores = core_bitmap_to_array(tmp_core_bitmap);
FREE_NULL_BITMAP(tmp_core_bitmap);
core_array_and_not(avail_cores, *exc_cores);
}
xassert(avail_cores);
picked_node_bitmap = bit_alloc(select_node_cnt);
for (i = 0; i < node_record_count; i++) {
if (fini ||
!avail_cores[i] ||
!bit_test(avail_node_bitmap, i) ||
(bit_set_count_range(avail_cores[i], 0,
core_cnt[local_node_offset]) <
core_cnt[local_node_offset])) {
FREE_NULL_BITMAP(avail_cores[i]);
continue;
}
bit_set(picked_node_bitmap, i);
c_cnt = 0;
for (c = 0; c < select_node_record[i].tot_cores; c++) {
if (!bit_test(avail_cores[i], c))
continue;
if (++c_cnt > core_cnt[local_node_offset])
bit_clear(avail_cores[i], c);
}
if (core_cnt[++local_node_offset] == 0)
fini = true;
}
if (!fini) {
info("%s: %s: reservation request can not be satisfied",
plugin_type, __func__);
FREE_NULL_BITMAP(picked_node_bitmap);
free_core_array(&local_cores);
} else {
free_core_array(exc_cores);
*exc_cores = avail_cores;
#if _DEBUG
for (i = 0; i < select_node_cnt; i++) {
if (!avail_cores[i])
continue;
bit_fmt(tmp, sizeof(tmp), avail_cores[i]);
error("%s: selected cores[%d] %s", __func__, i, tmp);
}
#endif
}
return picked_node_bitmap;
}
/*
* Select resources for advanced reservation
* avail_node_bitmap IN - Available nodes
* node_cnt IN - required node count
* core_cnt IN - required core count
* exc_cores IN/OUT - Cores to AVOID using on input, selected cores on output
* RET selected node bitmap
*/
static bitstr_t *_sequential_pick(bitstr_t *avail_node_bitmap,
uint32_t node_cnt, uint32_t *core_cnt,
bitstr_t ***exc_cores)
{
#if _DEBUG
char tmp[128];
bitstr_t **tmp_cores;
#endif
bitstr_t **avail_cores = NULL, **local_cores = NULL;
bitstr_t *picked_node_bitmap;
char str[300];
int cores_per_node = 0, extra_cores_needed = -1;
int total_core_cnt = 0, local_node_offset = 0, num_nodes;
bitstr_t *tmp_core_bitmap;
int c, c_cnt, c_target, i;
bool fini = false, single_core_cnt = false;
/*
* We have these cases here:
* 1) node_cnt != 0 && core_cnt != NULL
* 2) node_cnt == 0 && core_cnt != NULL
* 3) node_cnt != 0 && core_cnt == NULL
* 4) node_cnt == 0 && core_cnt == NULL
*/
if (core_cnt) {
num_nodes = bit_set_count(avail_node_bitmap);
for (i = 0; (i < num_nodes) && core_cnt[i]; i++)
total_core_cnt += core_cnt[i];
if ((node_cnt > 1) && (i == 1)) {
/* single core_cnt element applied across all nodes */
cores_per_node = MAX((total_core_cnt / node_cnt), 1);
extra_cores_needed = total_core_cnt -
(cores_per_node * node_cnt);
} else if ((node_cnt == 0) && (i == 1)) {
/*
* single core_cnt element applied across arbitrary
* node count
*/
single_core_cnt = true;
}
}
#if _DEBUG
if (cores_per_node) {
info("%s: %s: Reservations requires %d cores (%u each on %u nodes, plus %d)",
plugin_type, __func__, total_core_cnt, cores_per_node,
node_cnt, extra_cores_needed);
} else if (single_core_cnt) {
info("%s: %s: Reservations requires %d cores total",
plugin_type, __func__, total_core_cnt);
} else if (core_cnt && core_cnt[0]) {
info("%s: %s: Reservations requires %d cores with %d cores on first node",
plugin_type, __func__, total_core_cnt, core_cnt[0]);
} else {
info("%s: %s: Reservations requires %u nodes total",
plugin_type, __func__, node_cnt);
}
#endif
picked_node_bitmap = bit_alloc(select_node_cnt);
if (core_cnt) { /* Reservation is using partial nodes */
debug2("%s: %s: Reservation is using partial nodes",
plugin_type, __func__);
if (*exc_cores == NULL) { /* Exclude no cores by default */
#if _DEBUG
bit_fmt(tmp, sizeof(tmp), avail_node_bitmap);
info("%s: avail_nodes:%s", __func__, tmp);
info("%s: exc_cores: NULL", __func__);
#endif
c = select_node_record[select_node_cnt-1].cume_cores;
tmp_core_bitmap = bit_alloc(c);
bit_not(tmp_core_bitmap);
avail_cores = core_bitmap_to_array(tmp_core_bitmap);
local_cores = avail_cores;
FREE_NULL_BITMAP(tmp_core_bitmap);
} else {
#if _DEBUG
tmp_cores = *exc_cores;
bit_fmt(tmp, sizeof(tmp), avail_node_bitmap);
info("%s: avail_nodes:%s", __func__, tmp);
for (i = 0; i < select_node_cnt; i++) {
if (!tmp_cores[i])
continue;
bit_fmt(tmp, sizeof(tmp), tmp_cores[i]);
info("%s: exc_cores[%d]: %s", __func__, i, tmp);
}
#endif
/*
* Ensure all nodes in avail_node_bitmap are represented
* in exc_cores. For now include ALL nodes.
*/
c = select_node_record[select_node_cnt-1].cume_cores;
tmp_core_bitmap = bit_alloc(c);
bit_not(tmp_core_bitmap);
avail_cores = core_bitmap_to_array(tmp_core_bitmap);
FREE_NULL_BITMAP(tmp_core_bitmap);
core_array_and_not(avail_cores, *exc_cores);
}
xassert(avail_cores);
for (i = 0; i < select_node_cnt; i++) {
if (fini || !avail_cores[i] ||
!bit_test(avail_node_bitmap, i)) {
FREE_NULL_BITMAP(avail_cores[i]);
continue;
}
c = bit_set_count(avail_cores[i]);
if (cores_per_node) {
if (c < cores_per_node)
continue;
if ((c > cores_per_node) &&
(extra_cores_needed > 0)) {
c_cnt = cores_per_node +
extra_cores_needed;
if (c_cnt > c)
c_target = c;
else
c_target = c_cnt;
extra_cores_needed -= (c_target - c);
} else {
c_target = cores_per_node;
}
} else if (single_core_cnt) {
if (c > total_core_cnt)
c_target = total_core_cnt;
else
c_target = c;
total_core_cnt -= c_target;
} else { /* !single_core_cnt */
if (c < core_cnt[local_node_offset])
continue;
c_target = core_cnt[local_node_offset];
}
c_cnt = 0;
for (c = 0; c < select_node_record[i].tot_cores; c++) {
if (!bit_test(avail_cores[i], c))
continue;
if (c_cnt >= c_target)
bit_clear(avail_cores[i], c);
else
c_cnt++;
}
if (c_cnt) {
bit_set(picked_node_bitmap, i);
node_cnt--;
}
if (cores_per_node) { /* Test node count */
if (node_cnt <= 0)
fini = true;
} else if (single_core_cnt) { /* Test core count */
if (total_core_cnt <= 0)
fini = true;
} else { /* Test core_cnt array */
if (core_cnt[++local_node_offset] == 0)
fini = true;
}
}
if (!fini) {
info("%s: %s: reservation request can not be satisfied",
plugin_type, __func__);
FREE_NULL_BITMAP(picked_node_bitmap);
if (local_cores != avail_cores)
free_core_array(&avail_cores);
free_core_array(&local_cores);
} else {
free_core_array(exc_cores);
*exc_cores = avail_cores;
}
} else { /* Reservation is using full nodes */
while (node_cnt) {
int inx;
inx = bit_ffs(avail_node_bitmap);
if (inx < 0)
break;
/* Add this node to the final node bitmap */
bit_set(picked_node_bitmap, inx);
node_cnt--;
/* Clear this node from the initial available bitmap */
bit_clear(avail_node_bitmap, inx);
}
if (node_cnt) {
info("%s: %s: Reservation request can not be satisfied",
plugin_type, __func__);
FREE_NULL_BITMAP(picked_node_bitmap);
} else {
bit_fmt(str, sizeof(str), picked_node_bitmap);
debug2("%s: %s: Sequential pick using nodemap: %s",
plugin_type, __func__, str);
}
}
return picked_node_bitmap;
}
/*
* init() is called when the plugin is loaded, before any other functions
* are called. Put global initialization here.
*/
extern int init(void)
{
common_init();
cons_common_callbacks.can_job_run_on_node = can_job_run_on_node;
cons_common_callbacks.choose_nodes = choose_nodes;
cons_common_callbacks.dist_tasks_compute_c_b = dist_tasks_compute_c_b;
cons_common_callbacks.pick_first_cores = _pick_first_cores;
cons_common_callbacks.sequential_pick = _sequential_pick;
cons_common_callbacks.spec_core_filter = _spec_core_filter;
return SLURM_SUCCESS;
}
extern int fini(void)
{
common_fini();
free_core_array(&spec_core_res);
return SLURM_SUCCESS;
}
/* select_p_state_save() in cons_common */
/* select_p_state_restore() in cons_common */
/* select_p_job_init() in cons_common */
/* select_p_node_ranking() in cons_common */
/* select_p_node_init() in cons_common */
/*
* select_p_job_test - Given a specification of scheduling requirements,
* identify the nodes which "best" satisfy the request.
* "best" is defined as either a minimal number of consecutive nodes
* or if sharing resources then sharing them with a job of similar size.
* IN/OUT job_ptr - pointer to job being considered for initiation,
* set's start_time when job expected to start
* IN/OUT node_bitmap - usable nodes are set on input, nodes not required to
* satisfy the request are cleared, other left set
* IN min_nodes - minimum count of nodes
* IN max_nodes - maximum count of nodes (0==don't care)
* IN req_nodes - requested (or desired) count of nodes
* IN mode - SELECT_MODE_RUN_NOW (0): try to schedule job now
* SELECT_MODE_TEST_ONLY (1): test if job can ever run
* SELECT_MODE_WILL_RUN (2): determine when and where job can run
* IN preemptee_candidates - List of pointers to jobs which can be preempted.
* IN/OUT preemptee_job_list - Pointer to list of job pointers. These are the
* jobs to be preempted to initiate the pending job. Not set
* if mode==SELECT_MODE_TEST_ONLY or input pointer is NULL.
* IN exc_core_bitmap - Cores to be excluded for use (in advanced reservation)
* RET zero on success, EINVAL otherwise
*/
extern int select_p_job_test(job_record_t *job_ptr, bitstr_t *node_bitmap,
uint32_t min_nodes, uint32_t max_nodes,
uint32_t req_nodes, uint16_t mode,
List preemptee_candidates,
List *preemptee_job_list,
bitstr_t *exc_core_bitmap)
{
int rc;
bitstr_t **exc_cores;
xassert(node_bitmap);
debug2("%s: %s: evaluating %pJ", plugin_type, __func__, job_ptr);
if (!job_ptr->details)
return EINVAL;
/*
* FIXME: exc_core_bitmap is a full-system core bitmap to be replaced
* with a set of per-node bitmaps in a future release of Slurm
*/
exc_cores = core_bitmap_to_array(exc_core_bitmap);
#if _DEBUG
if (exc_cores) {
int i;
char tmp[128];
for (i = 0; i < select_node_cnt; i++) {
if (!exc_cores[i])
continue;
bit_fmt(tmp, sizeof(tmp), exc_cores[i]);
error("%s: %s: IN exc_cores[%d] %s", plugin_type,
__func__, i, tmp);
}
}
#endif
rc = common_job_test(job_ptr, node_bitmap, min_nodes, max_nodes,
req_nodes, mode, preemptee_candidates,
preemptee_job_list, exc_cores);
free_core_array(&exc_cores);
return rc;
}
/* select_p_job_begin() in cons_common */
/* select_p_job_ready() in cons_common */
/* select_p_job_resized() in cons_common */
/* select_p_job_expand() in cons_common */
/* select_p_job_signal() in cons_common */
/* select_p_job_mem_confirm() in cons_common */
/* select_p_job_fini() in cons_common */
/* select_p_job_suspend() in cons_common */
/* select_p_job_resume() in cons_common */
/* select_p_step_pick_nodes() in cons_common */
/* select_p_step_start() in cons_common */
/* select_p_step_finish() in cons_common */
/* select_p_select_nodeinfo_pack() in cons_common */
/* select_p_select_nodeinfo_unpack() in cons_common */
/* select_p_select_nodeinfo_alloc() in cons_common */
/* select_p_select_nodeinfo_free() in cons_common */
/* select_p_select_nodeinfo_set_all() in cons_common */
/* select_p_select_nodeinfo_set() in cons_common */
/* select_p_select_nodeinfo_get() in cons_common */
/* select_p_job_begin() in cons_common */
/* select_p_job_ready() in cons_common */
/* select_p_job_resized() in cons_common */
/* select_p_job_expand() in cons_common */
/* select_p_job_signal() in cons_common */
/* select_p_job_mem_confirm() in cons_common */
/* select_p_job_fini() in cons_common */
/* select_p_job_suspend() in cons_common */
/* select_p_job_resume() in cons_common */
/* select_p_step_pick_nodes() in cons_common */
/* select_p_step_start() in cons_common */
/* select_p_step_finish() in cons_common */
/* select_p_select_nodeinfo_pack() in cons_common */
/* select_p_select_nodeinfo_unpack() in cons_common */
/* select_p_select_nodeinfo_alloc() in cons_common */
/* select_p_select_nodeinfo_free() in cons_common */
/* select_p_select_nodeinfo_set_all() in cons_common */
/* select_p_select_nodeinfo_set() in cons_common */
/* select_p_select_nodeinfo_get() in cons_common */
/* select_p_select_jobinfo_alloc() in cons_common */
/* select_p_select_jobinfo_free() in cons_common */
/* select_p_select_jobinfo_set() in cons_common */
/* select_p_select_jobinfo_get() in cons_common */
/* select_p_select_jobinfo_copy() in cons_common */
/* select_p_select_jobinfo_pack() in cons_common */
/* select_p_select_jobinfo_unpack() in cons_common */
/* select_p_select_jobinfo_sprint() in cons_common */
/* select_p_select_jobinfo_xstrdup() in cons_common */
/* select_p_get_info_from_plugin() in cons_common */
/* select_p_update_node_config() in cons_common */
/* select_p_reconfigure() in cons_common */
/* select_p_resv_test() in cons_common */