/*****************************************************************************\ * job_test.c - functions to test job on resources ***************************************************************************** * Copyright (C) 2019 SchedMD LLC * Derived in large part from select/cons_[res|tres] plugins * * This file is part of Slurm, a resource management program. * For details, see . * Please also read the included file: DISCLAIMER. * * Slurm is free software; you can redistribute it and/or modify it under * the terms of the GNU General Public License as published by the Free * Software Foundation; either version 2 of the License, or (at your option) * any later version. * * In addition, as a special exception, the copyright holders give permission * to link the code of portions of this program with the OpenSSL library under * certain conditions as described in each individual source file, and * distribute linked combinations including the two. You must obey the GNU * General Public License in all respects for all of the code used other than * OpenSSL. If you modify file(s) with this exception, you may extend this * exception to your version of the file(s), but you are not obligated to do * so. If you do not wish to do so, delete this exception statement from your * version. If you delete this exception statement from all source files in * the program, then also delete it here. * * Slurm is distributed in the hope that it will be useful, but WITHOUT ANY * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more * details. * * You should have received a copy of the GNU General Public License along * with Slurm; if not, write to the Free Software Foundation, Inc., * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. \*****************************************************************************/ #include "cons_common.h" #include "dist_tasks.h" #include "src/common/node_select.h" #include "src/common/xstring.h" #include "src/slurmctld/preempt.h" typedef struct { int action; bool job_fini; bitstr_t *node_map; node_use_record_t *node_usage; part_res_record_t *part_record_ptr; int rc; } wrapper_rm_job_args_t; uint64_t def_cpu_per_gpu = 0; uint64_t def_mem_per_gpu = 0; bool preempt_strict_order = false; int preempt_reorder_cnt = 1; /* When any cores on a node are removed from being available for a job, * then remove the entire node from being available. */ static void _block_whole_nodes(bitstr_t *node_bitmap, bitstr_t **orig_core_bitmap, bitstr_t **new_core_bitmap) { int first_node, last_node, i_node; int first_core, last_core, i_core; bitstr_t *cr_orig_core_bitmap = NULL; bitstr_t *cr_new_core_bitmap = NULL; first_node = bit_ffs(node_bitmap); if (first_node >= 0) last_node = bit_fls(node_bitmap); else last_node = -2; if (!is_cons_tres) { cr_orig_core_bitmap = *orig_core_bitmap; cr_new_core_bitmap = *new_core_bitmap; } for (i_node = first_node; i_node <= last_node; i_node++) { if (!bit_test(node_bitmap, i_node)) continue; if (is_cons_tres) { first_core = 0; last_core = select_node_record[i_node].tot_cores; cr_orig_core_bitmap = orig_core_bitmap[i_node]; cr_new_core_bitmap = new_core_bitmap[i_node]; } else { first_core = cr_get_coremap_offset(i_node); last_core = cr_get_coremap_offset(i_node + 1); } for (i_core = first_core; i_core < last_core; i_core++) { if (bit_test(cr_orig_core_bitmap, i_core) && !bit_test(cr_new_core_bitmap, i_core)) { bit_clear(node_bitmap, i_node); break; } } } } static uint16_t _valid_uint16(uint16_t arg) { if ((arg == NO_VAL16) || (arg == INFINITE16)) return 0; return arg; } static gres_mc_data_t *_build_gres_mc_data(job_record_t *job_ptr) { gres_mc_data_t *tres_mc_ptr; tres_mc_ptr = xmalloc(sizeof(gres_mc_data_t)); tres_mc_ptr->cpus_per_task = _valid_uint16(job_ptr->details->cpus_per_task); tres_mc_ptr->ntasks_per_job = job_ptr->details->num_tasks; tres_mc_ptr->ntasks_per_node = _valid_uint16(job_ptr->details->ntasks_per_node); tres_mc_ptr->overcommit = job_ptr->details->overcommit; tres_mc_ptr->task_dist = job_ptr->details->task_dist; tres_mc_ptr->whole_node = job_ptr->details->whole_node; if (job_ptr->details->mc_ptr) { multi_core_data_t *job_mc_ptr = job_ptr->details->mc_ptr; tres_mc_ptr->boards_per_node = _valid_uint16(job_mc_ptr->boards_per_node); tres_mc_ptr->sockets_per_board = _valid_uint16(job_mc_ptr->sockets_per_board); tres_mc_ptr->sockets_per_node = _valid_uint16(job_mc_ptr->sockets_per_node); tres_mc_ptr->cores_per_socket = _valid_uint16(job_mc_ptr->cores_per_socket); tres_mc_ptr->threads_per_core = _valid_uint16(job_mc_ptr->threads_per_core); tres_mc_ptr->ntasks_per_board = _valid_uint16(job_mc_ptr->ntasks_per_board); tres_mc_ptr->ntasks_per_socket = _valid_uint16(job_mc_ptr->ntasks_per_socket); tres_mc_ptr->ntasks_per_core = _valid_uint16(job_mc_ptr->ntasks_per_core); } if ((tres_mc_ptr->ntasks_per_core == 0) && (slurmctld_conf.select_type_param & CR_ONE_TASK_PER_CORE)) tres_mc_ptr->ntasks_per_core = 1; return tres_mc_ptr; } static struct multi_core_data *_create_default_mc(void) { struct multi_core_data *mc_ptr; mc_ptr = xmalloc(sizeof(struct multi_core_data)); mc_ptr->sockets_per_node = NO_VAL16; mc_ptr->cores_per_socket = NO_VAL16; mc_ptr->threads_per_core = NO_VAL16; /* Other fields initialized to zero by xmalloc */ return mc_ptr; } /* List sort function: sort by the job's expected end time */ static int _cr_job_list_sort(void *x, void *y) { job_record_t *job1_ptr = *(job_record_t **) x; job_record_t *job2_ptr = *(job_record_t **) y; return (int) SLURM_DIFFTIME(job1_ptr->end_time, job2_ptr->end_time); } static int _find_job (void *x, void *key) { job_record_t *job_ptr = (job_record_t *) x; if (job_ptr == (job_record_t *) key) return 1; return 0; } extern void _free_avail_res_array(avail_res_t **avail_res_array) { int n; if (!avail_res_array) return; for (n = 0; n < select_node_cnt; n++) common_free_avail_res(avail_res_array[n]); xfree(avail_res_array); } /* Determine the node requirements for the job: * - does the job need exclusive nodes? (NODE_CR_RESERVED) * - can the job run on shared nodes? (NODE_CR_ONE_ROW) * - can the job run on overcommitted resources? (NODE_CR_AVAILABLE) */ static uint16_t _get_job_node_req(job_record_t *job_ptr) { int max_share = job_ptr->part_ptr->max_share; if (max_share == 0) /* Partition Shared=EXCLUSIVE */ return NODE_CR_RESERVED; /* Partition is Shared=FORCE */ if (max_share & SHARED_FORCE) return NODE_CR_AVAILABLE; if ((max_share > 1) && (job_ptr->details->share_res == 1)) /* part allows sharing, and the user has requested it */ return NODE_CR_AVAILABLE; return NODE_CR_ONE_ROW; } static void _set_gpu_defaults(job_record_t *job_ptr) { static part_record_t *last_part_ptr = NULL; static uint64_t last_cpu_per_gpu = NO_VAL64; static uint64_t last_mem_per_gpu = NO_VAL64; uint64_t cpu_per_gpu, mem_per_gpu; if (!is_cons_tres || !job_ptr->gres_list) return; if (job_ptr->part_ptr != last_part_ptr) { /* Cache data from last partition referenced */ last_part_ptr = job_ptr->part_ptr; last_cpu_per_gpu = common_get_def_cpu_per_gpu( last_part_ptr->job_defaults_list); last_mem_per_gpu = common_get_def_mem_per_gpu( last_part_ptr->job_defaults_list); } if (last_cpu_per_gpu != NO_VAL64) cpu_per_gpu = last_cpu_per_gpu; else if (def_cpu_per_gpu != NO_VAL64) cpu_per_gpu = def_cpu_per_gpu; else cpu_per_gpu = 0; if (last_mem_per_gpu != NO_VAL64) mem_per_gpu = last_mem_per_gpu; else if (def_mem_per_gpu != NO_VAL64) mem_per_gpu = def_mem_per_gpu; else mem_per_gpu = 0; gres_plugin_job_set_defs(job_ptr->gres_list, "gpu", cpu_per_gpu, mem_per_gpu); } /* Determine how many sockets per node this job requires for GRES */ static uint32_t _socks_per_node(job_record_t *job_ptr) { multi_core_data_t *mc_ptr; uint32_t s_p_n = NO_VAL; uint32_t cpu_cnt, cpus_per_node, tasks_per_node; uint32_t min_nodes; if (!job_ptr->details) return s_p_n; /* * FIXME: This was removed in cons_tres commit e82b9f17a23adf0, I am * wondering if it is actually needed in cons_res. */ if (!is_cons_tres && ((job_ptr->gres_list == NULL) || ((job_ptr->bit_flags & GRES_ENFORCE_BIND) == 0))) return s_p_n; cpu_cnt = job_ptr->details->num_tasks * job_ptr->details->cpus_per_task; cpu_cnt = MAX(job_ptr->details->min_cpus, cpu_cnt); min_nodes = MAX(job_ptr->details->min_nodes, 1); cpus_per_node = cpu_cnt / min_nodes; if (cpus_per_node <= 1) return (uint32_t) 1; mc_ptr = job_ptr->details->mc_ptr; if ((mc_ptr->ntasks_per_socket != NO_VAL16) && (mc_ptr->ntasks_per_socket != INFINITE16)) { tasks_per_node = job_ptr->details->num_tasks / min_nodes; s_p_n = (tasks_per_node + mc_ptr->ntasks_per_socket - 1) / mc_ptr->ntasks_per_socket; return s_p_n; } /* * This logic could be expanded to support additional cases, which may * require information per node information (e.g. threads per core). */ return s_p_n; } /* * Determine resource availability for pending job * * IN: job_ptr - pointer to the job requesting resources * IN: node_map - bitmap of available nodes * IN/OUT: core_map - per-node bitmaps of available cores * IN: cr_type - resource type * IN: test_only - Determine if job could ever run, ignore allocated memory * check * IN: will_run - Determining when a pending job can start * IN: part_core_map - per-node bitmap of cores allocated to jobs of this * partition or NULL if don't care * * RET array of avail_res_t pointers, free using _free_avail_res_array() */ static avail_res_t **_get_res_avail(job_record_t *job_ptr, bitstr_t *node_map, bitstr_t **core_map, node_use_record_t *node_usage, uint16_t cr_type, bool test_only, bool will_run, bitstr_t **part_core_map) { int i, i_first, i_last; avail_res_t **avail_res_array = NULL; uint32_t s_p_n = _socks_per_node(job_ptr); xassert(*cons_common_callbacks.can_job_run_on_node); _set_gpu_defaults(job_ptr); avail_res_array = xcalloc(select_node_cnt, sizeof(avail_res_t *)); i_first = bit_ffs(node_map); if (i_first != -1) i_last = bit_fls(node_map); else i_last = -2; for (i = i_first; i <= i_last; i++) { if (bit_test(node_map, i)) avail_res_array[i] = (*cons_common_callbacks.can_job_run_on_node)( job_ptr, core_map, i, s_p_n, node_usage, cr_type, test_only, will_run, part_core_map); /* * FIXME: This is a hack to make cons_res more bullet proof as * there are places that don't always behave correctly with a * sparce array. */ if (!is_cons_tres && !avail_res_array[i]) avail_res_array[i] = xmalloc(sizeof(avail_res_t)); } return avail_res_array; } /* For a given job already past it's end time, guess when it will actually end. * Used for backfill scheduling. */ static time_t _guess_job_end(job_record_t *job_ptr, time_t now) { time_t end_time; uint16_t over_time_limit; if (job_ptr->part_ptr && (job_ptr->part_ptr->over_time_limit != NO_VAL16)) { over_time_limit = job_ptr->part_ptr->over_time_limit; } else { over_time_limit = slurmctld_conf.over_time_limit; } if (over_time_limit == 0) { end_time = job_ptr->end_time + slurmctld_conf.kill_wait; } else if (over_time_limit == INFINITE16) { /* No idea when the job might end, this is just a guess */ if (job_ptr->time_limit && (job_ptr->time_limit != NO_VAL) && (job_ptr->time_limit != INFINITE)) { end_time = now + (job_ptr->time_limit * 60); } else { end_time = now + (365 * 24 * 60 * 60); /* one year */ } } else { end_time = job_ptr->end_time + slurmctld_conf.kill_wait + (over_time_limit * 60); } if (end_time <= now) end_time = now + 1; return end_time; } /* * Test to see if a node already has running jobs for _other_ partitions. * If (sharing_only) then only check sharing partitions. This is because * the job was submitted to a single-row partition which does not share * allocated CPUs with multi-row partitions. */ static int _is_node_busy(part_res_record_t *p_ptr, uint32_t node_i, int sharing_only, part_record_t *my_part_ptr, bool qos_preemptor) { uint32_t r, c, core_begin, core_end; uint16_t num_rows; bitstr_t *use_row_bitmap = NULL; for (; p_ptr; p_ptr = p_ptr->next) { num_rows = p_ptr->num_rows; if (preempt_by_qos && !qos_preemptor) num_rows--; /* Don't use extra row */ if (sharing_only && ((num_rows < 2) || (p_ptr->part_ptr == my_part_ptr))) continue; if (!p_ptr->row) continue; for (r = 0; r < num_rows; r++) { if (!p_ptr->row[r].row_bitmap) continue; if (is_cons_tres) { if (!p_ptr->row[r].row_bitmap[node_i]) continue; use_row_bitmap = p_ptr->row[r].row_bitmap[node_i]; core_begin = 0; core_end = bit_size( p_ptr->row[r].row_bitmap[node_i]); } else { if (!*p_ptr->row[r].row_bitmap) continue; use_row_bitmap = *p_ptr->row[r].row_bitmap; core_begin = cr_get_coremap_offset(node_i); core_end = cr_get_coremap_offset(node_i+1); } for (c = core_begin; c < core_end; c++) if (bit_test(use_row_bitmap, c)) return 1; } } return 0; } static bool _is_preemptable(job_record_t *job_ptr, List preemptee_candidates) { if (!preemptee_candidates) return false; if (list_find_first(preemptee_candidates, _find_job, job_ptr)) return true; return false; } /* * Select the best set of resources for the given job * IN: job_ptr - pointer to the job requesting resources * IN: min_nodes - minimum number of nodes required * IN: max_nodes - maximum number of nodes requested * IN: req_nodes - number of requested nodes * IN/OUT: node_bitmap - bitmap of available nodes / bitmap of selected nodes * IN/OUT: avail_core - available/selected cores * IN: cr_type - resource type * IN: test_only - Determine if job could ever run, ignore allocated memory * check * IN: will_run - Determining when a pending job can start * IN: part_core_map - per-node bitmap of cores allocated to jobs of this * partition or NULL if don't care * IN: prefer_alloc_nodes - select currently allocated nodes first * IN: tres_mc_ptr - job's multi-core options * RET: array of avail_res_t pointers, free using _free_avail_res_array(). * NULL on error */ static avail_res_t **_select_nodes(job_record_t *job_ptr, uint32_t min_nodes, uint32_t max_nodes, uint32_t req_nodes, bitstr_t *node_bitmap, bitstr_t **avail_core, node_use_record_t *node_usage, uint16_t cr_type, bool test_only, bool will_run, bitstr_t **part_core_map, bool prefer_alloc_nodes, gres_mc_data_t *tres_mc_ptr) { int i, rc; uint32_t n; struct job_details *details_ptr = job_ptr->details; bitstr_t *req_map = details_ptr->req_node_bitmap; avail_res_t **avail_res_array; xassert(*cons_common_callbacks.choose_nodes); if (bit_set_count(node_bitmap) < min_nodes) { #if _DEBUG info("%s: AvailNodes < MinNodes (%u < %u)", __func__, bit_set_count(node_bitmap), min_nodes); #endif return NULL; } core_array_log("_select_nodes/enter", node_bitmap, avail_core); /* Determine resource availability on each node for pending job */ avail_res_array = _get_res_avail(job_ptr, node_bitmap, avail_core, node_usage, cr_type, test_only, will_run, part_core_map); if (!avail_res_array) return avail_res_array; /* Eliminate nodes that don't have sufficient resources for this job */ for (n = 0; n < select_node_cnt; n++) { if (bit_test(node_bitmap, n) && (!avail_res_array[n] || !avail_res_array[n]->avail_cpus)) { /* insufficient resources available on this node */ bit_clear(node_bitmap, n); } } if ((bit_set_count(node_bitmap) < min_nodes) || (req_map && !bit_super_set(req_map, node_bitmap))) { rc = SLURM_ERROR; goto fini; } core_array_log("_select_nodes/elim_nodes", node_bitmap, avail_core); /* Select the best nodes for this job */ if (details_ptr->ntasks_per_node && details_ptr->num_tasks) { i = details_ptr->num_tasks; i += (details_ptr->ntasks_per_node - 1); i /= details_ptr->ntasks_per_node; min_nodes = MAX(min_nodes, i); } rc = (*cons_common_callbacks.choose_nodes)( job_ptr, node_bitmap, avail_core, min_nodes, max_nodes, req_nodes, avail_res_array, cr_type, prefer_alloc_nodes, tres_mc_ptr); if (rc != SLURM_SUCCESS) goto fini; core_array_log("_select_nodes/choose_nodes", node_bitmap, avail_core); /* If successful, sync up the avail_core with the node_map */ if (rc == SLURM_SUCCESS) { if (is_cons_tres) { for (n = 0; n < select_node_cnt; n++) { if (!avail_res_array[n] || !bit_test(node_bitmap, n)) FREE_NULL_BITMAP(avail_core[n]); } } else { int i_first, i_last, n, start; i_first = bit_ffs(node_bitmap); if (i_first != -1) i_last = bit_fls(node_bitmap); else i_last = -2; start = 0; for (n = i_first; n < i_last; n++) { if (!avail_res_array[n] || !bit_test(node_bitmap, n)) continue; if (cr_get_coremap_offset(n) != start) bit_nclear( *avail_core, start, (cr_get_coremap_offset(n)) - 1); start = cr_get_coremap_offset(n + 1); } if ((n >= 0) && (cr_get_coremap_offset(n) != start)) bit_nclear(*avail_core, start, cr_get_coremap_offset(n) - 1); } } core_array_log("_select_nodes/sync_cores", node_bitmap, avail_core); fini: if (rc != SLURM_SUCCESS) { _free_avail_res_array(avail_res_array); return NULL; } return avail_res_array; } /* * Sort the usable_node element to put jobs in the correct * preemption order. */ static int _sort_usable_nodes_dec(void *j1, void *j2) { job_record_t *job_a = *(job_record_t **) j1; job_record_t *job_b = *(job_record_t **) j2; if (job_a->details->usable_nodes > job_b->details->usable_nodes) return -1; else if (job_a->details->usable_nodes < job_b->details->usable_nodes) return 1; return 0; } /* * Determine which of these nodes are usable by this job * * Remove nodes from node_bitmap that don't have enough memory or other * resources to support this job. * * Return SLURM_ERROR if a required node can't be used. * * if node_state = NODE_CR_RESERVED, clear node_bitmap (if node is required * then should we return NODE_BUSY!?!) * * if node_state = NODE_CR_ONE_ROW, then this node can only be used by * another NODE_CR_ONE_ROW job * * if node_state = NODE_CR_AVAILABLE AND: * - job_node_req = NODE_CR_RESERVED, then we need idle nodes * - job_node_req = NODE_CR_ONE_ROW, then we need idle or non-sharing nodes */ static int _verify_node_state(part_res_record_t *cr_part_ptr, job_record_t *job_ptr, bitstr_t *node_bitmap, uint16_t cr_type, node_use_record_t *node_usage, enum node_cr_state job_node_req, bitstr_t **exc_cores, bool qos_preemptor) { node_record_t *node_ptr; uint32_t gres_cpus, gres_cores; uint64_t free_mem, min_mem, avail_mem; List gres_list; int i, i_first, i_last; bool disable_binding = false; if (is_cons_tres && !(job_ptr->bit_flags & JOB_MEM_SET) && (min_mem = gres_plugin_job_mem_max(job_ptr->gres_list))) { /* * Clear default partition or system per-node memory limit. * Rely exclusively upon the per-GRES memory limit. */ job_ptr->details->pn_min_memory = 0; } else if (job_ptr->details->pn_min_memory & MEM_PER_CPU) { uint16_t min_cpus; min_mem = job_ptr->details->pn_min_memory & (~MEM_PER_CPU); min_cpus = MAX(job_ptr->details->ntasks_per_node, job_ptr->details->pn_min_cpus); min_cpus = MAX(min_cpus, job_ptr->details->cpus_per_task); if (min_cpus > 0) min_mem *= min_cpus; } else { min_mem = job_ptr->details->pn_min_memory; } if (!is_cons_tres && (job_ptr->bit_flags & GRES_DISABLE_BIND)) disable_binding = true; i_first = bit_ffs(node_bitmap); if (i_first == -1) i_last = -2; else i_last = bit_fls(node_bitmap); for (i = i_first; i <= i_last; i++) { if (!bit_test(node_bitmap, i)) continue; node_ptr = select_node_record[i].node_ptr; /* node-level memory check */ if (min_mem && (cr_type & CR_MEMORY)) { avail_mem = select_node_record[i].real_memory - select_node_record[i].mem_spec_limit; if (avail_mem > node_usage[i].alloc_memory) { free_mem = avail_mem - node_usage[i].alloc_memory; } else free_mem = 0; if (free_mem < min_mem) { debug3("%s: %s: node %s no mem (%"PRIu64" < %"PRIu64")", plugin_type, __func__, node_ptr->name, free_mem, min_mem); goto clear_bit; } } else if (cr_type & CR_MEMORY) { /* --mem=0 for all memory */ if (node_usage[i].alloc_memory) { debug3("%s: %s: node %s mem in use %"PRIu64, plugin_type, __func__, node_ptr->name, node_usage[i].alloc_memory); goto clear_bit; } } /* Exclude nodes with reserved cores */ if ((job_ptr->details->whole_node == 1) && exc_cores) { if (is_cons_tres) { if (exc_cores[i] && (bit_ffs(exc_cores[i]) != -1)) { debug3("%s: %s: node %s exclusive", plugin_type, __func__, node_ptr->name); goto clear_bit; } } else if (*exc_cores) { for (int j = cr_get_coremap_offset(i); j < cr_get_coremap_offset(i+1); j++) { if (bit_test(*exc_cores, j)) continue; debug3("%s: %s: _vns: node %s exc", plugin_type, __func__, node_ptr->name); goto clear_bit; } } } /* node-level GRES check, assumes all cores usable */ if (node_usage[i].gres_list) gres_list = node_usage[i].gres_list; else gres_list = node_ptr->gres_list; gres_cores = gres_plugin_job_test(job_ptr->gres_list, gres_list, true, NULL, 0, 0, job_ptr->job_id, node_ptr->name, disable_binding); gres_cpus = gres_cores; if (gres_cpus != NO_VAL) gres_cpus *= select_node_record[i].vpus; if (gres_cpus == 0) { debug3("%s: %s: node %s lacks GRES", plugin_type, __func__, node_ptr->name); goto clear_bit; } /* exclusive node check */ if (node_usage[i].node_state >= NODE_CR_RESERVED) { debug3("%s: %s: node %s in exclusive use", plugin_type, __func__, node_ptr->name); goto clear_bit; /* non-resource-sharing node check */ } else if (node_usage[i].node_state >= NODE_CR_ONE_ROW) { if ((job_node_req == NODE_CR_RESERVED) || (job_node_req == NODE_CR_AVAILABLE)) { debug3("%s: %s: node %s non-sharing", plugin_type, __func__, node_ptr->name); goto clear_bit; } /* * cannot use this node if it is running jobs * in sharing partitions */ if (_is_node_busy(cr_part_ptr, i, 1, job_ptr->part_ptr, qos_preemptor)) { debug3("%s: %s: node %s sharing?", plugin_type, __func__, node_ptr->name); goto clear_bit; } /* node is NODE_CR_AVAILABLE - check job request */ } else { if (job_node_req == NODE_CR_RESERVED) { if (_is_node_busy(cr_part_ptr, i, 0, job_ptr->part_ptr, qos_preemptor)) { debug3("%s: %s: node %s busy", plugin_type, __func__, node_ptr->name); goto clear_bit; } } else if (job_node_req == NODE_CR_ONE_ROW) { /* * cannot use this node if it is running jobs * in sharing partitions */ if (_is_node_busy(cr_part_ptr, i, 1, job_ptr->part_ptr, qos_preemptor)) { debug3("%s: %s: node %s vbusy", plugin_type, __func__, node_ptr->name); goto clear_bit; } } } continue; /* node is usable, test next node */ clear_bit: /* This node is not usable by this job */ bit_clear(node_bitmap, i); if (job_ptr->details->req_node_bitmap && bit_test(job_ptr->details->req_node_bitmap, i)) return SLURM_ERROR; } return SLURM_SUCCESS; } /* * _job_test - does most of the real work for select_p_job_test(), which * includes contiguous selection, load-leveling and max_share logic * * PROCEDURE: * * Step 1: compare nodes in "avail" node_bitmap with current node state data * to find available nodes that match the job request * * Step 2: check resources in "avail" node_bitmap with allocated resources from * higher priority partitions (busy resources are UNavailable) * * Step 3: select resource usage on remaining resources in "avail" node_bitmap * for this job, with the placement influenced by existing * allocations */ static int _job_test(job_record_t *job_ptr, bitstr_t *node_bitmap, uint32_t min_nodes, uint32_t max_nodes, uint32_t req_nodes, int mode, uint16_t cr_type, enum node_cr_state job_node_req, part_res_record_t *cr_part_ptr, node_use_record_t *node_usage, bitstr_t **exc_cores, bool prefer_alloc_nodes, bool qos_preemptor, bool preempt_mode) { int error_code = SLURM_SUCCESS; bitstr_t *orig_node_map, **part_core_map = NULL; bitstr_t **free_cores_tmp = NULL, *node_bitmap_tmp = NULL; bitstr_t **free_cores_tmp2 = NULL, *node_bitmap_tmp2 = NULL; bitstr_t **avail_cores, **free_cores; bool test_only = false, will_run = false; uint32_t sockets_per_node = 1; uint32_t c, j, n, c_alloc = 0, c_size, total_cpus; uint64_t save_mem = 0, avail_mem = 0, needed_mem = 0, lowest_mem = 0; int32_t build_cnt; job_resources_t *job_res; struct job_details *details_ptr = job_ptr->details; part_res_record_t *p_ptr, *jp_ptr; uint16_t *cpu_count; int i, i_first, i_last; avail_res_t **avail_res_array, **avail_res_array_tmp; gres_mc_data_t *tres_mc_ptr = NULL; List *node_gres_list = NULL, *sock_gres_list = NULL; uint32_t *gres_task_limit = NULL; char *nodename = NULL; bitstr_t *exc_core_bitmap = NULL; free_job_resources(&job_ptr->job_resrcs); if (mode == SELECT_MODE_TEST_ONLY) test_only = true; else if (mode == SELECT_MODE_WILL_RUN) will_run = true; /* check node_state and update the node_bitmap as necessary */ if (!test_only) { error_code = _verify_node_state( cr_part_ptr, job_ptr, node_bitmap, cr_type, node_usage, job_node_req, exc_cores, qos_preemptor); if (error_code != SLURM_SUCCESS) { return error_code; } } /* * Ensure sufficient resources to satisfy thread/core/socket * specifications with -O/--overcommit option. */ if (details_ptr->overcommit && (details_ptr->min_cpus == details_ptr->min_nodes)) { struct multi_core_data *mc_ptr = details_ptr->mc_ptr; if ((mc_ptr->threads_per_core != NO_VAL16) && (mc_ptr->threads_per_core > 1)) details_ptr->min_cpus *= mc_ptr->threads_per_core; if ((mc_ptr->cores_per_socket != NO_VAL16) && (mc_ptr->cores_per_socket > 1)) details_ptr->min_cpus *= mc_ptr->cores_per_socket; if ((mc_ptr->sockets_per_node != NO_VAL16) && (mc_ptr->sockets_per_node > 1)) details_ptr->min_cpus *= mc_ptr->sockets_per_node; } if (is_cons_tres) { if (details_ptr->mc_ptr && details_ptr->mc_ptr->sockets_per_node) sockets_per_node = details_ptr->mc_ptr->sockets_per_node; details_ptr->min_gres_cpu = gres_plugin_job_min_cpu_node( sockets_per_node, details_ptr->ntasks_per_node, job_ptr->gres_list); } else if (exc_cores && *exc_cores) exc_core_bitmap = *exc_cores; if (select_debug_flags & DEBUG_FLAG_SELECT_TYPE) { info("%s: %s: evaluating %pJ on %u nodes", plugin_type, __func__, job_ptr, bit_set_count(node_bitmap)); } orig_node_map = bit_copy(node_bitmap); avail_cores = common_mark_avail_cores( node_bitmap, job_ptr->details->core_spec); /* * test to make sure that this job can succeed with all avail_cores * if 'no' then return FAIL * if 'yes' then we will seek the optimal placement for this job * within avail_cores */ free_cores = copy_core_array(avail_cores); if (is_cons_tres) tres_mc_ptr = _build_gres_mc_data(job_ptr); avail_res_array = _select_nodes(job_ptr, min_nodes, max_nodes, req_nodes, node_bitmap, free_cores, node_usage, cr_type, test_only, will_run, part_core_map, prefer_alloc_nodes, tres_mc_ptr); if (!avail_res_array) { /* job can not fit */ xfree(tres_mc_ptr); FREE_NULL_BITMAP(orig_node_map); free_core_array(&avail_cores); free_core_array(&free_cores); if (select_debug_flags & DEBUG_FLAG_SELECT_TYPE) { info("%s: %s: test 0 fail: insufficient resources", plugin_type, __func__); } return SLURM_ERROR; } else if (test_only) { xfree(tres_mc_ptr); FREE_NULL_BITMAP(orig_node_map); free_core_array(&avail_cores); free_core_array(&free_cores); _free_avail_res_array(avail_res_array); if (select_debug_flags & DEBUG_FLAG_SELECT_TYPE) { info("%s: %s: test 0 pass: test_only", plugin_type, __func__); } return SLURM_SUCCESS; } else if (!job_ptr->best_switch) { xfree(tres_mc_ptr); FREE_NULL_BITMAP(orig_node_map); free_core_array(&avail_cores); free_core_array(&free_cores); _free_avail_res_array(avail_res_array); if (select_debug_flags & DEBUG_FLAG_CPU_BIND) { info("%s: %s: test 0 fail: waiting for switches", plugin_type, __func__); } return SLURM_ERROR; } if (cr_type == CR_MEMORY) { /* * CR_MEMORY does not care about existing CPU allocations, * so we can jump right to job allocation from here */ goto alloc_job; } if (select_debug_flags & DEBUG_FLAG_SELECT_TYPE) { info("%s: %s: test 0 pass - job fits on given resources", plugin_type, __func__); } _free_avail_res_array(avail_res_array); /* * now that we know that this job can run with the given resources, * let's factor in the existing allocations and seek the optimal set * of resources for this job. Here is the procedure: * * Step 1: Seek idle CPUs across all partitions. If successful then * place job and exit. If not successful, then continue. Two * related items to note: * 1. Jobs that don't share CPUs finish with step 1. * 2. The remaining steps assume sharing or preemption. * * Step 2: Remove resources that are in use by higher-priority * partitions, and test that job can still succeed. If not * then exit. * * Step 3: Seek idle nodes among the partitions with the same * priority as the job's partition. If successful then * goto Step 6. If not then continue: * * Step 4: Seek placement within the job's partition. Search * row-by-row. If no placement if found, then exit. If a row * is found, then continue: * * Step 5: Place job and exit. FIXME! Here is where we need a * placement algorithm that recognizes existing job * boundaries and tries to "overlap jobs" as efficiently * as possible. * * Step 6: Place job and exit. FIXME! here is we use a placement * algorithm similar to Step 5 on jobs from lower-priority * partitions. */ /*** Step 1 ***/ bit_copybits(node_bitmap, orig_node_map); free_core_array(&free_cores); free_cores = copy_core_array(avail_cores); if (exc_core_bitmap && !is_cons_tres) { int exc_core_size = bit_size(exc_core_bitmap); int free_core_size = bit_size(*free_cores); if (exc_core_size != free_core_size) { /* This would indicate that cores were added to or * removed from nodes in this reservation when the * slurmctld daemon restarted with a new slurm.conf * file. This can result in cores being lost from a * reservation. */ error("Bad core_bitmap size for reservation %s " "(%d != %d), ignoring core reservation", job_ptr->resv_name, exc_core_size, free_core_size); exc_cores = NULL; /* Clear local value */ } } if (exc_cores) { #if _DEBUG core_array_log("exclude reserved cores", NULL, exc_cores); #endif core_array_and_not(free_cores, exc_cores); } /* remove all existing allocations from free_cores */ for (p_ptr = cr_part_ptr; p_ptr; p_ptr = p_ptr->next) { if (!p_ptr->row) continue; for (i = 0; i < p_ptr->num_rows; i++) { if (!p_ptr->row[i].row_bitmap) continue; core_array_and_not(free_cores, p_ptr->row[i].row_bitmap); if (p_ptr->part_ptr != job_ptr->part_ptr) continue; if (part_core_map) { core_array_or(part_core_map, p_ptr->row[i].row_bitmap); } else { part_core_map = copy_core_array( p_ptr->row[i].row_bitmap); } } } if (job_ptr->details->whole_node == 1) _block_whole_nodes(node_bitmap, avail_cores, free_cores); avail_res_array = _select_nodes(job_ptr, min_nodes, max_nodes, req_nodes, node_bitmap, free_cores, node_usage, cr_type, test_only, will_run, part_core_map, prefer_alloc_nodes, tres_mc_ptr); if (avail_res_array && job_ptr->best_switch) { /* job fits! We're done. */ if (select_debug_flags & DEBUG_FLAG_SELECT_TYPE) { info("%s: %s: test 1 pass - idle resources found", plugin_type, __func__); } goto alloc_job; } _free_avail_res_array(avail_res_array); avail_res_array = NULL; if ((gang_mode == 0) && (job_node_req == NODE_CR_ONE_ROW)) { /* * This job CANNOT share CPUs regardless of priority, * so we fail here. Note that Shared=EXCLUSIVE was already * addressed in _verify_node_state() and * job preemption removes jobs from simulated resource * allocation map before this point. */ if (select_debug_flags & DEBUG_FLAG_SELECT_TYPE) { info("%s: %s: test 1 fail - no idle resources available", plugin_type, __func__); } goto alloc_job; } if (select_debug_flags & DEBUG_FLAG_SELECT_TYPE) { info("%s: %s: test 1 fail - not enough idle resources", plugin_type, __func__); } /*** Step 2 ***/ for (jp_ptr = cr_part_ptr; jp_ptr; jp_ptr = jp_ptr->next) { if (jp_ptr->part_ptr == job_ptr->part_ptr) break; } if (!jp_ptr) { error("%s %s: could not find partition for %pJ", plugin_type, __func__, job_ptr); goto alloc_job; } bit_copybits(node_bitmap, orig_node_map); free_core_array(&free_cores); free_cores = copy_core_array(avail_cores); if (exc_cores) core_array_and_not(free_cores, exc_cores); if (preempt_by_part) { /* * Remove from avail_cores resources allocated to jobs which * this job can not preempt */ if (select_debug_flags & DEBUG_FLAG_SELECT_TYPE) { info("%s: %s: looking for higher-priority or " "PREEMPT_MODE_OFF part's to remove from avail_cores", plugin_type, __func__); } for (p_ptr = cr_part_ptr; p_ptr; p_ptr = p_ptr->next) { if ((p_ptr->part_ptr->priority_tier <= jp_ptr->part_ptr->priority_tier) && (p_ptr->part_ptr->preempt_mode != PREEMPT_MODE_OFF)) { if (select_debug_flags & DEBUG_FLAG_SELECT_TYPE) { info("%s: %s: continuing on part: %s", plugin_type, __func__, p_ptr->part_ptr->name); } continue; } if (!p_ptr->row) continue; for (i = 0; i < p_ptr->num_rows; i++) { if (!p_ptr->row[i].row_bitmap) continue; core_array_and_not(free_cores, p_ptr->row[i].row_bitmap); } } } if (job_ptr->details->whole_node == 1) _block_whole_nodes(node_bitmap, avail_cores, free_cores); /* make these changes permanent */ free_core_array(&avail_cores); avail_cores = copy_core_array(free_cores); avail_res_array = _select_nodes(job_ptr, min_nodes, max_nodes, req_nodes, node_bitmap, free_cores, node_usage, cr_type, test_only, will_run, part_core_map, prefer_alloc_nodes, tres_mc_ptr); if (!avail_res_array) { /* * job needs resources that are currently in use by * higher-priority jobs, so fail for now */ if (select_debug_flags & DEBUG_FLAG_SELECT_TYPE) { info("%s: %s: test 2 fail - resources busy with higher priority jobs", plugin_type, __func__); } goto alloc_job; } _free_avail_res_array(avail_res_array); if (select_debug_flags & DEBUG_FLAG_SELECT_TYPE) { info("%s: %s: test 2 pass - available resources for this priority", plugin_type, __func__); } /*** Step 3 ***/ bit_copybits(node_bitmap, orig_node_map); free_core_array(&free_cores); free_cores = copy_core_array(avail_cores); /* * remove existing allocations (jobs) from same-priority partitions * from avail_cores */ for (p_ptr = cr_part_ptr; p_ptr; p_ptr = p_ptr->next) { if (p_ptr->part_ptr->priority_tier != jp_ptr->part_ptr->priority_tier) continue; if (!p_ptr->row) continue; for (i = 0; i < p_ptr->num_rows; i++) { if (!p_ptr->row[i].row_bitmap) continue; core_array_and_not(free_cores, p_ptr->row[i].row_bitmap); } } if (job_ptr->details->whole_node == 1) _block_whole_nodes(node_bitmap, avail_cores, free_cores); free_cores_tmp = copy_core_array(free_cores); node_bitmap_tmp = bit_copy(node_bitmap); avail_res_array = _select_nodes(job_ptr, min_nodes, max_nodes, req_nodes, node_bitmap, free_cores, node_usage, cr_type, test_only, will_run, part_core_map, prefer_alloc_nodes, tres_mc_ptr); if (avail_res_array) { /* * To the extent possible, remove from consideration resources * which are allocated to jobs in lower priority partitions. */ if (select_debug_flags & DEBUG_FLAG_SELECT_TYPE) { info("%s: %s: test 3 pass - found resources", plugin_type, __func__); } for (p_ptr = cr_part_ptr; p_ptr; p_ptr = p_ptr->next) { if (p_ptr->part_ptr->priority_tier >= jp_ptr->part_ptr->priority_tier) continue; if (!p_ptr->row) continue; for (i = 0; i < p_ptr->num_rows; i++) { if (!p_ptr->row[i].row_bitmap) continue; core_array_and_not(free_cores_tmp, p_ptr->row[i].row_bitmap); } if (job_ptr->details->whole_node == 1) { _block_whole_nodes(node_bitmap_tmp, avail_cores, free_cores_tmp); } free_cores_tmp2 = copy_core_array(free_cores_tmp); node_bitmap_tmp2 = bit_copy(node_bitmap_tmp); avail_res_array_tmp = _select_nodes( job_ptr, min_nodes, max_nodes, req_nodes, node_bitmap_tmp, free_cores_tmp, node_usage, cr_type, test_only, will_run, part_core_map, prefer_alloc_nodes, tres_mc_ptr); if (!avail_res_array_tmp) { free_core_array(&free_cores_tmp2); FREE_NULL_BITMAP(node_bitmap_tmp2); break; } if (select_debug_flags & DEBUG_FLAG_SELECT_TYPE) { info("%s: %s: remove low-priority partition %s", plugin_type, __func__, p_ptr->part_ptr->name); } free_core_array(&free_cores); free_cores = free_cores_tmp; free_cores_tmp = free_cores_tmp2; free_cores_tmp2 = NULL; bit_copybits(node_bitmap, node_bitmap_tmp); FREE_NULL_BITMAP(node_bitmap_tmp); node_bitmap_tmp = node_bitmap_tmp2; node_bitmap_tmp2 = NULL; _free_avail_res_array(avail_res_array); avail_res_array = avail_res_array_tmp; } goto alloc_job; } if (select_debug_flags & DEBUG_FLAG_SELECT_TYPE) { info("%s: %s: test 3 fail - not enough idle resources in same priority", plugin_type, __func__); } /*** Step 4 ***/ /* * try to fit the job into an existing row * * free_cores = core_bitmap to be built * avail_cores = static core_bitmap of all available cores */ if (!jp_ptr || !jp_ptr->row) { /* * there's no existing jobs in this partition, so place * the job in avail_cores. FIXME: still need a good * placement algorithm here that optimizes "job overlap" * between this job (in these idle nodes) and existing * jobs in the other partitions with <= priority to * this partition */ free_core_array(&free_cores); free_cores = copy_core_array(avail_cores); bit_copybits(node_bitmap, orig_node_map); avail_res_array = _select_nodes(job_ptr, min_nodes, max_nodes, req_nodes, node_bitmap, free_cores, node_usage, cr_type, test_only, will_run, part_core_map, prefer_alloc_nodes, tres_mc_ptr); if (avail_res_array && (select_debug_flags & DEBUG_FLAG_SELECT_TYPE)) { info("%s: %s: test 4 pass - first row found", plugin_type, __func__); } goto alloc_job; } if ((jp_ptr->num_rows > 1) && !preempt_by_qos) part_data_sort_res(jp_ptr); /* Preserve row order for QOS */ c = jp_ptr->num_rows; if (preempt_by_qos && !qos_preemptor) c--; /* Do not use extra row */ if (preempt_by_qos && (job_node_req != NODE_CR_AVAILABLE)) c = 1; for (i = 0; i < c; i++) { if (!jp_ptr->row[i].row_bitmap) break; free_core_array(&free_cores); free_cores = copy_core_array(avail_cores); core_array_and_not(free_cores, jp_ptr->row[i].row_bitmap); bit_copybits(node_bitmap, orig_node_map); if (job_ptr->details->whole_node == 1) _block_whole_nodes(node_bitmap, avail_cores,free_cores); avail_res_array = _select_nodes(job_ptr, min_nodes, max_nodes, req_nodes, node_bitmap, free_cores, node_usage, cr_type, test_only, will_run, part_core_map, prefer_alloc_nodes, tres_mc_ptr); if (avail_res_array) { if (select_debug_flags & DEBUG_FLAG_SELECT_TYPE) { info("%s: %s: test 4 pass - row %i", plugin_type, __func__, i); } break; } if (select_debug_flags & DEBUG_FLAG_SELECT_TYPE) { info("%s: %s: test 4 fail - row %i", plugin_type, __func__, i); } } if ((i < c) && !jp_ptr->row[i].row_bitmap) { /* we've found an empty row, so use it */ free_core_array(&free_cores); free_cores = copy_core_array(avail_cores); bit_copybits(node_bitmap, orig_node_map); if (select_debug_flags & DEBUG_FLAG_SELECT_TYPE) { info("%s: %s: test 4 trying empty row %i", plugin_type, __func__, i); } avail_res_array = _select_nodes(job_ptr, min_nodes, max_nodes, req_nodes, node_bitmap, free_cores, node_usage, cr_type, test_only, will_run, part_core_map, prefer_alloc_nodes, tres_mc_ptr); } if (!avail_res_array) { /* job can't fit into any row, so exit */ if (select_debug_flags & DEBUG_FLAG_SELECT_TYPE) { info("%s: %s: test 4 fail - busy partition", plugin_type, __func__); } goto alloc_job; } /* *** CONSTRUCTION ZONE FOR STEPs 5 AND 6 *** * Note that while the job may have fit into a row, it should * still be run through a good placement algorithm here that * optimizes "job overlap" between this job (in these idle nodes) * and existing jobs in the other partitions with <= priority to * this partition */ alloc_job: /* * at this point we've found a good set of nodes and cores for the job: * - node_bitmap is the set of nodes to allocate * - free_cores is the set of allocated cores * - avail_res_array identifies cores and GRES * * Next steps are to cleanup the worker variables, * create the job_resources struct, * distribute the job on the bits, and exit */ FREE_NULL_BITMAP(orig_node_map); free_core_array(&part_core_map); free_core_array(&free_cores_tmp); FREE_NULL_BITMAP(node_bitmap_tmp); if (!avail_res_array || !job_ptr->best_switch) { /* we were sent here to cleanup and exit */ xfree(tres_mc_ptr); free_core_array(&avail_cores); free_core_array(&free_cores); _free_avail_res_array(avail_res_array); if (select_debug_flags & DEBUG_FLAG_SELECT_TYPE) { info("%s: %s: exiting with no allocation", plugin_type, __func__); } return SLURM_ERROR; } if ((mode != SELECT_MODE_WILL_RUN) && (job_ptr->part_ptr == NULL)) error_code = EINVAL; if ((error_code == SLURM_SUCCESS) && (mode == SELECT_MODE_WILL_RUN)) { /* * Set a reasonable value for the number of allocated CPUs. * Without computing task distribution this is only a guess */ job_ptr->total_cpus = MAX(job_ptr->details->min_cpus, job_ptr->details->min_nodes); } /* * Defer checking select mode until we get a correct CPU count. Then * exit if select mode is not SELECT_MODE_RUN_NOW, making sure to free * job_ptr->job_resrcs. */ if (error_code != SLURM_SUCCESS) { xfree(tres_mc_ptr); free_core_array(&avail_cores); free_core_array(&free_cores); _free_avail_res_array(avail_res_array); return error_code; } if (select_debug_flags & DEBUG_FLAG_SELECT_TYPE) { info("%s: %s: distributing %pJ", plugin_type, __func__, job_ptr); } /** create the struct_job_res **/ n = bit_set_count(node_bitmap); cpu_count = xmalloc(sizeof(uint16_t) * n); i_first = bit_ffs(node_bitmap); if (i_first != -1) i_last = bit_fls(node_bitmap); else i_last = -2; for (i = i_first, j = 0; i <= i_last; i++) { if (bit_test(node_bitmap, i) && avail_res_array[i]) cpu_count[j++] = avail_res_array[i]->avail_cpus; } if (j != n) { error("%s: %s: problem building cpu_count array (%d != %d)", plugin_type, __func__, j, n); } job_res = create_job_resources(); job_res->node_bitmap = bit_copy(node_bitmap); job_res->nodes = bitmap2node_name(node_bitmap); job_res->nhosts = n; job_res->ncpus = job_res->nhosts; if (job_ptr->details->ntasks_per_node) job_res->ncpus *= details_ptr->ntasks_per_node; job_res->ncpus = MAX(job_res->ncpus, details_ptr->min_cpus); job_res->ncpus = MAX(job_res->ncpus, (job_res->nhosts * details_ptr->pn_min_cpus)); if (job_ptr->details->mc_ptr) sockets_per_node = job_ptr->details->mc_ptr->sockets_per_node; i = gres_plugin_job_min_cpus(job_res->nhosts, sockets_per_node, job_ptr->details->num_tasks, job_ptr->gres_list); job_res->ncpus = MAX(job_res->ncpus, i); job_res->node_req = job_node_req; job_res->cpus = cpu_count; /* Per node CPU counts */ job_res->cpus_used = xmalloc(job_res->nhosts * sizeof(uint16_t)); job_res->memory_allocated = xmalloc(job_res->nhosts * sizeof(uint64_t)); job_res->memory_used = xmalloc(job_res->nhosts * sizeof(uint64_t)); job_res->whole_node = job_ptr->details->whole_node; /* store the hardware data for the selected nodes */ error_code = build_job_resources(job_res, node_record_table_ptr); if (error_code != SLURM_SUCCESS) { xfree(tres_mc_ptr); _free_avail_res_array(avail_res_array); free_job_resources(&job_res); free_core_array(&avail_cores); free_core_array(&free_cores); return error_code; } /* total up all CPUs and load the core_bitmap */ total_cpus = 0; c = 0; if (job_res->core_bitmap) c_size = bit_size(job_res->core_bitmap); else c_size = 0; i_first = bit_ffs(node_bitmap); for (i = 0, n = i_first; n < select_node_cnt; n++) { int first_core, last_core; bitstr_t *use_free_cores = NULL; if (!bit_test(node_bitmap, n)) continue; if (is_cons_tres) { first_core = 0; last_core = select_node_record[n].tot_cores; use_free_cores = free_cores[n]; } else { first_core = cr_get_coremap_offset(n); last_core = cr_get_coremap_offset(n + 1); use_free_cores = *free_cores; } for (j = first_core; j < last_core; j++, c++) { if (!bit_test(use_free_cores, j)) continue; if (c >= c_size) { error("%s: %s core_bitmap index error on node %s (NODE_INX:%d, C_SIZE:%u)", plugin_type, __func__, select_node_record[n].node_ptr->name, n, c_size); drain_nodes(select_node_record[n].node_ptr->name, "Bad core count", getuid()); _free_avail_res_array(avail_res_array); free_job_resources(&job_res); free_core_array(&free_cores); return SLURM_ERROR; } bit_set(job_res->core_bitmap, c); c_alloc++; } total_cpus += job_res->cpus[i]; i++; } /* * When 'srun --overcommit' is used, ncpus is set to a minimum value * in order to allocate the appropriate number of nodes based on the * job request. * For cons_tres, all available logical processors will be allocated on * each allocated node in order to accommodate the overcommit request. */ if (details_ptr->overcommit && details_ptr->num_tasks) job_res->ncpus = MIN(total_cpus, details_ptr->num_tasks); if (select_debug_flags & DEBUG_FLAG_SELECT_TYPE) { info("%s: %s: %pJ ncpus %u cbits %u/%u nbits %u", plugin_type, __func__, job_ptr, job_res->ncpus, count_core_array_set(free_cores), c_alloc, job_res->nhosts); } free_core_array(&free_cores); /* distribute the tasks, clear unused cores from job_res->core_bitmap */ job_ptr->job_resrcs = job_res; i_first = bit_ffs(job_res->node_bitmap); if (i_first != -1) i_last = bit_fls(job_res->node_bitmap); else i_last = -2; if (is_cons_tres && job_ptr->gres_list && (error_code == SLURM_SUCCESS)) { node_record_t *node_ptr; bool have_gres_per_task, task_limit_set = false; /* * Determine if any job gres_per_task specification here * to avoid calling gres_plugin_get_task_limit unless needed */ have_gres_per_task = gres_plugin_job_tres_per_task( job_ptr->gres_list); if (have_gres_per_task) { gres_task_limit = xcalloc(job_res->nhosts, sizeof(uint32_t)); } node_gres_list = xcalloc(job_res->nhosts, sizeof(List)); sock_gres_list = xcalloc(job_res->nhosts, sizeof(List)); for (i = i_first, j = 0; i <= i_last; i++) { if (!bit_test(job_res->node_bitmap, i)) continue; if (have_gres_per_task) { gres_task_limit[j] = gres_plugin_get_task_limit( avail_res_array[i]-> sock_gres_list); if (gres_task_limit[j] != NO_VAL) task_limit_set = true; } node_ptr = node_record_table_ptr + i; node_gres_list[j] = node_ptr->gres_list; sock_gres_list[j] = avail_res_array[i]->sock_gres_list; j++; } if (!task_limit_set) xfree(gres_task_limit); } error_code = dist_tasks(job_ptr, cr_type, preempt_mode, avail_cores, gres_task_limit); if (is_cons_tres && job_ptr->gres_list && (error_code == SLURM_SUCCESS)) { error_code = gres_plugin_job_core_filter4( sock_gres_list, job_ptr->job_id, job_res, job_ptr->details->overcommit, tres_mc_ptr, node_record_table_ptr); } xfree(gres_task_limit); xfree(node_gres_list); xfree(sock_gres_list); xfree(tres_mc_ptr); _free_avail_res_array(avail_res_array); free_core_array(&avail_cores); if (error_code != SLURM_SUCCESS) { free_job_resources(&job_ptr->job_resrcs); return error_code; } /* translate job_res->cpus array into format with repitition count */ build_cnt = build_job_resources_cpu_array(job_res); if (job_ptr->details->whole_node == 1) { job_ptr->total_cpus = 0; for (i = i_first; i <= i_last; i++) { if (!bit_test(job_res->node_bitmap, i)) continue; /* * This could make the job_res->cpus incorrect. * Don't use job_res->cpus when allocating * whole nodes as the job is finishing to * subtract from the total cpu count or you * will get an incorrect count. */ job_ptr->total_cpus += select_node_record[i].cpus; } } else if (cr_type & CR_SOCKET) { int ci = 0; int s, last_s, sock_cnt = 0; job_ptr->total_cpus = 0; for (i = i_first; i <= i_last; i++) { if (!bit_test(job_res->node_bitmap, i)) continue; sock_cnt = 0; for (s = 0; s < select_node_record[i].tot_sockets; s++){ last_s = -1; for (c = 0; ccore_bitmap, ci)){ if (s != last_s) { sock_cnt++; last_s = s; } } ci++; } } job_ptr->total_cpus += (sock_cnt * select_node_record[i].cores * select_node_record[i].vpus); } } else if (build_cnt >= 0) job_ptr->total_cpus = build_cnt; else job_ptr->total_cpus = total_cpus; /* best guess */ /* * Stop if we aren't trying to start the job right now. We needed to * get to here to have an accurate total_cpus so that accounting limits * checks are accurate later on. */ if (mode != SELECT_MODE_RUN_NOW) { free_job_resources(&job_ptr->job_resrcs); return error_code; } if (!(cr_type & CR_MEMORY)) return error_code; if (is_cons_tres && !(job_ptr->bit_flags & JOB_MEM_SET) && gres_plugin_job_mem_set(job_ptr->gres_list, job_res)) { debug("%pJ memory set via GRES limit", job_ptr); } else { /* load memory allocated array */ save_mem = details_ptr->pn_min_memory; for (i = i_first, j = 0; i <= i_last; i++) { if (!bit_test(job_res->node_bitmap, i)) continue; nodename = select_node_record[i].node_ptr->name; avail_mem = select_node_record[i].real_memory - select_node_record[i].mem_spec_limit; if (save_mem & MEM_PER_CPU) { /* Memory per CPU */ needed_mem = job_res->cpus[j] * (save_mem & (~MEM_PER_CPU)); } else if (save_mem) { /* Memory per node */ needed_mem = save_mem; } else { /* Allocate all node memory */ needed_mem = avail_mem; if (!test_only && (node_usage[i].alloc_memory > 0)) { if (select_debug_flags & DEBUG_FLAG_SELECT_TYPE) info("%s: node %s has already alloc_memory=%"PRIu64". %pJ can't allocate all node memory", __func__, nodename, node_usage[i].alloc_memory, job_ptr); error_code = SLURM_ERROR; break; } if ((j == 0) || (lowest_mem > avail_mem)) lowest_mem = avail_mem; } if (!test_only && save_mem) { if (node_usage[i].alloc_memory > avail_mem) { error("%s: node %s memory is already overallocated (%"PRIu64" > %"PRIu64"). %pJ can't allocate any node memory", __func__, nodename, node_usage[i].alloc_memory, avail_mem, job_ptr); error_code = SLURM_ERROR; break; } avail_mem -= node_usage[i].alloc_memory; } if (needed_mem > avail_mem) { if (select_debug_flags & DEBUG_FLAG_SELECT_TYPE) { info("%s: %pJ would overallocate node %s memory (%"PRIu64" > %"PRIu64")", __func__, job_ptr, nodename, needed_mem, avail_mem); } error_code = SLURM_ERROR; break; } job_res->memory_allocated[j] = needed_mem; j++; } if ((error_code != SLURM_ERROR) && (save_mem == 0)) details_ptr->pn_min_memory = lowest_mem; } if (error_code == SLURM_ERROR) free_job_resources(&job_ptr->job_resrcs); return error_code; } /* Determine if a job can ever run */ static int _test_only(job_record_t *job_ptr, bitstr_t *node_bitmap, uint32_t min_nodes, uint32_t max_nodes, uint32_t req_nodes, uint16_t job_node_req) { int rc; uint16_t tmp_cr_type = cr_type; if (job_ptr->part_ptr->cr_type) { if ((cr_type & CR_SOCKET) || (cr_type & CR_CORE)) { tmp_cr_type &= ~(CR_SOCKET | CR_CORE | CR_MEMORY); tmp_cr_type |= job_ptr->part_ptr->cr_type; } else { info("%s: Can't use Partition SelectType unless " "using CR_Socket or CR_Core", plugin_type); } } rc = _job_test(job_ptr, node_bitmap, min_nodes, max_nodes, req_nodes, SELECT_MODE_TEST_ONLY, tmp_cr_type, job_node_req, select_part_record, select_node_usage, NULL, false, false, false); return rc; } static int _wrapper_get_usable_nodes(void *x, void *arg) { job_record_t *job_ptr = (job_record_t *)x; wrapper_rm_job_args_t *wargs = (wrapper_rm_job_args_t *)arg; if ((!IS_JOB_RUNNING(job_ptr) && !IS_JOB_SUSPENDED(job_ptr))) return 0; wargs->rc += bit_overlap(wargs->node_map, job_ptr->node_bitmap); return 0; } static int _get_usable_nodes(bitstr_t *node_map, job_record_t *job_ptr) { wrapper_rm_job_args_t wargs = { .node_map = node_map }; if (!job_ptr->het_job_list) (void)_wrapper_get_usable_nodes(job_ptr, &wargs); else (void)list_for_each_nobreak(job_ptr->het_job_list, _wrapper_get_usable_nodes, &wargs); return wargs.rc; } static int _wrapper_job_res_rm_job(void *x, void *arg) { job_record_t *job_ptr = (job_record_t *)x; wrapper_rm_job_args_t *wargs = (wrapper_rm_job_args_t *)arg; (void)job_res_rm_job(wargs->part_record_ptr, wargs->node_usage, job_ptr, wargs->action, wargs->job_fini, wargs->node_map); /* * We might not had overlapped the main hetjob component partition, but * we might need these nodes. */ bit_or(wargs->node_map, job_ptr->node_bitmap); return 0; } static int _job_res_rm_job(part_res_record_t *part_record_ptr, node_use_record_t *node_usage, job_record_t *job_ptr, int action, bool job_fini, bitstr_t *node_map) { wrapper_rm_job_args_t wargs = { .action = action, .job_fini = job_fini, .node_usage = node_usage, .part_record_ptr = part_record_ptr, .node_map = node_map }; if (!job_overlap_and_running(node_map, job_ptr)) return 1; if (!job_ptr->het_job_list) (void)_wrapper_job_res_rm_job(job_ptr, &wargs); else (void)list_for_each(job_ptr->het_job_list, _wrapper_job_res_rm_job, &wargs); return 0; } /* * Determine where and when the job at job_ptr can begin execution by updating * a scratch cr_record structure to reflect each job terminating at the * end of its time limit and use this to show where and when the job at job_ptr * will begin execution. Used by Slurm's sched/backfill plugin. */ static int _will_run_test(job_record_t *job_ptr, bitstr_t *node_bitmap, uint32_t min_nodes, uint32_t max_nodes, uint32_t req_nodes, uint16_t job_node_req, List preemptee_candidates, List *preemptee_job_list, bitstr_t **exc_core_bitmap) { part_res_record_t *future_part; node_use_record_t *future_usage; job_record_t *tmp_job_ptr; List cr_job_list; ListIterator job_iterator, preemptee_iterator; bitstr_t *orig_map; int action, rc = SLURM_ERROR; time_t now = time(NULL); uint16_t tmp_cr_type = cr_type; bool qos_preemptor = false; orig_map = bit_copy(node_bitmap); if (job_ptr->part_ptr->cr_type) { if ((cr_type & CR_SOCKET) || (cr_type & CR_CORE)) { tmp_cr_type &= ~(CR_SOCKET | CR_CORE | CR_MEMORY); tmp_cr_type |= job_ptr->part_ptr->cr_type; } else { info("%s: Can't use Partition SelectType unless " "using CR_Socket or CR_Core", plugin_type); } } /* Try to run with currently available nodes */ rc = _job_test(job_ptr, node_bitmap, min_nodes, max_nodes, req_nodes, SELECT_MODE_WILL_RUN, tmp_cr_type, job_node_req, select_part_record, select_node_usage, exc_core_bitmap, false, false, false); if (rc == SLURM_SUCCESS) { FREE_NULL_BITMAP(orig_map); job_ptr->start_time = now; return SLURM_SUCCESS; } /* * Job is still pending. Simulate termination of jobs one at a time * to determine when and where the job can start. */ future_part = part_data_dup_res(select_part_record, orig_map); if (future_part == NULL) { FREE_NULL_BITMAP(orig_map); return SLURM_ERROR; } future_usage = node_data_dup_use(select_node_usage, orig_map); if (future_usage == NULL) { part_data_destroy_res(future_part); FREE_NULL_BITMAP(orig_map); return SLURM_ERROR; } /* Build list of running and suspended jobs */ cr_job_list = list_create(NULL); job_iterator = list_iterator_create(job_list); while ((tmp_job_ptr = list_next(job_iterator))) { if (!IS_JOB_RUNNING(tmp_job_ptr) && !IS_JOB_SUSPENDED(tmp_job_ptr)) continue; if (tmp_job_ptr->end_time == 0) { error("%s: %s: Active %pJ has zero end_time", plugin_type, __func__, tmp_job_ptr); continue; } if (tmp_job_ptr->node_bitmap == NULL) { /* * This should indicate a requeued job was cancelled * while NHC was running */ error("%s: %s: %pJ has NULL node_bitmap", plugin_type, __func__, tmp_job_ptr); continue; } if (!_is_preemptable(tmp_job_ptr, preemptee_candidates)) { /* Queue job for later removal from data structures */ list_append(cr_job_list, tmp_job_ptr); } else { uint16_t mode = slurm_job_preempt_mode(tmp_job_ptr); if (mode == PREEMPT_MODE_OFF) continue; if (mode == PREEMPT_MODE_SUSPEND) { action = 2; /* remove cores, keep memory */ if (preempt_by_qos) qos_preemptor = true; } else action = 0; /* remove cores and memory */ /* Remove preemptable job now */ _job_res_rm_job(future_part, future_usage, tmp_job_ptr, action, false, orig_map); } } list_iterator_destroy(job_iterator); /* Test with all preemptable jobs gone */ if (preemptee_candidates) { bit_or(node_bitmap, orig_map); rc = _job_test(job_ptr, node_bitmap, min_nodes, max_nodes, req_nodes, SELECT_MODE_WILL_RUN, tmp_cr_type, job_node_req, future_part, future_usage, exc_core_bitmap, false, qos_preemptor, true); if (rc == SLURM_SUCCESS) { /* * Actual start time will actually be later than "now", * but return "now" for backfill scheduler to * initiate preemption. */ job_ptr->start_time = now; } } /* * Remove the running jobs from exp_node_cr and try scheduling the * pending job after each one (or a few jobs that end close in time). */ if ((rc != SLURM_SUCCESS) && ((job_ptr->bit_flags & TEST_NOW_ONLY) == 0)) { int time_window = 30; time_t end_time = 0; bool more_jobs = true; DEF_TIMERS; list_sort(cr_job_list, _cr_job_list_sort); START_TIMER; job_iterator = list_iterator_create(cr_job_list); while (more_jobs) { job_record_t *last_job_ptr = NULL; job_record_t *next_job_ptr = NULL; int overlap, rm_job_cnt = 0; while (true) { tmp_job_ptr = list_next(job_iterator); if (!tmp_job_ptr) { more_jobs = false; break; } bit_or(node_bitmap, orig_map); overlap = bit_overlap(node_bitmap, tmp_job_ptr->node_bitmap); if (overlap == 0) /* job has no usable nodes */ continue; /* skip it */ debug2("%s: %s, %pJ: overlap=%d", plugin_type, __func__, tmp_job_ptr, overlap); if (!end_time) { time_t delta = 0; /* * align all time windows on a * time_window barrier from the original * first job evaluated, this prevents * data in the running set from skewing * changing the results between * scheduling evaluations */ delta = tmp_job_ptr->end_time % time_window; end_time = tmp_job_ptr->end_time + (time_window - delta); } last_job_ptr = tmp_job_ptr; (void) job_res_rm_job( future_part, future_usage, tmp_job_ptr, 0, false, orig_map); next_job_ptr = list_peek_next(job_iterator); if (!next_job_ptr) { more_jobs = false; break; } else if (next_job_ptr->end_time > (end_time + time_window)) { break; } if (rm_job_cnt++ > 200) goto timer_check; } if (!last_job_ptr) /* Should never happen */ break; do { if (bf_window_scale) time_window += bf_window_scale; else time_window *= 2; } while (next_job_ptr && next_job_ptr->end_time > (end_time + time_window)); rc = _job_test(job_ptr, node_bitmap, min_nodes, max_nodes, req_nodes, SELECT_MODE_WILL_RUN, tmp_cr_type, job_node_req, future_part, future_usage, exc_core_bitmap, backfill_busy_nodes, qos_preemptor, true); if (rc == SLURM_SUCCESS) { if (last_job_ptr->end_time <= now) { job_ptr->start_time = _guess_job_end(last_job_ptr, now); } else { job_ptr->start_time = last_job_ptr->end_time; } break; } timer_check: END_TIMER; if (DELTA_TIMER >= 2000000) break; /* Quit after 2 seconds wall time */ } list_iterator_destroy(job_iterator); } if ((rc == SLURM_SUCCESS) && preemptee_job_list && preemptee_candidates) { /* * Build list of preemptee jobs whose resources are * actually used. List returned even if not killed * in selected plugin, but by Moab or something else. */ if (*preemptee_job_list == NULL) { *preemptee_job_list = list_create(NULL); } preemptee_iterator =list_iterator_create(preemptee_candidates); while ((tmp_job_ptr = list_next(preemptee_iterator))) { if (!bit_overlap_any(node_bitmap, tmp_job_ptr->node_bitmap)) continue; list_append(*preemptee_job_list, tmp_job_ptr); } list_iterator_destroy(preemptee_iterator); } FREE_NULL_LIST(cr_job_list); part_data_destroy_res(future_part); node_data_destroy(future_usage, NULL); FREE_NULL_BITMAP(orig_map); return rc; } /* Allocate resources for a job now, if possible */ static int _run_now(job_record_t *job_ptr, bitstr_t *node_bitmap, uint32_t min_nodes, uint32_t max_nodes, uint32_t req_nodes, uint16_t job_node_req, List preemptee_candidates, List *preemptee_job_list, bitstr_t **exc_cores) { int rc; bitstr_t *orig_node_map = NULL, *save_node_map; job_record_t *tmp_job_ptr = NULL; ListIterator job_iterator, preemptee_iterator; part_res_record_t *future_part; node_use_record_t *future_usage; bool remove_some_jobs = false; uint16_t pass_count = 0; uint16_t mode = NO_VAL16; uint16_t tmp_cr_type = cr_type; bool preempt_mode = false; save_node_map = bit_copy(node_bitmap); top: orig_node_map = bit_copy(save_node_map); if (job_ptr->part_ptr->cr_type) { if ((cr_type & CR_SOCKET) || (cr_type & CR_CORE)) { tmp_cr_type &= ~(CR_SOCKET | CR_CORE | CR_MEMORY); tmp_cr_type |= job_ptr->part_ptr->cr_type; } else { info("%s: Can't use Partition SelectType unless " "using CR_Socket or CR_Core", plugin_type); } } rc = _job_test(job_ptr, node_bitmap, min_nodes, max_nodes, req_nodes, SELECT_MODE_RUN_NOW, tmp_cr_type, job_node_req, select_part_record, select_node_usage, exc_cores, false, false, preempt_mode); if ((rc != SLURM_SUCCESS) && preemptee_candidates && preempt_by_qos) { /* Determine QOS preempt mode of first job */ job_iterator = list_iterator_create(preemptee_candidates); if ((tmp_job_ptr = list_next(job_iterator))) { mode = slurm_job_preempt_mode(tmp_job_ptr); } list_iterator_destroy(job_iterator); } if ((rc != SLURM_SUCCESS) && preemptee_candidates && preempt_by_qos && (mode == PREEMPT_MODE_SUSPEND) && (job_ptr->priority != 0)) { /* Job can be held by bad allocate */ /* Try to schedule job using extra row of core bitmap */ bit_or(node_bitmap, orig_node_map); rc = _job_test(job_ptr, node_bitmap, min_nodes, max_nodes, req_nodes, SELECT_MODE_RUN_NOW, tmp_cr_type, job_node_req, select_part_record, select_node_usage, exc_cores, false, true, preempt_mode); } else if ((rc != SLURM_SUCCESS) && preemptee_candidates) { int preemptee_cand_cnt = list_count(preemptee_candidates); /* Remove preemptable jobs from simulated environment */ preempt_mode = true; future_part = part_data_dup_res(select_part_record, orig_node_map); if (future_part == NULL) { FREE_NULL_BITMAP(orig_node_map); FREE_NULL_BITMAP(save_node_map); return SLURM_ERROR; } future_usage = node_data_dup_use(select_node_usage, orig_node_map); if (future_usage == NULL) { part_data_destroy_res(future_part); FREE_NULL_BITMAP(orig_node_map); FREE_NULL_BITMAP(save_node_map); return SLURM_ERROR; } job_iterator = list_iterator_create(preemptee_candidates); while ((tmp_job_ptr = list_next(job_iterator))) { mode = slurm_job_preempt_mode(tmp_job_ptr); if ((mode != PREEMPT_MODE_REQUEUE) && (mode != PREEMPT_MODE_CANCEL)) continue; /* can't remove job */ /* Remove preemptable job now */ if(_job_res_rm_job(future_part, future_usage, tmp_job_ptr, 0, false, orig_node_map)) continue; bit_or(node_bitmap, orig_node_map); rc = _job_test(job_ptr, node_bitmap, min_nodes, max_nodes, req_nodes, SELECT_MODE_WILL_RUN, tmp_cr_type, job_node_req, future_part, future_usage, exc_cores, false, false, preempt_mode); tmp_job_ptr->details->usable_nodes = 0; if (rc != SLURM_SUCCESS) continue; if ((pass_count++ > preempt_reorder_cnt) || (preemptee_cand_cnt <= pass_count)) { /* * Ignore remaining jobs, but keep in the list * since the code can get called multiple times * for different node/feature sets -- * _get_req_features(). */ while ((tmp_job_ptr = list_next(job_iterator))) { tmp_job_ptr->details->usable_nodes = 1; } break; } /* * Reorder preemption candidates to minimize number * of preempted jobs and their priorities. */ if (preempt_strict_order) { /* * Move last preempted job to top of preemption * candidate list, preserving order of other * jobs. */ tmp_job_ptr = list_remove(job_iterator); list_prepend(preemptee_candidates, tmp_job_ptr); } else { /* * Set the last job's usable count to a large * value and re-sort preempted jobs. usable_nodes * count set to zero above to eliminate values * previously set to 99999. Note: usable_count * is only used for sorting purposes. */ tmp_job_ptr->details->usable_nodes = 99999; list_iterator_reset(job_iterator); while ((tmp_job_ptr = list_next(job_iterator))) { if (tmp_job_ptr->details->usable_nodes == 99999) break; tmp_job_ptr->details->usable_nodes = _get_usable_nodes(node_bitmap, tmp_job_ptr); } while ((tmp_job_ptr = list_next(job_iterator))) { tmp_job_ptr->details->usable_nodes = 0; } list_sort(preemptee_candidates, (ListCmpF)_sort_usable_nodes_dec); } FREE_NULL_BITMAP(orig_node_map); list_iterator_destroy(job_iterator); part_data_destroy_res(future_part); node_data_destroy(future_usage, NULL); goto top; } list_iterator_destroy(job_iterator); if ((rc == SLURM_SUCCESS) && preemptee_job_list && preemptee_candidates) { /* * Build list of preemptee jobs whose resources are * actually used */ if (*preemptee_job_list == NULL) { *preemptee_job_list = list_create(NULL); } preemptee_iterator = list_iterator_create( preemptee_candidates); while ((tmp_job_ptr = list_next(preemptee_iterator))) { mode = slurm_job_preempt_mode(tmp_job_ptr); if ((mode != PREEMPT_MODE_REQUEUE) && (mode != PREEMPT_MODE_CANCEL)) continue; if (!job_overlap_and_running( node_bitmap, tmp_job_ptr)) continue; if (tmp_job_ptr->details->usable_nodes) break; list_append(*preemptee_job_list, tmp_job_ptr); remove_some_jobs = true; } list_iterator_destroy(preemptee_iterator); if (!remove_some_jobs) { FREE_NULL_LIST(*preemptee_job_list); } } part_data_destroy_res(future_part); node_data_destroy(future_usage, NULL); } FREE_NULL_BITMAP(orig_node_map); FREE_NULL_BITMAP(save_node_map); return rc; } /* * common_job_test - Given a specification of scheduling requirements, * identify the nodes which "best" satisfy the request. * "best" is defined as either a minimal number of consecutive nodes * or if sharing resources then sharing them with a job of similar size. * IN/OUT job_ptr - pointer to job being considered for initiation, * set's start_time when job expected to start * IN/OUT bitmap - usable nodes are set on input, nodes not required to * satisfy the request are cleared, other left set * IN min_nodes - minimum count of nodes * IN req_nodes - requested (or desired) count of nodes * IN max_nodes - maximum count of nodes (0==don't care) * IN mode - SELECT_MODE_RUN_NOW (0): try to schedule job now * SELECT_MODE_TEST_ONLY (1): test if job can ever run * SELECT_MODE_WILL_RUN (2): determine when and where job can run * IN preemptee_candidates - List of pointers to jobs which can be preempted. * IN/OUT preemptee_job_list - Pointer to list of job pointers. These are the * jobs to be preempted to initiate the pending job. Not set * if mode=SELECT_MODE_TEST_ONLY or input pointer is NULL. * IN exc_cores - Cores to be excluded for use (in advanced reservation) * RET zero on success, EINVAL otherwise * globals (passed via select_p_node_init): * node_record_count - count of nodes configured * node_record_table_ptr - pointer to global node table * NOTE: the job information that is considered for scheduling includes: * req_node_bitmap: bitmap of specific nodes required by the job * contiguous: allocated nodes must be sequentially located * num_cpus: minimum number of processors required by the job * NOTE: bitmap must be a superset of req_nodes at the time that * select_p_job_test is called */ extern int common_job_test(job_record_t *job_ptr, bitstr_t *node_bitmap, uint32_t min_nodes, uint32_t max_nodes, uint32_t req_nodes, uint16_t mode, List preemptee_candidates, List *preemptee_job_list, bitstr_t **exc_cores) { int rc = EINVAL; uint16_t job_node_req; if (!(slurmctld_conf.conf_flags & CTL_CONF_ASRU)) job_ptr->details->core_spec = NO_VAL16; if ((job_ptr->details->core_spec != NO_VAL16) && (job_ptr->details->whole_node != 1)) { info("%s: %s: Setting Exclusive mode for %pJ with CoreSpec=%u", plugin_type, __func__, job_ptr, job_ptr->details->core_spec); job_ptr->details->whole_node = 1; } if (!job_ptr->details->mc_ptr) job_ptr->details->mc_ptr = _create_default_mc(); job_node_req = _get_job_node_req(job_ptr); if (select_debug_flags & DEBUG_FLAG_SELECT_TYPE) { char *node_mode = "Unknown", *alloc_mode = "Unknown"; if (job_node_req == NODE_CR_RESERVED) node_mode = "Exclusive"; else if (job_node_req == NODE_CR_AVAILABLE) node_mode = "OverCommit"; else if (job_node_req == NODE_CR_ONE_ROW) node_mode = "Normal"; if (mode == SELECT_MODE_WILL_RUN) alloc_mode = "Will_Run"; else if (mode == SELECT_MODE_TEST_ONLY) alloc_mode = "Test_Only"; else if (mode == SELECT_MODE_RUN_NOW) alloc_mode = "Run_Now"; info("%s: %s: %pJ node_mode:%s alloc_mode:%s", plugin_type, __func__, job_ptr, node_mode, alloc_mode); core_array_log("node_list & exc_cores", node_bitmap, exc_cores); info("%s: %s: nodes: min:%u max:%u requested:%u avail:%u", plugin_type, __func__, min_nodes, max_nodes, req_nodes, bit_set_count(node_bitmap)); node_data_dump(); } if (mode == SELECT_MODE_WILL_RUN) { rc = _will_run_test(job_ptr, node_bitmap, min_nodes, max_nodes, req_nodes, job_node_req, preemptee_candidates, preemptee_job_list, exc_cores); } else if (mode == SELECT_MODE_TEST_ONLY) { rc = _test_only(job_ptr, node_bitmap, min_nodes, max_nodes, req_nodes, job_node_req); } else if (mode == SELECT_MODE_RUN_NOW) { rc = _run_now(job_ptr, node_bitmap, min_nodes, max_nodes, req_nodes, job_node_req, preemptee_candidates, preemptee_job_list, exc_cores); } else { /* Should never get here */ error("%s: %s: Mode %d is invalid", plugin_type, __func__, mode); return EINVAL; } if ((select_debug_flags & DEBUG_FLAG_CPU_BIND) || (select_debug_flags & DEBUG_FLAG_SELECT_TYPE)) { if (job_ptr->job_resrcs) { if (rc != SLURM_SUCCESS) { info("%s: %s: error:%s", plugin_type, __func__, slurm_strerror(rc)); } log_job_resources(job_ptr); if (is_cons_tres) gres_plugin_job_state_log(job_ptr->gres_list, job_ptr->job_id); } else { info("%s: %s: no job_resources info for %pJ rc=%d", plugin_type, __func__, job_ptr, rc); } } return rc; }