/*****************************************************************************\ * select_linear.c - node selection plugin for simple one-dimensional * address space. Selects nodes for a job so as to minimize the number * of sets of consecutive nodes using a best-fit algorithm. ***************************************************************************** * Copyright (C) 2004-2007 The Regents of the University of California. * Copyright (C) 2008-2010 Lawrence Livermore National Security. * Portions Copyright (C) 2010 SchedMD . * Copyright (C) 2014 Silicon Graphics International Corp. All rights reserved. * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). * Written by Morris Jette * CODE-OCEC-09-009. All rights reserved. * * This file is part of Slurm, a resource management program. * For details, see . * Please also read the included file: DISCLAIMER. * * Slurm is free software; you can redistribute it and/or modify it under * the terms of the GNU General Public License as published by the Free * Software Foundation; either version 2 of the License, or (at your option) * any later version. * * In addition, as a special exception, the copyright holders give permission * to link the code of portions of this program with the OpenSSL library under * certain conditions as described in each individual source file, and * distribute linked combinations including the two. You must obey the GNU * General Public License in all respects for all of the code used other than * OpenSSL. If you modify file(s) with this exception, you may extend this * exception to your version of the file(s), but you are not obligated to do * so. If you do not wish to do so, delete this exception statement from your * version. If you delete this exception statement from all source files in * the program, then also delete it here. * * Slurm is distributed in the hope that it will be useful, but WITHOUT ANY * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more * details. * * You should have received a copy of the GNU General Public License along * with Slurm; if not, write to the Free Software Foundation, Inc., * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. \*****************************************************************************/ #include "config.h" #define _GNU_SOURCE #include #include #include #include #include #include #include #include "slurm/slurm.h" #include "slurm/slurm_errno.h" #include "src/common/slurm_xlator.h" /* Must be first */ #include "src/common/assoc_mgr.h" #include "src/common/gres.h" #include "src/common/job_resources.h" #include "src/common/list.h" #include "src/common/log.h" #include "src/common/node_select.h" #include "src/common/parse_time.h" #include "src/common/slurm_protocol_api.h" #include "src/common/slurm_topology.h" #include "src/common/slurm_resource_info.h" #include "src/common/xassert.h" #include "src/common/xmalloc.h" #include "src/common/xstring.h" #include "src/slurmctld/slurmctld.h" #include "src/slurmctld/preempt.h" #include "src/slurmctld/proc_req.h" #include "src/plugins/select/linear/select_linear.h" #define NO_SHARE_LIMIT 0xfffe #define NODEINFO_MAGIC 0x82ad #define RUN_JOB_INCR 16 #define SELECT_DEBUG 0 /* These are defined here so when we link with something other than * the slurmctld we will have these symbols defined. They will get * overwritten when linking with the slurmctld. */ #if defined (__APPLE__) extern slurm_ctl_conf_t slurmctld_conf __attribute__((weak_import)); extern node_record_t *node_record_table_ptr __attribute__((weak_import)); extern List part_list __attribute__((weak_import)); extern List job_list __attribute__((weak_import)); extern int node_record_count __attribute__((weak_import)); extern time_t last_node_update __attribute__((weak_import)); extern switch_record_t *switch_record_table __attribute__((weak_import)); extern int switch_record_cnt __attribute__((weak_import)); extern slurmctld_config_t slurmctld_config __attribute__((weak_import)); extern int hypercube_dimensions __attribute__((weak_import)); extern struct hypercube_switch *hypercube_switch_table __attribute__((weak_import)); extern int hypercube_switch_cnt __attribute__((weak_import)); extern struct hypercube_switch ***hypercube_switches __attribute__((weak_import)); #else slurm_ctl_conf_t slurmctld_conf; node_record_t *node_record_table_ptr; List part_list; List job_list; int node_record_count; time_t last_node_update; switch_record_t *switch_record_table; int switch_record_cnt; slurmctld_config_t slurmctld_config; int hypercube_dimensions; struct hypercube_switch *hypercube_switch_table; int hypercube_switch_cnt; struct hypercube_switch ***hypercube_switches; #endif struct select_nodeinfo { uint16_t magic; /* magic number */ uint16_t alloc_cpus; uint64_t alloc_memory; char *tres_alloc_fmt_str; /* formatted str of allocated tres */ double tres_alloc_weighted; /* weighted number of tres allocated. */ }; static uint16_t priority_flags = 0; static int _add_job_to_nodes(struct cr_record *cr_ptr, job_record_t *job_ptr, char *pre_err, int suspended); static void _add_run_job(struct cr_record *cr_ptr, uint32_t job_id); static void _add_tot_job(struct cr_record *cr_ptr, uint32_t job_id); static void _build_select_struct(job_record_t *job_ptr, bitstr_t *bitmap); static int _cr_job_list_sort(void *x, void *y); static job_resources_t *_create_job_resources(int node_cnt); static int _decr_node_job_cnt(int node_inx, job_record_t *job_ptr, char *pre_err); static void _dump_node_cr(struct cr_record *cr_ptr); static struct cr_record *_dup_cr(struct cr_record *cr_ptr); static int _find_job_mate(job_record_t *job_ptr, bitstr_t *bitmap, uint32_t min_nodes, uint32_t max_nodes, uint32_t req_nodes); static void _free_cr(struct cr_record *cr_ptr); static int _get_avail_cpus(job_record_t *job_ptr, int index); static uint16_t _get_total_cpus(int index); static void _init_node_cr(void); static int _job_count_bitmap(struct cr_record *cr_ptr, job_record_t *job_ptr, bitstr_t * bitmap, bitstr_t * jobmap, int run_job_cnt, int tot_job_cnt, uint16_t mode); static int _job_expand(job_record_t *from_job_ptr, job_record_t *to_job_ptr); static int _job_test(job_record_t *job_ptr, bitstr_t *bitmap, uint32_t min_nodes, uint32_t max_nodes, uint32_t req_nodes); static int _job_test_dfly(job_record_t *job_ptr, bitstr_t *bitmap, uint32_t min_nodes, uint32_t max_nodes, uint32_t req_nodes); static int _job_test_hypercube(job_record_t *job_ptr, bitstr_t *bitmap, uint32_t min_nodes, uint32_t max_nodes, uint32_t req_nodes); static int _job_test_topo(job_record_t *job_ptr, bitstr_t *bitmap, uint32_t min_nodes, uint32_t max_nodes, uint32_t req_nodes); static bool _rem_run_job(struct cr_record *cr_ptr, uint32_t job_id); static bool _rem_tot_job(struct cr_record *cr_ptr, uint32_t job_id); static int _rm_job_from_nodes(struct cr_record *cr_ptr, job_record_t *job_ptr, char *pre_err, bool remove_all, bool job_fini); static int _rm_job_from_one_node(job_record_t *job_ptr, node_record_t *node_ptr, char *pre_err); static int _run_now(job_record_t *job_ptr, bitstr_t *bitmap, uint32_t min_nodes, uint32_t max_nodes, int max_share, uint32_t req_nodes, List preemptee_candidates, List *preemptee_job_list); static int _sort_usable_nodes_dec(void *, void *); static bool _test_run_job(struct cr_record *cr_ptr, uint32_t job_id); static bool _test_tot_job(struct cr_record *cr_ptr, uint32_t job_id); static int _test_only(job_record_t *job_ptr, bitstr_t *bitmap, uint32_t min_nodes, uint32_t max_nodes, uint32_t req_nodes, int max_share); static int _will_run_test(job_record_t *job_ptr, bitstr_t *bitmap, uint32_t min_nodes, uint32_t max_nodes, int max_share, uint32_t req_nodes, List preemptee_candidates, List *preemptee_job_list); extern select_nodeinfo_t *select_p_select_nodeinfo_alloc(); extern int select_p_select_nodeinfo_free(select_nodeinfo_t *nodeinfo); /* * These variables are required by the generic plugin interface. If they * are not found in the plugin, the plugin loader will ignore it. * * plugin_name - a string giving a human-readable description of the * plugin. There is no maximum length, but the symbol must refer to * a valid string. * * plugin_type - a string suggesting the type of the plugin or its * applicability to a particular form of data or method of data handling. * If the low-level plugin API is used, the contents of this string are * unimportant and may be anything. Slurm uses the higher-level plugin * interface which requires this string to be of the form * * / * * where is a description of the intended application of * the plugin (e.g., "select" for Slurm node selection) and * is a description of how this plugin satisfies that application. Slurm will * only load select plugins if the plugin_type string has a * prefix of "select/". * * plugin_version - an unsigned 32-bit integer containing the Slurm version * (major.minor.micro combined into a single number). */ const char plugin_name[] = "Linear node selection plugin"; const char plugin_type[] = "select/linear"; const uint32_t plugin_id = SELECT_PLUGIN_LINEAR; const uint32_t plugin_version = SLURM_VERSION_NUMBER; static node_record_t *select_node_ptr = NULL; static int select_node_cnt = 0; static uint16_t cr_type; static bool have_dragonfly = false; static bool topo_optional = false; /* Record of resources consumed on each node including job details */ static struct cr_record *cr_ptr = NULL; static pthread_mutex_t cr_mutex = PTHREAD_MUTEX_INITIALIZER; /* Add job id to record of jobs running on this node */ static void _add_run_job(struct cr_record *cr_ptr, uint32_t job_id) { int i; if (cr_ptr->run_job_ids == NULL) { /* create new array */ cr_ptr->run_job_len = RUN_JOB_INCR; cr_ptr->run_job_ids = xcalloc(cr_ptr->run_job_len, sizeof(uint32_t)); cr_ptr->run_job_ids[0] = job_id; return; } for (i=0; irun_job_len; i++) { if (cr_ptr->run_job_ids[i]) continue; /* fill in hole */ cr_ptr->run_job_ids[i] = job_id; return; } /* expand array and add to end */ cr_ptr->run_job_len += RUN_JOB_INCR; xrealloc(cr_ptr->run_job_ids, sizeof(uint32_t) * cr_ptr->run_job_len); cr_ptr->run_job_ids[i] = job_id; } /* Add job id to record of jobs running or suspended on this node */ static void _add_tot_job(struct cr_record *cr_ptr, uint32_t job_id) { int i; if (cr_ptr->tot_job_ids == NULL) { /* create new array */ cr_ptr->tot_job_len = RUN_JOB_INCR; cr_ptr->tot_job_ids = xcalloc(cr_ptr->tot_job_len, sizeof(uint32_t)); cr_ptr->tot_job_ids[0] = job_id; return; } for (i=0; itot_job_len; i++) { if (cr_ptr->tot_job_ids[i]) continue; /* fill in hole */ cr_ptr->tot_job_ids[i] = job_id; return; } /* expand array and add to end */ cr_ptr->tot_job_len += RUN_JOB_INCR; xrealloc(cr_ptr->tot_job_ids, sizeof(uint32_t) * cr_ptr->tot_job_len); cr_ptr->tot_job_ids[i] = job_id; } static bool _ck_run_job(struct cr_record *cr_ptr, uint32_t job_id, bool clear_it) { int i; bool rc = false; if ((cr_ptr->run_job_ids == NULL) || (cr_ptr->run_job_len == 0)) return rc; for (i=0; irun_job_len; i++) { if (cr_ptr->run_job_ids[i] != job_id) continue; if (clear_it) cr_ptr->run_job_ids[i] = 0; rc = true; } return rc; } /* Remove job id from record of jobs running, * RET true if successful, false if the job was not running */ static bool _rem_run_job(struct cr_record *cr_ptr, uint32_t job_id) { return _ck_run_job(cr_ptr, job_id, true); } /* Test for job id in record of jobs running, * RET true if successful, false if the job was not running */ static bool _test_run_job(struct cr_record *cr_ptr, uint32_t job_id) { return _ck_run_job(cr_ptr, job_id, false); } static bool _ck_tot_job(struct cr_record *cr_ptr, uint32_t job_id, bool clear_it) { int i; bool rc = false; if ((cr_ptr->tot_job_ids == NULL) || (cr_ptr->tot_job_len == 0)) return rc; for (i=0; itot_job_len; i++) { if (cr_ptr->tot_job_ids[i] != job_id) continue; if (clear_it) cr_ptr->tot_job_ids[i] = 0; rc = true; } return rc; } /* Remove job id from record of jobs running or suspended, * RET true if successful, false if the job was not found */ static bool _rem_tot_job(struct cr_record *cr_ptr, uint32_t job_id) { return _ck_tot_job(cr_ptr, job_id, true); } /* Test for job id in record of jobs running or suspended, * RET true if successful, false if the job was not found */ static bool _test_tot_job(struct cr_record *cr_ptr, uint32_t job_id) { return _ck_tot_job(cr_ptr, job_id, false); } static bool _enough_nodes(int avail_nodes, int rem_nodes, uint32_t min_nodes, uint32_t req_nodes) { int needed_nodes; if (req_nodes > min_nodes) needed_nodes = rem_nodes + min_nodes - req_nodes; else needed_nodes = rem_nodes; return(avail_nodes >= needed_nodes); } /* * _get_avail_cpus - Get the number of "available" cpus on a node * given this number given the number of cpus_per_task and * maximum sockets, cores, threads. Note that the value of * cpus is the lowest-level logical processor (LLLP). * IN job_ptr - pointer to job being scheduled * IN index - index of node's configuration information in select_node_ptr */ static int _get_avail_cpus(job_record_t *job_ptr, int index) { node_record_t *node_ptr; int avail_cpus; uint16_t boards_per_node, sockets_per_board; uint16_t cores_per_socket, thread_per_core; uint16_t cpus_per_node, cpus_per_task = 1; uint16_t ntasks_per_node = 0, ntasks_per_core; uint32_t nppcu, total_cores; multi_core_data_t *mc_ptr = NULL; if (job_ptr->details == NULL) return (uint16_t) 0; if (job_ptr->details->cpus_per_task) cpus_per_task = job_ptr->details->cpus_per_task; if (job_ptr->details->ntasks_per_node) ntasks_per_node = job_ptr->details->ntasks_per_node; if ((mc_ptr = job_ptr->details->mc_ptr)) ntasks_per_core = mc_ptr->ntasks_per_core; else ntasks_per_core = 0; node_ptr = select_node_ptr + index; cpus_per_node = node_ptr->config_ptr->cpus; boards_per_node = node_ptr->config_ptr->boards; sockets_per_board = node_ptr->config_ptr->sockets; cores_per_socket = node_ptr->config_ptr->cores; thread_per_core = node_ptr->config_ptr->threads; #if SELECT_DEBUG info("host:%s HW_ cpus_per_node:%u boards_per_node:%u " "sockets_per_boards:%u cores_per_socket:%u thread_per_core:%u ", node_ptr->name, cpus_per_node, boards_per_node, sockets_per_board, cores_per_socket, thread_per_core); #endif /* pick defaults for any unspecified items */ if (cpus_per_task <= 0) cpus_per_task = 1; if (thread_per_core <= 0) thread_per_core = 1; if (cores_per_socket <= 0) cores_per_socket = 1; if (boards_per_node <= 0) boards_per_node = 1; if (sockets_per_board <= 0) { sockets_per_board = cpus_per_node / boards_per_node / cores_per_socket / thread_per_core; } nppcu = ntasks_per_core; total_cores = boards_per_node * sockets_per_board * cores_per_socket; avail_cpus = adjust_cpus_nppcu(nppcu, cpus_per_task, total_cores, cpus_per_node); if (ntasks_per_node > 0) avail_cpus = MIN(avail_cpus, ntasks_per_node * cpus_per_task); #if SELECT_DEBUG debug("avail_cpus index %d = %u (out of boards_per_node:%u " "sockets_per_boards:%u cores_per_socket:%u thread_per_core:%u)", index, avail_cpus, boards_per_node, sockets_per_board, cores_per_socket, thread_per_core); #endif return(avail_cpus); } /* * _get_total_cpus - Get the total number of cpus on a node * Note that the value of cpus is the lowest-level logical * processor (LLLP). * IN index - index of node's configuration information in select_node_ptr */ static uint16_t _get_total_cpus(int index) { node_record_t *node_ptr = &(select_node_ptr[index]); return node_ptr->config_ptr->cpus; } static job_resources_t *_create_job_resources(int node_cnt) { job_resources_t *job_resrcs_ptr; job_resrcs_ptr = create_job_resources(); job_resrcs_ptr->cpu_array_reps = xcalloc(node_cnt, sizeof(uint32_t)); job_resrcs_ptr->cpu_array_value = xcalloc(node_cnt, sizeof(uint16_t)); job_resrcs_ptr->cpus = xcalloc(node_cnt, sizeof(uint16_t)); job_resrcs_ptr->cpus_used = xcalloc(node_cnt, sizeof(uint16_t)); job_resrcs_ptr->memory_allocated = xcalloc(node_cnt, sizeof(uint64_t)); job_resrcs_ptr->memory_used = xcalloc(node_cnt, sizeof(uint64_t)); job_resrcs_ptr->nhosts = node_cnt; return job_resrcs_ptr; } /* Build the full job_resources_t *structure for a job based upon the nodes * allocated to it (the bitmap) and the job's memory requirement */ static void _build_select_struct(job_record_t *job_ptr, bitstr_t *bitmap) { int i, j, k; int first_bit, last_bit; uint32_t node_cpus, total_cpus = 0, node_cnt; uint64_t job_memory_cpu = 0, job_memory_node = 0; job_resources_t *job_resrcs_ptr; if (job_ptr->details->pn_min_memory && (cr_type & CR_MEMORY)) { if (job_ptr->details->pn_min_memory & MEM_PER_CPU) job_memory_cpu = job_ptr->details->pn_min_memory & (~MEM_PER_CPU); else job_memory_node = job_ptr->details->pn_min_memory; } if (job_ptr->job_resrcs) /* Old struct due to job requeue */ free_job_resources(&job_ptr->job_resrcs); node_cnt = bit_set_count(bitmap); job_ptr->job_resrcs = job_resrcs_ptr = _create_job_resources(node_cnt); job_resrcs_ptr->node_bitmap = bit_copy(bitmap); job_resrcs_ptr->nodes = bitmap2node_name(bitmap); job_resrcs_ptr->ncpus = job_ptr->total_cpus; if (build_job_resources(job_resrcs_ptr, (void *)select_node_ptr)) error("_build_select_struct: build_job_resources: %m"); first_bit = bit_ffs(bitmap); last_bit = bit_fls(bitmap); if (last_bit == -1) last_bit = -2; /* no bits set */ for (i = first_bit, j = 0, k = -1; i <= last_bit; i++) { if (!bit_test(bitmap, i)) continue; node_cpus = _get_total_cpus(i); job_resrcs_ptr->cpus[j] = node_cpus; if ((k == -1) || (job_resrcs_ptr->cpu_array_value[k] != node_cpus)) { job_resrcs_ptr->cpu_array_cnt++; job_resrcs_ptr->cpu_array_reps[++k] = 1; job_resrcs_ptr->cpu_array_value[k] = node_cpus; } else job_resrcs_ptr->cpu_array_reps[k]++; total_cpus += node_cpus; if (job_memory_node) { job_resrcs_ptr->memory_allocated[j] = job_memory_node; } else if (job_memory_cpu) { job_resrcs_ptr->memory_allocated[j] = job_memory_cpu * node_cpus; } if (set_job_resources_node(job_resrcs_ptr, j)) { error("_build_select_struct: set_job_resources_node: " "%m"); } j++; } if (job_resrcs_ptr->ncpus != total_cpus) { error("_build_select_struct: ncpus mismatch %u != %u", job_resrcs_ptr->ncpus, total_cpus); } } /* * Set the bits in 'jobmap' that correspond to bits in the 'bitmap' * that are running 'run_job_cnt' jobs or less, and clear the rest. */ static int _job_count_bitmap(struct cr_record *cr_ptr, job_record_t *job_ptr, bitstr_t * bitmap, bitstr_t * jobmap, int run_job_cnt, int tot_job_cnt, uint16_t mode) { int i, i_first, i_last; int count = 0, total_jobs, total_run_jobs; struct part_cr_record *part_cr_ptr; node_record_t *node_ptr; uint64_t job_memory_cpu = 0, job_memory_node = 0; uint64_t alloc_mem = 0, job_mem = 0, avail_mem = 0; uint32_t cpu_cnt, gres_cpus, gres_cores; int core_start_bit, core_end_bit, cpus_per_core; List gres_list; bool use_total_gres = true; xassert(cr_ptr); xassert(cr_ptr->nodes); if (mode != SELECT_MODE_TEST_ONLY) { use_total_gres = false; if (job_ptr->details->pn_min_memory && (cr_type & CR_MEMORY)) { if (job_ptr->details->pn_min_memory & MEM_PER_CPU) { job_memory_cpu = job_ptr->details->pn_min_memory & (~MEM_PER_CPU); } else { job_memory_node = job_ptr->details-> pn_min_memory; } } } i_first = bit_ffs(bitmap); i_last = bit_fls(bitmap); if (i_first == -1) /* job has no nodes */ i_last = -2; for (i = i_first; i <= i_last; i++) { if (!bit_test(bitmap, i)) { bit_clear(jobmap, i); continue; } node_ptr = node_record_table_ptr + i; cpu_cnt = node_ptr->config_ptr->cpus; if (cr_ptr->nodes[i].gres_list) gres_list = cr_ptr->nodes[i].gres_list; else gres_list = node_ptr->gres_list; core_start_bit = cr_get_coremap_offset(i); core_end_bit = cr_get_coremap_offset(i+1) - 1; cpus_per_core = cpu_cnt / (core_end_bit - core_start_bit + 1); gres_cores = gres_plugin_job_test(job_ptr->gres_list, gres_list, use_total_gres, NULL, core_start_bit, core_end_bit, job_ptr->job_id, node_ptr->name, false); gres_cpus = gres_cores; if (gres_cpus != NO_VAL) { gres_cpus *= cpus_per_core; if ((gres_cpus < cpu_cnt) || (gres_cpus < job_ptr->details->ntasks_per_node) || ((job_ptr->details->cpus_per_task > 1) && (gres_cpus < job_ptr->details->cpus_per_task))) { bit_clear(jobmap, i); continue; } } if (mode == SELECT_MODE_TEST_ONLY) { bit_set(jobmap, i); count++; continue; /* No need to test other resources */ } if (job_memory_cpu || job_memory_node) { alloc_mem = cr_ptr->nodes[i].alloc_memory; avail_mem = node_ptr->config_ptr->real_memory; if (job_memory_cpu) job_mem = job_memory_cpu * cpu_cnt; else job_mem = job_memory_node; avail_mem -= node_ptr->mem_spec_limit; if ((alloc_mem + job_mem) > avail_mem) { bit_clear(jobmap, i); continue; } } if ((mode != SELECT_MODE_TEST_ONLY) && (cr_ptr->nodes[i].exclusive_cnt != 0)) { /* already reserved by some exclusive job */ bit_clear(jobmap, i); continue; } total_jobs = 0; total_run_jobs = 0; part_cr_ptr = cr_ptr->nodes[i].parts; while (part_cr_ptr) { total_run_jobs += part_cr_ptr->run_job_cnt; total_jobs += part_cr_ptr->tot_job_cnt; part_cr_ptr = part_cr_ptr->next; } if ((total_run_jobs <= run_job_cnt) && (total_jobs <= tot_job_cnt)) { bit_set(jobmap, i); count++; } else { bit_clear(jobmap, i); } } return count; } /* _find_job_mate - does most of the real work for select_p_job_test(), * in trying to find a suitable job to mate this one with. This is * a pretty simple algorithm now, but could try to match the job * with multiple jobs that add up to the proper size or a single * job plus a few idle nodes. */ static int _find_job_mate(job_record_t *job_ptr, bitstr_t *bitmap, uint32_t min_nodes, uint32_t max_nodes, uint32_t req_nodes) { ListIterator job_iterator; job_record_t *job_scan_ptr; int rc = EINVAL; job_iterator = list_iterator_create(job_list); while ((job_scan_ptr = list_next(job_iterator))) { if ((!IS_JOB_RUNNING(job_scan_ptr)) || (job_scan_ptr->node_cnt != req_nodes) || (job_scan_ptr->total_cpus < job_ptr->details->min_cpus) || (!bit_super_set(job_scan_ptr->node_bitmap, bitmap))) continue; if (job_scan_ptr->details && job_ptr->details && (job_scan_ptr->details->contiguous != job_ptr->details->contiguous)) continue; if (job_ptr->details->req_node_bitmap && (!bit_super_set(job_ptr->details->req_node_bitmap, job_scan_ptr->node_bitmap))) continue; /* Required nodes missing from job */ if (job_ptr->details->exc_node_bitmap && bit_overlap_any(job_ptr->details->exc_node_bitmap, job_scan_ptr->node_bitmap)) continue; /* Excluded nodes in this job */ bit_and(bitmap, job_scan_ptr->node_bitmap); job_ptr->total_cpus = job_scan_ptr->total_cpus; rc = SLURM_SUCCESS; break; } list_iterator_destroy(job_iterator); return rc; } /* _job_test - does most of the real work for select_p_job_test(), which * pretty much just handles load-leveling and max_share logic */ static int _job_test(job_record_t *job_ptr, bitstr_t *bitmap, uint32_t min_nodes, uint32_t max_nodes, uint32_t req_nodes) { int i, error_code = EINVAL, sufficient; int *consec_nodes; /* how many nodes we can add from this * consecutive set of nodes */ int *consec_cpus; /* how many nodes we can add from this * consecutive set of nodes */ int *consec_start; /* where this consecutive set starts (index) */ int *consec_end; /* where this consecutive set ends (index) */ int *consec_req; /* are nodes from this set required * (in req_bitmap) */ int consec_index, consec_size; int rem_cpus, rem_nodes; /* remaining resources desired */ int best_fit_nodes, best_fit_cpus, best_fit_req; int best_fit_location = 0, best_fit_sufficient; int avail_cpus, total_cpus = 0; int *avail_cpu_cnt; /* Available CPU count on each node */ int first_cpu_cnt = 0, total_node_cnt = 0, low_cpu_cnt = 99999; bool heterogeneous = false; /* Nodes have heterogeneous CPU counts */ if (bit_set_count(bitmap) < min_nodes) return error_code; if ((job_ptr->details->req_node_bitmap) && (!bit_super_set(job_ptr->details->req_node_bitmap, bitmap))) return error_code; if (hypercube_switch_table && hypercube_switch_cnt) { /* Optimized resource selection based on hypercube topology */ return _job_test_hypercube(job_ptr, bitmap, min_nodes, max_nodes, req_nodes); } if (switch_record_cnt && switch_record_table && ((topo_optional == false) || job_ptr->req_switch)) { /* Perform optimized resource selection based upon topology */ if (have_dragonfly) { return _job_test_dfly(job_ptr, bitmap, min_nodes, max_nodes, req_nodes); } else { return _job_test_topo(job_ptr, bitmap, min_nodes, max_nodes, req_nodes); } } consec_index = 0; consec_size = 50; /* start allocation for 50 sets of * consecutive nodes */ consec_cpus = xcalloc(consec_size, sizeof(int)); consec_nodes = xcalloc(consec_size, sizeof(int)); consec_start = xcalloc(consec_size, sizeof(int)); consec_end = xcalloc(consec_size, sizeof(int)); consec_req = xcalloc(consec_size, sizeof(int)); /* Build table with information about sets of consecutive nodes */ consec_cpus[consec_index] = consec_nodes[consec_index] = 0; consec_req[consec_index] = -1; /* no required nodes here by default */ rem_cpus = job_ptr->details->min_cpus; if (req_nodes > min_nodes) rem_nodes = req_nodes; else rem_nodes = min_nodes; avail_cpu_cnt = xcalloc(select_node_cnt, sizeof(int)); for (i = 0; i < select_node_cnt; i++) { if (bit_test(bitmap, i)) { avail_cpu_cnt[i] = _get_avail_cpus(job_ptr, i); if (++total_node_cnt == 1) first_cpu_cnt = avail_cpu_cnt[i]; else if (first_cpu_cnt != avail_cpu_cnt[i]) heterogeneous = true; low_cpu_cnt = MIN(low_cpu_cnt, avail_cpu_cnt[i]); if (consec_nodes[consec_index] == 0) consec_start[consec_index] = i; avail_cpus = avail_cpu_cnt[i]; if (job_ptr->details->req_node_bitmap && (max_nodes > 0) && bit_test(job_ptr->details->req_node_bitmap, i)) { if (consec_req[consec_index] == -1) { /* first required node in set */ consec_req[consec_index] = i; } rem_nodes--; max_nodes--; rem_cpus -= avail_cpus; total_cpus += _get_total_cpus(i); } else { /* node not required (yet) */ bit_clear(bitmap, i); consec_cpus[consec_index] += avail_cpus; consec_nodes[consec_index]++; } } else if (consec_nodes[consec_index] == 0) { consec_req[consec_index] = -1; /* already picked up any required nodes */ /* re-use this record */ } else { consec_end[consec_index] = i - 1; if (++consec_index >= consec_size) { consec_size *= 2; xrealloc(consec_cpus, sizeof(int) * consec_size); xrealloc(consec_nodes, sizeof(int) * consec_size); xrealloc(consec_start, sizeof(int) * consec_size); xrealloc(consec_end, sizeof(int) * consec_size); xrealloc(consec_req, sizeof(int) * consec_size); } consec_cpus[consec_index] = 0; consec_nodes[consec_index] = 0; consec_req[consec_index] = -1; } } if (consec_nodes[consec_index] != 0) consec_end[consec_index++] = i - 1; #if SELECT_DEBUG /* don't compile this, it slows things down too much */ debug3("rem_cpus=%d, rem_nodes=%d", rem_cpus, rem_nodes); for (i = 0; i < consec_index; i++) { if (consec_req[i] != -1) debug3("start=%s, end=%s, nodes=%d, cpus=%d, req=%s", select_node_ptr[consec_start[i]].name, select_node_ptr[consec_end[i]].name, consec_nodes[i], consec_cpus[i], select_node_ptr[consec_req[i]].name); else debug3("start=%s, end=%s, nodes=%d, cpus=%d", select_node_ptr[consec_start[i]].name, select_node_ptr[consec_end[i]].name, consec_nodes[i], consec_cpus[i]); } #endif if (heterogeneous && (rem_cpus > (low_cpu_cnt * rem_nodes))) { while ((max_nodes > 0) && ((rem_nodes > 0) || (rem_cpus > 0))) { int high_cpu_cnt = 0, high_cpu_inx = -1; for (i = 0; i < select_node_cnt; i++) { if (high_cpu_cnt > avail_cpu_cnt[i]) continue; if (bit_test(bitmap, i)) continue; high_cpu_cnt = avail_cpu_cnt[i]; high_cpu_inx = i; } if (high_cpu_inx == -1) break; /* No more available CPUs */ bit_set(bitmap, high_cpu_inx); rem_nodes--; max_nodes--; avail_cpus = avail_cpu_cnt[high_cpu_inx]; rem_cpus -= avail_cpus; total_cpus +=_get_total_cpus(high_cpu_inx); avail_cpu_cnt[high_cpu_inx] = 0; } } else { heterogeneous = false; } /* accumulate nodes from these sets of consecutive nodes until */ /* sufficient resources have been accumulated */ while (consec_index && (max_nodes > 0) && !heterogeneous) { best_fit_cpus = best_fit_nodes = best_fit_sufficient = 0; best_fit_req = -1; /* first required node, -1 if none */ for (i = 0; i < consec_index; i++) { if (consec_nodes[i] == 0) continue; /* no usable nodes here */ if (job_ptr->details->contiguous && job_ptr->details->req_node_bitmap && (consec_req[i] == -1)) continue; /* no required nodes here */ sufficient = (consec_cpus[i] >= rem_cpus) && _enough_nodes(consec_nodes[i], rem_nodes, min_nodes, req_nodes); /* if first possibility OR */ /* contains required nodes OR */ /* first set large enough for request OR */ /* tightest fit (less resource waste) OR */ /* nothing yet large enough, but this is biggest */ if ((best_fit_nodes == 0) || ((best_fit_req == -1) && (consec_req[i] != -1)) || (sufficient && (best_fit_sufficient == 0)) || (sufficient && (consec_cpus[i] < best_fit_cpus)) || ((sufficient == 0) && (consec_cpus[i] > best_fit_cpus))) { best_fit_cpus = consec_cpus[i]; best_fit_nodes = consec_nodes[i]; best_fit_location = i; best_fit_req = consec_req[i]; best_fit_sufficient = sufficient; } if (job_ptr->details->contiguous && job_ptr->details->req_node_bitmap) { /* Must wait for all required nodes to be * in a single consecutive block */ int j, other_blocks = 0; for (j = (i+1); j < consec_index; j++) { if (consec_req[j] != -1) { other_blocks = 1; break; } } if (other_blocks) { best_fit_nodes = 0; break; } } } if (best_fit_nodes == 0) break; if (job_ptr->details->contiguous && ((best_fit_cpus < rem_cpus) || (!_enough_nodes(best_fit_nodes, rem_nodes, min_nodes, req_nodes)))) break; /* no hole large enough */ if (best_fit_req != -1) { /* This collection of nodes includes required ones * select nodes from this set, first working up * then down from the required nodes */ for (i = best_fit_req; i <= consec_end[best_fit_location]; i++) { if ((max_nodes <= 0) || ((rem_nodes <= 0) && (rem_cpus <= 0))) break; if (bit_test(bitmap, i)) continue; bit_set(bitmap, i); rem_nodes--; max_nodes--; avail_cpus = avail_cpu_cnt[i]; rem_cpus -= avail_cpus; total_cpus += _get_total_cpus(i); } for (i = (best_fit_req - 1); i >= consec_start[best_fit_location]; i--) { if ((max_nodes <= 0) || ((rem_nodes <= 0) && (rem_cpus <= 0))) break; if (bit_test(bitmap, i)) continue; bit_set(bitmap, i); rem_nodes--; max_nodes--; avail_cpus = avail_cpu_cnt[i]; rem_cpus -= avail_cpus; total_cpus += _get_total_cpus(i); } } else { for (i = consec_start[best_fit_location]; i <= consec_end[best_fit_location]; i++) { if ((max_nodes <= 0) || ((rem_nodes <= 0) && (rem_cpus <= 0))) break; if (bit_test(bitmap, i)) continue; bit_set(bitmap, i); rem_nodes--; max_nodes--; avail_cpus = avail_cpu_cnt[i]; rem_cpus -= avail_cpus; total_cpus += _get_total_cpus(i); } } if (job_ptr->details->contiguous || ((rem_nodes <= 0) && (rem_cpus <= 0))) { error_code = SLURM_SUCCESS; break; } consec_cpus[best_fit_location] = 0; consec_nodes[best_fit_location] = 0; } if (error_code && (rem_cpus <= 0) && _enough_nodes(0, rem_nodes, min_nodes, req_nodes)) { error_code = SLURM_SUCCESS; } if (error_code == SLURM_SUCCESS) { /* job's total_cpus is needed for SELECT_MODE_WILL_RUN */ job_ptr->total_cpus = total_cpus; } xfree(avail_cpu_cnt); xfree(consec_cpus); xfree(consec_nodes); xfree(consec_start); xfree(consec_end); xfree(consec_req); return error_code; } /* * Compute the variance on the passed sufficient cluster and see if it's the * best we've seen so far */ static void _hypercube_update_variance( int dim, int dir, int start_index, int end_index, int node_count, int max_nodes, int leftover_nodes, int64_t summed_squares, int64_t squared_sums, int * min_curve, int * min_direction, int * min_start_index, int * min_neighbors, int * min_extra_nodes, int64_t * min_variance) { //XXX use actual node count? int64_t variance = summed_squares - squared_sums * squared_sums / node_count; /* Don't calculate if we've used too many nodes */ if (0 > max_nodes) return; if ((variance < *min_variance) || ((variance == *min_variance) && (leftover_nodes <= *min_extra_nodes))) { int begin = start_index - dir; int end = end_index + dir; int neighbors = 0; if (0 > begin) { begin = hypercube_switch_cnt - 1; } else if (hypercube_switch_cnt >= begin) { begin = 0; } if (0 > end) { end = hypercube_switch_cnt - 1; } else if (hypercube_switch_cnt >= end) { end = 0; } if (begin != end_index) { neighbors += hypercube_switches[dim][begin]->avail_cnt; } if (end != start_index && begin != end) { neighbors += hypercube_switches[dim][end]->avail_cnt; } /* * Update if variance is lowest found or if variance * == lowest variance found and there are less extra * nodes on the switch or if variances and extra nodes * are the same but there are less neighboring nodes */ if ((variance < *min_variance) || ((variance == *min_variance) && (leftover_nodes < *min_extra_nodes)) || ((variance == *min_variance) && (leftover_nodes == *min_extra_nodes) && (neighbors < *min_neighbors))) { *min_variance = variance; *min_start_index = start_index; *min_extra_nodes = leftover_nodes; *min_neighbors = neighbors; *min_direction = dir; *min_curve = dim; } } } /* * We're passed a cluster (start_index, end_index) and asked to add another * switch with its nodes if we don't already have enough. As an experiment we * try adding switches to the left, but we otherwise add to the right. */ static void _hypercube_add_nodes(job_record_t *job_ptr, bitstr_t *avail_bitmap, int dim, int32_t start_index, int32_t *end_index, int node_count, int32_t max_nodes, int32_t rem_nodes, int32_t rem_cpus, int leftover_nodes, bitstr_t *bitmap, int64_t *distance_offset, int64_t summed_squares, int64_t squared_sums, int *min_curve, int *min_direction, int *min_start_index, int32_t *min_neighbors, int32_t *min_extra_nodes, int64_t *min_variance) { bitstr_t * tmp_bitmap; int32_t l_start_index = *end_index; int32_t l_end_index = start_index; int32_t l_temp_max_nodes = max_nodes; int32_t l_temp_rem_nodes = rem_nodes; int32_t l_temp_rem_cpus = rem_cpus; int64_t l_summed_squares = summed_squares; int64_t l_squared_sums = squared_sums; int32_t l_leftover_nodes = 0; int32_t l_distance_offset = 0; int32_t l_distance; /* Don't need to add any more nodes */ if (leftover_nodes || (0 >= rem_nodes && 0 >= rem_cpus)) { return; } tmp_bitmap = bit_copy(bitmap); /* * Create a temporary right-sided cluster and try sliding the left edge * further left until we have enough nodes. */ while (0 <= l_temp_max_nodes && (0 < l_temp_rem_nodes || 0 < l_temp_rem_cpus)) { int cnt, n, new_nodes = 0; l_end_index--; if (l_end_index < 0) { /* Handle wrap-around */ l_end_index = hypercube_switch_cnt - 1; l_distance_offset = -1 * hypercube_switches[dim] [hypercube_switch_cnt - 1]->distance[dim]; } /* Add nodes from the switch until we've hit our limits */ cnt = hypercube_switches[dim][l_end_index]->node_cnt; for (n = 0; n < cnt; n++) { int node = hypercube_switches[dim] [l_end_index]->node_index[n]; if (!bit_test(avail_bitmap, node) || bit_test(tmp_bitmap, node)) { continue; } /* The node is unused and available, add it */ new_nodes++; bit_set(tmp_bitmap, node); l_temp_max_nodes--; l_temp_rem_nodes--; l_temp_rem_cpus -= _get_avail_cpus(job_ptr, node); /* Have we hit our limits? */ if ((0 > l_temp_max_nodes) || (0 >= l_temp_rem_nodes && 0 >= l_temp_rem_cpus)) { break; } } /* Factor in the distance for the new nodes */ l_distance = l_distance_offset + hypercube_switches[dim][l_end_index]->distance[dim]; l_summed_squares += new_nodes * l_distance * l_distance; l_squared_sums += new_nodes * l_distance; l_leftover_nodes = hypercube_switches[dim][l_end_index]->avail_cnt - new_nodes; } bit_free(tmp_bitmap); /* Let's see how good this right-sided cluster is */ _hypercube_update_variance( dim, -1, l_start_index, l_end_index, node_count, l_temp_max_nodes, l_leftover_nodes, l_summed_squares, l_squared_sums, min_curve, min_direction, min_start_index, min_neighbors, min_extra_nodes, min_variance); /* * OK, we're back to working on the left-sided cluster. Move the right * side further right to pick up a new switch and add more of the needed * nodes. */ (*end_index)++; if (*end_index == hypercube_switch_cnt) { /* Handle wrap-around */ *end_index = 0; *distance_offset = hypercube_switches[dim][hypercube_switch_cnt - 1]-> distance[dim]; } } /* Variance based best-fit cluster algorithm: * Loop through all of the Hilbert Curves that were created. * Each Hilbert Curve is essentially a particular ordering of all the * switches in the network. For each Hilbert Curve, the algorithm loops * through all clusters of neighboring nodes, but only tests clusters that * either have their leftmost switch completed saturated (all available * nodes for the switch are in the cluster) or their rightmost switch * completed saturated, also called left-saturated clusters and * right-saturated clusters. The algorithm starts at the left (top) of * the table and works its way to the right (down). * The algorithm starts by adding nodes from the switch at the top of * the table to the cluster. If after the nodes are added, the cluster * still needs more nodes, the algorithm continues down (right) the table * adding the number of nodes the next switch has available to the * cluster. It continues adding nodes until it has enough for the cluster. * If the cluster only needs 4 more nodes and the next rightmost switch * has 8 nodes available, the cluster only adds the 4 needed nodes to the * cluster: called adding a partial. When the algorithm moves to the next * cluster, it will pick up where it left off and will add the remaining 4 * nodes on the switch before moving to the next switch in the table. * Once the algorithm has added enough nodes to the cluster, it * computes the variance for the cluster of nodes. If this cluster is * the best-fit cluster found so far, it saves the cluster's information. * To move on to testing the next cluster, first it removes all the nodes * from the leftmost switch. Then the algorithm repeats the process of * adding nodes to the cluster from rightmost switch until it has enough. * If the rightside of the cluster reaches the bottom of the table, * then it loops back around to the top most switch in the table and * continues. The algorithm continues until the leftmost switch of the * cluster has reached the end of the table. At which point it knows * that it has tested all possible left-saturated clusters. * Although this algorithm could be run in reverse order on the table * in order to test all right-saturated clusters, it would result in * redundant calculations. Instead, all right-saturated clusters are * tested during the node adding process of the left-saturated clusters * algorithm. While running the left-saturated clusters algorithm * described above, anytime nodes are added from the rightmost switch * resulting in all the available nodes from that switch being in the * cluster and the cluster still needs more nodes, then create a temporary * cluster equal to the current cluster. * Use this temporary cluster to test the right-saturated cluster * starting at the rightmost switch in the cluster and moving left (up the * table). Since the temporary cluster needs more nodes, add nodes by * moving up/left in the table (rather than right, like is done for the * left-saturated clusters). Once the cluster has enough nodes, calculate * its variance and remember it if it is the best fit cluster found so far. * Then erase the temporary cluster and continue with the original cluster * where the algorithm left off. By doing this right-saturated clusters * calcution everytime the rightmost switch of a cluster is fully added, * the algorithm tests every possible right-saturated cluster. * * Equation used to calculate the variance of a cluster: * Variance = sum(x^2) - sum(x)^2/num(x), where sum(x) is the sum of all * values of x, and num(x) is the number of x values summed * * *** Important Note: There isn't actually a 'cluster' struct, but rather * a cluster is described by its necesary characteristics including: * start_index, end_index, summed_squares, squared_sums, and rem_nodes *** */ static void _explore_hypercube(job_record_t *job_ptr, bitstr_t *avail_bitmap, const int64_t *req_summed_squares, const int64_t *req_squared_sums, const int max_nodes, const int rem_nodes, const int rem_cpus, const int node_count, int *min_start_index, int *min_direction, int *min_curve) { bitstr_t * tmp_bitmap = bit_alloc(bit_size(avail_bitmap)); int64_t min_variance = INT64_MAX; int32_t min_extra_nodes = INT32_MAX; int32_t min_neighbors = INT32_MAX; int dim; /* Check each dimension for the best cluster of nodes */ for (dim = 0; dim < hypercube_dimensions; dim++) { int64_t summed_squares = req_summed_squares[dim]; int64_t squared_sums = req_squared_sums[dim]; int64_t distance, distance_offset = 0; int32_t start_index = 0, end_index; int32_t temp_rem_nodes = rem_nodes; int32_t temp_rem_cpus = rem_cpus; int32_t temp_max_nodes = max_nodes; /* If this curve wasn't set up then skip it */ if (hypercube_switch_table[0].distance[dim] == 0) {continue;} /* Move to first switch with available nodes */ while (hypercube_switches[dim][start_index]->avail_cnt == 0) { start_index++; } end_index = start_index; bit_clear_all(tmp_bitmap); /* Test every switch to see if it's the best starting point */ while ((start_index < hypercube_switch_cnt) && (start_index >= 0)) { int leftover_nodes = 0; /* * Add new nodes to cluster. If next switch has more nodes * then needed, only add nodes needed. This is called * adding a partial. */ while ((0 <= temp_max_nodes) && (0 < temp_rem_nodes || 0 < temp_rem_cpus)) { int cnt = hypercube_switches[dim][end_index]-> node_cnt; int fn = hypercube_switches[dim][end_index]-> node_index[0]; int ln = hypercube_switches[dim][end_index]-> node_index[cnt - 1]; int new_nodes = 0; int n; /* Add free nodes from the switch */ for (n = 0; n < cnt; n++) { int node = hypercube_switches[dim] [end_index]->node_index[n]; if (!bit_test(avail_bitmap, node) || bit_test(tmp_bitmap, node)) { continue; } /* Unused and available, add it */ new_nodes++; bit_set(tmp_bitmap, node); temp_max_nodes--; temp_rem_nodes--; temp_rem_cpus -= _get_avail_cpus( job_ptr, node); /* Do we have enough resources? */ if ((0 > temp_max_nodes) || (0 >= temp_rem_nodes && 0 >= temp_rem_cpus)) { break; } } /* * Calculate the inputs to the variance for the * current cluster */ distance = hypercube_switches[dim] [end_index]->distance[dim] + distance_offset; summed_squares += new_nodes * distance * distance; squared_sums += new_nodes * distance; leftover_nodes = hypercube_switches[dim][end_index]-> avail_cnt - bit_set_count_range(tmp_bitmap, fn, ln + 1); /* Add nodes from an additional switch */ _hypercube_add_nodes( job_ptr, avail_bitmap, dim, start_index, &end_index, node_count, temp_max_nodes, temp_rem_nodes, temp_rem_cpus, leftover_nodes, tmp_bitmap, &distance_offset, summed_squares, squared_sums, min_curve, min_direction, min_start_index, &min_neighbors, &min_extra_nodes, &min_variance); } /* Check to see if this is the lowest variance so far */ _hypercube_update_variance( dim, 1, start_index, end_index, node_count, temp_max_nodes, leftover_nodes, summed_squares, squared_sums, min_curve, min_direction, min_start_index, &min_neighbors, &min_extra_nodes, &min_variance); /* * We're updating our indices to slide right and have a * new leftmost switch. Remove the nodes from the current * leftmost switch. */ while ((temp_rem_nodes <= 0) && (start_index < hypercube_switch_cnt) && (start_index >= 0)) { int cnt = hypercube_switches[dim][start_index]-> node_cnt; int used = MIN( rem_nodes, hypercube_switches[dim][start_index]-> avail_cnt); int n; if (hypercube_switches[dim][start_index]-> avail_cnt == 0) { if (start_index == end_index) end_index++; start_index++; continue; } distance = hypercube_switches[dim][start_index]-> distance[dim]; summed_squares -= distance * distance * used; squared_sums -= distance * used; /* Free the nodes we added on this switch */ for (n = 0; n < cnt; n++) { int node = hypercube_switches[dim] [start_index]->node_index[n]; if (!bit_test(tmp_bitmap, node)) continue; bit_clear(tmp_bitmap, node); temp_max_nodes++; temp_rem_nodes++; temp_rem_cpus += _get_avail_cpus(job_ptr, node); } if (start_index == end_index) end_index++; start_index++; } /* * If the cluster had holes with switches * completely allocated to other jobs, keep sliding * right until we find a switch with free nodes */ while ((start_index < hypercube_switch_cnt) && (hypercube_switches[dim][start_index]-> avail_cnt == 0)) { if (start_index == end_index) end_index++; start_index++; } } } bit_free(tmp_bitmap); } /* a hypercube topology version of _job_test - * does most of the real work for select_p_job_test(), which * pretty much just handles load-leveling and max_share logic */ static int _job_test_hypercube(job_record_t *job_ptr, bitstr_t *bitmap, uint32_t min_nodes, uint32_t max_nodes, uint32_t req_nodes) { int i, rc = EINVAL; int32_t rem_cpus, rem_nodes, node_count = 0, total_cpus = 0; int32_t alloc_nodes = 0; int64_t *req_summed_squares = xcalloc(hypercube_dimensions, sizeof(int64_t)); int64_t *req_squared_sums = xcalloc(hypercube_dimensions, sizeof(int64_t)); bitstr_t *req_nodes_bitmap = NULL; bitstr_t *avail_bitmap = NULL; int32_t cur_node_index = -1, node_counter = 0, switch_index; int32_t min_start_index = -1, min_direction = 1234, min_curve = 4321; for (i = 0; i < hypercube_dimensions; i++) { req_summed_squares[i] = 0; req_squared_sums[i] = 0; } rem_cpus = job_ptr->details->min_cpus; node_count = rem_nodes = MAX(req_nodes, min_nodes); /* Give up now if there aren't enough hosts */ if (bit_set_count(bitmap) < rem_nodes) goto fini; /* Grab all of the required nodes if there are any */ if (job_ptr->details->req_node_bitmap) { req_nodes_bitmap = bit_copy(job_ptr->details->req_node_bitmap); // set avail_bitmap to all available nodes except the required nodes // set bitmap to just the required nodes avail_bitmap = bit_copy(req_nodes_bitmap); bit_not(avail_bitmap); bit_and(avail_bitmap, bitmap ); bit_copybits(bitmap, req_nodes_bitmap); i = bit_set_count(req_nodes_bitmap); if (i > (int)max_nodes) { info("%pJ requires more nodes than currently available (%d>%u)", job_ptr, i, max_nodes); FREE_NULL_BITMAP(req_nodes_bitmap); FREE_NULL_BITMAP(avail_bitmap); xfree(req_squared_sums); xfree(req_summed_squares); return EINVAL; } rem_nodes -= i; alloc_nodes += i; } else { // if there are no required nodes, update bitmaps accordingly avail_bitmap = bit_copy(bitmap); bit_nclear(bitmap, 0, node_record_count - 1); } /* Calculate node availability for each switch */ for (i = 0; i < hypercube_switch_cnt; i++) { const int node_idx = hypercube_switch_table[i].node_index[0]; const int cnt = hypercube_switch_table[i].node_cnt; /* Add all the nodes on this switch */ hypercube_switch_table[i].avail_cnt = bit_set_count_range( avail_bitmap, node_idx, node_idx + cnt); /* If the switch has nodes that are required, loop through them */ if (req_nodes_bitmap && (hypercube_switch_table[i].avail_cnt != 0) && (bit_set_count_range( req_nodes_bitmap, node_idx, node_idx + cnt) > 0)) { int j; /* Check each node on the switch */ for (j = 0; j < cnt; j++) { int idx = hypercube_switch_table[i].node_index[j]; /* If is req'd, add cpus and distance to calulations */ if (bit_test(req_nodes_bitmap, idx)) { int k; rem_cpus -= _get_avail_cpus(job_ptr, idx); total_cpus += _get_total_cpus(idx); /* * Add the required nodes data to the * variance calculations */ for (k = 0; k < hypercube_dimensions; k++) { int distance = hypercube_switch_table[i]. distance[k]; req_summed_squares[k] += distance * distance; req_squared_sums[k] += distance; } } } } } // check to see if no more nodes need to be added to the job if ((alloc_nodes >= max_nodes) || ((rem_nodes <= 0) && (rem_cpus <= 0))) { goto fini; } /* Find the best starting switch and traversal path to get nodes from */ if (alloc_nodes < max_nodes) i = max_nodes - alloc_nodes; else i = 0; _explore_hypercube(job_ptr, avail_bitmap, req_summed_squares, req_squared_sums, i, rem_nodes, rem_cpus, node_count, &min_start_index, &min_direction, &min_curve); if (-1 == min_start_index) goto fini; /* * Assigns nodes from the best cluster to the job. Starts at the start * index switch and keeps adding available nodes until it has as many * as it needs */ switch_index = min_start_index; node_counter = 0; while ((alloc_nodes < max_nodes) && ((rem_nodes > 0) || (rem_cpus > 0))) { int node_index; /* If we used up all the nodes in a switch, move to the next */ if (node_counter == hypercube_switches[min_curve][switch_index]->avail_cnt) { node_counter = 0; cur_node_index = -1; do { // min_direction == 1 moves up the table // min_direction == -1 moves down the table switch_index += min_direction; if (switch_index == hypercube_switch_cnt) { switch_index = 0; } else if (switch_index == -1) { switch_index = hypercube_switch_cnt - 1; } else if (switch_index == min_start_index) { goto fini; } } while (hypercube_switches[min_curve][switch_index]-> avail_cnt == 0); } /* Find the next usable node in the switch */ do { cur_node_index++; node_index = hypercube_switches[min_curve][switch_index]-> node_index[cur_node_index]; } while (false == bit_test(avail_bitmap, node_index)); /* Allocate the CPUs from the node */ bit_set(bitmap, node_index); rem_cpus -= _get_avail_cpus(job_ptr, node_index); total_cpus += _get_total_cpus(node_index); rem_nodes--; alloc_nodes++; node_counter++; } fini: /* If we allocated sufficient CPUs and nodes, we were successful */ if ((rem_cpus <= 0) && (bit_set_count(bitmap) >= min_nodes)) { rc = SLURM_SUCCESS; /* Job's total_cpus is needed for SELECT_MODE_WILL_RUN */ job_ptr->total_cpus = total_cpus; } else { rc = EINVAL; if (alloc_nodes > max_nodes) { info("%pJ requires more nodes than allowed", job_ptr); } } xfree(req_squared_sums); xfree(req_summed_squares); FREE_NULL_BITMAP(req_nodes_bitmap); FREE_NULL_BITMAP(avail_bitmap); return rc; } /* * _job_test_dfly - A dragonfly topology aware version of _job_test() * NOTE: The logic here is almost identical to that of _eval_nodes_dfly() in * select/cons_res/job_test.c. Any bug found here is probably also there. */ static int _job_test_dfly(job_record_t *job_ptr, bitstr_t *bitmap, uint32_t min_nodes, uint32_t max_nodes, uint32_t req_nodes) { bitstr_t **switches_bitmap; /* nodes on this switch */ int *switches_cpu_cnt; /* total CPUs on switch */ uint32_t *switches_node_cnt; /* total nodes on switch */ uint32_t *switches_node_use; /* nodes from switch used */ bitstr_t *req_nodes_bitmap = NULL; int rem_cpus; /* remaining resources desired */ int avail_cpus, total_cpus = 0; uint32_t want_nodes, alloc_nodes = 0; int i, j, rc = SLURM_SUCCESS; int best_fit_inx, first, last; int best_fit_nodes, best_fit_cpus; int best_fit_location = 0; bool sufficient; long time_waiting = 0; int leaf_switch_count = 0; /* Count of leaf node switches used */ if (job_ptr->req_switch > 1) { /* Maximum leaf switch count >1 probably makes no sense */ info("%s: Resetting %pJ leaf switch count from %u to 0", __func__, job_ptr, job_ptr->req_switch); job_ptr->req_switch = 0; } if (job_ptr->req_switch) { time_t time_now; time_now = time(NULL); if (job_ptr->wait4switch_start == 0) job_ptr->wait4switch_start = time_now; time_waiting = time_now - job_ptr->wait4switch_start; } rem_cpus = job_ptr->details->min_cpus; if (req_nodes > min_nodes) want_nodes = req_nodes; else want_nodes = min_nodes; /* Construct a set of switch array entries, * use the same indexes as switch_record_table in slurmctld */ switches_bitmap = xcalloc(switch_record_cnt, sizeof(bitstr_t *)); switches_cpu_cnt = xcalloc(switch_record_cnt, sizeof(int)); switches_node_cnt = xcalloc(switch_record_cnt, sizeof(uint32_t)); switches_node_use = xcalloc(switch_record_cnt, sizeof(int)); if (job_ptr->details->req_node_bitmap) { req_nodes_bitmap = bit_copy(job_ptr->details->req_node_bitmap); i = bit_set_count(req_nodes_bitmap); if (i > (int)max_nodes) { info("%pJ requires more nodes than currently available (%u>%u)", job_ptr, i, max_nodes); rc = EINVAL; goto fini; } } /* phase 1: make availability bitmaps for switches */ sufficient = false; for (i = 0; i < switch_record_cnt; i++) { switches_bitmap[i] = bit_copy(switch_record_table[i]. node_bitmap); bit_and(switches_bitmap[i], bitmap); if (req_nodes_bitmap && !bit_super_set(req_nodes_bitmap, switches_bitmap[i])) switches_node_cnt[i] = 0; else { switches_node_cnt[i] = bit_set_count(switches_bitmap[i]); sufficient = true; } } bit_nclear(bitmap, 0, node_record_count - 1); #if SELECT_DEBUG /* Don't compile this, it slows things down too much */ for (i = 0; i < switch_record_cnt; i++) { char *node_names = NULL; if (switches_node_cnt[i]) node_names = bitmap2node_name(switches_bitmap[i]); debug("switch=%s nodes=%u:%s speed=%u", switch_record_table[i].name, switches_node_cnt[i], node_names, switch_record_table[i].link_speed); xfree(node_names); } #endif /* check if requested nodes are available */ if (!sufficient) { info("%pJ requires nodes not available on any switch", job_ptr); rc = EINVAL; goto fini; } /* phase 2: calculate CPU resources for each switch */ for (i = 0; i < node_record_count; i++) { avail_cpus = _get_avail_cpus(job_ptr, i); for (j = 0; j < switch_record_cnt; j++) { if (bit_test(switches_bitmap[j], i)) { switches_cpu_cnt[j] += avail_cpus; } } } /* phase 3 */ /* Determine lowest level switch satisfying request with best fit */ best_fit_inx = -1; for (j = 0; j < switch_record_cnt; j++) { #if SELECT_DEBUG debug5("checking switch %d: nodes %u cpus %d", j, switches_node_cnt[j], switches_cpu_cnt[j]); #endif if ((switches_cpu_cnt[j] < rem_cpus) || (switches_node_cnt[j] < min_nodes)) continue; if ((best_fit_inx == -1) || (switch_record_table[j].level < switch_record_table[best_fit_inx].level) || ((switch_record_table[j].level == switch_record_table[best_fit_inx].level) && (switches_node_cnt[j] < switches_node_cnt[best_fit_inx]))) best_fit_inx = j; } if (best_fit_inx == -1) { debug("%s: could not find resources for %pJ", __func__, job_ptr); rc = EINVAL; goto fini; } /* phase 4: select resources from already allocated leaves */ /* Identify usable leafs (within higher switch having best fit) */ for (j = 0; j < switch_record_cnt; j++) { if ((switch_record_table[j].level > 0) || (!bit_super_set(switches_bitmap[j], switches_bitmap[best_fit_inx]))) { switches_node_cnt[j] = 0; } else if (req_nodes_bitmap) { /* we have subnodes count zeroed yet so count them */ switches_node_cnt[j] = bit_set_count(switches_bitmap[j]); } } /* set already allocated nodes and gather additional resources */ if (req_nodes_bitmap) { /* Accumulate specific required resources, if any */ for (j = 0; j < switch_record_cnt; j++) { if (alloc_nodes > max_nodes) break; if (switches_node_cnt[j] == 0 || bit_overlap_any(req_nodes_bitmap, switches_bitmap[j]) == 0) continue; /* Use nodes from this leaf */ first = bit_ffs(switches_bitmap[j]); if (first < 0) { switches_node_cnt[j] = 0; continue; } last = bit_fls(switches_bitmap[j]); for (i = first; i <= last; i++) { if (!bit_test(switches_bitmap[j], i)) continue; if (!bit_test(req_nodes_bitmap, i)) { /* node wasn't requested */ continue; } bit_clear(switches_bitmap[j], i); switches_node_cnt[j]--; avail_cpus = _get_avail_cpus(job_ptr, i); switches_cpu_cnt[j] -= avail_cpus; if (bit_test(bitmap, i)) { /* node on multiple leaf switches * and already selected */ continue; } switches_node_use[j]++; bit_set(bitmap, i); alloc_nodes++; rem_cpus -= avail_cpus; total_cpus += _get_total_cpus(i); } } } /* phase 5 */ /* Select resources from leafs on a best-fit or round-robin basis */ while ((alloc_nodes <= max_nodes) && ((alloc_nodes < want_nodes) || (rem_cpus > 0))) { best_fit_cpus = best_fit_nodes = 0; for (j = 0; j < switch_record_cnt; j++) { if (switches_node_cnt[j] == 0) continue; /* If multiple leaf switches must be used, prefer use * of leaf switches with fewest number of idle CPUs. * This results in more leaf switches being used and * achieves better network bandwidth. */ if ((best_fit_nodes == 0) || (switches_node_use[best_fit_location] > switches_node_use[j]) || ((switches_node_use[best_fit_location] == switches_node_use[j]) && (switches_cpu_cnt[j] < best_fit_cpus))) { best_fit_cpus = switches_cpu_cnt[j]; best_fit_nodes = switches_node_cnt[j]; best_fit_location = j; } } #if SELECT_DEBUG info("%s: found switch %d for allocation: nodes %d cpus %d " "allocated %u", __func__, best_fit_location, best_fit_nodes, best_fit_cpus, alloc_nodes); #endif if (best_fit_nodes == 0) break; /* Use select nodes from this leaf */ first = bit_ffs(switches_bitmap[best_fit_location]); if (first < 0) { switches_node_cnt[best_fit_location] = 0; continue; } last = bit_fls(switches_bitmap[best_fit_location]); for (i = first; i <= last; i++) { if (!bit_test(switches_bitmap[best_fit_location], i)) continue; if (bit_test(bitmap, i)) { /* node on multiple leaf switches * and already selected */ continue; } bit_clear(switches_bitmap[best_fit_location], i); switches_node_cnt[best_fit_location]--; switches_node_use[best_fit_location]++; bit_set(bitmap, i); alloc_nodes++; j = _get_avail_cpus(job_ptr, i); switches_cpu_cnt[best_fit_location] -= j; rem_cpus -= j; total_cpus += _get_total_cpus(i); break; } leaf_switch_count++; if (job_ptr->req_switch > 0) { if (time_waiting >= job_ptr->wait4switch) { job_ptr->best_switch = true; debug3("%pJ Waited %ld sec for switches use=%d", job_ptr, time_waiting, leaf_switch_count); } else if (leaf_switch_count > job_ptr->req_switch) { /* Allocation is for more than requested number * of switches */ job_ptr->best_switch = false; debug3("%pJ waited %ld sec for switches=%u found=%d wait %u", job_ptr, time_waiting, job_ptr->req_switch, leaf_switch_count, job_ptr->wait4switch); } else { job_ptr->best_switch = true; } } } if ((alloc_nodes <= max_nodes) && (rem_cpus <= 0) && (alloc_nodes >= min_nodes)) { rc = SLURM_SUCCESS; } else rc = EINVAL; fini: if (rc == SLURM_SUCCESS) { /* Job's total_cpus is needed for SELECT_MODE_WILL_RUN */ job_ptr->total_cpus = total_cpus; } else if (alloc_nodes > max_nodes) info("%pJ requires more nodes than allowed", job_ptr); FREE_NULL_BITMAP(req_nodes_bitmap); for (i = 0; i < switch_record_cnt; i++) FREE_NULL_BITMAP(switches_bitmap[i]); xfree(switches_bitmap); xfree(switches_cpu_cnt); xfree(switches_node_cnt); xfree(switches_node_use); return rc; } /* * _job_test_topo - A topology aware version of _job_test() * NOTE: The logic here is almost identical to that of _eval_nodes_topo() in * select/cons_res/job_test.c. Any bug found here is probably also there. */ static int _job_test_topo(job_record_t *job_ptr, bitstr_t *bitmap, uint32_t min_nodes, uint32_t max_nodes, uint32_t req_nodes) { bitstr_t **switches_bitmap; /* nodes on this switch */ int *switches_cpu_cnt; /* total CPUs on switch */ uint32_t *switches_node_cnt; /* total nodes on switch */ int *switches_required; /* set if has required node */ bitstr_t *req_nodes_bitmap = NULL; int rem_cpus; /* remaining resources desired */ int avail_cpus, total_cpus = 0; uint32_t want_nodes, alloc_nodes = 0; int i, j, rc = SLURM_SUCCESS; int best_fit_inx, first, last; int best_fit_nodes, best_fit_cpus; int best_fit_location = 0; bool sufficient; long time_waiting = 0; int leaf_switch_count = 0; /* Count of leaf node switches used */ if (job_ptr->req_switch) { time_t time_now; time_now = time(NULL); if (job_ptr->wait4switch_start == 0) job_ptr->wait4switch_start = time_now; time_waiting = time_now - job_ptr->wait4switch_start; } rem_cpus = job_ptr->details->min_cpus; if (req_nodes > min_nodes) want_nodes = req_nodes; else want_nodes = min_nodes; /* Construct a set of switch array entries, * use the same indexes as switch_record_table in slurmctld */ switches_bitmap = xcalloc(switch_record_cnt, sizeof(bitstr_t *)); switches_cpu_cnt = xcalloc(switch_record_cnt, sizeof(int)); switches_node_cnt = xcalloc(switch_record_cnt, sizeof(uint32_t)); switches_required = xcalloc(switch_record_cnt, sizeof(int)); if (job_ptr->details->req_node_bitmap) { req_nodes_bitmap = bit_copy(job_ptr->details->req_node_bitmap); i = bit_set_count(req_nodes_bitmap); if (i > (int)max_nodes) { info("%pJ requires more nodes than currently available (%u>%u)", job_ptr, i, max_nodes); rc = EINVAL; goto fini; } } /* phase 1: make availability bitmaps for switches */ sufficient = false; for (i=0; i 0) || (!bit_super_set(switches_bitmap[j], switches_bitmap[best_fit_inx]))) { switches_node_cnt[j] = 0; } else if (req_nodes_bitmap) { /* we have subnodes count zeroed yet so count them */ switches_node_cnt[j] = bit_set_count(switches_bitmap[j]); } } /* set already allocated nodes and gather additional resources */ if (req_nodes_bitmap) { /* Accumulate specific required resources, if any */ for (j = 0; j < switch_record_cnt; j++) { if (alloc_nodes > max_nodes) break; if (switches_node_cnt[j] == 0 || bit_overlap_any(req_nodes_bitmap, switches_bitmap[j]) == 0) continue; /* Use nodes from this leaf */ first = bit_ffs(switches_bitmap[j]); if (first < 0) { switches_node_cnt[j] = 0; continue; } last = bit_fls(switches_bitmap[j]); for (i=first; i<=last; i++) { if (!bit_test(switches_bitmap[j], i)) continue; if (!bit_test(req_nodes_bitmap, i)) { /* node wasn't requested */ continue; } bit_clear(switches_bitmap[j], i); switches_node_cnt[j]--; avail_cpus = _get_avail_cpus(job_ptr, i); switches_cpu_cnt[j] -= avail_cpus; if (bit_test(bitmap, i)) { /* node on multiple leaf switches * and already selected */ continue; } switches_required[j] = 1; bit_set(bitmap, i); alloc_nodes++; rem_cpus -= avail_cpus; total_cpus += _get_total_cpus(i); } } /* Accumulate additional resources from leafs that * contain required nodes */ for (j=0; j max_nodes) || ((alloc_nodes >= want_nodes) && (rem_cpus <= 0))) break; if (switches_required[j] == 0) continue; /* Use nodes from this leaf */ first = bit_ffs(switches_bitmap[j]); if (first < 0) { switches_node_cnt[j] = 0; continue; } last = bit_fls(switches_bitmap[j]); for (i=first; i<=last; i++) { if (!bit_test(switches_bitmap[j], i)) continue; /* there is no need here to reset anything for switch j as we disable it after cycle by setting switches_node_cnt[j] to 0 */ if (bit_test(bitmap, i)) { /* node on multiple leaf switches * and already selected */ continue; } bit_set(bitmap, i); alloc_nodes++; rem_cpus -= _get_avail_cpus(job_ptr, i); total_cpus += _get_total_cpus(i); if ((alloc_nodes > max_nodes) || ((alloc_nodes >= want_nodes) && (rem_cpus <= 0))) break; } switches_node_cnt[j] = 0; /* it's used up */ } } /* phase 5 */ /* Select resources from these leafs on a best-fit basis */ /* Compute best-switch nodes available array */ while ((alloc_nodes <= max_nodes) && ((alloc_nodes < want_nodes) || (rem_cpus > 0))) { best_fit_cpus = best_fit_nodes = 0; for (j=0; j max_nodes) || ((alloc_nodes >= want_nodes) && (rem_cpus <= 0))) break; } switches_node_cnt[best_fit_location] = 0; leaf_switch_count++; if (job_ptr->req_switch > 0) { if (time_waiting >= job_ptr->wait4switch) { job_ptr->best_switch = true; debug3("%pJ Waited %ld sec for switches use=%d", job_ptr, time_waiting, leaf_switch_count); } else if (leaf_switch_count > job_ptr->req_switch) { /* Allocation is for more than requested number * of switches */ job_ptr->best_switch = false; debug3("%pJ waited %ld sec for switches=%u found=%d wait %u", job_ptr, time_waiting, job_ptr->req_switch, leaf_switch_count, job_ptr->wait4switch); } else { job_ptr->best_switch = true; } } } if ((alloc_nodes <= max_nodes) && (rem_cpus <= 0) && (alloc_nodes >= min_nodes)) { rc = SLURM_SUCCESS; } else rc = EINVAL; fini: if (rc == SLURM_SUCCESS) { /* Job's total_cpus is needed for SELECT_MODE_WILL_RUN */ job_ptr->total_cpus = total_cpus; } else if (alloc_nodes > max_nodes) info("%pJ requires more nodes than allowed", job_ptr); FREE_NULL_BITMAP(req_nodes_bitmap); for (i=0; ijob_id) == 0) { info("%s: %pJ has no resources allocated", plugin_type, job_ptr); return SLURM_ERROR; } if (job_ptr->start_time < slurmctld_config.boot_time) old_job = true; if (remove_all && job_ptr->details && job_ptr->details->pn_min_memory && (cr_type & CR_MEMORY)) { if (job_ptr->details->pn_min_memory & MEM_PER_CPU) { job_memory_cpu = job_ptr->details->pn_min_memory & (~MEM_PER_CPU); } else job_memory_node = job_ptr->details->pn_min_memory; } if ((job_resrcs_ptr = job_ptr->job_resrcs) == NULL) { error("%pJ lacks a job_resources struct", job_ptr); return SLURM_ERROR; } is_job_running = _rem_run_job(cr_ptr, job_ptr->job_id); exclusive = (job_ptr->details->share_res == 0); i_first = bit_ffs(job_resrcs_ptr->node_bitmap); i_last = bit_fls(job_resrcs_ptr->node_bitmap); if (i_first == -1) /* job has no nodes */ i_last = -2; node_offset = -1; for (i = i_first; i <= i_last; i++) { if (!bit_test(job_resrcs_ptr->node_bitmap, i)) continue; node_offset++; if (!job_ptr->node_bitmap || !bit_test(job_ptr->node_bitmap, i)) continue; node_ptr = node_record_table_ptr + i; cpu_cnt = node_ptr->config_ptr->cpus; if (job_memory_cpu) job_memory = job_memory_cpu * cpu_cnt; else job_memory = job_memory_node; if (cr_ptr->nodes[i].alloc_memory >= job_memory) cr_ptr->nodes[i].alloc_memory -= job_memory; else { debug("%s: memory underflow for node %s", pre_err, node_ptr->name); cr_ptr->nodes[i].alloc_memory = 0; } if (remove_all) { if (cr_ptr->nodes[i].gres_list) gres_list = cr_ptr->nodes[i].gres_list; else gres_list = node_ptr->gres_list; gres_plugin_job_dealloc(job_ptr->gres_list, gres_list, node_offset, job_ptr->job_id, node_ptr->name, old_job, job_ptr->user_id, job_fini); gres_plugin_node_state_log(gres_list, node_ptr->name); } if (exclusive) { if (cr_ptr->nodes[i].exclusive_cnt) cr_ptr->nodes[i].exclusive_cnt--; else { error("%s: exclusive_cnt underflow for " "node %s", pre_err, node_ptr->name); } } part_cr_ptr = cr_ptr->nodes[i].parts; while (part_cr_ptr) { if (part_cr_ptr->part_ptr != job_ptr->part_ptr) { part_cr_ptr = part_cr_ptr->next; continue; } if (!is_job_running) /* cancelled job already suspended */; else if (part_cr_ptr->run_job_cnt > 0) part_cr_ptr->run_job_cnt--; else { error("%s: run_job_cnt underflow for node %s", pre_err, node_ptr->name); } if (remove_all) { if (part_cr_ptr->tot_job_cnt > 0) part_cr_ptr->tot_job_cnt--; else { error("%s: tot_job_cnt underflow " "for node %s", pre_err, node_ptr->name); } if ((part_cr_ptr->tot_job_cnt == 0) && (part_cr_ptr->run_job_cnt)) { part_cr_ptr->run_job_cnt = 0; error("%s: run_job_cnt out of sync " "for node %s", pre_err, node_ptr->name); } } break; } if (part_cr_ptr == NULL) { if (job_ptr->part_nodes_missing) { ; } else if (job_ptr->part_ptr) { info("%s: %pJ and its partition %s no longer contain node %s", pre_err, job_ptr, job_ptr->partition, node_ptr->name); } else { info("%s: %pJ has no pointer to partition %s and node %s", pre_err, job_ptr, job_ptr->partition, node_ptr->name); } job_ptr->part_nodes_missing = true; rc = SLURM_ERROR; } } return rc; } /* Move all resources from one job to another */ static int _job_expand(job_record_t *from_job_ptr, job_record_t *to_job_ptr) { int i, node_cnt, rc = SLURM_SUCCESS; node_record_t *node_ptr; job_resources_t *from_job_resrcs_ptr, *to_job_resrcs_ptr, *new_job_resrcs_ptr; bool from_node_used, to_node_used; int from_node_offset, to_node_offset, new_node_offset; int first_bit, last_bit; bitstr_t *tmp_bitmap, *tmp_bitmap2; xassert(from_job_ptr); xassert(to_job_ptr); if (cr_ptr == NULL) { error("%s: cr_ptr not initialized", plugin_type); return SLURM_ERROR; } if (from_job_ptr->job_id == to_job_ptr->job_id) { error("%s: attempt to merge %pJ with self", plugin_type, from_job_ptr); return SLURM_ERROR; } if (_test_tot_job(cr_ptr, from_job_ptr->job_id) == 0) { info("%s: %pJ has no resources allocated", plugin_type, from_job_ptr); return SLURM_ERROR; } if (_test_tot_job(cr_ptr, to_job_ptr->job_id) == 0) { info("%s: %pJ has no resources allocated", plugin_type, to_job_ptr); return SLURM_ERROR; } from_job_resrcs_ptr = from_job_ptr->job_resrcs; if ((from_job_resrcs_ptr == NULL) || (from_job_resrcs_ptr->cpus == NULL) || (from_job_resrcs_ptr->node_bitmap == NULL)) { error("%s: %pJ lacks a job_resources struct", plugin_type, from_job_ptr); return SLURM_ERROR; } to_job_resrcs_ptr = to_job_ptr->job_resrcs; if ((to_job_resrcs_ptr == NULL) || (to_job_resrcs_ptr->cpus == NULL) || (to_job_resrcs_ptr->node_bitmap == NULL)) { error("%s: %pJ lacks a job_resources struct", plugin_type, to_job_ptr); return SLURM_ERROR; } (void) _rm_job_from_nodes(cr_ptr, from_job_ptr, "select_p_job_expand", true, true); (void) _rm_job_from_nodes(cr_ptr, to_job_ptr, "select_p_job_expand", true, true); if (to_job_resrcs_ptr->core_bitmap_used) { i = bit_size(to_job_resrcs_ptr->core_bitmap_used); bit_nclear(to_job_resrcs_ptr->core_bitmap_used, 0, i-1); } tmp_bitmap = bit_copy(to_job_resrcs_ptr->node_bitmap); bit_or(tmp_bitmap, from_job_resrcs_ptr->node_bitmap); tmp_bitmap2 = bit_copy(to_job_ptr->node_bitmap); bit_or(tmp_bitmap2, from_job_ptr->node_bitmap); bit_and(tmp_bitmap, tmp_bitmap2); bit_free(tmp_bitmap2); node_cnt = bit_set_count(tmp_bitmap); new_job_resrcs_ptr = _create_job_resources(node_cnt); new_job_resrcs_ptr->ncpus = from_job_resrcs_ptr->ncpus + to_job_resrcs_ptr->ncpus; new_job_resrcs_ptr->node_req = to_job_resrcs_ptr->node_req; new_job_resrcs_ptr->node_bitmap = tmp_bitmap; new_job_resrcs_ptr->nodes = bitmap2node_name(new_job_resrcs_ptr-> node_bitmap); build_job_resources(new_job_resrcs_ptr, node_record_table_ptr); xfree(to_job_ptr->node_addr); to_job_ptr->node_addr = xcalloc(node_cnt, sizeof(slurm_addr_t)); to_job_ptr->total_cpus = 0; first_bit = MIN(bit_ffs(from_job_resrcs_ptr->node_bitmap), bit_ffs(to_job_resrcs_ptr->node_bitmap)); last_bit = MAX(bit_fls(from_job_resrcs_ptr->node_bitmap), bit_fls(to_job_resrcs_ptr->node_bitmap)); from_node_offset = to_node_offset = new_node_offset = -1; for (i = first_bit; i <= last_bit; i++) { from_node_used = to_node_used = false; if (bit_test(from_job_resrcs_ptr->node_bitmap, i)) { from_node_used = bit_test(from_job_ptr->node_bitmap,i); from_node_offset++; } if (bit_test(to_job_resrcs_ptr->node_bitmap, i)) { to_node_used = bit_test(to_job_ptr->node_bitmap, i); to_node_offset++; } if (!from_node_used && !to_node_used) continue; new_node_offset++; node_ptr = node_record_table_ptr + i; memcpy(&to_job_ptr->node_addr[new_node_offset], &node_ptr->slurm_addr, sizeof(slurm_addr_t)); if (from_node_used) { /* Merge alloc info from both "from" and "to" jobs, * leave "from" job with no allocated CPUs or memory */ new_job_resrcs_ptr->cpus[new_node_offset] = from_job_resrcs_ptr->cpus[from_node_offset]; from_job_resrcs_ptr->cpus[from_node_offset] = 0; /* new_job_resrcs_ptr->cpus_used[new_node_offset] = from_job_resrcs_ptr-> cpus_used[from_node_offset]; Should be 0 */ new_job_resrcs_ptr->memory_allocated[new_node_offset] = from_job_resrcs_ptr-> memory_allocated[from_node_offset]; /* new_job_resrcs_ptr->memory_used[new_node_offset] = from_job_resrcs_ptr-> memory_used[from_node_offset]; Should be 0 */ job_resources_bits_copy(new_job_resrcs_ptr, new_node_offset, from_job_resrcs_ptr, from_node_offset); } if (to_node_used) { /* Merge alloc info from both "from" and "to" jobs */ /* DO NOT double count the allocated CPUs in partition * with Shared nodes */ new_job_resrcs_ptr->cpus[new_node_offset] = to_job_resrcs_ptr->cpus[to_node_offset]; new_job_resrcs_ptr->cpus_used[new_node_offset] += to_job_resrcs_ptr->cpus_used[to_node_offset]; new_job_resrcs_ptr->memory_allocated[new_node_offset]+= to_job_resrcs_ptr-> memory_allocated[to_node_offset]; new_job_resrcs_ptr->memory_used[new_node_offset] += to_job_resrcs_ptr->memory_used[to_node_offset]; job_resources_bits_copy(new_job_resrcs_ptr, new_node_offset, to_job_resrcs_ptr, to_node_offset); } to_job_ptr->total_cpus += new_job_resrcs_ptr-> cpus[new_node_offset]; } build_job_resources_cpu_array(new_job_resrcs_ptr); gres_plugin_job_merge(from_job_ptr->gres_list, from_job_resrcs_ptr->node_bitmap, to_job_ptr->gres_list, to_job_resrcs_ptr->node_bitmap); /* Now swap data: "new" -> "to" and clear "from" */ free_job_resources(&to_job_ptr->job_resrcs); to_job_ptr->job_resrcs = new_job_resrcs_ptr; to_job_ptr->cpu_cnt = to_job_ptr->total_cpus; if (to_job_ptr->details) { to_job_ptr->details->min_cpus = to_job_ptr->total_cpus; to_job_ptr->details->max_cpus = to_job_ptr->total_cpus; } from_job_ptr->total_cpus = 0; from_job_resrcs_ptr->ncpus = 0; if (from_job_ptr->details) { from_job_ptr->details->min_cpus = 0; from_job_ptr->details->max_cpus = 0; } from_job_ptr->total_nodes = 0; from_job_resrcs_ptr->nhosts = 0; from_job_ptr->node_cnt = 0; if (from_job_ptr->details) from_job_ptr->details->min_nodes = 0; to_job_ptr->total_nodes = new_job_resrcs_ptr->nhosts; to_job_ptr->node_cnt = new_job_resrcs_ptr->nhosts; bit_or(to_job_ptr->node_bitmap, from_job_ptr->node_bitmap); bit_nclear(from_job_ptr->node_bitmap, 0, (node_record_count - 1)); bit_nclear(from_job_resrcs_ptr->node_bitmap, 0, (node_record_count - 1)); xfree(to_job_ptr->nodes); to_job_ptr->nodes = xstrdup(new_job_resrcs_ptr->nodes); xfree(from_job_ptr->nodes); from_job_ptr->nodes = xstrdup(""); xfree(from_job_resrcs_ptr->nodes); from_job_resrcs_ptr->nodes = xstrdup(""); _add_job_to_nodes(cr_ptr, to_job_ptr, "select_p_job_expand", 1); return rc; } /* Decrement a partitions running and total job counts as needed to enforce the * limit of jobs per node per partition (the partition's Shared=# parameter) */ static int _decr_node_job_cnt(int node_inx, job_record_t *job_ptr, char *pre_err) { node_record_t *node_ptr = node_record_table_ptr + node_inx; struct part_cr_record *part_cr_ptr; bool exclusive = false, is_job_running; if (job_ptr->details) exclusive = (job_ptr->details->share_res == 0); if (exclusive) { if (cr_ptr->nodes[node_inx].exclusive_cnt) cr_ptr->nodes[node_inx].exclusive_cnt--; else { error("%s: exclusive_cnt underflow for node %s", pre_err, node_ptr->name); } } is_job_running = _test_run_job(cr_ptr, job_ptr->job_id); part_cr_ptr = cr_ptr->nodes[node_inx].parts; while (part_cr_ptr) { if (part_cr_ptr->part_ptr != job_ptr->part_ptr) { part_cr_ptr = part_cr_ptr->next; continue; } if (!is_job_running) /* cancelled job already suspended */; else if (part_cr_ptr->run_job_cnt > 0) part_cr_ptr->run_job_cnt--; else { error("%s: run_job_cnt underflow for node %s", pre_err, node_ptr->name); } if (part_cr_ptr->tot_job_cnt > 0) part_cr_ptr->tot_job_cnt--; else { error("%s: tot_job_cnt underflow for node %s", pre_err, node_ptr->name); } if ((part_cr_ptr->tot_job_cnt == 0) && (part_cr_ptr->run_job_cnt)) { part_cr_ptr->run_job_cnt = 0; error("%s: run_job_cnt out of sync for node %s", pre_err, node_ptr->name); } return SLURM_SUCCESS; } if (job_ptr->part_ptr) { error("%s: Could not find partition %s for node %s", pre_err, job_ptr->part_ptr->name, node_ptr->name); } else { error("%s: no partition ptr given for %pJ and node %s", pre_err, job_ptr, node_ptr->name); } return SLURM_ERROR; } /* * deallocate resources that were assigned to this job on one node */ static int _rm_job_from_one_node(job_record_t *job_ptr, node_record_t *node_ptr, char *pre_err) { int i, node_inx, node_offset; job_resources_t *job_resrcs_ptr; uint64_t job_memory, job_memory_cpu = 0, job_memory_node = 0; int first_bit; uint16_t cpu_cnt; List gres_list; bool old_job = false; if (cr_ptr == NULL) { error("%s: cr_ptr not initialized", pre_err); return SLURM_ERROR; } if (_test_tot_job(cr_ptr, job_ptr->job_id) == 0) { info("%s: %pJ has no resources allocated", plugin_type, job_ptr); return SLURM_ERROR; } if (job_ptr->details && job_ptr->details->pn_min_memory && (cr_type & CR_MEMORY)) { if (job_ptr->details->pn_min_memory & MEM_PER_CPU) { job_memory_cpu = job_ptr->details->pn_min_memory & (~MEM_PER_CPU); } else job_memory_node = job_ptr->details->pn_min_memory; } if ((job_ptr->job_resrcs == NULL) || (job_ptr->job_resrcs->cpus == NULL)) { error("%pJ lacks a job_resources struct", job_ptr); return SLURM_ERROR; } job_resrcs_ptr = job_ptr->job_resrcs; node_inx = node_ptr - node_record_table_ptr; if (!bit_test(job_resrcs_ptr->node_bitmap, node_inx)) { error("%pJ allocated nodes (%s) which have been removed from slurm.conf", job_ptr, node_ptr->name); return SLURM_ERROR; } first_bit = bit_ffs(job_resrcs_ptr->node_bitmap); node_offset = -1; for (i = first_bit; i <= node_inx; i++) { if (!bit_test(job_resrcs_ptr->node_bitmap, i)) continue; node_offset++; } if (job_resrcs_ptr->cpus[node_offset] == 0) { error("duplicate relinquish of node %s by %pJ", node_ptr->name, job_ptr); return SLURM_ERROR; } extract_job_resources_node(job_resrcs_ptr, node_offset); if (job_ptr->start_time < slurmctld_config.boot_time) old_job = true; cpu_cnt = node_ptr->config_ptr->cpus; if (job_memory_cpu) job_memory = job_memory_cpu * cpu_cnt; else job_memory = job_memory_node; if (cr_ptr->nodes[node_inx].alloc_memory >= job_memory) cr_ptr->nodes[node_inx].alloc_memory -= job_memory; else { cr_ptr->nodes[node_inx].alloc_memory = 0; error("%s: memory underflow for node %s", pre_err, node_ptr->name); } if (cr_ptr->nodes[node_inx].gres_list) gres_list = cr_ptr->nodes[node_inx].gres_list; else gres_list = node_ptr->gres_list; gres_plugin_job_dealloc(job_ptr->gres_list, gres_list, node_offset, job_ptr->job_id, node_ptr->name, old_job, job_ptr->user_id, true); gres_plugin_node_state_log(gres_list, node_ptr->name); return _decr_node_job_cnt(node_inx, job_ptr, pre_err); } /* * allocate resources to the given job * * if alloc_all = 0: the job has been suspended, so just re-allocate CPUs * if alloc_all = 1: allocate all resources (CPUs and memory) */ static int _add_job_to_nodes(struct cr_record *cr_ptr, job_record_t *job_ptr, char *pre_err, int alloc_all) { int i, i_first, i_last, node_cnt, node_offset, rc = SLURM_SUCCESS; bool exclusive; struct part_cr_record *part_cr_ptr; job_resources_t *job_resrcs_ptr; uint64_t job_memory_cpu = 0, job_memory_node = 0; uint16_t cpu_cnt; node_record_t *node_ptr; List gres_list; if (cr_ptr == NULL) { error("%s: cr_ptr not initialized", pre_err); return SLURM_ERROR; } if (alloc_all && job_ptr->details && job_ptr->details->pn_min_memory && (cr_type & CR_MEMORY)) { if (job_ptr->details->pn_min_memory & MEM_PER_CPU) { job_memory_cpu = job_ptr->details->pn_min_memory & (~MEM_PER_CPU); } else job_memory_node = job_ptr->details->pn_min_memory; } if ((job_resrcs_ptr = job_ptr->job_resrcs) == NULL) { error("%pJ lacks a job_resources struct", job_ptr); return SLURM_ERROR; } exclusive = (job_ptr->details->share_res == 0); if (alloc_all) _add_run_job(cr_ptr, job_ptr->job_id); _add_tot_job(cr_ptr, job_ptr->job_id); i_first = bit_ffs(job_resrcs_ptr->node_bitmap); i_last = bit_fls(job_resrcs_ptr->node_bitmap); node_cnt = bit_set_count(job_resrcs_ptr->node_bitmap); if (i_first == -1) /* job has no nodes */ i_last = -2; node_offset = -1; for (i = i_first; i <= i_last; i++) { if (!bit_test(job_resrcs_ptr->node_bitmap, i)) continue; node_offset++; if (!bit_test(job_ptr->node_bitmap, i)) continue; node_ptr = node_record_table_ptr + i; cpu_cnt = node_ptr->config_ptr->cpus; if (job_memory_cpu) { cr_ptr->nodes[i].alloc_memory += job_memory_cpu * cpu_cnt; } else cr_ptr->nodes[i].alloc_memory += job_memory_node; if (alloc_all) { if (cr_ptr->nodes[i].gres_list) gres_list = cr_ptr->nodes[i].gres_list; else gres_list = node_ptr->gres_list; gres_plugin_job_alloc(job_ptr->gres_list, gres_list, node_cnt, i, node_offset, job_ptr->job_id, node_ptr->name, NULL, job_ptr->user_id); gres_plugin_node_state_log(gres_list, node_ptr->name); } if (exclusive) cr_ptr->nodes[i].exclusive_cnt++; part_cr_ptr = cr_ptr->nodes[i].parts; while (part_cr_ptr) { if (part_cr_ptr->part_ptr != job_ptr->part_ptr) { part_cr_ptr = part_cr_ptr->next; continue; } if (alloc_all) part_cr_ptr->run_job_cnt++; part_cr_ptr->tot_job_cnt++; break; } if (part_cr_ptr == NULL) { info("%s: %pJ could not find partition %s for node %s", pre_err, job_ptr, job_ptr->partition, node_ptr->name); job_ptr->part_nodes_missing = true; rc = SLURM_ERROR; } } if (alloc_all) { gres_build_job_details(job_ptr->gres_list, &job_ptr->gres_detail_cnt, &job_ptr->gres_detail_str, &job_ptr->gres_used); } return rc; } static void _free_cr(struct cr_record *cr_ptr) { int i; struct part_cr_record *part_cr_ptr1, *part_cr_ptr2; if (cr_ptr == NULL) return; for (i = 0; i < select_node_cnt; i++) { part_cr_ptr1 = cr_ptr->nodes[i].parts; while (part_cr_ptr1) { part_cr_ptr2 = part_cr_ptr1->next; xfree(part_cr_ptr1); part_cr_ptr1 = part_cr_ptr2; } FREE_NULL_LIST(cr_ptr->nodes[i].gres_list); } xfree(cr_ptr->nodes); xfree(cr_ptr->run_job_ids); xfree(cr_ptr->tot_job_ids); xfree(cr_ptr); } static void _dump_node_cr(struct cr_record *cr_ptr) { #if SELECT_DEBUG int i; struct part_cr_record *part_cr_ptr; node_record_t *node_ptr; List gres_list; if ((cr_ptr == NULL) || (cr_ptr->nodes == NULL)) return; for (i = 0; i < cr_ptr->run_job_len; i++) { if (cr_ptr->run_job_ids[i]) info("Running JobId=%u", cr_ptr->run_job_ids[i]); } for (i = 0; i < cr_ptr->tot_job_len; i++) { if (cr_ptr->tot_job_ids[i]) info("Alloc JobId=%u", cr_ptr->tot_job_ids[i]); } for (i = 0; i < select_node_cnt; i++) { node_ptr = node_record_table_ptr + i; info("Node:%s exclusive_cnt:%u alloc_mem:%"PRIu64"", node_ptr->name, cr_ptr->nodes[i].exclusive_cnt, cr_ptr->nodes[i].alloc_memory); part_cr_ptr = cr_ptr->nodes[i].parts; while (part_cr_ptr) { info(" Part:%s run:%u tot:%u", part_cr_ptr->part_ptr->name, part_cr_ptr->run_job_cnt, part_cr_ptr->tot_job_cnt); part_cr_ptr = part_cr_ptr->next; } if (cr_ptr->nodes[i].gres_list) gres_list = cr_ptr->nodes[i].gres_list; else gres_list = node_ptr->gres_list; if (gres_list) gres_plugin_node_state_log(gres_list, node_ptr->name); } #endif } static struct cr_record *_dup_cr(struct cr_record *cr_ptr) { int i; struct cr_record *new_cr_ptr; struct part_cr_record *part_cr_ptr, *new_part_cr_ptr; node_record_t *node_ptr; List gres_list; if (cr_ptr == NULL) return NULL; new_cr_ptr = xmalloc(sizeof(struct cr_record)); new_cr_ptr->run_job_len = cr_ptr->run_job_len; i = sizeof(uint32_t) * cr_ptr->run_job_len; new_cr_ptr->run_job_ids = xmalloc(i); memcpy(new_cr_ptr->run_job_ids, cr_ptr->run_job_ids, i); new_cr_ptr->tot_job_len = cr_ptr->tot_job_len; i = sizeof(uint32_t) * cr_ptr->tot_job_len; new_cr_ptr->tot_job_ids = xmalloc(i); memcpy(new_cr_ptr->tot_job_ids, cr_ptr->tot_job_ids, i); new_cr_ptr->nodes = xcalloc(select_node_cnt, sizeof(struct node_cr_record)); for (i = 0; i < select_node_cnt; i++) { node_ptr = node_record_table_ptr + i; new_cr_ptr->nodes[i].alloc_memory = cr_ptr->nodes[i]. alloc_memory; new_cr_ptr->nodes[i].exclusive_cnt = cr_ptr->nodes[i]. exclusive_cnt; part_cr_ptr = cr_ptr->nodes[i].parts; while (part_cr_ptr) { new_part_cr_ptr = xmalloc(sizeof(struct part_cr_record)); new_part_cr_ptr->part_ptr = part_cr_ptr->part_ptr; new_part_cr_ptr->run_job_cnt = part_cr_ptr->run_job_cnt; new_part_cr_ptr->tot_job_cnt = part_cr_ptr->tot_job_cnt; new_part_cr_ptr->next = new_cr_ptr->nodes[i]. parts; new_cr_ptr->nodes[i].parts = new_part_cr_ptr; part_cr_ptr = part_cr_ptr->next; } if (cr_ptr->nodes[i].gres_list) gres_list = cr_ptr->nodes[i].gres_list; else gres_list = node_ptr->gres_list; new_cr_ptr->nodes[i].gres_list = gres_plugin_node_state_dup(gres_list); } return new_cr_ptr; } static void _init_node_cr(void) { part_record_t *part_ptr; struct part_cr_record *part_cr_ptr; job_resources_t *job_resrcs_ptr; node_record_t *node_ptr; ListIterator part_iterator; job_record_t *job_ptr; ListIterator job_iterator; uint64_t job_memory_cpu, job_memory_node; int exclusive, i, i_first, i_last, node_offset; if (cr_ptr) return; cr_ptr = xmalloc(sizeof(struct cr_record)); cr_ptr->nodes = xcalloc(select_node_cnt, sizeof(struct node_cr_record)); /* build partition records */ part_iterator = list_iterator_create(part_list); while ((part_ptr = list_next(part_iterator))) { for (i = 0; i < select_node_cnt; i++) { if (part_ptr->node_bitmap == NULL) break; if (!bit_test(part_ptr->node_bitmap, i)) continue; part_cr_ptr = xmalloc(sizeof(struct part_cr_record)); part_cr_ptr->next = cr_ptr->nodes[i].parts; part_cr_ptr->part_ptr = part_ptr; cr_ptr->nodes[i].parts = part_cr_ptr; } } list_iterator_destroy(part_iterator); /* Clear existing node Gres allocations */ for (i = 0, node_ptr = node_record_table_ptr; i < node_record_count; i++, node_ptr++) { gres_plugin_node_state_dealloc_all(node_ptr->gres_list); } /* record running and suspended jobs in node_cr_records */ job_iterator = list_iterator_create(job_list); while ((job_ptr = list_next(job_iterator))) { if (!IS_JOB_RUNNING(job_ptr) && !IS_JOB_SUSPENDED(job_ptr)) continue; if ((job_resrcs_ptr = job_ptr->job_resrcs) == NULL) { error("%pJ lacks a job_resources struct", job_ptr); continue; } if (IS_JOB_RUNNING(job_ptr) || (IS_JOB_SUSPENDED(job_ptr) && (job_ptr->priority != 0))) _add_run_job(cr_ptr, job_ptr->job_id); _add_tot_job(cr_ptr, job_ptr->job_id); job_memory_cpu = 0; job_memory_node = 0; if (job_ptr->details && job_ptr->details->pn_min_memory && (cr_type & CR_MEMORY)) { if (job_ptr->details->pn_min_memory & MEM_PER_CPU) { job_memory_cpu = job_ptr->details-> pn_min_memory & (~MEM_PER_CPU); } else { job_memory_node = job_ptr->details-> pn_min_memory; } } /* Use job_resrcs_ptr->node_bitmap rather than * job_ptr->node_bitmap which can have DOWN nodes * cleared from the bitmap */ if (job_resrcs_ptr->node_bitmap == NULL) continue; if (job_ptr->details) exclusive = (job_ptr->details->share_res == 0); else exclusive = 0; node_offset = -1; i_first = bit_ffs(job_resrcs_ptr->node_bitmap); i_last = bit_fls(job_resrcs_ptr->node_bitmap); if (i_first == -1) i_last = -2; for (i = i_first; i <= i_last; i++) { if (!bit_test(job_resrcs_ptr->node_bitmap, i)) continue; node_offset++; if (!bit_test(job_ptr->node_bitmap, i)) continue; /* node already released */ node_ptr = node_record_table_ptr + i; if (exclusive) cr_ptr->nodes[i].exclusive_cnt++; if (job_memory_cpu == 0) { cr_ptr->nodes[i].alloc_memory += job_memory_node; } else { cr_ptr->nodes[i].alloc_memory += job_memory_cpu * node_record_table_ptr[i]. config_ptr->cpus; } if (bit_test(job_ptr->node_bitmap, i)) { gres_plugin_job_alloc(job_ptr->gres_list, node_ptr->gres_list, job_resrcs_ptr->nhosts, i, node_offset, job_ptr->job_id, node_ptr->name, NULL, job_ptr->user_id); } part_cr_ptr = cr_ptr->nodes[i].parts; while (part_cr_ptr) { if (part_cr_ptr->part_ptr != job_ptr->part_ptr) { part_cr_ptr = part_cr_ptr->next; continue; } if (IS_JOB_RUNNING(job_ptr) || (IS_JOB_SUSPENDED(job_ptr) && (job_ptr->priority != 0))) { /* Running or being gang scheduled */ part_cr_ptr->run_job_cnt++; } part_cr_ptr->tot_job_cnt++; break; } if (part_cr_ptr == NULL) { info("%s: %pJ could not find partition %s for node %s", __func__, job_ptr, job_ptr->partition, node_ptr->name); job_ptr->part_nodes_missing = true; } } } list_iterator_destroy(job_iterator); _dump_node_cr(cr_ptr); } static int _find_job (void *x, void *key) { job_record_t *job_ptr = (job_record_t *) x; if (job_ptr == (job_record_t *) key) return 1; return 0; } static bool _is_preemptable(job_record_t *job_ptr, List preemptee_candidates) { if (!preemptee_candidates) return false; if (list_find_first(preemptee_candidates, _find_job, job_ptr)) return true; return false; } /* Determine if a job can ever run */ static int _test_only(job_record_t *job_ptr, bitstr_t *bitmap, uint32_t min_nodes, uint32_t max_nodes, uint32_t req_nodes, int max_share) { bitstr_t *orig_map; int i, rc = SLURM_ERROR; uint64_t save_mem; orig_map = bit_copy(bitmap); /* Try to run with currently available nodes */ i = _job_count_bitmap(cr_ptr, job_ptr, orig_map, bitmap, NO_SHARE_LIMIT, NO_SHARE_LIMIT, SELECT_MODE_TEST_ONLY); if (i >= min_nodes) { save_mem = job_ptr->details->pn_min_memory; job_ptr->details->pn_min_memory = 0; rc = _job_test(job_ptr, bitmap, min_nodes, max_nodes, req_nodes); job_ptr->details->pn_min_memory = save_mem; } FREE_NULL_BITMAP(orig_map); return rc; } /* * Sort the usable_node element to put jobs in the correct * preemption order. */ static int _sort_usable_nodes_dec(void *j1, void *j2) { job_record_t *job_a = *(job_record_t **)j1; job_record_t *job_b = *(job_record_t **)j2; if (job_a->details->usable_nodes > job_b->details->usable_nodes) return -1; else if (job_a->details->usable_nodes < job_b->details->usable_nodes) return 1; return 0; } /* Allocate resources for a job now, if possible */ static int _run_now(job_record_t *job_ptr, bitstr_t *bitmap, uint32_t min_nodes, uint32_t max_nodes, int max_share, uint32_t req_nodes, List preemptee_candidates, List *preemptee_job_list) { bitstr_t *orig_map; int max_run_job, j, sus_jobs, rc = EINVAL, prev_cnt = -1; job_record_t *tmp_job_ptr; ListIterator job_iterator, preemptee_iterator; struct cr_record *exp_cr; uint16_t pass_count = 0; orig_map = bit_copy(bitmap); for (max_run_job=0; ((max_run_job 0) { /* We need to share. Try to find * suitable job to share nodes with */ rc = _find_job_mate(job_ptr, bitmap, min_nodes, max_nodes, req_nodes); if (rc == SLURM_SUCCESS) break; } rc = _job_test(job_ptr, bitmap, min_nodes, max_nodes, req_nodes); } } top: if ((rc != SLURM_SUCCESS) && preemptee_candidates && (exp_cr = _dup_cr(cr_ptr))) { /* Remove all preemptable jobs from simulated environment */ job_iterator = list_iterator_create(preemptee_candidates); while ((tmp_job_ptr = list_next(job_iterator))) { bool remove_all = false; uint16_t mode; if (!IS_JOB_RUNNING(tmp_job_ptr) && !IS_JOB_SUSPENDED(tmp_job_ptr)) continue; mode = slurm_job_preempt_mode(tmp_job_ptr); if ((mode == PREEMPT_MODE_REQUEUE) || (mode == PREEMPT_MODE_CANCEL)) remove_all = true; /* Remove preemptable job now */ _rm_job_from_nodes(exp_cr, tmp_job_ptr, "_run_now", remove_all, false); j = _job_count_bitmap(exp_cr, job_ptr, orig_map, bitmap, (max_share - 1), NO_SHARE_LIMIT, SELECT_MODE_RUN_NOW); tmp_job_ptr->details->usable_nodes = bit_overlap(bitmap, tmp_job_ptr->node_bitmap); if (j < min_nodes) continue; rc = _job_test(job_ptr, bitmap, min_nodes, max_nodes, req_nodes); /* * If successful, set the last job's usable count to a * large value so that it will be first after sorting. * Note: usable_count is only used for sorting purposes */ if (rc == SLURM_SUCCESS) { if (pass_count++ || (list_count(preemptee_candidates) == 1)) break; tmp_job_ptr->details->usable_nodes = 9999; while ((tmp_job_ptr = list_next(job_iterator))) { tmp_job_ptr->details->usable_nodes = 0; } list_sort(preemptee_candidates, (ListCmpF)_sort_usable_nodes_dec); rc = EINVAL; list_iterator_destroy(job_iterator); _free_cr(exp_cr); goto top; } } list_iterator_destroy(job_iterator); if ((rc == SLURM_SUCCESS) && preemptee_job_list && preemptee_candidates) { /* Build list of preemptee jobs whose resources are * actually used */ if (*preemptee_job_list == NULL) { *preemptee_job_list = list_create(NULL); } preemptee_iterator = list_iterator_create( preemptee_candidates); while ((tmp_job_ptr = list_next(preemptee_iterator))) { if (bit_overlap_any(bitmap, tmp_job_ptr-> node_bitmap) == 0) continue; if (tmp_job_ptr->details->usable_nodes == 0) continue; list_append(*preemptee_job_list, tmp_job_ptr); } list_iterator_destroy(preemptee_iterator); } _free_cr(exp_cr); } if (rc == SLURM_SUCCESS) _build_select_struct(job_ptr, bitmap); FREE_NULL_BITMAP(orig_map); return rc; } /* * Determine where and when the job at job_ptr can begin execution by updating * a scratch cr_record structure to reflect each job terminating at the * end of its time limit and use this to show where and when the job at job_ptr * will begin execution. Used by Slurm's sched/backfill plugin. */ static int _will_run_test(job_record_t *job_ptr, bitstr_t *bitmap, uint32_t min_nodes, uint32_t max_nodes, int max_share, uint32_t req_nodes, List preemptee_candidates, List *preemptee_job_list) { struct cr_record *exp_cr; job_record_t *tmp_job_ptr; List cr_job_list; ListIterator job_iterator, preemptee_iterator; bitstr_t *orig_map; int i, max_run_jobs, rc = SLURM_ERROR; time_t now = time(NULL); max_run_jobs = MAX((max_share - 1), 1); /* exclude this job */ orig_map = bit_copy(bitmap); /* Try to run with currently available nodes */ i = _job_count_bitmap(cr_ptr, job_ptr, orig_map, bitmap, max_run_jobs, NO_SHARE_LIMIT, SELECT_MODE_WILL_RUN); if (i >= min_nodes) { rc = _job_test(job_ptr, bitmap, min_nodes, max_nodes, req_nodes); if (rc == SLURM_SUCCESS) { FREE_NULL_BITMAP(orig_map); job_ptr->start_time = time(NULL); return SLURM_SUCCESS; } } /* Job is still pending. Simulate termination of jobs one at a time * to determine when and where the job can start. */ exp_cr = _dup_cr(cr_ptr); if (exp_cr == NULL) { FREE_NULL_BITMAP(orig_map); return SLURM_ERROR; } /* Build list of running and suspended jobs */ cr_job_list = list_create(NULL); job_iterator = list_iterator_create(job_list); while ((tmp_job_ptr = list_next(job_iterator))) { if (!IS_JOB_RUNNING(tmp_job_ptr) && !IS_JOB_SUSPENDED(tmp_job_ptr)) continue; if (tmp_job_ptr->end_time == 0) { error("%s: Active %pJ has zero end_time", __func__, tmp_job_ptr); continue; } if (tmp_job_ptr->node_bitmap == NULL) { error("%s: %pJ has NULL node_bitmap", __func__, tmp_job_ptr); continue; } if (!_is_preemptable(tmp_job_ptr, preemptee_candidates)) { /* Queue job for later removal from data structures */ list_append(cr_job_list, tmp_job_ptr); } else { uint16_t mode = slurm_job_preempt_mode(tmp_job_ptr); bool remove_all = false; if ((mode == PREEMPT_MODE_REQUEUE) || (mode == PREEMPT_MODE_CANCEL)) remove_all = true; /* Remove preemptable job now */ _rm_job_from_nodes(exp_cr, tmp_job_ptr, "_will_run_test", remove_all, false); } } list_iterator_destroy(job_iterator); /* Test with all preemptable jobs gone */ if (preemptee_candidates) { i = _job_count_bitmap(exp_cr, job_ptr, orig_map, bitmap, max_run_jobs, NO_SHARE_LIMIT, SELECT_MODE_RUN_NOW); if (i >= min_nodes) { rc = _job_test(job_ptr, bitmap, min_nodes, max_nodes, req_nodes); if (rc == SLURM_SUCCESS) { /* Actual start time will actually be later * than "now", but return "now" for backfill * scheduler to initiate preemption. */ job_ptr->start_time = now; } } } /* Remove the running jobs one at a time from exp_node_cr and try * scheduling the pending job after each one */ if ((rc != SLURM_SUCCESS) && ((job_ptr->bit_flags & TEST_NOW_ONLY) == 0)) { list_sort(cr_job_list, _cr_job_list_sort); job_iterator = list_iterator_create(cr_job_list); while ((tmp_job_ptr = list_next(job_iterator))) { _rm_job_from_nodes(exp_cr, tmp_job_ptr, "_will_run_test", true, false); i = _job_count_bitmap(exp_cr, job_ptr, orig_map, bitmap, max_run_jobs, NO_SHARE_LIMIT, SELECT_MODE_RUN_NOW); if (i < min_nodes) continue; rc = _job_test(job_ptr, bitmap, min_nodes, max_nodes, req_nodes); if (rc != SLURM_SUCCESS) continue; if (tmp_job_ptr->end_time <= now) job_ptr->start_time = now + 1; else job_ptr->start_time = tmp_job_ptr->end_time; break; } list_iterator_destroy(job_iterator); } if ((rc == SLURM_SUCCESS) && preemptee_job_list && preemptee_candidates) { /* Build list of preemptee jobs whose resources are * actually used. List returned even if not killed * in selected plugin, but by Moab or something else. */ if (*preemptee_job_list == NULL) { *preemptee_job_list = list_create(NULL); } preemptee_iterator =list_iterator_create(preemptee_candidates); while ((tmp_job_ptr = list_next(preemptee_iterator))) { if (bit_overlap_any(bitmap, tmp_job_ptr->node_bitmap) == 0) continue; list_append(*preemptee_job_list, tmp_job_ptr); } list_iterator_destroy(preemptee_iterator); } FREE_NULL_LIST(cr_job_list); _free_cr(exp_cr); FREE_NULL_BITMAP(orig_map); return rc; } static int _cr_job_list_sort(void *x, void *y) { job_record_t *job1_ptr = *(job_record_t **) x; job_record_t *job2_ptr = *(job_record_t **) y; return (int) SLURM_DIFFTIME(job1_ptr->end_time, job2_ptr->end_time); } /* * init() is called when the plugin is loaded, before any other functions * are called. Put global initialization here. */ extern int init ( void ) { char *topo_param; int rc = SLURM_SUCCESS; cr_type = slurmctld_conf.select_type_param; if (cr_type) verbose("%s loaded with argument %u", plugin_name, cr_type); topo_param = slurm_get_topology_param(); if (topo_param) { if (xstrcasestr(topo_param, "dragonfly")) have_dragonfly = true; if (xstrcasestr(topo_param, "TopoOptional")) topo_optional = true; xfree(topo_param); } priority_flags = slurm_get_priority_flags(); return rc; } extern int fini ( void ) { int rc = SLURM_SUCCESS; cr_fini_global_core_data(); slurm_mutex_lock(&cr_mutex); _free_cr(cr_ptr); cr_ptr = NULL; slurm_mutex_unlock(&cr_mutex); return rc; } /* * The remainder of this file implements the standard Slurm * node selection API. */ extern int select_p_state_save(char *dir_name) { return SLURM_SUCCESS; } extern int select_p_state_restore(char *dir_name) { return SLURM_SUCCESS; } /* * Note the initialization of job records, issued upon restart of * slurmctld and used to synchronize any job state. */ extern int select_p_job_init(List job_list_arg) { return SLURM_SUCCESS; } extern bool select_p_node_ranking(node_record_t *node_ptr, int node_cnt) { return false; } extern int select_p_node_init(node_record_t *node_ptr, int node_cnt) { if (node_ptr == NULL) { error("select_p_node_init: node_ptr == NULL"); return SLURM_ERROR; } if (node_cnt < 0) { error("select_p_node_init: node_cnt < 0"); return SLURM_ERROR; } /* NOTE: We free the consumable resources info here, but * can't rebuild it since the partition and node structures * have not yet had node bitmaps reset. */ slurm_mutex_lock(&cr_mutex); _free_cr(cr_ptr); cr_ptr = NULL; select_node_ptr = node_ptr; select_node_cnt = node_cnt; cr_init_global_core_data(node_ptr, node_cnt); slurm_mutex_unlock(&cr_mutex); return SLURM_SUCCESS; } /* * select_p_job_test - Given a specification of scheduling requirements, * identify the nodes which "best" satisfy the request. * "best" is defined as either single set of consecutive nodes satisfying * the request and leaving the minimum number of unused nodes OR * the fewest number of consecutive node sets * IN/OUT job_ptr - pointer to job being considered for initiation, * set's start_time when job expected to start * IN/OUT bitmap - usable nodes are set on input, nodes not required to * satisfy the request are cleared, other left set * IN min_nodes - minimum count of nodes * IN req_nodes - requested (or desired) count of nodes * IN max_nodes - maximum count of nodes * IN mode - SELECT_MODE_RUN_NOW: try to schedule job now * SELECT_MODE_TEST_ONLY: test if job can ever run * SELECT_MODE_WILL_RUN: determine when and where job can run * IN preemptee_candidates - List of pointers to jobs which can be preempted. * IN/OUT preemptee_job_list - Pointer to list of job pointers. These are the * jobs to be preempted to initiate the pending job. Not set * if mode=SELECT_MODE_TEST_ONLY or input pointer is NULL. * IN exc_core_bitmap - bitmap of cores being reserved. * RET zero on success, EINVAL otherwise * globals (passed via select_p_node_init): * node_record_count - count of nodes configured * node_record_table_ptr - pointer to global node table * NOTE: the job information that is considered for scheduling includes: * req_node_bitmap: bitmap of specific nodes required by the job * contiguous: allocated nodes must be sequentially located * num_cpus: minimum number of processors required by the job * NOTE: bitmap must be a superset of the job's required at the time that * select_p_job_test is called */ extern int select_p_job_test(job_record_t *job_ptr, bitstr_t *bitmap, uint32_t min_nodes, uint32_t max_nodes, uint32_t req_nodes, uint16_t mode, List preemptee_candidates, List *preemptee_job_list, bitstr_t *exc_core_bitmap) { int max_share = 0, rc = EINVAL; xassert(bitmap); if (job_ptr->details == NULL) return EINVAL; slurm_mutex_lock(&cr_mutex); if (cr_ptr == NULL) { _init_node_cr(); if (cr_ptr == NULL) { slurm_mutex_unlock(&cr_mutex); error("select_p_job_test: cr_ptr not initialized"); return SLURM_ERROR; } } if (bit_set_count(bitmap) < min_nodes) { slurm_mutex_unlock(&cr_mutex); return EINVAL; } if (job_ptr->details->core_spec != NO_VAL16) { verbose("%s: %pJ core_spec(%u) not supported", plugin_type, job_ptr, job_ptr->details->core_spec); job_ptr->details->core_spec = NO_VAL16; } if (job_ptr->details->share_res) max_share = job_ptr->part_ptr->max_share & ~SHARED_FORCE; else /* ((shared == 0) || (shared == NO_VAL16)) */ max_share = 1; if (mode == SELECT_MODE_WILL_RUN) { rc = _will_run_test(job_ptr, bitmap, min_nodes, max_nodes, max_share, req_nodes, preemptee_candidates, preemptee_job_list); if (!job_ptr->best_switch) rc = SLURM_ERROR; } else if (mode == SELECT_MODE_TEST_ONLY) { rc = _test_only(job_ptr, bitmap, min_nodes, max_nodes, req_nodes, max_share); } else if (mode == SELECT_MODE_RUN_NOW) { rc = _run_now(job_ptr, bitmap, min_nodes, max_nodes, max_share, req_nodes, preemptee_candidates, preemptee_job_list); if (!job_ptr->best_switch) rc = SLURM_ERROR; } else fatal("select_p_job_test: Mode %d is invalid", mode); slurm_mutex_unlock(&cr_mutex); return rc; } /* * Note initiation of job is about to begin. Called immediately * after select_p_job_test(). Executed from slurmctld. * IN job_ptr - pointer to job being initiated */ extern int select_p_job_begin(job_record_t *job_ptr) { int rc = SLURM_SUCCESS; slurm_mutex_lock(&cr_mutex); if (cr_ptr == NULL) _init_node_cr(); if (rc == SLURM_SUCCESS) rc = _add_job_to_nodes(cr_ptr, job_ptr, "select_p_job_begin", 1); gres_plugin_job_state_log(job_ptr->gres_list, job_ptr->job_id); slurm_mutex_unlock(&cr_mutex); return rc; } /* * Determine if allocated nodes are usable (powered up) * IN job_ptr - pointer to job being tested * RET -1 on error, 1 if ready to execute, 0 otherwise */ extern int select_p_job_ready(job_record_t *job_ptr) { int i, i_first, i_last; node_record_t *node_ptr; if (!IS_JOB_RUNNING(job_ptr) && !IS_JOB_SUSPENDED(job_ptr)) { /* Gang scheduling might suspend job immediately */ return 0; } if ((job_ptr->node_bitmap == NULL) || ((i_first = bit_ffs(job_ptr->node_bitmap)) == -1)) return READY_NODE_STATE; i_last = bit_fls(job_ptr->node_bitmap); for (i = i_first; i <= i_last; i++) { if (bit_test(job_ptr->node_bitmap, i) == 0) continue; node_ptr = node_record_table_ptr + i; if (IS_NODE_POWER_SAVE(node_ptr) || IS_NODE_POWER_UP(node_ptr)) return 0; } return READY_NODE_STATE; } extern int select_p_job_expand(job_record_t *from_job_ptr, job_record_t *to_job_ptr) { int rc; slurm_mutex_lock(&cr_mutex); if (cr_ptr == NULL) _init_node_cr(); rc = _job_expand(from_job_ptr, to_job_ptr); slurm_mutex_unlock(&cr_mutex); return rc; } /* * Modify internal data structures for a job that has changed size * Only support jobs shrinking now. * RET: 0 or an error code */ extern int select_p_job_resized(job_record_t *job_ptr, node_record_t *node_ptr) { int rc = SLURM_SUCCESS; slurm_mutex_lock(&cr_mutex); if (cr_ptr == NULL) _init_node_cr(); _rm_job_from_one_node(job_ptr, node_ptr, "select_p_job_resized"); slurm_mutex_unlock(&cr_mutex); return rc; } extern int select_p_job_signal(job_record_t *job_ptr, int signal) { return SLURM_SUCCESS; } extern int select_p_job_mem_confirm(job_record_t *job_ptr) { return SLURM_SUCCESS; } /* * Note termination of job is starting. Executed from slurmctld. * IN job_ptr - pointer to job being terminated */ extern int select_p_job_fini(job_record_t *job_ptr) { int rc = SLURM_SUCCESS; slurm_mutex_lock(&cr_mutex); if (cr_ptr == NULL) _init_node_cr(); if (_rm_job_from_nodes(cr_ptr, job_ptr, "select_p_job_fini", true, true) != SLURM_SUCCESS) rc = SLURM_ERROR; slurm_mutex_unlock(&cr_mutex); return rc; } /* * Suspend a job. Executed from slurmctld. * IN job_ptr - pointer to job being suspended * IN indf_susp - set if job is being suspended indefinitely by user * or admin, otherwise suspended for gang scheduling * RET SLURM_SUCCESS or error code */ extern int select_p_job_suspend(job_record_t *job_ptr, bool indf_susp) { int rc; if (!indf_susp) return SLURM_SUCCESS; slurm_mutex_lock(&cr_mutex); if (cr_ptr == NULL) _init_node_cr(); rc = _rm_job_from_nodes(cr_ptr, job_ptr, "select_p_job_suspend", false, false); slurm_mutex_unlock(&cr_mutex); return rc; } /* * Resume a job. Executed from slurmctld. * IN job_ptr - pointer to job being resumed * IN indf_susp - set if job is being resumed from indefinite suspend by user * or admin, otherwise resume from gang scheduling * RET SLURM_SUCCESS or error code */ extern int select_p_job_resume(job_record_t *job_ptr, bool indf_susp) { int rc; if (!indf_susp) return SLURM_SUCCESS; slurm_mutex_lock(&cr_mutex); if (cr_ptr == NULL) _init_node_cr(); rc = _add_job_to_nodes(cr_ptr, job_ptr, "select_p_job_resume", 0); slurm_mutex_unlock(&cr_mutex); return rc; } extern bitstr_t *select_p_step_pick_nodes(job_record_t *job_ptr, select_jobinfo_t *jobinfo, uint32_t node_count, bitstr_t **avail_nodes) { return NULL; } extern int select_p_step_start(step_record_t *step_ptr) { return SLURM_SUCCESS; } extern int select_p_step_finish(step_record_t *step_ptr, bool killing_step) { return SLURM_SUCCESS; } extern int select_p_select_nodeinfo_pack(select_nodeinfo_t *nodeinfo, Buf buffer, uint16_t protocol_version) { select_nodeinfo_t *nodeinfo_empty = NULL; if (!nodeinfo) { /* * We should never get here, * but avoid abort with bad data structures */ error("%s: nodeinfo is NULL", __func__); nodeinfo_empty = xmalloc(sizeof(select_nodeinfo_t)); nodeinfo = nodeinfo_empty; } if (protocol_version >= SLURM_MIN_PROTOCOL_VERSION) { pack16(nodeinfo->alloc_cpus, buffer); pack64(nodeinfo->alloc_memory, buffer); packstr(nodeinfo->tres_alloc_fmt_str, buffer); packdouble(nodeinfo->tres_alloc_weighted, buffer); } xfree(nodeinfo_empty); return SLURM_SUCCESS; } extern int select_p_select_nodeinfo_unpack(select_nodeinfo_t **nodeinfo, Buf buffer, uint16_t protocol_version) { uint32_t uint32_tmp; select_nodeinfo_t *nodeinfo_ptr = NULL; nodeinfo_ptr = select_p_select_nodeinfo_alloc(); *nodeinfo = nodeinfo_ptr; if (protocol_version >= SLURM_MIN_PROTOCOL_VERSION) { safe_unpack16(&nodeinfo_ptr->alloc_cpus, buffer); safe_unpack64(&nodeinfo_ptr->alloc_memory, buffer); safe_unpackstr_xmalloc(&nodeinfo_ptr->tres_alloc_fmt_str, &uint32_tmp, buffer); safe_unpackdouble(&nodeinfo_ptr->tres_alloc_weighted, buffer); } return SLURM_SUCCESS; unpack_error: error("select_nodeinfo_unpack: error unpacking here"); select_p_select_nodeinfo_free(nodeinfo_ptr); *nodeinfo = NULL; return SLURM_ERROR; } extern select_nodeinfo_t *select_p_select_nodeinfo_alloc(void) { select_nodeinfo_t *nodeinfo = xmalloc(sizeof(struct select_nodeinfo)); nodeinfo->magic = NODEINFO_MAGIC; return nodeinfo; } extern int select_p_select_nodeinfo_free(select_nodeinfo_t *nodeinfo) { if (nodeinfo) { if (nodeinfo->magic != NODEINFO_MAGIC) { error("select_p_select_nodeinfo_free: " "nodeinfo magic bad"); return EINVAL; } nodeinfo->magic = 0; xfree(nodeinfo->tres_alloc_fmt_str); xfree(nodeinfo); } return SLURM_SUCCESS; } extern int select_p_select_nodeinfo_set_all(void) { node_record_t *node_ptr = NULL; int n; static time_t last_set_all = 0; /* only set this once when the last_node_update is newer than * the last time we set things up. */ if (last_set_all && (last_node_update < last_set_all)) { debug2("Node select info for set all hasn't " "changed since %ld", (long)last_set_all); return SLURM_NO_CHANGE_IN_DATA; } last_set_all = last_node_update; for (n = 0, node_ptr = node_record_table_ptr; n < select_node_cnt; n++, node_ptr++) { select_nodeinfo_t *nodeinfo = NULL; /* We have to use the '_g_' here to make sure we get the * correct data to work on. i.e. cray calls this plugin * from within select/cray which has it's own struct. */ select_g_select_nodeinfo_get(node_ptr->select_nodeinfo, SELECT_NODEDATA_PTR, 0, (void *)&nodeinfo); if (!nodeinfo) { error("no nodeinfo returned from structure"); continue; } xfree(nodeinfo->tres_alloc_fmt_str); if (IS_NODE_COMPLETING(node_ptr) || IS_NODE_ALLOCATED(node_ptr)) { nodeinfo->alloc_cpus = node_ptr->config_ptr->cpus; nodeinfo->tres_alloc_fmt_str = assoc_mgr_make_tres_str_from_array( node_ptr->tres_cnt, TRES_STR_CONVERT_UNITS, false); nodeinfo->tres_alloc_weighted = assoc_mgr_tres_weighted( node_ptr->tres_cnt, node_ptr->config_ptr->tres_weights, priority_flags, false); } else { nodeinfo->alloc_cpus = 0; nodeinfo->tres_alloc_weighted = 0.0; } if (cr_ptr && cr_ptr->nodes) { nodeinfo->alloc_memory = cr_ptr->nodes[n].alloc_memory; } else { nodeinfo->alloc_memory = 0; } } return SLURM_SUCCESS; } extern int select_p_select_nodeinfo_set(job_record_t *job_ptr) { xassert(job_ptr); slurm_mutex_lock(&cr_mutex); if (cr_ptr == NULL) _init_node_cr(); slurm_mutex_unlock(&cr_mutex); return SLURM_SUCCESS; } extern int select_p_select_nodeinfo_get(select_nodeinfo_t *nodeinfo, enum select_nodedata_type dinfo, enum node_states state, void *data) { int rc = SLURM_SUCCESS; uint16_t *uint16 = (uint16_t *) data; uint64_t *uint64 = (uint64_t *) data; char **tmp_char = (char **) data; double *tmp_double = (double *) data; select_nodeinfo_t **select_nodeinfo = (select_nodeinfo_t **) data; if (nodeinfo == NULL) { error("get_nodeinfo: nodeinfo not set"); return SLURM_ERROR; } if (nodeinfo->magic != NODEINFO_MAGIC) { error("get_nodeinfo: nodeinfo magic bad"); return SLURM_ERROR; } switch (dinfo) { case SELECT_NODEDATA_SUBCNT: if (state == NODE_STATE_ALLOCATED) *uint16 = nodeinfo->alloc_cpus; else *uint16 = 0; break; case SELECT_NODEDATA_PTR: *select_nodeinfo = nodeinfo; break; case SELECT_NODEDATA_MEM_ALLOC: *uint64 = nodeinfo->alloc_memory; break; case SELECT_NODEDATA_TRES_ALLOC_FMT_STR: *tmp_char = xstrdup(nodeinfo->tres_alloc_fmt_str); break; case SELECT_NODEDATA_TRES_ALLOC_WEIGHTED: *tmp_double = nodeinfo->tres_alloc_weighted; break; default: error("Unsupported option %d for get_nodeinfo.", dinfo); rc = SLURM_ERROR; break; } return rc; } /* * allocate storage for a select job credential * RET - storage for a select job credential * NOTE: storage must be freed using select_p_select_jobinfo_free */ extern select_jobinfo_t *select_p_select_jobinfo_alloc(void) { return NULL; } /* * fill in a previously allocated select job credential * IN/OUT jobinfo - updated select job credential * IN data_type - type of data to enter into job credential * IN data - the data to enter into job credential */ extern int select_p_select_jobinfo_set(select_jobinfo_t *jobinfo, enum select_jobdata_type data_type, void *data) { return SLURM_SUCCESS; } /* * get data from a select job credential * IN jobinfo - updated select job credential * IN data_type - type of data to enter into job credential * OUT data - the data to get from job credential, caller must xfree * data for data_type == SELECT_JOBDATA_PART_ID */ extern int select_p_select_jobinfo_get (select_jobinfo_t *jobinfo, enum select_jobdata_type data_type, void *data) { return SLURM_ERROR; } /* * copy a select job credential * IN jobinfo - the select job credential to be copied * RET - the copy or NULL on failure * NOTE: returned value must be freed using select_p_select_jobinfo_free */ extern select_jobinfo_t *select_p_select_jobinfo_copy( select_jobinfo_t *jobinfo) { return NULL; } /* * free storage previously allocated for a select job credential * IN jobinfo - the select job credential to be freed * RET - slurm error code */ extern int select_p_select_jobinfo_free (select_jobinfo_t *jobinfo) { return SLURM_SUCCESS; } /* * pack a select job credential into a buffer in machine independent form * IN jobinfo - the select job credential to be saved * OUT buffer - buffer with select credential appended * IN protocol_version - slurm protocol version of client * RET - slurm error code */ extern int select_p_select_jobinfo_pack(select_jobinfo_t *jobinfo, Buf buffer, uint16_t protocol_version) { return SLURM_SUCCESS; } /* * unpack a select job credential from a buffer * OUT jobinfo - the select job credential read * IN buffer - buffer with select credential read from current pointer loc * IN protocol_version - slurm protocol version of client * RET - slurm error code * NOTE: returned value must be freed using select_p_select_jobinfo_free */ extern int select_p_select_jobinfo_unpack(select_jobinfo_t **jobinfo, Buf buffer, uint16_t protocol_version) { return SLURM_SUCCESS; } extern char *select_p_select_jobinfo_sprint(select_jobinfo_t *jobinfo, char *buf, size_t size, int mode) { if (buf && size) { buf[0] = '\0'; return buf; } else return NULL; } extern char *select_p_select_jobinfo_xstrdup(select_jobinfo_t *jobinfo, int mode) { return NULL; } extern int select_p_get_info_from_plugin(enum select_plugindata_info dinfo, job_record_t *job_ptr, void *data) { return SLURM_SUCCESS; } extern int select_p_update_node_config (int index) { return SLURM_SUCCESS; } extern int select_p_reconfigure(void) { slurm_mutex_lock(&cr_mutex); _free_cr(cr_ptr); cr_ptr = NULL; _init_node_cr(); slurm_mutex_unlock(&cr_mutex); return SLURM_SUCCESS; } extern bitstr_t * select_p_resv_test(resv_desc_msg_t *resv_desc_ptr, uint32_t node_cnt, bitstr_t *avail_bitmap, bitstr_t **core_bitmap) { bitstr_t **switches_bitmap; /* nodes on this switch */ int *switches_cpu_cnt; /* total CPUs on switch */ int *switches_node_cnt; /* total nodes on switch */ int *switches_required; /* set if has required node */ bitstr_t *avail_nodes_bitmap = NULL; /* nodes on any switch */ int rem_nodes; /* remaining resources desired */ int i, j; int best_fit_inx, first, last; int best_fit_nodes; int best_fit_location = 0; bool sufficient, best_fit_sufficient; xassert(avail_bitmap); xassert(resv_desc_ptr); if (!switch_record_cnt || !switch_record_table) return bit_pick_cnt(avail_bitmap, node_cnt); /* Use topology state information */ if (bit_set_count(avail_bitmap) < node_cnt) return avail_nodes_bitmap; rem_nodes = node_cnt; /* Construct a set of switch array entries, * use the same indexes as switch_record_table in slurmctld */ switches_bitmap = xcalloc(switch_record_cnt, sizeof(bitstr_t *)); switches_cpu_cnt = xcalloc(switch_record_cnt, sizeof(int)); switches_node_cnt = xcalloc(switch_record_cnt, sizeof(int)); switches_required = xcalloc(switch_record_cnt, sizeof(int)); for (i=0; i 0) { best_fit_nodes = best_fit_sufficient = 0; for (j=0; j= rem_nodes); /* If first possibility OR */ /* first set large enough for request OR */ /* tightest fit (less resource waste) OR */ /* nothing yet large enough, but this is biggest */ if ((best_fit_nodes == 0) || (sufficient && (best_fit_sufficient == 0)) || (sufficient && (switches_node_cnt[j] < best_fit_nodes)) || ((sufficient == 0) && (switches_node_cnt[j] > best_fit_nodes))) { best_fit_nodes = switches_node_cnt[j]; best_fit_location = j; best_fit_sufficient = sufficient; } } if (best_fit_nodes == 0) break; /* Use select nodes from this leaf */ first = bit_ffs(switches_bitmap[best_fit_location]); last = bit_fls(switches_bitmap[best_fit_location]); for (i=first; ((i<=last) && (first>=0)); i++) { if (!bit_test(switches_bitmap[best_fit_location], i)) continue; bit_clear(switches_bitmap[best_fit_location], i); switches_node_cnt[best_fit_location]--; if (bit_test(avail_nodes_bitmap, i)) { /* node on multiple leaf switches * and already selected */ continue; } bit_set(avail_nodes_bitmap, i); if (--rem_nodes <= 0) break; } switches_node_cnt[best_fit_location] = 0; } if (rem_nodes > 0) /* insufficient resources */ FREE_NULL_BITMAP(avail_nodes_bitmap); fini: for (i=0; i