/*****************************************************************************\
 *  dist_tasks.c - Assign task count for each resource.
 *****************************************************************************
 *  Copyright (C) 2018 SchedMD LLC
 *  Derived in large part from select/cons_res plugin
 *
 *  This file is part of Slurm, a resource management program.
 *  For details, see <https://slurm.schedmd.com/>.
 *  Please also read the included file: DISCLAIMER.
 *
 *  Slurm is free software; you can redistribute it and/or modify it under
 *  the terms of the GNU General Public License as published by the Free
 *  Software Foundation; either version 2 of the License, or (at your option)
 *  any later version.
 *
 *  In addition, as a special exception, the copyright holders give permission
 *  to link the code of portions of this program with the OpenSSL library under
 *  certain conditions as described in each individual source file, and
 *  distribute linked combinations including the two. You must obey the GNU
 *  General Public License in all respects for all of the code used other than
 *  OpenSSL. If you modify file(s) with this exception, you may extend this
 *  exception to your version of the file(s), but you are not obligated to do
 *  so. If you do not wish to do so, delete this exception statement from your
 *  version.  If you delete this exception statement from all source files in
 *  the program, then also delete it here.
 *
 *  Slurm is distributed in the hope that it will be useful, but WITHOUT ANY
 *  WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 *  FOR A PARTICULAR PURPOSE.  See the GNU General Public License for more
 *  details.
 *
 *  You should have received a copy of the GNU General Public License along
 *  with Slurm; if not, write to the Free Software Foundation, Inc.,
 *  51 Franklin Street, Fifth Floor, Boston, MA 02110-1301  USA.
\*****************************************************************************/

#include "select_cons_tres.h"
#include "../cons_common/dist_tasks.h"

/* At tasks_per_node limit for given node */
static bool _at_tpn_limit(const uint32_t n, const job_record_t *job_ptr,
			  const char *tag, bool log_error)
{
	const job_resources_t *job_res = job_ptr->job_resrcs;
	const log_level_t log_lvl = log_error ? LOG_LEVEL_ERROR :
						LOG_LEVEL_INFO;

	if (job_ptr->details->ntasks_per_node == 0)
		return false;

	if (job_res->tasks_per_node[n] < job_ptr->details->ntasks_per_node)
		return false;

	if (log_error || (select_debug_flags & DEBUG_FLAG_SELECT_TYPE))
		log_var(log_lvl,
			"%s: %s over tasks_per_node for %pJ node:%u task_per_node:%d max:%" PRIu16,
			__func__, tag, job_ptr, n, job_res->tasks_per_node[n],
			job_ptr->details->ntasks_per_node);

	return true;
}

/*
 * dist_tasks_compute_c_b - compute the number of tasks on each
 * of the node for the cyclic and block distribution. We need to do
 * this in the case of consumable resources so that we have an exact
 * count for the needed hardware resources which will be used later to
 * update the different used resources per node structures.
 *
 * The most common case is when we have more resources than needed. In
 * that case we just "take" what we need and "release" the remaining
 * resources for other jobs. In the case where we oversubscribe the
 * processing units (PUs) we keep the initial set of resources.
 *
 * IN/OUT job_ptr - pointer to job being scheduled. The per-node
 *                  job_res->cpus array is recomputed here.
 * IN gres_task_limit - array of task limits based upon job's GRES specification
 *			offset based upon bits set in
 *			job_ptr->job_resrcs->node_bitmap
 */
extern int dist_tasks_compute_c_b(job_record_t *job_ptr,
				  uint32_t *gres_task_limit)
{
	bool over_subscribe = false;
	uint32_t n, tid, t, maxtasks, l;
	uint16_t *avail_cpus;
	job_resources_t *job_res = job_ptr->job_resrcs;
	bool log_over_subscribe = true;
	char *err_msg = NULL;
	uint16_t *vpus;
	bool space_remaining = false;
	bool test_tres_tasks = true;
	int i, i_first, i_last, rem_cpus, rem_tasks;

	if (!job_res)
		err_msg = "job_res is NULL";
	else if (!job_res->cpus)
		err_msg = "job_res->cpus is NULL";
	else if (!job_res->nhosts)
		err_msg = "job_res->nhosts is zero";
	if (err_msg) {
		error("%s: %s: Invalid allocation for %pJ: %s",
		      plugin_type, __func__, job_ptr, err_msg);
		return SLURM_ERROR;
	}

	vpus = xmalloc(job_res->nhosts * sizeof(uint16_t));
	i_first = bit_ffs(job_res->node_bitmap);
	if (i_first >= 0)
		i_last  = bit_fls(job_res->node_bitmap);
	else
		i_last = -2;
	for (i = i_first, n = 0; i <= i_last; i++) {
		if (!bit_test(job_res->node_bitmap, i))
			continue;
		vpus[n++] = select_node_record[i].vpus;
	}

	maxtasks = job_res->ncpus;
	avail_cpus = job_res->cpus;
	job_res->cpus = xmalloc(job_res->nhosts * sizeof(uint16_t));
	job_res->tasks_per_node = xmalloc(job_res->nhosts * sizeof(uint16_t));

	/* ncpus is already set the number of tasks if overcommit is used */
	if (!job_ptr->details->overcommit &&
	    (job_ptr->details->cpus_per_task > 1)) {
		if (job_ptr->details->ntasks_per_node == 0) {
			maxtasks = maxtasks / job_ptr->details->cpus_per_task;
		} else {
			maxtasks = job_ptr->details->ntasks_per_node *
				   job_res->nhosts;
		}
	}

	/*
	 * Safe guard if the user didn't specified a lower number of
	 * CPUs than cpus_per_task or didn't specify the number.
	 */
	if (!maxtasks) {
		error("%s: %s: changing task count from 0 to 1 for %pJ",
		      plugin_type, __func__, job_ptr);
		maxtasks = 1;
	}
	if (job_ptr->details->cpus_per_task == 0)
		job_ptr->details->cpus_per_task = 1;
	if (job_ptr->details->overcommit)
		log_over_subscribe = false;
	/* Start by allocating one task per node */
	space_remaining = false;
	tid = 0;
	for (n = 0; ((n < job_res->nhosts) && (tid < maxtasks)); n++) {
		if (avail_cpus[n] || over_subscribe) {
			/* Ignore gres_task_limit for first task per node */
			tid++;
			job_res->tasks_per_node[n]++;
			for (l = 0; l < job_ptr->details->cpus_per_task; l++) {
				if (job_res->cpus[n] < avail_cpus[n])
					job_res->cpus[n]++;
			}
			if (job_res->cpus[n] < avail_cpus[n])
				space_remaining = true;
		}
	}
	if (!space_remaining)
		over_subscribe = true;

	/* Next fill out the CPUs on the cores already allocated to this job */
	for (n = 0; ((n < job_res->nhosts) && (tid < maxtasks)); n++) {
		rem_cpus = job_res->cpus[n] % vpus[n];
		rem_tasks = rem_cpus / job_ptr->details->cpus_per_task;
		if (rem_tasks == 0)
			continue;
		for (t = 0; ((t < rem_tasks) && (tid < maxtasks)); t++) {
			if (!over_subscribe &&
			    ((avail_cpus[n] - job_res->cpus[n]) <
			     job_ptr->details->cpus_per_task))
				break;
			if (!over_subscribe &&
			    !dist_tasks_tres_tasks_avail(
				    gres_task_limit, job_res, n))
				break;
			if (!over_subscribe &&
			    _at_tpn_limit(n, job_ptr, "fill allocated", false))
				break;
			tid++;
			job_res->tasks_per_node[n]++;
			for (l = 0; l < job_ptr->details->cpus_per_task; l++) {
				if (job_res->cpus[n] < avail_cpus[n])
					job_res->cpus[n]++;
			}
		}
	}

	/*
	 * Next distribute additional tasks, packing the cores or sockets as
	 * appropriate to avoid allocating more CPUs than needed. For example,
	 * with core allocations and 2 processors per core, we don't want to
	 * partially populate some cores on some nodes and allocate extra
	 * cores on other nodes. So "srun -n20 hostname" should not launch 7
	 * tasks on node 0, 7 tasks on node 1, and 6 tasks on node 2.  It should
	 * launch 8 tasks on node, 8 tasks on node 1, and 4 tasks on node 2.
	 */
	if (job_ptr->details->overcommit && (job_ptr->tres_per_task == 0))
		maxtasks = 0;	/* Allocate have one_task_per_node */
	for (i = 0; tid < maxtasks; i++) {
		bool space_remaining = false;
		if (over_subscribe && log_over_subscribe) {
			/*
			 * 'over_subscribe' is a relief valve that guards
			 * against an infinite loop, and it *should* never
			 * come into play because maxtasks should never be
			 * greater than the total number of available CPUs
			 */
			error("%s: %s: oversubscribe for %pJ",
			      plugin_type, __func__, job_ptr);
			log_over_subscribe = false;	/* Log once per job */;
		}
		for (n = 0; ((n < job_res->nhosts) && (tid < maxtasks)); n++) {
			rem_tasks = vpus[n] / job_ptr->details->cpus_per_task;
			rem_tasks = MAX(rem_tasks, 1);
			for (t = 0; ((t < rem_tasks) && (tid < maxtasks)); t++){
				if (!over_subscribe) {
					if ((avail_cpus[n] - job_res->cpus[n]) <
					    job_ptr->details->cpus_per_task)
						break;
					if (!dist_tasks_tres_tasks_avail(
						    gres_task_limit,
						    job_res, n))
						break;
					if (_at_tpn_limit(n, job_ptr,
							  "fill additional",
							  false))
						break;
				}

				tid++;
				job_res->tasks_per_node[n]++;
				for (l = 0; l < job_ptr->details->cpus_per_task;
				     l++) {
					if (job_res->cpus[n] < avail_cpus[n])
						job_res->cpus[n]++;
				}
				if ((avail_cpus[n] - job_res->cpus[n]) >=
				    job_ptr->details->cpus_per_task)
					space_remaining = true;
			}
		}
		if (!space_remaining)
			over_subscribe = true;
	}
	xfree(avail_cpus);
	xfree(vpus);

	if (job_ptr->details->overcommit && job_ptr->tres_per_task)
		maxtasks = job_ptr->details->num_tasks;
	/*
	 * Distribute any remaining tasks (without dedicated CPUs) evenly
	 * across nodes
	 */
	while (tid < maxtasks) {
		bool more_tres_tasks = false;
		for (n = 0; ((n < job_res->nhosts) && (tid < maxtasks)); n++) {
			if (test_tres_tasks) {
				if (!dist_tasks_tres_tasks_avail(
					    gres_task_limit, job_res, n))
					continue;
				if (_at_tpn_limit(n, job_ptr,
						  "fill non-dedicated CPUs",
						  true))
					continue;
			}

			more_tres_tasks = true;
			tid++;
			job_res->tasks_per_node[n]++;
		}
		if (!more_tres_tasks) {
			if (!test_tres_tasks) {
				error("%s: failed to find additional placement for task %u for %pJ",
				      __func__, tid, job_ptr);
				return SLURM_ERROR;
			} else
				test_tres_tasks = false;
		}
	}

	return SLURM_SUCCESS;
}