/*****************************************************************************\
* cookies.c - Library for managing a switch on a Cray system.
*****************************************************************************
* Copyright (C) 2014 SchedMD LLC
* Copyright 2014 Cray Inc. All Rights Reserved.
* Written by David Gloe
*
* This file is part of Slurm, a resource management program.
* For details, see .
* Please also read the included file: DISCLAIMER.
*
* Slurm is free software; you can redistribute it and/or modify it under
* the terms of the GNU General Public License as published by the Free
* Software Foundation; either version 2 of the License, or (at your option)
* any later version.
*
* In addition, as a special exception, the copyright holders give permission
* to link the code of portions of this program with the OpenSSL library under
* certain conditions as described in each individual source file, and
* distribute linked combinations including the two. You must obey the GNU
* General Public License in all respects for all of the code used other than
* OpenSSL. If you modify file(s) with this exception, you may extend this
* exception to your version of the file(s), but you are not obligated to do
* so. If you do not wish to do so, delete this exception statement from your
* version. If you delete this exception statement from all source files in
* the program, then also delete it here.
*
* Slurm is distributed in the hope that it will be useful, but WITHOUT ANY
* WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
* FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
* details.
*
* You should have received a copy of the GNU General Public License along
* with Slurm; if not, write to the Free Software Foundation, Inc.,
* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
\*****************************************************************************/
#include "switch_cray_aries.h"
#if defined(HAVE_NATIVE_CRAY) || defined(HAVE_CRAY_NETWORK)
#include
#include "src/common/xstring.h"
#include "src/common/read_config.h"
// Default lease time 1 week
#define COOKIE_LEASE_TIME 60*60*24*7
// Extend lease every 2 hours
#define COOKIE_LEASE_INTERVAL 60*60*2
// Cookie owner
#ifndef COOKIE_OWNER
#define COOKIE_OWNER "SLURM"
#endif
// Number of cookies to request
#define NUM_COOKIES 2
// List of cookie ids currently in use
static int32_t *cookie_id_list = NULL;
// Size of the cookie id list
static int32_t cookie_id_list_size = 0;
// Capacity of the cookie id list
static size_t cookie_id_list_capacity = 0;
// Mutex for the cookie id list
static pthread_mutex_t cookie_id_mutex = PTHREAD_MUTEX_INITIALIZER;
// If we are running the lease_extender
static bool lease_extender_running = false;
// Static function declarations
static void _add_cookie(int32_t cookie_id);
static void _remove_cookie(int32_t cookie_id);
static void *_lease_extender(void *args);
/*
* Start the thread to extend cookie leases.
*/
extern int start_lease_extender(void)
{
// Start lease extender in the slurmctld
if (!running_in_slurmctld())
return SLURM_SUCCESS;
slurm_thread_create_detached(NULL, _lease_extender, NULL);
return SLURM_SUCCESS;
}
/*
* cleanup the lease_extender
*/
extern int cleanup_lease_extender(void)
{
// Cleanup lease extender in the slurmctld
if (!running_in_slurmctld())
return SLURM_SUCCESS;
lease_extender_running = false;
slurm_mutex_lock(&cookie_id_mutex);
xfree(cookie_id_list);
cookie_id_list_size = 0;
cookie_id_list_capacity = 0;
slurm_mutex_unlock(&cookie_id_mutex);
return SLURM_SUCCESS;
}
/*
* Lease cookies for this job, filling in the information in *job.
* Leased cookies will periodically have their lease extended.
*/
extern int lease_cookies(slurm_cray_jobinfo_t *job, int32_t *nodes,
int32_t num_nodes)
{
int rc;
uint32_t i;
char *err_msg = NULL;
int32_t *cookie_ids = NULL;
char **cookies = NULL;
if (!running_in_slurmctld())
return SLURM_SUCCESS;
/*
* Lease some cookies
*
* TODO: I could ensure that the nodes list was sorted either by doing
* some research to see if it comes in sorted or calling a sort
* routine.
*/
rc = alpsc_lease_cookies(&err_msg, COOKIE_OWNER, job->apid,
COOKIE_LEASE_TIME, nodes,
num_nodes, NUM_COOKIES,
&cookies, &cookie_ids);
ALPSC_SN_DEBUG("alpsc_lease_cookies");
if (rc != 0) {
return SLURM_ERROR;
}
/*
* xmalloc the space for the cookies and cookie_ids, so it can be freed
* with xfree later, which is consistent with Slurm practices and how
* the rest of the structure will be freed.
* We must free() the ALPS Common library allocated memory using free(),
* not xfree().
*/
job->num_cookies = NUM_COOKIES;
job->cookie_ids = (uint32_t *) xmalloc(sizeof(uint32_t) * NUM_COOKIES);
memcpy(job->cookie_ids, cookie_ids, sizeof(uint32_t) * NUM_COOKIES);
free(cookie_ids);
job->cookies = (char **) xmalloc(sizeof(char **) * NUM_COOKIES);
for (i = 0; i < NUM_COOKIES; i++) {
job->cookies[i] = xstrdup(cookies[i]);
free(cookies[i]);
}
free(cookies);
// Add them to the list
for (i = 0; i < job->num_cookies; i++) {
_add_cookie(job->cookie_ids[i]);
}
return SLURM_SUCCESS;
}
/*
* Track cookies which have already been leased. These cookies will also
* have their lease extended periodically. Useful for when slurmctld is
* restarted, to track cookies leased before it was shut down.
*/
extern int track_cookies(slurm_cray_jobinfo_t *job)
{
uint32_t i;
if (!running_in_slurmctld())
return SLURM_SUCCESS;
// Add cookies to the list
for (i = 0; i < job->num_cookies; i++) {
_add_cookie(job->cookie_ids[i]);
}
return SLURM_SUCCESS;
}
/*
* Release cookies which have been leased.
*/
extern int release_cookies(slurm_cray_jobinfo_t *job)
{
uint32_t i;
int rc;
char *err_msg = NULL;
if (!running_in_slurmctld())
return SLURM_SUCCESS;
// Remove cookies from the list
for (i = 0; i < job->num_cookies; i++) {
_remove_cookie(job->cookie_ids[i]);
}
// Release them
rc = alpsc_release_cookies(&err_msg, (int32_t *) job->cookie_ids,
(int32_t) job->num_cookies);
ALPSC_SN_DEBUG("alpsc_release_cookies");
if (rc != 0) {
return SLURM_ERROR;
}
return SLURM_SUCCESS;
}
/*
* Add a cookie to the tracked cookie list
*/
static void _add_cookie(int32_t cookie_id)
{
int32_t i;
// Lock the mutex
slurm_mutex_lock(&cookie_id_mutex);
// If the cookie is already in the list, skip
for (i = 0; i < cookie_id_list_size; i++) {
if (cookie_id_list[i] == cookie_id) {
slurm_mutex_unlock(&cookie_id_mutex);
CRAY_INFO("Duplicate cookie %"PRId32" found in tracked"
" cookie list", cookie_id);
return;
}
}
// Extend id list if necessary
if (cookie_id_list_size + 1 > cookie_id_list_capacity) {
if (cookie_id_list_capacity == 0) {
cookie_id_list_capacity = 2048;
} else {
cookie_id_list_capacity *= 2;
}
cookie_id_list = xrealloc(cookie_id_list,
(cookie_id_list_capacity
* sizeof(int32_t)));
}
// Set value
cookie_id_list[cookie_id_list_size] = cookie_id;
cookie_id_list_size++;
// Unlock the mutex
slurm_mutex_unlock(&cookie_id_mutex);
}
/*
* Remove a cookie from the tracked cookie list
*/
static void _remove_cookie(int32_t cookie_id)
{
int32_t i;
int found = 0;
// Lock the mutex
slurm_mutex_lock(&cookie_id_mutex);
// Find a match in the list
for (i = 0; i < cookie_id_list_size; i++) {
if (cookie_id_list[i] == cookie_id) {
// Copy the last id to this spot
if (i < cookie_id_list_size - 1) {
cookie_id_list[i] =
cookie_id_list[cookie_id_list_size - 1];
}
found = 1;
cookie_id_list_size--;
break;
}
}
if (!found) {
/*
* For a hetstep we release the same cookies multiple times, so
* they will not exist after the first time they are released.
*/
CRAY_DEBUG("Cookie %"PRId32" not found in tracked cookie list",
cookie_id);
}
// Unlock the mutex
slurm_mutex_unlock(&cookie_id_mutex);
}
static void *_lease_extender(void *args)
{
int rc;
char *err_msg = NULL;
CRAY_INFO("Leasing cookies for %ds, renewing every %ds",
COOKIE_LEASE_TIME, COOKIE_LEASE_INTERVAL);
lease_extender_running = true;
while (lease_extender_running) {
// Lock the mutex
slurm_mutex_lock(&cookie_id_mutex);
// If there are cookies, extend their leases
if (cookie_id_list_size > 0) {
// Extend the cookie leases
CRAY_INFO("Extending leases for %"PRId32" cookies",
cookie_id_list_size);
rc = alpsc_set_cookie_lease(&err_msg, cookie_id_list,
cookie_id_list_size,
COOKIE_LEASE_TIME);
ALPSC_SN_DEBUG("alpsc_set_cookie_lease");
// Just ignore errors, not much we can do about them
}
// Unlock the mutex
slurm_mutex_unlock(&cookie_id_mutex);
// Wait until we want to extend leases again
sleep(COOKIE_LEASE_INTERVAL);
}
return NULL;
}
#endif