/*****************************************************************************\ * cookies.c - Library for managing a switch on a Cray system. ***************************************************************************** * Copyright (C) 2014 SchedMD LLC * Copyright 2014 Cray Inc. All Rights Reserved. * Written by David Gloe * * This file is part of Slurm, a resource management program. * For details, see . * Please also read the included file: DISCLAIMER. * * Slurm is free software; you can redistribute it and/or modify it under * the terms of the GNU General Public License as published by the Free * Software Foundation; either version 2 of the License, or (at your option) * any later version. * * In addition, as a special exception, the copyright holders give permission * to link the code of portions of this program with the OpenSSL library under * certain conditions as described in each individual source file, and * distribute linked combinations including the two. You must obey the GNU * General Public License in all respects for all of the code used other than * OpenSSL. If you modify file(s) with this exception, you may extend this * exception to your version of the file(s), but you are not obligated to do * so. If you do not wish to do so, delete this exception statement from your * version. If you delete this exception statement from all source files in * the program, then also delete it here. * * Slurm is distributed in the hope that it will be useful, but WITHOUT ANY * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more * details. * * You should have received a copy of the GNU General Public License along * with Slurm; if not, write to the Free Software Foundation, Inc., * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. \*****************************************************************************/ #include "switch_cray_aries.h" #if defined(HAVE_NATIVE_CRAY) || defined(HAVE_CRAY_NETWORK) #include #include "src/common/xstring.h" #include "src/common/read_config.h" // Default lease time 1 week #define COOKIE_LEASE_TIME 60*60*24*7 // Extend lease every 2 hours #define COOKIE_LEASE_INTERVAL 60*60*2 // Cookie owner #ifndef COOKIE_OWNER #define COOKIE_OWNER "SLURM" #endif // Number of cookies to request #define NUM_COOKIES 2 // List of cookie ids currently in use static int32_t *cookie_id_list = NULL; // Size of the cookie id list static int32_t cookie_id_list_size = 0; // Capacity of the cookie id list static size_t cookie_id_list_capacity = 0; // Mutex for the cookie id list static pthread_mutex_t cookie_id_mutex = PTHREAD_MUTEX_INITIALIZER; // If we are running the lease_extender static bool lease_extender_running = false; // Static function declarations static void _add_cookie(int32_t cookie_id); static void _remove_cookie(int32_t cookie_id); static void *_lease_extender(void *args); /* * Start the thread to extend cookie leases. */ extern int start_lease_extender(void) { // Start lease extender in the slurmctld if (!running_in_slurmctld()) return SLURM_SUCCESS; slurm_thread_create_detached(NULL, _lease_extender, NULL); return SLURM_SUCCESS; } /* * cleanup the lease_extender */ extern int cleanup_lease_extender(void) { // Cleanup lease extender in the slurmctld if (!running_in_slurmctld()) return SLURM_SUCCESS; lease_extender_running = false; slurm_mutex_lock(&cookie_id_mutex); xfree(cookie_id_list); cookie_id_list_size = 0; cookie_id_list_capacity = 0; slurm_mutex_unlock(&cookie_id_mutex); return SLURM_SUCCESS; } /* * Lease cookies for this job, filling in the information in *job. * Leased cookies will periodically have their lease extended. */ extern int lease_cookies(slurm_cray_jobinfo_t *job, int32_t *nodes, int32_t num_nodes) { int rc; uint32_t i; char *err_msg = NULL; int32_t *cookie_ids = NULL; char **cookies = NULL; if (!running_in_slurmctld()) return SLURM_SUCCESS; /* * Lease some cookies * * TODO: I could ensure that the nodes list was sorted either by doing * some research to see if it comes in sorted or calling a sort * routine. */ rc = alpsc_lease_cookies(&err_msg, COOKIE_OWNER, job->apid, COOKIE_LEASE_TIME, nodes, num_nodes, NUM_COOKIES, &cookies, &cookie_ids); ALPSC_SN_DEBUG("alpsc_lease_cookies"); if (rc != 0) { return SLURM_ERROR; } /* * xmalloc the space for the cookies and cookie_ids, so it can be freed * with xfree later, which is consistent with Slurm practices and how * the rest of the structure will be freed. * We must free() the ALPS Common library allocated memory using free(), * not xfree(). */ job->num_cookies = NUM_COOKIES; job->cookie_ids = (uint32_t *) xmalloc(sizeof(uint32_t) * NUM_COOKIES); memcpy(job->cookie_ids, cookie_ids, sizeof(uint32_t) * NUM_COOKIES); free(cookie_ids); job->cookies = (char **) xmalloc(sizeof(char **) * NUM_COOKIES); for (i = 0; i < NUM_COOKIES; i++) { job->cookies[i] = xstrdup(cookies[i]); free(cookies[i]); } free(cookies); // Add them to the list for (i = 0; i < job->num_cookies; i++) { _add_cookie(job->cookie_ids[i]); } return SLURM_SUCCESS; } /* * Track cookies which have already been leased. These cookies will also * have their lease extended periodically. Useful for when slurmctld is * restarted, to track cookies leased before it was shut down. */ extern int track_cookies(slurm_cray_jobinfo_t *job) { uint32_t i; if (!running_in_slurmctld()) return SLURM_SUCCESS; // Add cookies to the list for (i = 0; i < job->num_cookies; i++) { _add_cookie(job->cookie_ids[i]); } return SLURM_SUCCESS; } /* * Release cookies which have been leased. */ extern int release_cookies(slurm_cray_jobinfo_t *job) { uint32_t i; int rc; char *err_msg = NULL; if (!running_in_slurmctld()) return SLURM_SUCCESS; // Remove cookies from the list for (i = 0; i < job->num_cookies; i++) { _remove_cookie(job->cookie_ids[i]); } // Release them rc = alpsc_release_cookies(&err_msg, (int32_t *) job->cookie_ids, (int32_t) job->num_cookies); ALPSC_SN_DEBUG("alpsc_release_cookies"); if (rc != 0) { return SLURM_ERROR; } return SLURM_SUCCESS; } /* * Add a cookie to the tracked cookie list */ static void _add_cookie(int32_t cookie_id) { int32_t i; // Lock the mutex slurm_mutex_lock(&cookie_id_mutex); // If the cookie is already in the list, skip for (i = 0; i < cookie_id_list_size; i++) { if (cookie_id_list[i] == cookie_id) { slurm_mutex_unlock(&cookie_id_mutex); CRAY_INFO("Duplicate cookie %"PRId32" found in tracked" " cookie list", cookie_id); return; } } // Extend id list if necessary if (cookie_id_list_size + 1 > cookie_id_list_capacity) { if (cookie_id_list_capacity == 0) { cookie_id_list_capacity = 2048; } else { cookie_id_list_capacity *= 2; } cookie_id_list = xrealloc(cookie_id_list, (cookie_id_list_capacity * sizeof(int32_t))); } // Set value cookie_id_list[cookie_id_list_size] = cookie_id; cookie_id_list_size++; // Unlock the mutex slurm_mutex_unlock(&cookie_id_mutex); } /* * Remove a cookie from the tracked cookie list */ static void _remove_cookie(int32_t cookie_id) { int32_t i; int found = 0; // Lock the mutex slurm_mutex_lock(&cookie_id_mutex); // Find a match in the list for (i = 0; i < cookie_id_list_size; i++) { if (cookie_id_list[i] == cookie_id) { // Copy the last id to this spot if (i < cookie_id_list_size - 1) { cookie_id_list[i] = cookie_id_list[cookie_id_list_size - 1]; } found = 1; cookie_id_list_size--; break; } } if (!found) { /* * For a hetstep we release the same cookies multiple times, so * they will not exist after the first time they are released. */ CRAY_DEBUG("Cookie %"PRId32" not found in tracked cookie list", cookie_id); } // Unlock the mutex slurm_mutex_unlock(&cookie_id_mutex); } static void *_lease_extender(void *args) { int rc; char *err_msg = NULL; CRAY_INFO("Leasing cookies for %ds, renewing every %ds", COOKIE_LEASE_TIME, COOKIE_LEASE_INTERVAL); lease_extender_running = true; while (lease_extender_running) { // Lock the mutex slurm_mutex_lock(&cookie_id_mutex); // If there are cookies, extend their leases if (cookie_id_list_size > 0) { // Extend the cookie leases CRAY_INFO("Extending leases for %"PRId32" cookies", cookie_id_list_size); rc = alpsc_set_cookie_lease(&err_msg, cookie_id_list, cookie_id_list_size, COOKIE_LEASE_TIME); ALPSC_SN_DEBUG("alpsc_set_cookie_lease"); // Just ignore errors, not much we can do about them } // Unlock the mutex slurm_mutex_unlock(&cookie_id_mutex); // Wait until we want to extend leases again sleep(COOKIE_LEASE_INTERVAL); } return NULL; } #endif