/* * To compile: * cc gpu_sensor.c -lnvidia-ml */ /*- * Copyright (c) 2011 Open Grid Scheduler. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the Open Grid Scheduler * project and its contributors. * 4. Neither the name of Open Grid Scheduler nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include #include #include #include #include #include #include #include #include #define SUCCESS 0 #define MATCH 0 #define FAILURE 1 #define LINE_LEN 256 #define MAXDEVNAME 50 typedef unsigned long long ecc_err_t; static void usage(void) { fprintf(stderr, "usage\n"); exit(0); } static int init_gpuapi(void) { nvmlReturn_t ret = nvmlInit(); if (ret == NVML_SUCCESS) return SUCCESS; else return FAILURE; } static int shutdown_gpuapi(void) { nvmlReturn_t ret = nvmlShutdown(); if (ret == NVML_SUCCESS) return SUCCESS; else return FAILURE; } static int gpu_maintenance(nvmlDevice_t device, char hostname[], int i) { /* future expansion */ return 0; } static int lasthourECC(nvmlDevice_t device, char hostname[], int i) { time_t time_now; struct tm *now; static int hour = -1; static ecc_err_t sb_ecc_err_prev_hr, db_ecc_err_prev_hr; time_now = time(NULL); now = localtime(&time_now); if (now->tm_hour == hour) { printf("%s:gpu.%d.prevhrsbiteccerror:%llu\n", hostname, i, sb_ecc_err_prev_hr); printf("%s:gpu.%d.prevhrdbiteccerror:%llu\n", hostname, i, db_ecc_err_prev_hr); return SUCCESS; } else { nvmlEnableState_t current, pending; if (hour != -1) { if (nvmlDeviceGetEccMode(device, ¤t, &pending) == NVML_SUCCESS) { if (current == NVML_FEATURE_ENABLED) { ecc_err_t eccCounts; if (nvmlDeviceGetTotalEccErrors(device, NVML_SINGLE_BIT_ECC, NVML_VOLATILE_ECC, &eccCounts) == NVML_SUCCESS) { sb_ecc_err_prev_hr = eccCounts; } if (nvmlDeviceGetTotalEccErrors(device, NVML_DOUBLE_BIT_ECC, NVML_VOLATILE_ECC, &eccCounts) == NVML_SUCCESS) { db_ecc_err_prev_hr = eccCounts; } } } printf("%s:gpu.%d.prevhrsbiteccerror:%llu\n", hostname, i, sb_ecc_err_prev_hr); printf("%s:gpu.%d.prevhrdbiteccerror:%llu\n", hostname, i, db_ecc_err_prev_hr); if (nvmlDeviceClearEccErrorCounts(device, NVML_VOLATILE_ECC) == NVML_SUCCESS) { hour = now->tm_hour; return SUCCESS; } else { return FAILURE; } } else { if (nvmlDeviceGetEccMode(device, ¤t, &pending) == NVML_SUCCESS) { if (current == NVML_FEATURE_ENABLED) { if (nvmlDeviceClearEccErrorCounts(device, NVML_VOLATILE_ECC) == NVML_SUCCESS) { hour = now->tm_hour; return SUCCESS; } } } return FAILURE; } } } static int listdevices(char hostname[]) { nvmlReturn_t ret; unsigned int count = 0, i; ret = nvmlDeviceGetCount(&count); if (ret != NVML_SUCCESS) { fprintf(stderr, "nvmlDeviceGetCount err: %d\n", ret); return FAILURE; } for (i=0; i < count; i++) { nvmlDevice_t device; ret = nvmlDeviceGetHandleByIndex(i, &device); if (ret != NVML_SUCCESS) { fprintf(stderr, "nvmlDeviceGetHandleByIndex err: %d\n", ret); continue; } { /* house keeping stuff */ gpu_maintenance(device, hostname, i); } { char name[MAXDEVNAME]; ret = nvmlDeviceGetName(device, name, sizeof(name)); if (ret == NVML_SUCCESS) { printf("%s:gpu.%d.name:%s\n", hostname, i, name); } } { nvmlPciInfo_t pci; ret = nvmlDeviceGetPciInfo(device, &pci); if (ret == NVML_SUCCESS) { printf("%s:gpu.%d.busId:%s\n", hostname, i, pci.busId); } } { unsigned int speed; ret = nvmlDeviceGetFanSpeed(device, &speed); if (ret == NVML_SUCCESS) { printf("%s:gpu.%d.fanspeed:%u\n", hostname, i, speed); } else if (ret == NVML_ERROR_NOT_SUPPORTED) { printf("%s:gpu.%d.fanspeed:0\n", hostname, i); } } { unsigned int clock; ret = nvmlDeviceGetClockInfo(device, NVML_CLOCK_GRAPHICS, &clock); if (ret == NVML_SUCCESS) { printf("%s:gpu.%d.clockspeed:%u\n", hostname, i, clock); } } { nvmlMemory_t memory; ret = nvmlDeviceGetMemoryInfo(device, &memory); if (ret == NVML_SUCCESS) { printf("%s:gpu.%d.memfree:%llu\n", hostname, i, memory.free); printf("%s:gpu.%d.memused:%llu\n", hostname, i, memory.used); printf("%s:gpu.%d.memtotal:%llu\n", hostname, i, memory.total); } } { nvmlUtilization_t utilization; ret = nvmlDeviceGetUtilizationRates(device, &utilization); if (ret == NVML_SUCCESS) { printf("%s:gpu.%d.utilgpu:%u\n", hostname, i, utilization.gpu); printf("%s:gpu.%d.utilmem:%u\n", hostname, i, utilization.memory); } } { /* house keeping stuff */ lasthourECC(device, hostname, i); } { nvmlEnableState_t current, pending; ret = nvmlDeviceGetEccMode(device, ¤t, &pending); if (ret == NVML_SUCCESS) { if (current == NVML_FEATURE_ENABLED) { ecc_err_t eccCounts; ret = nvmlDeviceGetTotalEccErrors(device, NVML_SINGLE_BIT_ECC, NVML_VOLATILE_ECC, &eccCounts); if (ret == NVML_SUCCESS) { printf("%s:gpu.%d.sbiteccerror:%llu\n", hostname, i, eccCounts); } ret = nvmlDeviceGetTotalEccErrors(device, NVML_DOUBLE_BIT_ECC, NVML_VOLATILE_ECC, &eccCounts); if (ret == NVML_SUCCESS) { printf("%s:gpu.%d.dbiteccerror:%llu\n", hostname, i, eccCounts); } } } } { unsigned int temp = 0; ret = nvmlDeviceGetTemperature(device, NVML_TEMPERATURE_GPU, &temp); if (ret == NVML_SUCCESS) { printf("%s:gpu.%d.temperature:%u\n", hostname, i, temp); } } } return SUCCESS; } static int init_hostname(char hostname[], size_t len) { if (gethostname(hostname, len) != 0) return FAILURE; else return SUCCESS; } int main(int argc, char *argv[]) { char buffer[LINE_LEN]; char hostname[HOST_NAME_MAX]; if (argc != 1) { usage(); } if (init_hostname(hostname, HOST_NAME_MAX) != SUCCESS) { fprintf(stderr, "init_hostname error\n"); exit(1); } if (init_gpuapi() != SUCCESS) { fprintf(stderr, "init_gpuapi error\n"); // exit(1); } #if !defined(STANDALONE) while (fgets(buffer, sizeof(buffer), stdin) != NULL) #endif { if (memcmp(buffer, "quit", sizeof("quit")-1) == MATCH) { shutdown_gpuapi(); return SUCCESS; /* done */ } { printf("begin\n"); listdevices(hostname); printf("end\n"); fflush(stdout); } } return SUCCESS; }