Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/marian-nmt/nccl.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
Diffstat (limited to 'src/include/nvmlwrap.h')
-rw-r--r--src/include/nvmlwrap.h149
1 files changed, 149 insertions, 0 deletions
diff --git a/src/include/nvmlwrap.h b/src/include/nvmlwrap.h
new file mode 100644
index 0000000..ddfd233
--- /dev/null
+++ b/src/include/nvmlwrap.h
@@ -0,0 +1,149 @@
+/*************************************************************************
+ * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef NCCL_NVMLWRAP_H_
+#define NCCL_NVMLWRAP_H_
+
+#include "core.h"
+
+//#define NVML_DIRECT 1
+#ifdef NVML_DIRECT
+#include "nvml.h"
+
+#define NVMLCHECK(cmd) do { \
+ nvmlReturn_t e = cmd; \
+ if( e != NVML_SUCCESS ) { \
+ WARN("NVML failure '%s'", nvmlErrorString(e)); \
+ return ncclSystemError; \
+ } \
+} while(false)
+
+static ncclResult_t wrapNvmlSymbols(void) { return ncclSuccess; }
+static ncclResult_t wrapNvmlInit(void) { NVMLCHECK(nvmlInit()); return ncclSuccess; }
+static ncclResult_t wrapNvmlShutdown(void) { NVMLCHECK(nvmlShutdown()); return ncclSuccess; }
+static ncclResult_t wrapNvmlDeviceGetHandleByPciBusId(const char* pciBusId, nvmlDevice_t* device) {
+ NVMLCHECK(nvmlDeviceGetHandleByPciBusId(pciBusId, device));
+ return ncclSuccess;
+}
+static ncclResult_t wrapNvmlDeviceGetIndex(nvmlDevice_t device, unsigned* index) {
+ NVMLCHECK(nvmlDeviceGetIndex(device, index));
+ return ncclSuccess;
+}
+static ncclResult_t wrapNvmlDeviceSetCpuAffinity(nvmlDevice_t device) {
+ NVMLCHECK(nvmlDeviceSetCpuAffinity(device));
+ return ncclSuccess;
+}
+static ncclResult_t wrapNvmlDeviceClearCpuAffinity(nvmlDevice_t device) {
+ NVMLCHECK(nvmlDeviceClearCpuAffinity(device));
+ return ncclSuccess;
+}
+static ncclResult_t wrapNvmlDeviceGetHandleByIndex(unsigned int index, nvmlDevice_t *device) {
+ NVMLCHECK(nvmlDeviceGetHandleByIndex(index,device));
+ return ncclSuccess;
+}
+static ncclResult_t wrapNvmlDeviceGetHandleByPciInfo(nvmlDevice_t device, nvmlPciInfo_t* pci) {
+ NVMLCHECK(nvmlDeviceGetPciInfo(device, pci));
+ return ncclSuccess;
+}
+static ncclResult_t wrapNvmlDeviceGetNvLinkState(nvmlDevice_t device, unsigned int link, nvmlEnableState_t *isActive) {
+ NVMLCHECK(nvmlDeviceGetNvLinkState(device, link, isActive));
+ return ncclSuccess;
+}
+static ncclResult_t wrapNvmlDeviceGetNvLinkRemotePciInfo(nvmlDevice_t device, unsigned int link, nvmlPciInfo_t *pci) {
+ NVMLCHECK(nvmlDeviceGetNvLinkRemotePciInfo(device, link, pci));
+ return ncclSuccess;
+}
+static ncclResult_t wrapNvmlDeviceGetNvLinkCapability(nvmlDevice_t device, unsigned int link,
+ nvmlNvLinkCapability_t capability, unsigned int *capResult) {
+ NVMLCHECK(nvmlDeviceGetNvLinkCapability(device, link, capability, capResult));
+ return ncclSuccess;
+}
+#else
+// Dynamically handle dependencies on NVML
+
+/* Extracted from nvml.h */
+typedef struct nvmlDevice_st* nvmlDevice_t;
+#define NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE 16
+
+typedef enum nvmlEnableState_enum
+{
+ NVML_FEATURE_DISABLED = 0, //!< Feature disabled
+ NVML_FEATURE_ENABLED = 1 //!< Feature enabled
+} nvmlEnableState_t;
+
+typedef enum nvmlNvLinkCapability_enum
+{
+ NVML_NVLINK_CAP_P2P_SUPPORTED = 0, // P2P over NVLink is supported
+ NVML_NVLINK_CAP_SYSMEM_ACCESS = 1, // Access to system memory is supported
+ NVML_NVLINK_CAP_P2P_ATOMICS = 2, // P2P atomics are supported
+ NVML_NVLINK_CAP_SYSMEM_ATOMICS= 3, // System memory atomics are supported
+ NVML_NVLINK_CAP_SLI_BRIDGE = 4, // SLI is supported over this link
+ NVML_NVLINK_CAP_VALID = 5, // Link is supported on this device
+ // should be last
+ NVML_NVLINK_CAP_COUNT
+} nvmlNvLinkCapability_t;
+
+typedef enum nvmlReturn_enum
+{
+ NVML_SUCCESS = 0, //!< The operation was successful
+ NVML_ERROR_UNINITIALIZED = 1, //!< NVML was not first initialized with nvmlInit()
+ NVML_ERROR_INVALID_ARGUMENT = 2, //!< A supplied argument is invalid
+ NVML_ERROR_NOT_SUPPORTED = 3, //!< The requested operation is not available on target device
+ NVML_ERROR_NO_PERMISSION = 4, //!< The current user does not have permission for operation
+ NVML_ERROR_ALREADY_INITIALIZED = 5, //!< Deprecated: Multiple initializations are now allowed through ref counting
+ NVML_ERROR_NOT_FOUND = 6, //!< A query to find an object was unsuccessful
+ NVML_ERROR_INSUFFICIENT_SIZE = 7, //!< An input argument is not large enough
+ NVML_ERROR_INSUFFICIENT_POWER = 8, //!< A device's external power cables are not properly attached
+ NVML_ERROR_DRIVER_NOT_LOADED = 9, //!< NVIDIA driver is not loaded
+ NVML_ERROR_TIMEOUT = 10, //!< User provided timeout passed
+ NVML_ERROR_IRQ_ISSUE = 11, //!< NVIDIA Kernel detected an interrupt issue with a GPU
+ NVML_ERROR_LIBRARY_NOT_FOUND = 12, //!< NVML Shared Library couldn't be found or loaded
+ NVML_ERROR_FUNCTION_NOT_FOUND = 13, //!< Local version of NVML doesn't implement this function
+ NVML_ERROR_CORRUPTED_INFOROM = 14, //!< infoROM is corrupted
+ NVML_ERROR_GPU_IS_LOST = 15, //!< The GPU has fallen off the bus or has otherwise become inaccessible
+ NVML_ERROR_RESET_REQUIRED = 16, //!< The GPU requires a reset before it can be used again
+ NVML_ERROR_OPERATING_SYSTEM = 17, //!< The GPU control device has been blocked by the operating system/cgroups
+ NVML_ERROR_LIB_RM_VERSION_MISMATCH = 18, //!< RM detects a driver/library version mismatch
+ NVML_ERROR_IN_USE = 19, //!< An operation cannot be performed because the GPU is currently in use
+ NVML_ERROR_UNKNOWN = 999 //!< An internal driver error occurred
+} nvmlReturn_t;
+
+typedef struct nvmlPciInfo_st
+{
+ char busId[NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE]; //!< The tuple domain:bus:device.function PCI identifier (&amp; NULL terminator)
+ unsigned int domain; //!< The PCI domain on which the device's bus resides, 0 to 0xffff
+ unsigned int bus; //!< The bus on which the device resides, 0 to 0xff
+ unsigned int device; //!< The device's id on the bus, 0 to 31
+ unsigned int pciDeviceId; //!< The combined 16-bit device id and 16-bit vendor id
+
+ // Added in NVML 2.285 API
+ unsigned int pciSubSystemId; //!< The 32-bit Sub System Device ID
+
+ // NVIDIA reserved for internal use only
+ unsigned int reserved0;
+ unsigned int reserved1;
+ unsigned int reserved2;
+ unsigned int reserved3;
+} nvmlPciInfo_t;
+/* End of nvml.h */
+
+ncclResult_t wrapNvmlSymbols(void);
+
+ncclResult_t wrapNvmlInit(void);
+ncclResult_t wrapNvmlShutdown(void);
+ncclResult_t wrapNvmlDeviceGetHandleByPciBusId(const char* pciBusId, nvmlDevice_t* device);
+ncclResult_t wrapNvmlDeviceGetIndex(nvmlDevice_t device, unsigned* index);
+ncclResult_t wrapNvmlDeviceSetCpuAffinity(nvmlDevice_t device);
+ncclResult_t wrapNvmlDeviceClearCpuAffinity(nvmlDevice_t device);
+ncclResult_t wrapNvmlDeviceGetHandleByIndex(unsigned int index, nvmlDevice_t *device);
+ncclResult_t wrapNvmlDeviceGetPciInfo(nvmlDevice_t device, nvmlPciInfo_t* pci);
+ncclResult_t wrapNvmlDeviceGetNvLinkState(nvmlDevice_t device, unsigned int link, nvmlEnableState_t *isActive);
+ncclResult_t wrapNvmlDeviceGetNvLinkRemotePciInfo(nvmlDevice_t device, unsigned int link, nvmlPciInfo_t *pci);
+ncclResult_t wrapNvmlDeviceGetNvLinkCapability(nvmlDevice_t device, unsigned int link,
+ nvmlNvLinkCapability_t capability, unsigned int *capResult);
+#endif // NVML_DIRECT
+
+#endif // End include guard