/************************************************************************* * Copyright (c) 2015-2016, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of NVIDIA CORPORATION nor the names of its * contributors may be used to endorse or promote products derived * from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ************************************************************************/ #ifndef CORE_H_ #define CORE_H_ #include "nccl.h" #include #include #define MAXFLAGS 8 #define MAXQUEUE 4 // Maximum number of queued collectives per communicator. #define DEFAULT_BUFFER_SIZE_BYTES (1UL << 21) // DIE on error #define CUDACHECK(cmd) do { \ cudaError_t e = cmd; \ if( e != cudaSuccess ) { \ printf("Cuda failure %s:%d '%s'\n", \ __FILE__,__LINE__,cudaGetErrorString(e)); \ exit(EXIT_FAILURE); \ } \ } while(false) #define NCCL_MEM_PAD_ALIGN 4096 typedef struct { cudaEvent_t isDone[MAXQUEUE]; int back; // Last event used } EventQueue; struct ncclMem { union { // Pad this block so that devBuff is correctly aligned struct { int flags[MAXFLAGS]; void* recvPtrs[MAXFLAGS]; }; char pad[NCCL_MEM_PAD_ALIGN]; }; // devBuff will likely be bigger ; we only use its offset/address. char buff[NCCL_MEM_PAD_ALIGN]; }; struct ncclNodeRef { ncclMem* remote; ncclMem* local; int remoteCleanup; void* cleanupHandle; }; struct ncclComm { int nDev; // number of devices in communicator int cudaDev; // cuda device index int ncclId; // nccl logical index // Device and Host allocated chunks. Stored here to correctly free() memory. ncclMem* devMem; ncclMem* hostMem; int hostMemState; // Placed between calling and internal device streams. EventQueue events; // Maps an internal nccl index to user-specified rank order. This is necessary // since we need to know how the user expects data to be ordered across // devices. int* userFromRing; // copy of the above stored on each device int* devUserFromRing; // Inverse of userFromRing. Maps user specified index to internal nccl index. int* ringFromUser; // Size of temp buffer in bytes. size_t buffSize; // Whether we have remote access to the recvbuff pointers passed from remote // GPUs. In single process mode this can be used as long as QPI links are // not present. In multi-process, we never push to a remote recvbuff. int useRemoteRecv; // Device-to-device communication structures to access remote or local device // memory. Actual allocation larger than 1. ncclNodeRef ptrs[1]; }; typedef enum {NONE=0, VERSION=1, WARN=2, INFO=3, ABORT=4} DebugLevel; extern DebugLevel ncclDebugLevel; #define WARN(...) do { \ if (ncclDebugLevel >= WARN) { \ printf("WARN %s:%d ", __FILE__, __LINE__); \ printf(__VA_ARGS__); \ printf("\n"); \ if (ncclDebugLevel >= ABORT) abort(); \ } \ } while(0) #define INFO(...) do { \ if (ncclDebugLevel >= INFO) { \ printf("INFO "); printf(__VA_ARGS__); printf("\n"); \ } \ } while(0) #define DSOGLOBAL __attribute__((visibility("default"))) #endif // end include guard