From c8c68fb5f79d28555bcc65c423e4b250fca85bbf Mon Sep 17 00:00:00 2001 From: Ke Wen Date: Fri, 12 Jul 2019 08:30:05 -0700 Subject: Size up IPC buffers to multiples of 2MB Avoid potential CUDA error in concurrent communicator initialization --- src/include/comm.h | 2 +- src/transport/p2p.cc | 6 ++++-- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/src/include/comm.h b/src/include/comm.h index 132eb39..3b2a85d 100644 --- a/src/include/comm.h +++ b/src/include/comm.h @@ -23,7 +23,7 @@ struct cudaLaunchParams { #define CACHE_LINE_SIZE 128 #define MEM_ALIGN 4096 -#define CUDA_IPC_MIN 2097152UL /* 2MiB - not currently used */ +#define CUDA_IPC_MIN 2097152UL struct ncclSendMem { union { diff --git a/src/transport/p2p.cc b/src/transport/p2p.cc index 42b549e..62bd725 100644 --- a/src/transport/p2p.cc +++ b/src/transport/p2p.cc @@ -468,7 +468,8 @@ ncclResult_t p2pSendSetup(struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peer struct p2pSendResources* resources; NCCLCHECK(ncclCalloc(&resources, 1)); send->transportResources = resources; - const int sendSize = sizeof(struct ncclSendMem); + int sendSize = sizeof(struct ncclSendMem); + ALIGN_SIZE(sendSize, CUDA_IPC_MIN); NCCLCHECK(ncclCudaCalloc((char**)&resources->devMem, sendSize)); struct p2pConnectInfo info; @@ -517,7 +518,8 @@ ncclResult_t p2pRecvSetup(struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peer struct p2pRecvResources* resources; NCCLCHECK(ncclCalloc(&resources, 1)); recv->transportResources = resources; - const int recvSize = offsetof(struct ncclRecvMem, buff)+buffSize; + int recvSize = offsetof(struct ncclRecvMem, buff)+buffSize; + ALIGN_SIZE(recvSize, CUDA_IPC_MIN); NCCLCHECK(ncclCudaCalloc((char**)&resources->devMem, recvSize)); struct p2pConnectInfo info; -- cgit v1.2.3