diff options
author | Ke Wen <kwen@nvidia.com> | 2019-07-12 18:30:05 +0300 |
---|---|---|
committer | Ke Wen <kwen@nvidia.com> | 2019-07-12 19:50:17 +0300 |
commit | c8c68fb5f79d28555bcc65c423e4b250fca85bbf (patch) | |
tree | 16d7dc64a7e6debc9bfcae29e4ba450462db1bef | |
parent | 0b192d2299146e64a096aee16f8b8f7638d2d9d4 (diff) |
Size up IPC buffers to multiples of 2MB
Avoid potential CUDA error in concurrent communicator initialization
-rw-r--r-- | src/include/comm.h | 2 | ||||
-rw-r--r-- | src/transport/p2p.cc | 6 |
2 files changed, 5 insertions, 3 deletions
diff --git a/src/include/comm.h b/src/include/comm.h index 132eb39..3b2a85d 100644 --- a/src/include/comm.h +++ b/src/include/comm.h @@ -23,7 +23,7 @@ struct cudaLaunchParams { #define CACHE_LINE_SIZE 128 #define MEM_ALIGN 4096 -#define CUDA_IPC_MIN 2097152UL /* 2MiB - not currently used */ +#define CUDA_IPC_MIN 2097152UL struct ncclSendMem { union { diff --git a/src/transport/p2p.cc b/src/transport/p2p.cc index 42b549e..62bd725 100644 --- a/src/transport/p2p.cc +++ b/src/transport/p2p.cc @@ -468,7 +468,8 @@ ncclResult_t p2pSendSetup(struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peer struct p2pSendResources* resources; NCCLCHECK(ncclCalloc(&resources, 1)); send->transportResources = resources; - const int sendSize = sizeof(struct ncclSendMem); + int sendSize = sizeof(struct ncclSendMem); + ALIGN_SIZE(sendSize, CUDA_IPC_MIN); NCCLCHECK(ncclCudaCalloc((char**)&resources->devMem, sendSize)); struct p2pConnectInfo info; @@ -517,7 +518,8 @@ ncclResult_t p2pRecvSetup(struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peer struct p2pRecvResources* resources; NCCLCHECK(ncclCalloc(&resources, 1)); recv->transportResources = resources; - const int recvSize = offsetof(struct ncclRecvMem, buff)+buffSize; + int recvSize = offsetof(struct ncclRecvMem, buff)+buffSize; + ALIGN_SIZE(recvSize, CUDA_IPC_MIN); NCCLCHECK(ncclCudaCalloc((char**)&resources->devMem, recvSize)); struct p2pConnectInfo info; |