Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/marian-nmt/nccl.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorSylvain Jeaugey <sjeaugey@nvidia.com>2020-02-12 22:04:35 +0300
committerSylvain Jeaugey <sjeaugey@nvidia.com>2020-02-12 22:11:55 +0300
commitc38f174bd436031dbc79dce19ff969f377976a8a (patch)
treed329ae936567710d56cffed83810e01815bd2765
parent3701130b3c1bcdb01c14b3cb70fe52498c1e82b7 (diff)
Fix Allgather operations above 4G with multiple GPUs per process.
Fixes nccl-tests#37. Direct offsets were still on 32 bits in the low-level primitives.
-rw-r--r--src/collectives/device/primitives.h18
1 files changed, 9 insertions, 9 deletions
diff --git a/src/collectives/device/primitives.h b/src/collectives/device/primitives.h
index aa3d20d..b624359 100644
--- a/src/collectives/device/primitives.h
+++ b/src/collectives/device/primitives.h
@@ -1,5 +1,5 @@
/*************************************************************************
- * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2016-2020, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@@ -143,12 +143,12 @@ class ncclPrimitives {
}
template <int DIRECTRECV>
- inline __device__ const T* directRecvPtr(int i, int directOffset) {
+ inline __device__ const T* directRecvPtr(int i, ssize_t directOffset) {
return DIRECTRECV && recvDirectBuff[i] ? recvDirectBuff[i]+directOffset : recvPtr(i);
}
template <int DIRECTSEND>
- inline __device__ T* directSendPtr(int i, int directOffset) {
+ inline __device__ T* directSendPtr(int i, ssize_t directOffset) {
return DIRECTSEND && sendDirectBuff[i] ? sendDirectBuff[i]+directOffset : sendPtr(i);
}
@@ -164,7 +164,7 @@ class ncclPrimitives {
template <int DIRECTRECV, int DIRECTSEND, int RECV, int SEND, int SRC, int DST>
inline __device__ void
- GenericOp(const T* srcPtr, T* dstPtr, int nelem, int directOffset) {
+ GenericOp(const T* srcPtr, T* dstPtr, int nelem, ssize_t directOffset) {
int offset = 0;
int sliceSize = stepSize*SLICESTEPS;
int dataSize = max(DIVUP(nelem, 16*SLICESPERCHUNK)*16, sliceSize/32);
@@ -310,7 +310,7 @@ class ncclPrimitives {
GenericOp<0, 0, 0, 1, 1, 0>(src, NULL, nelem, 0);
}
__device__ __forceinline__ void
- directSend(const T* src, int directOffset, int nelem) {
+ directSend(const T* src, ssize_t directOffset, int nelem) {
GenericOp<0, 1, 0, 1, 1, 0>(src, NULL, nelem, directOffset);
}
@@ -319,7 +319,7 @@ class ncclPrimitives {
GenericOp<0, 0, 1, 0, 0, 1>(NULL, dst, nelem, 0);
}
__device__ __forceinline__ void
- directRecv(T* dst, int directOffset, int nelem) {
+ directRecv(T* dst, ssize_t directOffset, int nelem) {
GenericOp<1, 0, 1, 0, 0, 1>(NULL, dst, nelem, directOffset);
}
@@ -328,7 +328,7 @@ class ncclPrimitives {
GenericOp<0, 0, 0, 1, 1, 1>(src, dst, nelem, 0);
}
__device__ __forceinline__ void
- directCopySend(const T* src, T* dst, int directOffset, int nelem) {
+ directCopySend(const T* src, T* dst, ssize_t directOffset, int nelem) {
GenericOp<0, 1, 0, 1, 1, 1>(src, dst, nelem, directOffset);
}
@@ -337,7 +337,7 @@ class ncclPrimitives {
GenericOp<0, 0, 1, 1, 0, 1>(NULL, dst, nelem, 0);
}
__device__ __forceinline__ void
- directRecvCopySend(T* dst, int directOffset, int nelem) {
+ directRecvCopySend(T* dst, ssize_t directOffset, int nelem) {
GenericOp<1, 1, 1, 1, 0, 1>(NULL, dst, nelem, directOffset);
}
@@ -356,7 +356,7 @@ class ncclPrimitives {
GenericOp<0, 0, 1, 1, 1, 1>(src, dst, nelem, 0);
}
__device__ __forceinline__ void
- directRecvReduceCopySend(const T* src, T* dst, int directOffset, int nelem) {
+ directRecvReduceCopySend(const T* src, T* dst, ssize_t directOffset, int nelem) {
// Direct is only for the send part
GenericOp<0, 1, 1, 1, 1, 1>(src, dst, nelem, directOffset);
}