Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/marian-nmt/nccl.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authoraokomoriuta <internal@enmps.net>2020-01-29 16:05:12 +0300
committerSylvain Jeaugey <sjeaugey@nvidia.com>2020-04-15 05:00:51 +0300
commita783484ab5003e96774c12d14f555ef969a410fa (patch)
tree78deb517c4af865417dad502184a1c336cf8ff9d
parentb5b6c6acdd40b816e79fcffb251346ca73dd7bcd (diff)
Fix wrong variable name "slice" to "chunk"
https://github.com/NVIDIA/nccl/issues/287
-rw-r--r--src/collectives/device/all_reduce.h66
1 files changed, 33 insertions, 33 deletions
diff --git a/src/collectives/device/all_reduce.h b/src/collectives/device/all_reduce.h
index 173b5fa..4e04f88 100644
--- a/src/collectives/device/all_reduce.h
+++ b/src/collectives/device/all_reduce.h
@@ -37,19 +37,19 @@ __device__ void ncclAllReduceRingKernel(struct CollectiveArgs* args) {
/////////////// begin AllReduce steps ///////////////
ssize_t offset;
int nelem;
- int slice;
+ int chunk;
// step 0: push data to next GPU
- slice = ring->devUserRanks[nranks-1];
- offset = chunkOffset + slice * realChunkSize;
+ chunk = ring->devUserRanks[nranks-1];
+ offset = chunkOffset + chunk * realChunkSize;
nelem = min(realChunkSize, size-offset);
prims.send(thisInput+offset, nelem);
// k-2 steps: reduce and copy to next GPU
for (int j=2; j<nranks; ++j) {
- slice = ring->devUserRanks[nranks-j];
- offset = chunkOffset + slice * realChunkSize;
+ chunk = ring->devUserRanks[nranks-j];
+ offset = chunkOffset + chunk * realChunkSize;
nelem = min(realChunkSize, size-offset);
prims.recvReduceSend(thisInput+offset, nelem);
@@ -57,24 +57,24 @@ __device__ void ncclAllReduceRingKernel(struct CollectiveArgs* args) {
// step k-1: reduce this buffer and data, which will produce the final
// result that we store in this data and push to the next GPU
- slice = ring->devUserRanks[0];
- offset = chunkOffset + slice * realChunkSize;
+ chunk = ring->devUserRanks[0];
+ offset = chunkOffset + chunk * realChunkSize;
nelem = min(realChunkSize, size-offset);
prims.directRecvReduceCopySend(thisInput+offset, thisOutput+offset, offset, nelem);
// k-2 steps: copy to next GPU
for (int j=1; j<nranks-1; ++j) {
- slice = ring->devUserRanks[nranks-j];
- offset = chunkOffset + slice * realChunkSize;
+ chunk = ring->devUserRanks[nranks-j];
+ offset = chunkOffset + chunk * realChunkSize;
nelem = min(realChunkSize, size-offset);
prims.directRecvCopySend(thisOutput+offset, offset, nelem);
}
// Make final copy from buffer to dest.
- slice = ring->devUserRanks[1];
- offset = chunkOffset + slice * realChunkSize;
+ chunk = ring->devUserRanks[1];
+ offset = chunkOffset + chunk * realChunkSize;
nelem = min(realChunkSize, size-offset);
// Final wait/copy.
@@ -225,19 +225,19 @@ __device__ void ncclAllReduceRingLLKernel(struct CollectiveArgs* args) {
/////////////// begin AllReduce steps ///////////////
ssize_t offset;
int nelem;
- int slice;
+ int chunk;
// step 0: push data to next GPU
- slice = ring->devUserRanks[nranks-1];
- offset = gridOffset + (slice*args->nChannels+bid) * chunkSize;
+ chunk = ring->devUserRanks[nranks-1];
+ offset = gridOffset + (chunk*args->nChannels+bid) * chunkSize;
nelem = min(chunkSize, size-offset);
LLprims.send(thisInput+offset, nelem);
// k-2 steps: reduce and copy to next GPU
for (int j=2; j<nranks; ++j) {
- slice = ring->devUserRanks[nranks-j];
- offset = gridOffset + (slice*args->nChannels+bid) * chunkSize;
+ chunk = ring->devUserRanks[nranks-j];
+ offset = gridOffset + (chunk*args->nChannels+bid) * chunkSize;
nelem = min(chunkSize, size-offset);
LLprims.recvReduceSend(thisInput+offset, nelem);
@@ -245,24 +245,24 @@ __device__ void ncclAllReduceRingLLKernel(struct CollectiveArgs* args) {
// step k-1: reduce this buffer and data, which will produce the final
// result that we store in this data and push to the next GPU
- slice = ring->devUserRanks[0];
- offset = gridOffset + (slice*args->nChannels+bid) * chunkSize;
+ chunk = ring->devUserRanks[0];
+ offset = gridOffset + (chunk*args->nChannels+bid) * chunkSize;
nelem = min(chunkSize, size-offset);
LLprims.recvReduceCopySend(thisInput+offset, thisOutput+offset, nelem);
// k-2 steps: copy to next GPU
for (int j=1; j<nranks-1; ++j) {
- slice = ring->devUserRanks[nranks-j];
- offset = gridOffset + (slice*args->nChannels+bid) * chunkSize;
+ chunk = ring->devUserRanks[nranks-j];
+ offset = gridOffset + (chunk*args->nChannels+bid) * chunkSize;
nelem = min(chunkSize, size-offset);
LLprims.recvCopySend(thisOutput+offset, nelem);
}
// Make final copy from buffer to dest.
- slice = ring->devUserRanks[1];
- offset = gridOffset + (slice*args->nChannels+bid) * chunkSize;
+ chunk = ring->devUserRanks[1];
+ offset = gridOffset + (chunk*args->nChannels+bid) * chunkSize;
nelem = min(chunkSize, size-offset);
// Here we need to copy from buffer to this output.
@@ -413,19 +413,19 @@ __device__ void ncclAllReduceRingLL128Kernel(struct CollectiveArgs* args) {
/////////////// begin AllReduce steps ///////////////
ssize_t offset;
int nelem;
- int slice;
+ int chunk;
// step 0: push data to next GPU
- slice = ring->devUserRanks[nranks-1];
- offset = gridOffset + (slice*args->nChannels+bid) * chunkSize;
+ chunk = ring->devUserRanks[nranks-1];
+ offset = gridOffset + (chunk*args->nChannels+bid) * chunkSize;
nelem = min(chunkSize, size-offset);
LLprims.send(thisInput+offset, nelem);
// k-2 steps: reduce and copy to next GPU
for (int j=2; j<nranks; ++j) {
- slice = ring->devUserRanks[nranks-j];
- offset = gridOffset + (slice*args->nChannels+bid) * chunkSize;
+ chunk = ring->devUserRanks[nranks-j];
+ offset = gridOffset + (chunk*args->nChannels+bid) * chunkSize;
nelem = min(chunkSize, size-offset);
LLprims.recvReduceSend(thisInput+offset, nelem);
@@ -433,24 +433,24 @@ __device__ void ncclAllReduceRingLL128Kernel(struct CollectiveArgs* args) {
// step k-1: reduce this buffer and data, which will produce the final
// result that we store in this data and push to the next GPU
- slice = ring->devUserRanks[0];
- offset = gridOffset + (slice*args->nChannels+bid) * chunkSize;
+ chunk = ring->devUserRanks[0];
+ offset = gridOffset + (chunk*args->nChannels+bid) * chunkSize;
nelem = min(chunkSize, size-offset);
LLprims.recvReduceCopySend(thisInput+offset, thisOutput+offset, nelem);
// k-2 steps: copy to next GPU
for (int j=1; j<nranks-1; ++j) {
- slice = ring->devUserRanks[nranks-j];
- offset = gridOffset + (slice*args->nChannels+bid) * chunkSize;
+ chunk = ring->devUserRanks[nranks-j];
+ offset = gridOffset + (chunk*args->nChannels+bid) * chunkSize;
nelem = min(chunkSize, size-offset);
LLprims.recvCopySend(thisOutput+offset, nelem);
}
// Make final copy from buffer to dest.
- slice = ring->devUserRanks[1];
- offset = gridOffset + (slice*args->nChannels+bid) * chunkSize;
+ chunk = ring->devUserRanks[1];
+ offset = gridOffset + (chunk*args->nChannels+bid) * chunkSize;
nelem = min(chunkSize, size-offset);
// Here we need to copy from buffer to this output.