diff options
author | aokomoriuta <internal@enmps.net> | 2020-01-29 16:05:12 +0300 |
---|---|---|
committer | Sylvain Jeaugey <sjeaugey@nvidia.com> | 2020-04-15 05:00:51 +0300 |
commit | a783484ab5003e96774c12d14f555ef969a410fa (patch) | |
tree | 78deb517c4af865417dad502184a1c336cf8ff9d | |
parent | b5b6c6acdd40b816e79fcffb251346ca73dd7bcd (diff) |
Fix wrong variable name "slice" to "chunk"
https://github.com/NVIDIA/nccl/issues/287
-rw-r--r-- | src/collectives/device/all_reduce.h | 66 |
1 files changed, 33 insertions, 33 deletions
diff --git a/src/collectives/device/all_reduce.h b/src/collectives/device/all_reduce.h index 173b5fa..4e04f88 100644 --- a/src/collectives/device/all_reduce.h +++ b/src/collectives/device/all_reduce.h @@ -37,19 +37,19 @@ __device__ void ncclAllReduceRingKernel(struct CollectiveArgs* args) { /////////////// begin AllReduce steps /////////////// ssize_t offset; int nelem; - int slice; + int chunk; // step 0: push data to next GPU - slice = ring->devUserRanks[nranks-1]; - offset = chunkOffset + slice * realChunkSize; + chunk = ring->devUserRanks[nranks-1]; + offset = chunkOffset + chunk * realChunkSize; nelem = min(realChunkSize, size-offset); prims.send(thisInput+offset, nelem); // k-2 steps: reduce and copy to next GPU for (int j=2; j<nranks; ++j) { - slice = ring->devUserRanks[nranks-j]; - offset = chunkOffset + slice * realChunkSize; + chunk = ring->devUserRanks[nranks-j]; + offset = chunkOffset + chunk * realChunkSize; nelem = min(realChunkSize, size-offset); prims.recvReduceSend(thisInput+offset, nelem); @@ -57,24 +57,24 @@ __device__ void ncclAllReduceRingKernel(struct CollectiveArgs* args) { // step k-1: reduce this buffer and data, which will produce the final // result that we store in this data and push to the next GPU - slice = ring->devUserRanks[0]; - offset = chunkOffset + slice * realChunkSize; + chunk = ring->devUserRanks[0]; + offset = chunkOffset + chunk * realChunkSize; nelem = min(realChunkSize, size-offset); prims.directRecvReduceCopySend(thisInput+offset, thisOutput+offset, offset, nelem); // k-2 steps: copy to next GPU for (int j=1; j<nranks-1; ++j) { - slice = ring->devUserRanks[nranks-j]; - offset = chunkOffset + slice * realChunkSize; + chunk = ring->devUserRanks[nranks-j]; + offset = chunkOffset + chunk * realChunkSize; nelem = min(realChunkSize, size-offset); prims.directRecvCopySend(thisOutput+offset, offset, nelem); } // Make final copy from buffer to dest. - slice = ring->devUserRanks[1]; - offset = chunkOffset + slice * realChunkSize; + chunk = ring->devUserRanks[1]; + offset = chunkOffset + chunk * realChunkSize; nelem = min(realChunkSize, size-offset); // Final wait/copy. @@ -225,19 +225,19 @@ __device__ void ncclAllReduceRingLLKernel(struct CollectiveArgs* args) { /////////////// begin AllReduce steps /////////////// ssize_t offset; int nelem; - int slice; + int chunk; // step 0: push data to next GPU - slice = ring->devUserRanks[nranks-1]; - offset = gridOffset + (slice*args->nChannels+bid) * chunkSize; + chunk = ring->devUserRanks[nranks-1]; + offset = gridOffset + (chunk*args->nChannels+bid) * chunkSize; nelem = min(chunkSize, size-offset); LLprims.send(thisInput+offset, nelem); // k-2 steps: reduce and copy to next GPU for (int j=2; j<nranks; ++j) { - slice = ring->devUserRanks[nranks-j]; - offset = gridOffset + (slice*args->nChannels+bid) * chunkSize; + chunk = ring->devUserRanks[nranks-j]; + offset = gridOffset + (chunk*args->nChannels+bid) * chunkSize; nelem = min(chunkSize, size-offset); LLprims.recvReduceSend(thisInput+offset, nelem); @@ -245,24 +245,24 @@ __device__ void ncclAllReduceRingLLKernel(struct CollectiveArgs* args) { // step k-1: reduce this buffer and data, which will produce the final // result that we store in this data and push to the next GPU - slice = ring->devUserRanks[0]; - offset = gridOffset + (slice*args->nChannels+bid) * chunkSize; + chunk = ring->devUserRanks[0]; + offset = gridOffset + (chunk*args->nChannels+bid) * chunkSize; nelem = min(chunkSize, size-offset); LLprims.recvReduceCopySend(thisInput+offset, thisOutput+offset, nelem); // k-2 steps: copy to next GPU for (int j=1; j<nranks-1; ++j) { - slice = ring->devUserRanks[nranks-j]; - offset = gridOffset + (slice*args->nChannels+bid) * chunkSize; + chunk = ring->devUserRanks[nranks-j]; + offset = gridOffset + (chunk*args->nChannels+bid) * chunkSize; nelem = min(chunkSize, size-offset); LLprims.recvCopySend(thisOutput+offset, nelem); } // Make final copy from buffer to dest. - slice = ring->devUserRanks[1]; - offset = gridOffset + (slice*args->nChannels+bid) * chunkSize; + chunk = ring->devUserRanks[1]; + offset = gridOffset + (chunk*args->nChannels+bid) * chunkSize; nelem = min(chunkSize, size-offset); // Here we need to copy from buffer to this output. @@ -413,19 +413,19 @@ __device__ void ncclAllReduceRingLL128Kernel(struct CollectiveArgs* args) { /////////////// begin AllReduce steps /////////////// ssize_t offset; int nelem; - int slice; + int chunk; // step 0: push data to next GPU - slice = ring->devUserRanks[nranks-1]; - offset = gridOffset + (slice*args->nChannels+bid) * chunkSize; + chunk = ring->devUserRanks[nranks-1]; + offset = gridOffset + (chunk*args->nChannels+bid) * chunkSize; nelem = min(chunkSize, size-offset); LLprims.send(thisInput+offset, nelem); // k-2 steps: reduce and copy to next GPU for (int j=2; j<nranks; ++j) { - slice = ring->devUserRanks[nranks-j]; - offset = gridOffset + (slice*args->nChannels+bid) * chunkSize; + chunk = ring->devUserRanks[nranks-j]; + offset = gridOffset + (chunk*args->nChannels+bid) * chunkSize; nelem = min(chunkSize, size-offset); LLprims.recvReduceSend(thisInput+offset, nelem); @@ -433,24 +433,24 @@ __device__ void ncclAllReduceRingLL128Kernel(struct CollectiveArgs* args) { // step k-1: reduce this buffer and data, which will produce the final // result that we store in this data and push to the next GPU - slice = ring->devUserRanks[0]; - offset = gridOffset + (slice*args->nChannels+bid) * chunkSize; + chunk = ring->devUserRanks[0]; + offset = gridOffset + (chunk*args->nChannels+bid) * chunkSize; nelem = min(chunkSize, size-offset); LLprims.recvReduceCopySend(thisInput+offset, thisOutput+offset, nelem); // k-2 steps: copy to next GPU for (int j=1; j<nranks-1; ++j) { - slice = ring->devUserRanks[nranks-j]; - offset = gridOffset + (slice*args->nChannels+bid) * chunkSize; + chunk = ring->devUserRanks[nranks-j]; + offset = gridOffset + (chunk*args->nChannels+bid) * chunkSize; nelem = min(chunkSize, size-offset); LLprims.recvCopySend(thisOutput+offset, nelem); } // Make final copy from buffer to dest. - slice = ring->devUserRanks[1]; - offset = gridOffset + (slice*args->nChannels+bid) * chunkSize; + chunk = ring->devUserRanks[1]; + offset = gridOffset + (chunk*args->nChannels+bid) * chunkSize; nelem = min(chunkSize, size-offset); // Here we need to copy from buffer to this output. |