Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/marian-nmt/nccl.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
Diffstat (limited to 'src/collectives/device')
-rw-r--r--src/collectives/device/Makefile2
-rw-r--r--src/collectives/device/all_gather.cu2
-rw-r--r--src/collectives/device/all_gather.h8
-rw-r--r--src/collectives/device/all_reduce.cu2
-rw-r--r--src/collectives/device/all_reduce.h12
-rw-r--r--src/collectives/device/broadcast.cu2
-rw-r--r--src/collectives/device/broadcast.h8
-rw-r--r--src/collectives/device/common.h6
-rw-r--r--src/collectives/device/common_kernel.h4
-rw-r--r--src/collectives/device/functions.cu4
-rwxr-xr-xsrc/collectives/device/gen_rules.sh2
-rw-r--r--src/collectives/device/primitives.h54
-rw-r--r--src/collectives/device/reduce.cu2
-rw-r--r--src/collectives/device/reduce.h8
-rw-r--r--src/collectives/device/reduce_scatter.cu2
-rw-r--r--src/collectives/device/reduce_scatter.h8
16 files changed, 55 insertions, 71 deletions
diff --git a/src/collectives/device/Makefile b/src/collectives/device/Makefile
index 8e92596..0ee587b 100644
--- a/src/collectives/device/Makefile
+++ b/src/collectives/device/Makefile
@@ -1,5 +1,5 @@
#
-# Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
#
# See LICENSE.txt for license information
#
diff --git a/src/collectives/device/all_gather.cu b/src/collectives/device/all_gather.cu
index 530bf14..109c341 100644
--- a/src/collectives/device/all_gather.cu
+++ b/src/collectives/device/all_gather.cu
@@ -1,5 +1,5 @@
/*************************************************************************
- * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
diff --git a/src/collectives/device/all_gather.h b/src/collectives/device/all_gather.h
index 36809c9..8e78730 100644
--- a/src/collectives/device/all_gather.h
+++ b/src/collectives/device/all_gather.h
@@ -1,10 +1,10 @@
/*************************************************************************
- * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
-#include "core.h"
+#include "devcomm.h"
#include "primitives.h"
#include "collectives.h"
@@ -13,7 +13,7 @@ __device__ void ncclAllGatherRingKernel(struct CollectiveArgs* args) {
const int tid = threadIdx.x;
const int nthreads = blockDim.x - 1;
const int bid = args->bid;
- struct ncclComm* comm = args->comm;
+ struct ncclDevComm* comm = args->comm;
struct ncclChannel* channel = comm->channels+blockIdx.x;
struct ncclRing* ring = &channel->ring;
const ssize_t size = args->N;
@@ -74,7 +74,7 @@ __device__ void ncclAllGatherRingLLKernel(struct CollectiveArgs* args) {
const int tid = threadIdx.x;
const int bid = args->bid;
const int nthreads = args->nThreads;
- struct ncclComm* comm = args->comm;
+ struct ncclDevComm* comm = args->comm;
struct ncclChannel* channel = comm->channels+blockIdx.x;
struct ncclRing* ring = &channel->ring;
diff --git a/src/collectives/device/all_reduce.cu b/src/collectives/device/all_reduce.cu
index aaa96b4..85d007e 100644
--- a/src/collectives/device/all_reduce.cu
+++ b/src/collectives/device/all_reduce.cu
@@ -1,5 +1,5 @@
/*************************************************************************
- * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
diff --git a/src/collectives/device/all_reduce.h b/src/collectives/device/all_reduce.h
index ea89a71..9b058cc 100644
--- a/src/collectives/device/all_reduce.h
+++ b/src/collectives/device/all_reduce.h
@@ -1,10 +1,10 @@
/*************************************************************************
- * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
-#include "core.h"
+#include "devcomm.h"
#include "primitives.h"
#include "collectives.h"
@@ -13,7 +13,7 @@ __device__ void ncclAllReduceRingKernel(struct CollectiveArgs* args) {
const int tid = threadIdx.x;
const int nthreads = blockDim.x - 1;
const int bid = args->bid;
- struct ncclComm* comm = args->comm;
+ struct ncclDevComm* comm = args->comm;
struct ncclChannel* channel = comm->channels+blockIdx.x;
struct ncclRing* ring = &channel->ring;
const ssize_t size = args->N;
@@ -87,7 +87,7 @@ __device__ void ncclAllReduceTreeKernel(struct CollectiveArgs* args) {
const int tid = threadIdx.x;
const int nthreads = blockDim.x - 1;
const int bid = args->bid;
- struct ncclComm* comm = args->comm;
+ struct ncclDevComm* comm = args->comm;
struct ncclChannel* channel = comm->channels+blockIdx.x;
struct ncclTree* tree = &channel->tree;
const ssize_t size = args->N;
@@ -139,7 +139,7 @@ __device__ void ncclAllReduceRingLLKernel(struct CollectiveArgs* args) {
const int tid = threadIdx.x;
const int bid = args->bid;
const int nthreads = args->nThreads;
- struct ncclComm* comm = args->comm;
+ struct ncclDevComm* comm = args->comm;
struct ncclChannel* channel = comm->channels+blockIdx.x;
struct ncclRing* ring = &channel->ring;
@@ -214,7 +214,7 @@ __device__ void ncclAllReduceTreeLLKernel(struct CollectiveArgs* args) {
const int tid = threadIdx.x;
const int nthreads = args->nThreads;
const int bid = args->bid;
- struct ncclComm* comm = args->comm;
+ struct ncclDevComm* comm = args->comm;
struct ncclChannel* channel = comm->channels+blockIdx.x;
struct ncclTree* tree = &channel->tree;
const ssize_t size = args->N;
diff --git a/src/collectives/device/broadcast.cu b/src/collectives/device/broadcast.cu
index b83ee70..8c8dbb6 100644
--- a/src/collectives/device/broadcast.cu
+++ b/src/collectives/device/broadcast.cu
@@ -1,5 +1,5 @@
/*************************************************************************
- * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
diff --git a/src/collectives/device/broadcast.h b/src/collectives/device/broadcast.h
index fb18312..ae8667f 100644
--- a/src/collectives/device/broadcast.h
+++ b/src/collectives/device/broadcast.h
@@ -1,10 +1,10 @@
/*************************************************************************
- * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
-#include "core.h"
+#include "devcomm.h"
#include "primitives.h"
#include "collectives.h"
@@ -13,7 +13,7 @@ __device__ void ncclBroadcastRingKernel(struct CollectiveArgs* args) {
const int tid = threadIdx.x;
const int nthreads = blockDim.x - 1;
const int bid = args->bid;
- struct ncclComm* comm = args->comm;
+ struct ncclDevComm* comm = args->comm;
struct ncclChannel* channel = comm->channels+blockIdx.x;
struct ncclRing* ring = &channel->ring;
const ssize_t size = args->N;
@@ -59,7 +59,7 @@ __device__ void ncclBroadcastRingLLKernel(struct CollectiveArgs* args) {
const int tid = threadIdx.x;
const int bid = args->bid;
const int nthreads = args->nThreads;
- struct ncclComm* comm = args->comm;
+ struct ncclDevComm* comm = args->comm;
struct ncclChannel* channel = comm->channels+blockIdx.x;
struct ncclRing* ring = &channel->ring;
diff --git a/src/collectives/device/common.h b/src/collectives/device/common.h
index e4aecbd..8c336bf 100644
--- a/src/collectives/device/common.h
+++ b/src/collectives/device/common.h
@@ -1,5 +1,5 @@
/*************************************************************************
- * Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@@ -8,7 +8,7 @@
#define NCCL_DEVICE_COMMON_H_
#include "../collectives.h"
-#include "core.h"
+#include "devcomm.h"
#include "nccl.h"
// Exit If Abort Barrier across CTA: make sure all threads exit consistently
@@ -57,7 +57,7 @@ __global__ void NCCL_KERN_NAME(coll, op, dtype)(struct ncclColl firstColl) { \
int bid = blockIdx.x; \
__shared__ struct ncclColl localColl; \
\
- struct ncclComm* comm = firstColl.args.comm; \
+ struct ncclDevComm* comm = firstColl.args.comm; \
struct ncclChannel* channel = comm->channels+bid; \
struct ncclColl* c; \
if (bid == 0) { \
diff --git a/src/collectives/device/common_kernel.h b/src/collectives/device/common_kernel.h
index e1fb096..435a598 100644
--- a/src/collectives/device/common_kernel.h
+++ b/src/collectives/device/common_kernel.h
@@ -1,5 +1,5 @@
/*************************************************************************
- * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@@ -7,7 +7,7 @@
#ifndef NCCL_COMMON_KERNEL_H_
#define NCCL_COMMON_KERNEL_H_
-#include "core.h"
+#include "devcomm.h"
#include <cstdio>
#include <cstdint>
diff --git a/src/collectives/device/functions.cu b/src/collectives/device/functions.cu
index ea06b68..010c454 100644
--- a/src/collectives/device/functions.cu
+++ b/src/collectives/device/functions.cu
@@ -1,10 +1,10 @@
/*************************************************************************
- * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
-#include "core.h"
+#include "devcomm.h"
#include "collectives.h"
#include "common.h"
diff --git a/src/collectives/device/gen_rules.sh b/src/collectives/device/gen_rules.sh
index 3942c8c..4413213 100755
--- a/src/collectives/device/gen_rules.sh
+++ b/src/collectives/device/gen_rules.sh
@@ -1,6 +1,6 @@
#!/bin/bash
#
-# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
#
# See LICENSE.txt for license information
#
diff --git a/src/collectives/device/primitives.h b/src/collectives/device/primitives.h
index c5aaf54..7beeaf4 100644
--- a/src/collectives/device/primitives.h
+++ b/src/collectives/device/primitives.h
@@ -50,7 +50,7 @@ class ncclPrimitives {
T* sendDirectBuff[NSEND];
const T* recvBuff[NRECV];
T* sendBuff[NSEND];
- struct ncclComm* comm;
+ struct ncclDevComm* comm;
inline __device__ int recvOffset(int i) { return (recvStep[i]%NCCL_STEPS)*stepSize; }
inline __device__ int sendOffset(int i) { return (sendStep[i]%NCCL_STEPS)*stepSize; }
@@ -239,7 +239,7 @@ class ncclPrimitives {
public:
__device__ __forceinline__
- ncclPrimitives(const int tid, const int nthreads, int* recvPeers, int* sendPeers, T* directBuff, int stepSize, struct ncclChannel* channel, struct ncclComm* comm, const uint64_t opCount)
+ ncclPrimitives(const int tid, const int nthreads, int* recvPeers, int* sendPeers, T* directBuff, int stepSize, struct ncclChannel* channel, struct ncclDevComm* comm, const uint64_t opCount)
: comm(comm), tid(tid), nthreads(nthreads), stepSize(stepSize), opCount(opCount) {
// Make sure step is updated before we read it
__syncthreads();
@@ -329,14 +329,14 @@ class ncclLLPrimitives {
uint64_t sendConnHead;
union ncclLLFifoLine* recvBuff[NRECV];
union ncclLLFifoLine* sendBuff[NSEND];
- struct ncclComm* comm;
+ struct ncclDevComm* comm;
inline __device__ int recvOffset(int i) { return (recvStep[i]%NCCL_STEPS)*NCCL_LL_SLICE_LINES; }
inline __device__ int sendOffset(int i) { return (sendStep[i]%NCCL_STEPS)*NCCL_LL_SLICE_LINES; }
inline __device__ union ncclLLFifoLine* recvPtr(int i) { return recvBuff[i]+recvOffset(i); }
inline __device__ union ncclLLFifoLine* sendPtr(int i) { return sendBuff[i]+sendOffset(i); }
- inline __device__ uint32_t recvFlag(int i) { return recvStep[i]+1; }
- inline __device__ uint32_t sendFlag(int i) { return sendStep[i]+1; }
+ inline __device__ uint32_t recvFlag(int i) { return NCCL_LL_FLAG(recvStep[i]+1); }
+ inline __device__ uint32_t sendFlag(int i) { return NCCL_LL_FLAG(sendStep[i]+1); }
// Exit If Abort Barrier : make sure all threads exit consistently
// Each thread sets a predicate to true if val == 1
@@ -393,7 +393,10 @@ class ncclLLPrimitives {
sendConnHead = *waitPtr;
if (checkAbort(sendConn[i]->opCountRem)) break;
}
- if (fifoPtr) fifoPtr[sendStep[i]%NCCL_STEPS] = nbytes;
+ if (fifoPtr) {
+ int size = ((sendStep[i] & NCCL_LL_CLEAN_MASK) == NCCL_LL_CLEAN_MASK) ? NCCL_LL_SLICE_LINES*sizeof(union ncclLLFifoLine) : nbytes;
+ fifoPtr[sendStep[i]%NCCL_STEPS] = size;
+ }
}
}
@@ -402,7 +405,12 @@ class ncclLLPrimitives {
if (tid == i) *postPtr = recvStep[i];
}
- inline __device__ void postSend(int i) {
+ inline __device__ void postSend(int i, int offset) {
+ // LL Cleanup : write all flags in the slice to make sure we don't have
+ // data corruption when flag loops over.
+ if ((sendStep[i] & NCCL_LL_CLEAN_MASK) == NCCL_LL_CLEAN_MASK) {
+ for (int o = offset; o<NCCL_LL_SLICE_LINES; o+=nthreads) storeLL(sendPtr(i)+o, 0, sendFlag(i));
+ }
sendStep[i]++;
}
@@ -443,9 +451,10 @@ class ncclLLPrimitives {
uint32_t npack = DIVUP(nbytes, sizeof(uint64_t));
uint64_t* srcPack = (uint64_t*)srcPtr;
uint64_t* dstPack = (uint64_t*)dstPtr;
+ int offset = tid;
// Do multiples of 64 bits
#pragma unroll 2
- for (int offset=tid; offset<npack; offset+=nthreads) {
+ for (; offset<npack; offset+=nthreads) {
// Recv : local, then intra-node, then inter-node
uint64_t val = SRC ? readAL(srcPack+offset) : readLL(0, offset);
if (RECV) {
@@ -471,7 +480,7 @@ class ncclLLPrimitives {
}
exitIfAbortLocalBarrier();
FOR_RECV(postRecv);
- FOR_SEND(postSend);
+ FOR_SEND(postSend, offset);
}
__device__ __forceinline__ void loadRecvConn(struct ncclConnInfo* conn, int i) {
@@ -514,32 +523,9 @@ class ncclLLPrimitives {
}
}
- __device__ __forceinline__ void llSendCleaning(int i) {
- if (sendStep[i] > sendConn[i]->llLastCleaning + NCCL_LL_CLEAN_FREQ) {
- /* Reset all flags */
- static_assert((NCCL_LL_BUFF_SIZE % NCCL_LL_MAX_NTHREADS) == 0, "NCCL_LL_BUFF_SIZE must be a multiple of THREADS");
- static_assert(NCCL_LL_BUFF_SIZE/(sizeof(union ncclLLFifoLine)*NCCL_LL_MAX_NTHREADS) > 0, "NCCL_LL_BUFF_SIZE is less than 16 bytes*THREADS");
- for (int s=0; s<NCCL_STEPS; s++) {
- waitSend(i, 0);
- for (int o=tid; o<NCCL_LL_SLICE_LINES; o+=nthreads) {
- const union ncclLLFifoLine resetLine = { 0, sendFlag(i), 0, sendFlag(i) };
- sendPtr(i)[o].i4 = resetLine.i4;
- }
- }
- if (tid == 0) sendConn[i]->llLastCleaning = sendStep[i];
- }
- }
-
- __device__ __forceinline__ void llRecvCleaning(int i) {
- if (recvStep[i] > recvConn[i]->llLastCleaning + NCCL_LL_CLEAN_FREQ) {
- recvStep[i] += NCCL_STEPS;
- if (tid == 0) recvConn[i]->llLastCleaning = recvStep[i];
- }
- }
-
public:
__device__ __forceinline__
- ncclLLPrimitives(const int tid, const int nthreads, int* recvPeers, int* sendPeers, struct ncclChannel* channel, struct ncclComm* comm, const uint64_t opCount)
+ ncclLLPrimitives(const int tid, const int nthreads, int* recvPeers, int* sendPeers, struct ncclChannel* channel, struct ncclDevComm* comm, const uint64_t opCount)
: comm(comm), tid(tid), nthreads(nthreads), opCount(opCount) {
// Make sure step is updated before we read it.
barrier();
@@ -577,8 +563,6 @@ class ncclLLPrimitives {
}
__device__ __forceinline__ ~ncclLLPrimitives() {
- for (int i=0; i<NSEND && i<nsend; i++) llSendCleaning(i);
- for (int i=0; i<NRECV && i<nrecv; i++) llRecvCleaning(i);
// Save steps for the next operation
for (int i=0; i<NRECV && i<nrecv; i++) saveRecvConn(i);
for (int i=0; i<NSEND && i<nsend; i++) saveSendConn(i);
diff --git a/src/collectives/device/reduce.cu b/src/collectives/device/reduce.cu
index 1ef66d4..a2caac5 100644
--- a/src/collectives/device/reduce.cu
+++ b/src/collectives/device/reduce.cu
@@ -1,5 +1,5 @@
/*************************************************************************
- * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
diff --git a/src/collectives/device/reduce.h b/src/collectives/device/reduce.h
index 302d053..d2d5d3b 100644
--- a/src/collectives/device/reduce.h
+++ b/src/collectives/device/reduce.h
@@ -1,10 +1,10 @@
/*************************************************************************
- * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
-#include "core.h"
+#include "devcomm.h"
#include "primitives.h"
#include "collectives.h"
@@ -13,7 +13,7 @@ __device__ void ncclReduceRingKernel(struct CollectiveArgs* args) {
const int tid = threadIdx.x;
const int nthreads = blockDim.x - 1;
const int bid = args->bid;
- struct ncclComm* comm = args->comm;
+ struct ncclDevComm* comm = args->comm;
struct ncclChannel* channel = comm->channels+blockIdx.x;
struct ncclRing* ring = &channel->ring;
const ssize_t size = args->N;
@@ -55,7 +55,7 @@ __device__ void ncclReduceRingLLKernel(struct CollectiveArgs* args) {
const int tid = threadIdx.x;
const int bid = args->bid;
const int nthreads = args->nThreads;
- struct ncclComm* comm = args->comm;
+ struct ncclDevComm* comm = args->comm;
struct ncclChannel* channel = comm->channels+blockIdx.x;
struct ncclRing* ring = &channel->ring;
diff --git a/src/collectives/device/reduce_scatter.cu b/src/collectives/device/reduce_scatter.cu
index 10857ed..8b45299 100644
--- a/src/collectives/device/reduce_scatter.cu
+++ b/src/collectives/device/reduce_scatter.cu
@@ -1,5 +1,5 @@
/*************************************************************************
- * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
diff --git a/src/collectives/device/reduce_scatter.h b/src/collectives/device/reduce_scatter.h
index c70c845..09ba56e 100644
--- a/src/collectives/device/reduce_scatter.h
+++ b/src/collectives/device/reduce_scatter.h
@@ -1,10 +1,10 @@
/*************************************************************************
- * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
-#include "core.h"
+#include "devcomm.h"
#include "primitives.h"
#include "collectives.h"
@@ -13,7 +13,7 @@ __device__ void ncclReduceScatterRingKernel(struct CollectiveArgs* args) {
const int tid = threadIdx.x;
const int nthreads = blockDim.x - 1;
const int bid = args->bid;
- struct ncclComm* comm = args->comm;
+ struct ncclDevComm* comm = args->comm;
struct ncclChannel* channel = comm->channels+blockIdx.x;
struct ncclRing* ring = &channel->ring;
const ssize_t size = args->N;
@@ -69,7 +69,7 @@ __device__ void ncclReduceScatterRingLLKernel(struct CollectiveArgs* args) {
const int tid = threadIdx.x;
const int bid = args->bid;
const int nthreads = args->nThreads;
- struct ncclComm* comm = args->comm;
+ struct ncclDevComm* comm = args->comm;
struct ncclChannel* channel = comm->channels+blockIdx.x;
struct ncclRing* ring = &channel->ring;