Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/marian-nmt/nccl.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorSylvain Jeaugey <sjeaugey@nvidia.com>2016-09-20 21:52:15 +0300
committerSylvain Jeaugey <sjeaugey@nvidia.com>2016-09-22 21:58:33 +0300
commitca330b110ae76ace344182ab83a028911111cc36 (patch)
tree2225d406a3eca4ec735213e1b86e4ecfa2eafd22
parent6c77476cc1aeb38a7666550813baaedde3dd32d1 (diff)
Add scan testsv1.3.0-1
-rw-r--r--Makefile6
-rw-r--r--test/single/all_gather_scan.cu239
-rw-r--r--test/single/all_reduce_scan.cu247
-rw-r--r--test/single/broadcast_scan.cu232
-rw-r--r--test/single/reduce_scan.cu238
-rw-r--r--test/single/reduce_scatter_scan.cu249
6 files changed, 1210 insertions, 1 deletions
diff --git a/Makefile b/Makefile
index 35e5eef..177230b 100644
--- a/Makefile
+++ b/Makefile
@@ -130,7 +130,11 @@ MPI_INC ?= $(MPI_HOME)/include
MPI_LIB ?= $(MPI_HOME)/lib
MPIFLAGS := -I$(MPI_INC) -L$(MPI_LIB) -lmpi
-TESTS := all_gather_test all_reduce_test broadcast_test reduce_test reduce_scatter_test
+TESTS := all_gather_test all_gather_scan \
+ all_reduce_test all_reduce_scan \
+ broadcast_test broadcast_scan \
+ reduce_test reduce_scan \
+ reduce_scatter_test reduce_scatter_scan
MPITESTS := mpi_test
TSTINC := -I$(NCCL_INC) -Itest/include
diff --git a/test/single/all_gather_scan.cu b/test/single/all_gather_scan.cu
new file mode 100644
index 0000000..becf315
--- /dev/null
+++ b/test/single/all_gather_scan.cu
@@ -0,0 +1,239 @@
+/*************************************************************************
+ * Copyright (c) 2016, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of NVIDIA CORPORATION nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ ************************************************************************/
+
+#include <chrono>
+#include <cstdio>
+#include <cstdlib>
+#include <string>
+#include <float.h>
+
+#include "nccl.h"
+#include "test_utilities.h"
+#include <nvToolsExt.h>
+
+void showUsage(const char* bin) {
+ printf("\n"
+ "Usage: %s <type> <n_min> <n_max> [delta] [gpus] [gpu0 [gpu1 [...]]]\n"
+ "Where:\n"
+#ifdef CUDA_HAS_HALF
+ " type = [char|int|half|float|double|int64|uint64]\n"
+#else
+ " type = [char|int|float|double|int64|uint64]\n"
+#endif
+ " n_min > 0\n"
+ " n_max >= n_min\n"
+ " delta > 0\n\n", bin);
+ return;
+}
+
+int main(int argc, char* argv[]) {
+ int nvis = 0;
+ CUDACHECK(cudaGetDeviceCount(&nvis));
+ if (nvis == 0) {
+ printf("No GPUs found\n");
+ showUsage(argv[0]);
+ exit(EXIT_FAILURE);
+ }
+
+ ncclDataType_t type;
+ int n_min;
+ int n_max;
+ int delta;
+ int gpus;
+ int* list = NULL;
+
+ if (argc < 4) {
+ showUsage(argv[0]);
+ exit(EXIT_FAILURE);
+ }
+
+ type = strToType(argv[1]);
+ if (type == nccl_NUM_TYPES) {
+ printf("Invalid <type> '%s'\n", argv[1]);
+ showUsage(argv[0]);
+ exit(EXIT_FAILURE);
+ }
+
+ n_min = strToPosInt(argv[2]);
+ if (n_min < 1) {
+ printf("Invalid <n_min> '%s'\n", argv[2]);
+ showUsage(argv[0]);
+ exit(EXIT_FAILURE);
+ }
+
+ n_max = strToPosInt(argv[3]);
+ if (n_max < n_min) {
+ printf("Invalid <n_max> '%s'\n", argv[3]);
+ showUsage(argv[0]);
+ exit(EXIT_FAILURE);
+ }
+
+ if (argc > 4) {
+ delta = strToPosInt(argv[4]);
+ if (delta < 1) {
+ printf("Invalid <delta> '%s'\n", argv[4]);
+ showUsage(argv[0]);
+ exit(EXIT_FAILURE);
+ }
+ } else {
+ delta = (n_max == n_min) ? 1 : (n_max - n_min+9) / 10;
+ }
+
+ if (argc > 5) {
+ gpus = strToPosInt(argv[5]);
+ if (gpus < 1) {
+ printf("Invalid <gpus> '%s'\n", argv[5]);
+ showUsage(argv[0]);
+ exit(EXIT_FAILURE);
+ }
+ } else {
+ gpus = nvis;
+ }
+
+ list = (int*)malloc(gpus*sizeof(int));
+
+ if (argc > 6 && argc != 6+gpus) {
+ printf("If given, GPU list must be fully specified.\n");
+ showUsage(argv[0]);
+ exit(EXIT_FAILURE);
+ }
+
+ for(int g=0; g<gpus; ++g) {
+ if(argc > 6) {
+ list[g] = strToNonNeg(argv[6+g]);
+ if (list[g] < 0) {
+ printf("Invalid GPU%d '%s'\n", g, argv[6+g]);
+ showUsage(argv[0]);
+ exit(EXIT_FAILURE);
+ } else if (list[g] >= nvis) {
+ printf("GPU%d (%d) exceeds visible devices (%d)\n", g, list[g], nvis);
+ showUsage(argv[0]);
+ exit(EXIT_FAILURE);
+ }
+ } else {
+ list[g] = g % nvis;
+ }
+ }
+
+ size_t word = wordSize(type);
+ size_t max_input = n_max * word;
+ size_t max_output = max_input * gpus;
+ void* refout;
+ CUDACHECK(cudaMallocHost(&refout, max_output));
+
+ void **input, **output;
+ double** localError;
+ ncclComm_t* comm;
+ cudaStream_t* stream;
+
+ input = (void**)malloc(gpus*sizeof(void*));
+ output = (void**)malloc(gpus*sizeof(void*));
+ localError = (double**)malloc(gpus*sizeof(double*));
+ comm = (ncclComm_t*)malloc(gpus*sizeof(ncclComm_t));
+ stream = (cudaStream_t*)malloc(gpus*sizeof(cudaStream_t));
+
+ for(int g=0; g<gpus; ++g) {
+ char busid[32] = {0};
+ CUDACHECK(cudaDeviceGetPCIBusId(busid, 32, list[g]));
+ printf("# Rank %d using device %d [%s]\n", g, list[g], busid);
+
+ CUDACHECK(cudaSetDevice(list[g]));
+ CUDACHECK(cudaMalloc(&input[g], max_input));
+ CUDACHECK(cudaMalloc(&output[g], max_output));
+ CUDACHECK(cudaMallocHost(&localError[g], sizeof(double)));
+ CUDACHECK(cudaStreamCreate(&stream[g]));
+ makeRandom(input[g], n_max, type, 42+g);
+
+ CUDACHECK(cudaMemcpy((char*)refout+max_input*g, input[g], max_input, cudaMemcpyDeviceToHost));
+ }
+
+ NCCLCHECK(ncclCommInitAll(comm, gpus, list));
+
+ printf(" BYTES ERROR MSEC BW\n");
+
+ for(int n=n_min; n<=n_max; n+=delta) {
+ size_t out_bytes = word * n * gpus;
+
+ for(int g=0; g<gpus; ++g) {
+ CUDACHECK(cudaSetDevice(list[g]));
+ CUDACHECK(cudaMemsetAsync(output[g], 0, out_bytes, stream[g]));
+ CUDACHECK(cudaStreamSynchronize(stream[g]));
+ }
+
+ auto start = std::chrono::high_resolution_clock::now();
+ for(int g=0; g<gpus; ++g) {
+ CUDACHECK(cudaSetDevice(list[g]));
+ NCCLCHECK(ncclAllGather(input[g], n, type, output[g], comm[g], stream[g]));
+ }
+ for(int g=0; g<gpus; ++g) {
+ CUDACHECK(cudaSetDevice(list[g]));
+ CUDACHECK(cudaStreamSynchronize(stream[g]));
+ }
+ auto stop = std::chrono::high_resolution_clock::now();
+ double ms = std::chrono::duration_cast<std::chrono::duration<double>>
+ (stop - start).count() * 1000.0;
+
+ double max_error = 0.0;
+ for(int slice=0; slice<gpus; ++slice) {
+ void* refSlice = (void*)((char*)refout + slice*max_input);
+ for(int g=0; g<gpus; ++g) {
+ CUDACHECK(cudaSetDevice(list[g]));
+ void* mySlice = (void*)((char*)output[g] + slice*n*word);
+ maxDiff(localError[g], mySlice, refSlice, n, type, stream[g]);
+ }
+ for(int g=0; g<gpus; ++g) {
+ CUDACHECK(cudaSetDevice(list[g]));
+ CUDACHECK(cudaStreamSynchronize(stream[g]));
+ max_error = max(max_error, *localError[g]);
+ }
+ }
+
+ double mb = (double)(n*word * (gpus-1)) * 1.e-6;
+ double algbw = mb / ms;
+ printf("%12lu %5.0le %10.3lf %6.2lf\n",
+ n*word, max_error, ms, algbw);
+ }
+
+ for(int g=0; g<gpus; ++g) {
+ CUDACHECK(cudaSetDevice(list[g]));
+ CUDACHECK(cudaStreamDestroy(stream[g]));
+ ncclCommDestroy(comm[g]);
+ CUDACHECK(cudaFree(input[g]));
+ CUDACHECK(cudaFree(output[g]));
+ CUDACHECK(cudaFreeHost(localError[g]));
+ }
+
+ free(localError);
+ free(output);
+ free(input);
+ free(comm);
+ free(stream);
+ CUDACHECK(cudaFreeHost(refout));
+ exit(EXIT_SUCCESS);
+}
+
diff --git a/test/single/all_reduce_scan.cu b/test/single/all_reduce_scan.cu
new file mode 100644
index 0000000..f93a099
--- /dev/null
+++ b/test/single/all_reduce_scan.cu
@@ -0,0 +1,247 @@
+/*************************************************************************
+ * Copyright (c) 2016, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of NVIDIA CORPORATION nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ ************************************************************************/
+
+#include <chrono>
+#include <cstdio>
+#include <cstdlib>
+#include <string>
+#include <float.h>
+
+#include "nccl.h"
+#include "test_utilities.h"
+#include <nvToolsExt.h>
+
+void showUsage(const char* bin) {
+ printf("\n"
+ "Usage: %s <type> <op> <n_min> <n_max> [delta] [gpus] [gpu0 [gpu1 [...]]]\n"
+ "Where:\n"
+#ifdef CUDA_HAS_HALF
+ " type = [char|int|half|float|double|int64|uint64]\n"
+#else
+ " type = [char|int|float|double|int64|uint64]\n"
+#endif
+ " op = [sum|prod|max|min]\n"
+ " n_min > 0\n"
+ " n_max >= n_min\n"
+ " delta > 0\n\n", bin);
+ return;
+}
+
+int main(int argc, char* argv[]) {
+ int nvis = 0;
+ CUDACHECK(cudaGetDeviceCount(&nvis));
+ if (nvis == 0) {
+ printf("No GPUs found\n");
+ showUsage(argv[0]);
+ exit(EXIT_FAILURE);
+ }
+
+ ncclDataType_t type;
+ ncclRedOp_t op;
+ int n_min;
+ int n_max;
+ int delta;
+ int gpus;
+ int* list = NULL;
+
+ if (argc < 5) {
+ showUsage(argv[0]);
+ exit(EXIT_FAILURE);
+ }
+
+ type = strToType(argv[1]);
+ if (type == nccl_NUM_TYPES) {
+ printf("Invalid <type> '%s'\n", argv[1]);
+ showUsage(argv[0]);
+ exit(EXIT_FAILURE);
+ }
+
+ op = strToOp(argv[2]);
+ if (op == nccl_NUM_OPS) {
+ printf("Invalid <op> '%s'\n", argv[2]);
+ showUsage(argv[0]);
+ exit(EXIT_FAILURE);
+ }
+
+ n_min = strToPosInt(argv[3]);
+ if (n_min < 1) {
+ printf("Invalid <n_min> '%s'\n", argv[3]);
+ showUsage(argv[0]);
+ exit(EXIT_FAILURE);
+ }
+
+ n_max = strToPosInt(argv[4]);
+ if (n_max < n_min) {
+ printf("Invalid <n_max> '%s'\n", argv[4]);
+ showUsage(argv[0]);
+ exit(EXIT_FAILURE);
+ }
+
+ if (argc > 5) {
+ delta = strToPosInt(argv[5]);
+ if (delta < 1) {
+ printf("Invalid <delta> '%s'\n", argv[5]);
+ showUsage(argv[0]);
+ exit(EXIT_FAILURE);
+ }
+ } else {
+ delta = (n_max == n_min) ? 1 : (n_max - n_min+9) / 10;
+ }
+
+ if (argc > 6) {
+ gpus = strToPosInt(argv[6]);
+ if (gpus < 1) {
+ printf("Invalid <gpus> '%s'\n", argv[6]);
+ showUsage(argv[0]);
+ exit(EXIT_FAILURE);
+ }
+ } else {
+ gpus = nvis;
+ }
+
+ list = (int*)malloc(gpus*sizeof(int));
+
+ if (argc > 7 && argc != 7+gpus) {
+ printf("If given, GPU list must be fully specified.\n");
+ showUsage(argv[0]);
+ exit(EXIT_FAILURE);
+ }
+
+ for(int g=0; g<gpus; ++g) {
+ if(argc > 7) {
+ list[g] = strToNonNeg(argv[7+g]);
+ if (list[g] < 0) {
+ printf("Invalid GPU%d '%s'\n", g, argv[7+g]);
+ showUsage(argv[0]);
+ exit(EXIT_FAILURE);
+ } else if (list[g] >= nvis) {
+ printf("GPU%d (%d) exceeds visible devices (%d)\n", g, list[g], nvis);
+ showUsage(argv[0]);
+ exit(EXIT_FAILURE);
+ }
+ } else {
+ list[g] = g % nvis;
+ }
+ }
+
+ size_t word = wordSize(type);
+ size_t max_size = n_max * word;
+ void* refout;
+ CUDACHECK(cudaMallocHost(&refout, max_size));
+
+ void **input, **output;
+ double** localError;
+ ncclComm_t* comm;
+ cudaStream_t* stream;
+
+ input = (void**)malloc(gpus*sizeof(void*));
+ output = (void**)malloc(gpus*sizeof(void*));
+ localError = (double**)malloc(gpus*sizeof(double*));
+ comm = (ncclComm_t*)malloc(gpus*sizeof(ncclComm_t));
+ stream = (cudaStream_t*)malloc(gpus*sizeof(cudaStream_t));
+
+ for(int g=0; g<gpus; ++g) {
+ char busid[32] = {0};
+ CUDACHECK(cudaDeviceGetPCIBusId(busid, 32, list[g]));
+ printf("# Rank %d using device %d [%s]\n", g, list[g], busid);
+
+ CUDACHECK(cudaSetDevice(list[g]));
+ CUDACHECK(cudaMalloc(&input[g], max_size));
+ CUDACHECK(cudaMalloc(&output[g], max_size));
+ CUDACHECK(cudaMallocHost(&localError[g], sizeof(double)));
+ CUDACHECK(cudaStreamCreate(&stream[g]));
+ makeRandom(input[g], n_max, type, 42+g);
+
+ if (g == 0)
+ CUDACHECK(cudaMemcpy(refout, input[g], max_size, cudaMemcpyDeviceToHost));
+ else
+ accVec(refout, input[g], n_max, type, op);
+ }
+
+ NCCLCHECK(ncclCommInitAll(comm, gpus, list));
+
+ printf(" BYTES ERROR MSEC ALGBW BUSBW\n");
+
+ for(int n=n_min; n<=n_max; n+=delta) {
+ size_t bytes = word * n;
+
+ for(int g=0; g<gpus; ++g) {
+ CUDACHECK(cudaSetDevice(list[g]));
+ CUDACHECK(cudaMemsetAsync(output[g], 0, bytes, stream[g]));
+ CUDACHECK(cudaStreamSynchronize(stream[g]));
+ }
+
+ auto start = std::chrono::high_resolution_clock::now();
+ for(int g=0; g<gpus; ++g) {
+ CUDACHECK(cudaSetDevice(list[g]));
+ NCCLCHECK(ncclAllReduce(input[g], output[g], n, type, op, comm[g], stream[g]));
+ }
+ for(int g=0; g<gpus; ++g) {
+ CUDACHECK(cudaSetDevice(list[g]));
+ CUDACHECK(cudaStreamSynchronize(stream[g]));
+ }
+ auto stop = std::chrono::high_resolution_clock::now();
+ double ms = std::chrono::duration_cast<std::chrono::duration<double>>
+ (stop - start).count() * 1000.0;
+
+ double max_error = 0.0;
+ for(int g=0; g<gpus; ++g) {
+ CUDACHECK(cudaSetDevice(list[g]));
+ maxDiff(localError[g], output[g], refout, n, type, stream[g]);
+ }
+ for(int g=0; g<gpus; ++g) {
+ CUDACHECK(cudaSetDevice(list[g]));
+ CUDACHECK(cudaStreamSynchronize(stream[g]));
+ max_error = max(max_error, *localError[g]);
+ }
+
+ double mb = (double)bytes * 1.e-6;
+ double algbw = mb / ms;
+ double busbw = algbw * (double)(2*gpus - 2) / (double)gpus;
+ printf("%12lu %5.0le %10.3lf %6.2lf %6.2lf\n",
+ n*word, max_error, ms, algbw, busbw);
+ }
+
+ for(int g=0; g<gpus; ++g) {
+ CUDACHECK(cudaSetDevice(list[g]));
+ CUDACHECK(cudaStreamDestroy(stream[g]));
+ ncclCommDestroy(comm[g]);
+ CUDACHECK(cudaFree(input[g]));
+ CUDACHECK(cudaFree(output[g]));
+ CUDACHECK(cudaFreeHost(localError[g]));
+ }
+
+ free(localError);
+ free(output);
+ free(input);
+ free(comm);
+ free(stream);
+ CUDACHECK(cudaFreeHost(refout));
+ exit(EXIT_SUCCESS);
+}
+
diff --git a/test/single/broadcast_scan.cu b/test/single/broadcast_scan.cu
new file mode 100644
index 0000000..ea11c7d
--- /dev/null
+++ b/test/single/broadcast_scan.cu
@@ -0,0 +1,232 @@
+/*************************************************************************
+ * Copyright (c) 2016, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of NVIDIA CORPORATION nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ ************************************************************************/
+
+#include <chrono>
+#include <cstdio>
+#include <cstdlib>
+#include <string>
+#include <float.h>
+
+#include "nccl.h"
+#include "test_utilities.h"
+#include <nvToolsExt.h>
+
+void showUsage(const char* bin) {
+ printf("\n"
+ "Usage: %s <type> <n_min> <n_max> [delta] [gpus] [gpu0 [gpu1 [...]]]\n"
+ "Where:\n"
+#ifdef CUDA_HAS_HALF
+ " type = [char|int|half|float|double|int64|uint64]\n"
+#else
+ " type = [char|int|float|double|int64|uint64]\n"
+#endif
+ " n_min > 0\n"
+ " n_max >= n_min\n"
+ " delta > 0\n\n", bin);
+ return;
+}
+
+int main(int argc, char* argv[]) {
+ int nvis = 0;
+ CUDACHECK(cudaGetDeviceCount(&nvis));
+ if (nvis == 0) {
+ printf("No GPUs found\n");
+ showUsage(argv[0]);
+ exit(EXIT_FAILURE);
+ }
+
+ ncclDataType_t type;
+ int n_min;
+ int n_max;
+ int delta;
+ int gpus;
+ int* list = NULL;
+
+ if (argc < 4) {
+ showUsage(argv[0]);
+ exit(EXIT_FAILURE);
+ }
+
+ type = strToType(argv[1]);
+ if (type == nccl_NUM_TYPES) {
+ printf("Invalid <type> '%s'\n", argv[1]);
+ showUsage(argv[0]);
+ exit(EXIT_FAILURE);
+ }
+
+ n_min = strToPosInt(argv[2]);
+ if (n_min < 1) {
+ printf("Invalid <n_min> '%s'\n", argv[2]);
+ showUsage(argv[0]);
+ exit(EXIT_FAILURE);
+ }
+
+ n_max = strToPosInt(argv[3]);
+ if (n_max < n_min) {
+ printf("Invalid <n_max> '%s'\n", argv[3]);
+ showUsage(argv[0]);
+ exit(EXIT_FAILURE);
+ }
+
+ if (argc > 4) {
+ delta = strToPosInt(argv[4]);
+ if (delta < 1) {
+ printf("Invalid <delta> '%s'\n", argv[4]);
+ showUsage(argv[0]);
+ exit(EXIT_FAILURE);
+ }
+ } else {
+ delta = (n_max == n_min) ? 1 : (n_max - n_min+9) / 10;
+ }
+
+ if (argc > 5) {
+ gpus = strToPosInt(argv[5]);
+ if (gpus < 1) {
+ printf("Invalid <gpus> '%s'\n", argv[5]);
+ showUsage(argv[0]);
+ exit(EXIT_FAILURE);
+ }
+ } else {
+ gpus = nvis;
+ }
+
+ list = (int*)malloc(gpus*sizeof(int));
+
+ if (argc > 6 && argc != 6+gpus) {
+ printf("If given, GPU list must be fully specified.\n");
+ showUsage(argv[0]);
+ exit(EXIT_FAILURE);
+ }
+
+ for(int g=0; g<gpus; ++g) {
+ if(argc > 6) {
+ list[g] = strToNonNeg(argv[6+g]);
+ if (list[g] < 0) {
+ printf("Invalid GPU%d '%s'\n", g, argv[6+g]);
+ showUsage(argv[0]);
+ exit(EXIT_FAILURE);
+ } else if (list[g] >= nvis) {
+ printf("GPU%d (%d) exceeds visible devices (%d)\n", g, list[g], nvis);
+ showUsage(argv[0]);
+ exit(EXIT_FAILURE);
+ }
+ } else {
+ list[g] = g % nvis;
+ }
+ }
+
+ size_t word = wordSize(type);
+ size_t max_size = n_max * word;
+ void* refout;
+ CUDACHECK(cudaMallocHost(&refout, max_size));
+
+ void** io;
+ double* localError;
+ ncclComm_t* comm;
+ cudaStream_t* stream;
+
+ io = (void**)malloc(gpus*sizeof(void*));
+ CUDACHECK(cudaMallocHost(&localError, gpus*sizeof(double)));
+ comm = (ncclComm_t*)malloc(gpus*sizeof(ncclComm_t));
+ stream = (cudaStream_t*)malloc(gpus*sizeof(cudaStream_t));
+
+ for(int g=0; g<gpus; ++g) {
+ char busid[32] = {0};
+ CUDACHECK(cudaDeviceGetPCIBusId(busid, 32, list[g]));
+ printf("# Rank %d using device %d [%s]\n", g, list[g], busid);
+
+ CUDACHECK(cudaSetDevice(list[g]));
+ CUDACHECK(cudaStreamCreate(&stream[g]));
+ CUDACHECK(cudaMalloc(&io[g], max_size));
+ if(g == 0) {
+ makeRandom(io[g], n_max, type, 42+g);
+ CUDACHECK(cudaMemcpy(refout, io[g], max_size, cudaMemcpyDeviceToHost));
+ }
+ }
+
+ NCCLCHECK(ncclCommInitAll(comm, gpus, list));
+
+ printf(" BYTES ERROR MSEC BW\n");
+
+ for(int n=n_min; n<=n_max; n+=delta) {
+ size_t bytes = word * n;
+
+ for(int g=1; g<gpus; ++g) {
+ CUDACHECK(cudaSetDevice(list[g]));
+ CUDACHECK(cudaMemsetAsync(io[g], 0, bytes, stream[g]));
+ }
+ for(int g=0; g<gpus; ++g) {
+ CUDACHECK(cudaSetDevice(list[g]));
+ CUDACHECK(cudaStreamSynchronize(stream[0]));
+ }
+
+ auto start = std::chrono::high_resolution_clock::now();
+ for(int g=0; g<gpus; ++g) {
+ CUDACHECK(cudaSetDevice(list[g]));
+ NCCLCHECK(ncclBcast(io[g], n, type, 0, comm[g], stream[g]));
+ }
+ for(int g=0; g<gpus; ++g) {
+ CUDACHECK(cudaSetDevice(list[g]));
+ CUDACHECK(cudaStreamSynchronize(stream[g]));
+ }
+ auto stop = std::chrono::high_resolution_clock::now();
+ double ms = std::chrono::duration_cast<std::chrono::duration<double>>
+ (stop - start).count() * 1000.0;
+
+ for(int g=1; g<gpus; ++g) {
+ CUDACHECK(cudaSetDevice(list[g]));
+ maxDiff(localError+g, io[g], refout, n, type, stream[g]);
+ }
+ double maxError = 0.0;
+ for(int g=1; g<gpus; ++g) {
+ CUDACHECK(cudaSetDevice(list[g]));
+ CUDACHECK(cudaStreamSynchronize(stream[g]));
+ maxError = max(maxError, localError[g]);
+ }
+
+ double mb = (double)bytes * 1.e-6;
+ double algbw = mb / ms;
+ printf("%12lu %5.0le %10.3lf %6.2lf\n",
+ n*word, maxError, ms, algbw);
+ }
+
+ for(int g=0; g<gpus; ++g) {
+ CUDACHECK(cudaSetDevice(list[g]));
+ CUDACHECK(cudaStreamDestroy(stream[g]));
+ ncclCommDestroy(comm[g]);
+ CUDACHECK(cudaFree(io[g]));
+ }
+
+ free(io);
+ free(comm);
+ free(stream);
+ CUDACHECK(cudaFreeHost(refout));
+ CUDACHECK(cudaFreeHost(localError));
+ exit(EXIT_SUCCESS);
+}
+
diff --git a/test/single/reduce_scan.cu b/test/single/reduce_scan.cu
new file mode 100644
index 0000000..f42643e
--- /dev/null
+++ b/test/single/reduce_scan.cu
@@ -0,0 +1,238 @@
+/*************************************************************************
+ * Copyright (c) 2016, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of NVIDIA CORPORATION nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ ************************************************************************/
+
+#include <chrono>
+#include <cstdio>
+#include <cstdlib>
+#include <string>
+#include <float.h>
+
+#include "nccl.h"
+#include "test_utilities.h"
+#include <nvToolsExt.h>
+
+void showUsage(const char* bin) {
+ printf("\n"
+ "Usage: %s <type> <op> <n_min> <n_max> [delta] [gpus] [gpu0 [gpu1 [...]]]\n"
+ "Where:\n"
+#ifdef CUDA_HAS_HALF
+ " type = [char|int|half|float|double|int64|uint64]\n"
+#else
+ " type = [char|int|float|double|int64|uint64]\n"
+#endif
+ " op = [sum|prod|max|min]\n"
+ " n_min > 0\n"
+ " n_max >= n_min\n"
+ " delta > 0\n\n", bin);
+ return;
+}
+
+int main(int argc, char* argv[]) {
+ int nvis = 0;
+ CUDACHECK(cudaGetDeviceCount(&nvis));
+ if (nvis == 0) {
+ printf("No GPUs found\n");
+ showUsage(argv[0]);
+ exit(EXIT_FAILURE);
+ }
+
+ ncclDataType_t type;
+ ncclRedOp_t op;
+ int n_min;
+ int n_max;
+ int delta;
+ int gpus;
+ int* list = NULL;
+
+ if (argc < 5) {
+ showUsage(argv[0]);
+ exit(EXIT_FAILURE);
+ }
+
+ type = strToType(argv[1]);
+ if (type == nccl_NUM_TYPES) {
+ printf("Invalid <type> '%s'\n", argv[1]);
+ showUsage(argv[0]);
+ exit(EXIT_FAILURE);
+ }
+
+ op = strToOp(argv[2]);
+ if (op == nccl_NUM_OPS) {
+ printf("Invalid <op> '%s'\n", argv[2]);
+ showUsage(argv[0]);
+ exit(EXIT_FAILURE);
+ }
+
+ n_min = strToPosInt(argv[3]);
+ if (n_min < 1) {
+ printf("Invalid <n_min> '%s'\n", argv[3]);
+ showUsage(argv[0]);
+ exit(EXIT_FAILURE);
+ }
+
+ n_max = strToPosInt(argv[4]);
+ if (n_max < n_min) {
+ printf("Invalid <n_max> '%s'\n", argv[4]);
+ showUsage(argv[0]);
+ exit(EXIT_FAILURE);
+ }
+
+ if (argc > 5) {
+ delta = strToPosInt(argv[5]);
+ if (delta < 1) {
+ printf("Invalid <delta> '%s'\n", argv[5]);
+ showUsage(argv[0]);
+ exit(EXIT_FAILURE);
+ }
+ } else {
+ delta = (n_max == n_min) ? 1 : (n_max - n_min+9) / 10;
+ }
+
+ if (argc > 6) {
+ gpus = strToPosInt(argv[6]);
+ if (gpus < 1) {
+ printf("Invalid <gpus> '%s'\n", argv[6]);
+ showUsage(argv[0]);
+ exit(EXIT_FAILURE);
+ }
+ } else {
+ gpus = nvis;
+ }
+
+ list = (int*)malloc(gpus*sizeof(int));
+
+ if (argc > 7 && argc != 7+gpus) {
+ printf("If given, GPU list must be fully specified.\n");
+ showUsage(argv[0]);
+ exit(EXIT_FAILURE);
+ }
+
+ for(int g=0; g<gpus; ++g) {
+ if(argc > 7) {
+ list[g] = strToNonNeg(argv[7+g]);
+ if (list[g] < 0) {
+ printf("Invalid GPU%d '%s'\n", g, argv[7+g]);
+ showUsage(argv[0]);
+ exit(EXIT_FAILURE);
+ } else if (list[g] >= nvis) {
+ printf("GPU%d (%d) exceeds visible devices (%d)\n", g, list[g], nvis);
+ showUsage(argv[0]);
+ exit(EXIT_FAILURE);
+ }
+ } else {
+ list[g] = g % nvis;
+ }
+ }
+
+ size_t word = wordSize(type);
+ size_t max_size = n_max * word;
+ void* refout;
+ CUDACHECK(cudaMallocHost(&refout, max_size));
+
+ void** input;
+ void* output; // always goes on rank 0
+ double* maxError;
+ ncclComm_t* comm;
+ cudaStream_t* stream;
+
+ input = (void**)malloc(gpus*sizeof(void*));
+ comm = (ncclComm_t*)malloc(gpus*sizeof(ncclComm_t));
+ stream = (cudaStream_t*)malloc(gpus*sizeof(cudaStream_t));
+
+ for(int g=0; g<gpus; ++g) {
+ char busid[32] = {0};
+ CUDACHECK(cudaDeviceGetPCIBusId(busid, 32, list[g]));
+ printf("# Rank %d using device %d [%s]\n", g, list[g], busid);
+
+ CUDACHECK(cudaSetDevice(list[g]));
+ CUDACHECK(cudaStreamCreate(&stream[g]));
+ CUDACHECK(cudaMalloc(&input[g], max_size));
+ makeRandom(input[g], n_max, type, 42+g);
+
+ if (g == 0) {
+ CUDACHECK(cudaMalloc(&output, max_size));
+ CUDACHECK(cudaMallocHost(&maxError, sizeof(double)));
+ CUDACHECK(cudaMemcpy(refout, input[g], max_size, cudaMemcpyDeviceToHost));
+ } else {
+ accVec(refout, input[g], n_max, type, op);
+ }
+ }
+
+ NCCLCHECK(ncclCommInitAll(comm, gpus, list));
+
+ printf(" BYTES ERROR MSEC BW\n");
+
+ for(int n=n_min; n<=n_max; n+=delta) {
+ size_t bytes = word * n;
+
+ CUDACHECK(cudaSetDevice(list[0]));
+ CUDACHECK(cudaMemsetAsync(output, 0, bytes, stream[0]));
+ for(int g=0; g<gpus; ++g)
+ CUDACHECK(cudaStreamSynchronize(stream[0]));
+
+ auto start = std::chrono::high_resolution_clock::now();
+ for(int g=0; g<gpus; ++g) {
+ CUDACHECK(cudaSetDevice(list[g]));
+ NCCLCHECK(ncclReduce(input[g], output, n, type, op, 0, comm[g], stream[g]));
+ }
+ for(int g=0; g<gpus; ++g) {
+ CUDACHECK(cudaSetDevice(list[g]));
+ CUDACHECK(cudaStreamSynchronize(stream[g]));
+ }
+ auto stop = std::chrono::high_resolution_clock::now();
+ double ms = std::chrono::duration_cast<std::chrono::duration<double>>
+ (stop - start).count() * 1000.0;
+
+ CUDACHECK(cudaSetDevice(list[0]));
+ maxDiff(maxError, output, refout, n, type, stream[0]);
+ CUDACHECK(cudaStreamSynchronize(stream[0]));
+
+ double mb = (double)bytes * 1.e-6;
+ double algbw = mb / ms;
+ printf("%12lu %5.0le %10.3lf %6.2lf\n",
+ n*word, *maxError, ms, algbw);
+ }
+
+ for(int g=0; g<gpus; ++g) {
+ CUDACHECK(cudaSetDevice(list[g]));
+ CUDACHECK(cudaStreamDestroy(stream[g]));
+ ncclCommDestroy(comm[g]);
+ CUDACHECK(cudaFree(input[g]));
+ if(g == 0) {
+ CUDACHECK(cudaFree(output));
+ CUDACHECK(cudaFreeHost(maxError));
+ }
+ }
+
+ free(input);
+ free(comm);
+ free(stream);
+ CUDACHECK(cudaFreeHost(refout));
+ exit(EXIT_SUCCESS);
+}
+
diff --git a/test/single/reduce_scatter_scan.cu b/test/single/reduce_scatter_scan.cu
new file mode 100644
index 0000000..8c37508
--- /dev/null
+++ b/test/single/reduce_scatter_scan.cu
@@ -0,0 +1,249 @@
+/*************************************************************************
+ * Copyright (c) 2016, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of NVIDIA CORPORATION nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ ************************************************************************/
+
+#include <chrono>
+#include <cstdio>
+#include <cstdlib>
+#include <string>
+#include <float.h>
+
+#include "nccl.h"
+#include "test_utilities.h"
+#include <nvToolsExt.h>
+
+void showUsage(const char* bin) {
+ printf("\n"
+ "Usage: %s <type> <op> <n_min> <n_max> [delta] [gpus] [gpu0 [gpu1 [...]]]\n"
+ "Where:\n"
+#ifdef CUDA_HAS_HALF
+ " type = [char|int|half|float|double|int64|uint64]\n"
+#else
+ " type = [char|int|float|double|int64|uint64]\n"
+#endif
+ " op = [sum|prod|max|min]\n"
+ " n_min > 0\n"
+ " n_max >= n_min\n"
+ " delta > 0\n\n", bin);
+ return;
+}
+
+int main(int argc, char* argv[]) {
+ int nvis = 0;
+ CUDACHECK(cudaGetDeviceCount(&nvis));
+ if (nvis == 0) {
+ printf("No GPUs found\n");
+ showUsage(argv[0]);
+ exit(EXIT_FAILURE);
+ }
+
+ ncclDataType_t type;
+ ncclRedOp_t op;
+ int n_min;
+ int n_max;
+ int delta;
+ int gpus;
+ int* list = NULL;
+
+ if (argc < 5) {
+ showUsage(argv[0]);
+ exit(EXIT_FAILURE);
+ }
+
+ type = strToType(argv[1]);
+ if (type == nccl_NUM_TYPES) {
+ printf("Invalid <type> '%s'\n", argv[1]);
+ showUsage(argv[0]);
+ exit(EXIT_FAILURE);
+ }
+
+ op = strToOp(argv[2]);
+ if (op == nccl_NUM_OPS) {
+ printf("Invalid <op> '%s'\n", argv[2]);
+ showUsage(argv[0]);
+ exit(EXIT_FAILURE);
+ }
+
+ n_min = strToPosInt(argv[3]);
+ if (n_min < 1) {
+ printf("Invalid <n_min> '%s'\n", argv[3]);
+ showUsage(argv[0]);
+ exit(EXIT_FAILURE);
+ }
+
+ n_max = strToPosInt(argv[4]);
+ if (n_max < n_min) {
+ printf("Invalid <n_max> '%s'\n", argv[4]);
+ showUsage(argv[0]);
+ exit(EXIT_FAILURE);
+ }
+
+ if (argc > 5) {
+ delta = strToPosInt(argv[5]);
+ if (delta < 1) {
+ printf("Invalid <delta> '%s'\n", argv[5]);
+ showUsage(argv[0]);
+ exit(EXIT_FAILURE);
+ }
+ } else {
+ delta = (n_max == n_min) ? 1 : (n_max - n_min+9) / 10;
+ }
+
+ if (argc > 6) {
+ gpus = strToPosInt(argv[6]);
+ if (gpus < 1) {
+ printf("Invalid <gpus> '%s'\n", argv[6]);
+ showUsage(argv[0]);
+ exit(EXIT_FAILURE);
+ }
+ } else {
+ gpus = nvis;
+ }
+
+ list = (int*)malloc(gpus*sizeof(int));
+
+ if (argc > 7 && argc != 7+gpus) {
+ printf("If given, GPU list must be fully specified.\n");
+ showUsage(argv[0]);
+ exit(EXIT_FAILURE);
+ }
+
+ for(int g=0; g<gpus; ++g) {
+ if(argc > 7) {
+ list[g] = strToNonNeg(argv[7+g]);
+ if (list[g] < 0) {
+ printf("Invalid GPU%d '%s'\n", g, argv[7+g]);
+ showUsage(argv[0]);
+ exit(EXIT_FAILURE);
+ } else if (list[g] >= nvis) {
+ printf("GPU%d (%d) exceeds visible devices (%d)\n", g, list[g], nvis);
+ showUsage(argv[0]);
+ exit(EXIT_FAILURE);
+ }
+ } else {
+ list[g] = g % nvis;
+ }
+ }
+
+ size_t word = wordSize(type);
+ size_t max_output = n_max * word;
+ size_t max_input = gpus * max_output;
+ void* refout;
+ CUDACHECK(cudaMallocHost(&refout, max_input)); // contains entire reduction
+
+ void **input, **output;
+ double** localError;
+ ncclComm_t* comm;
+ cudaStream_t* stream;
+
+ input = (void**)malloc(gpus*sizeof(void*));
+ output = (void**)malloc(gpus*sizeof(void*));
+ localError = (double**)malloc(gpus*sizeof(double*));
+ comm = (ncclComm_t*)malloc(gpus*sizeof(ncclComm_t));
+ stream = (cudaStream_t*)malloc(gpus*sizeof(cudaStream_t));
+
+ for(int g=0; g<gpus; ++g) {
+ char busid[32] = {0};
+ CUDACHECK(cudaDeviceGetPCIBusId(busid, 32, list[g]));
+ printf("# Rank %d using device %d [%s]\n", g, list[g], busid);
+
+ CUDACHECK(cudaSetDevice(list[g]));
+ CUDACHECK(cudaMalloc(&input[g], max_input));
+ CUDACHECK(cudaMalloc(&output[g], max_output));
+ CUDACHECK(cudaMallocHost(&localError[g], sizeof(double)));
+ CUDACHECK(cudaStreamCreate(&stream[g]));
+ makeRandom(input[g], n_max*gpus, type, 42+g);
+
+ if (g == 0)
+ CUDACHECK(cudaMemcpy(refout, input[g], max_input, cudaMemcpyDeviceToHost));
+ else
+ accVec(refout, input[g], n_max*gpus, type, op);
+ }
+
+ NCCLCHECK(ncclCommInitAll(comm, gpus, list));
+
+ printf(" BYTES ERROR MSEC ALGBW BUSBW\n");
+
+ for(int n=n_min; n<=n_max; n+=delta) {
+ size_t bytes = word * n;
+
+ for(int g=0; g<gpus; ++g) {
+ CUDACHECK(cudaSetDevice(list[g]));
+ CUDACHECK(cudaMemsetAsync(output[g], 0, bytes, stream[g]));
+ CUDACHECK(cudaStreamSynchronize(stream[g]));
+ }
+
+ auto start = std::chrono::high_resolution_clock::now();
+ for(int g=0; g<gpus; ++g) {
+ CUDACHECK(cudaSetDevice(list[g]));
+ NCCLCHECK(ncclReduceScatter(input[g], output[g], n, type, op, comm[g], stream[g]));
+ }
+ for(int g=0; g<gpus; ++g) {
+ CUDACHECK(cudaSetDevice(list[g]));
+ CUDACHECK(cudaStreamSynchronize(stream[g]));
+ }
+ auto stop = std::chrono::high_resolution_clock::now();
+ double ms = std::chrono::duration_cast<std::chrono::duration<double>>
+ (stop - start).count() * 1000.0;
+
+ double max_error = 0.0;
+ for(int g=0; g<gpus; ++g) {
+ CUDACHECK(cudaSetDevice(list[g]));
+ void* myRef = (void*)((char*)refout + g*bytes);
+ maxDiff(localError[g], output[g], myRef, n, type, stream[g]);
+ }
+ for(int g=0; g<gpus; ++g) {
+ CUDACHECK(cudaSetDevice(list[g]));
+ CUDACHECK(cudaStreamSynchronize(stream[g]));
+ max_error = max(max_error, *localError[g]);
+ }
+
+ double mb = (double)bytes * 1.e-6;
+ double algbw = mb / ms;
+ double busbw = algbw * (double)(gpus - 1);
+ printf("%12lu %5.0le %10.3lf %6.2lf %6.2lf\n",
+ n*word, max_error, ms, algbw, busbw);
+ }
+
+ for(int g=0; g<gpus; ++g) {
+ CUDACHECK(cudaSetDevice(list[g]));
+ CUDACHECK(cudaStreamDestroy(stream[g]));
+ ncclCommDestroy(comm[g]);
+ CUDACHECK(cudaFree(input[g]));
+ CUDACHECK(cudaFree(output[g]));
+ CUDACHECK(cudaFreeHost(localError[g]));
+ }
+
+ free(localError);
+ free(output);
+ free(input);
+ free(comm);
+ free(stream);
+ CUDACHECK(cudaFreeHost(refout));
+ exit(EXIT_SUCCESS);
+}
+