Add scan testsv1.3.0-1

author: Sylvain Jeaugey <sjeaugey@nvidia.com> 2016-09-20 21:52:15 +0300
committer: Sylvain Jeaugey <sjeaugey@nvidia.com> 2016-09-22 21:58:33 +0300
commit: ca330b110ae76ace344182ab83a028911111cc36 (patch)
tree: 2225d406a3eca4ec735213e1b86e4ecfa2eafd22
parent: 6c77476cc1aeb38a7666550813baaedde3dd32d1 (diff)
6 files changed, 1210 insertions, 1 deletions
diff --git a/Makefile b/Makefile
index 35e5eef..177230b 100644
--- a/Makefile
+++ b/Makefile
@@ -130,7 +130,11 @@ MPI_INC ?= $(MPI_HOME)/include
 MPI_LIB ?= $(MPI_HOME)/lib
 MPIFLAGS   := -I$(MPI_INC) -L$(MPI_LIB) -lmpi
 
-TESTS       := all_gather_test all_reduce_test broadcast_test reduce_test reduce_scatter_test
+TESTS       := all_gather_test     all_gather_scan \
+               all_reduce_test     all_reduce_scan \
+               broadcast_test      broadcast_scan \
+               reduce_test         reduce_scan \
+               reduce_scatter_test reduce_scatter_scan
 MPITESTS    := mpi_test
 
 TSTINC     := -I$(NCCL_INC) -Itest/include
diff --git a/test/single/all_gather_scan.cu b/test/single/all_gather_scan.cu
new file mode 100644
index 0000000..becf315
--- /dev/null
+++ b/test/single/all_gather_scan.cu
@@ -0,0 +1,239 @@
+/*************************************************************************
+ * Copyright (c) 2016, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ ************************************************************************/
+
+#include <chrono>
+#include <cstdio>
+#include <cstdlib>
+#include <string>
+#include <float.h>
+
+#include "nccl.h"
+#include "test_utilities.h"
+#include <nvToolsExt.h>
+
+void showUsage(const char* bin) {
+  printf("\n"
+         "Usage: %s <type> <n_min> <n_max> [delta] [gpus] [gpu0 [gpu1 [...]]]\n"
+         "Where:\n"
+#ifdef CUDA_HAS_HALF
+         "    type   =   [char|int|half|float|double|int64|uint64]\n"
+#else
+         "    type   =   [char|int|float|double|int64|uint64]\n"
+#endif
+         "    n_min  >   0\n"
+         "    n_max  >=  n_min\n"
+         "    delta  >   0\n\n", bin);
+  return;
+}
+
+int main(int argc, char* argv[]) {
+  int nvis = 0;
+  CUDACHECK(cudaGetDeviceCount(&nvis));
+  if (nvis == 0) {
+    printf("No GPUs found\n");
+    showUsage(argv[0]);
+    exit(EXIT_FAILURE);
+  }
+
+  ncclDataType_t type;
+  int n_min;
+  int n_max;
+  int delta;
+  int gpus;
+  int* list = NULL;
+
+  if (argc < 4) {
+    showUsage(argv[0]);
+    exit(EXIT_FAILURE);
+  }
+
+  type = strToType(argv[1]);
+  if (type == nccl_NUM_TYPES) {
+    printf("Invalid <type> '%s'\n", argv[1]);
+    showUsage(argv[0]);
+    exit(EXIT_FAILURE);
+  }
+
+  n_min = strToPosInt(argv[2]);
+  if (n_min < 1) {
+    printf("Invalid <n_min> '%s'\n", argv[2]);
+    showUsage(argv[0]);
+    exit(EXIT_FAILURE);
+  }
+
+  n_max = strToPosInt(argv[3]);
+  if (n_max < n_min) {
+    printf("Invalid <n_max> '%s'\n", argv[3]);
+    showUsage(argv[0]);
+    exit(EXIT_FAILURE);
+  }
+
+  if (argc > 4) {
+    delta = strToPosInt(argv[4]);
+    if (delta < 1) {
+      printf("Invalid <delta> '%s'\n", argv[4]);
+      showUsage(argv[0]);
+      exit(EXIT_FAILURE);
+    }
+  } else {
+    delta = (n_max == n_min) ? 1 : (n_max - n_min+9) / 10;
+  }
+
+  if (argc > 5) {
+    gpus = strToPosInt(argv[5]);
+    if (gpus < 1) {
+      printf("Invalid <gpus> '%s'\n", argv[5]);
+      showUsage(argv[0]);
+      exit(EXIT_FAILURE);
+    }
+  } else {
+    gpus = nvis;
+  }
+
+  list = (int*)malloc(gpus*sizeof(int));
+
+  if (argc > 6 && argc != 6+gpus) {
+    printf("If given, GPU list must be fully specified.\n");
+    showUsage(argv[0]);
+    exit(EXIT_FAILURE);
+  }
+
+  for(int g=0; g<gpus; ++g) {
+    if(argc > 6) {
+      list[g] = strToNonNeg(argv[6+g]);
+      if (list[g] < 0) {
+        printf("Invalid GPU%d '%s'\n", g, argv[6+g]);
+        showUsage(argv[0]);
+        exit(EXIT_FAILURE);
+      } else if (list[g] >= nvis) {
+        printf("GPU%d (%d) exceeds visible devices (%d)\n", g, list[g], nvis);
+        showUsage(argv[0]);
+        exit(EXIT_FAILURE);
+      }
+    } else {
+      list[g] = g % nvis;
+    }
+  }
+
+  size_t word = wordSize(type);
+  size_t max_input = n_max * word;
+  size_t max_output = max_input * gpus;
+  void* refout;
+  CUDACHECK(cudaMallocHost(&refout, max_output));
+
+  void **input, **output;
+  double** localError;
+  ncclComm_t* comm;
+  cudaStream_t* stream;
+
+  input = (void**)malloc(gpus*sizeof(void*));
+  output = (void**)malloc(gpus*sizeof(void*));
+  localError = (double**)malloc(gpus*sizeof(double*));
+  comm = (ncclComm_t*)malloc(gpus*sizeof(ncclComm_t));
+  stream = (cudaStream_t*)malloc(gpus*sizeof(cudaStream_t));
+
+  for(int g=0; g<gpus; ++g) {
+    char busid[32] = {0};
+    CUDACHECK(cudaDeviceGetPCIBusId(busid, 32, list[g]));
+    printf("# Rank %d using device %d [%s]\n", g, list[g], busid);
+
+    CUDACHECK(cudaSetDevice(list[g]));
+    CUDACHECK(cudaMalloc(&input[g],  max_input));
+    CUDACHECK(cudaMalloc(&output[g], max_output));
+    CUDACHECK(cudaMallocHost(&localError[g], sizeof(double)));
+    CUDACHECK(cudaStreamCreate(&stream[g]));
+    makeRandom(input[g], n_max, type, 42+g);
+
+    CUDACHECK(cudaMemcpy((char*)refout+max_input*g, input[g], max_input, cudaMemcpyDeviceToHost));
+  }
+
+  NCCLCHECK(ncclCommInitAll(comm, gpus, list));
+
+  printf("       BYTES ERROR       MSEC     BW\n");
+
+  for(int n=n_min; n<=n_max; n+=delta) {
+    size_t out_bytes = word * n * gpus;
+
+    for(int g=0; g<gpus; ++g) {
+      CUDACHECK(cudaSetDevice(list[g]));
+      CUDACHECK(cudaMemsetAsync(output[g], 0, out_bytes, stream[g]));
+      CUDACHECK(cudaStreamSynchronize(stream[g]));
+    }
+
+    auto start = std::chrono::high_resolution_clock::now();
+    for(int g=0; g<gpus; ++g) {
+      CUDACHECK(cudaSetDevice(list[g]));
+      NCCLCHECK(ncclAllGather(input[g], n, type, output[g], comm[g], stream[g]));
+    }
+    for(int g=0; g<gpus; ++g) {
+      CUDACHECK(cudaSetDevice(list[g]));
+      CUDACHECK(cudaStreamSynchronize(stream[g]));
+    }
+    auto stop = std::chrono::high_resolution_clock::now();
+    double ms = std::chrono::duration_cast<std::chrono::duration<double>>
+        (stop - start).count() * 1000.0;
+
+    double max_error = 0.0;
+    for(int slice=0; slice<gpus; ++slice) {
+      void* refSlice = (void*)((char*)refout + slice*max_input);
+      for(int g=0; g<gpus; ++g) {
+        CUDACHECK(cudaSetDevice(list[g]));
+        void* mySlice = (void*)((char*)output[g] + slice*n*word);
+        maxDiff(localError[g], mySlice, refSlice, n, type, stream[g]);
+      }
+      for(int g=0; g<gpus; ++g) {
+        CUDACHECK(cudaSetDevice(list[g]));
+        CUDACHECK(cudaStreamSynchronize(stream[g]));
+        max_error = max(max_error, *localError[g]);
+      }
+    }
+
+    double mb = (double)(n*word * (gpus-1)) * 1.e-6;
+    double algbw = mb / ms;
+    printf("%12lu %5.0le %10.3lf %6.2lf\n",
+        n*word, max_error, ms, algbw);
+  }
+
+  for(int g=0; g<gpus; ++g) {
+    CUDACHECK(cudaSetDevice(list[g]));
+    CUDACHECK(cudaStreamDestroy(stream[g]));
+    ncclCommDestroy(comm[g]);
+    CUDACHECK(cudaFree(input[g]));
+    CUDACHECK(cudaFree(output[g]));
+    CUDACHECK(cudaFreeHost(localError[g]));
+  }
+
+  free(localError);
+  free(output);
+  free(input);
+  free(comm);
+  free(stream);
+  CUDACHECK(cudaFreeHost(refout));
+  exit(EXIT_SUCCESS);
+}
+
diff --git a/test/single/all_reduce_scan.cu b/test/single/all_reduce_scan.cu
new file mode 100644
index 0000000..f93a099
--- /dev/null
+++ b/test/single/all_reduce_scan.cu
@@ -0,0 +1,247 @@
+/*************************************************************************
+ * Copyright (c) 2016, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ ************************************************************************/
+
+#include <chrono>
+#include <cstdio>
+#include <cstdlib>
+#include <string>
+#include <float.h>
+
+#include "nccl.h"
+#include "test_utilities.h"
+#include <nvToolsExt.h>
+
+void showUsage(const char* bin) {
+  printf("\n"
+         "Usage: %s <type> <op> <n_min> <n_max> [delta] [gpus] [gpu0 [gpu1 [...]]]\n"
+         "Where:\n"
+#ifdef CUDA_HAS_HALF
+         "    type   =   [char|int|half|float|double|int64|uint64]\n"
+#else
+         "    type   =   [char|int|float|double|int64|uint64]\n"
+#endif
+         "    op     =   [sum|prod|max|min]\n"
+         "    n_min  >   0\n"
+         "    n_max  >=  n_min\n"
+         "    delta  >   0\n\n", bin);
+  return;
+}
+
+int main(int argc, char* argv[]) {
+  int nvis = 0;
+  CUDACHECK(cudaGetDeviceCount(&nvis));
+  if (nvis == 0) {
+    printf("No GPUs found\n");
+    showUsage(argv[0]);
+    exit(EXIT_FAILURE);
+  }
+
+  ncclDataType_t type;
+  ncclRedOp_t op;
+  int n_min;
+  int n_max;
+  int delta;
+  int gpus;
+  int* list = NULL;
+
+  if (argc < 5) {
+    showUsage(argv[0]);
+    exit(EXIT_FAILURE);
+  }
+
+  type = strToType(argv[1]);
+  if (type == nccl_NUM_TYPES) {
+    printf("Invalid <type> '%s'\n", argv[1]);
+    showUsage(argv[0]);
+    exit(EXIT_FAILURE);
+  }
+
+  op = strToOp(argv[2]);
+  if (op == nccl_NUM_OPS) {
+    printf("Invalid <op> '%s'\n", argv[2]);
+    showUsage(argv[0]);
+    exit(EXIT_FAILURE);
+  }
+
+  n_min = strToPosInt(argv[3]);
+  if (n_min < 1) {
+    printf("Invalid <n_min> '%s'\n", argv[3]);
+    showUsage(argv[0]);
+    exit(EXIT_FAILURE);
+  }
+
+  n_max = strToPosInt(argv[4]);
+  if (n_max < n_min) {
+    printf("Invalid <n_max> '%s'\n", argv[4]);
+    showUsage(argv[0]);
+    exit(EXIT_FAILURE);
+  }
+
+  if (argc > 5) {
+    delta = strToPosInt(argv[5]);
+    if (delta < 1) {
+      printf("Invalid <delta> '%s'\n", argv[5]);
+      showUsage(argv[0]);
+      exit(EXIT_FAILURE);
+    }
+  } else {
+    delta = (n_max == n_min) ? 1 : (n_max - n_min+9) / 10;
+  }
+
+  if (argc > 6) {
+    gpus = strToPosInt(argv[6]);
+    if (gpus < 1) {
+      printf("Invalid <gpus> '%s'\n", argv[6]);
+      showUsage(argv[0]);
+      exit(EXIT_FAILURE);
+    }
+  } else {
+    gpus = nvis;
+  }
+
+  list = (int*)malloc(gpus*sizeof(int));
+
+  if (argc > 7 && argc != 7+gpus) {
+    printf("If given, GPU list must be fully specified.\n");
+    showUsage(argv[0]);
+    exit(EXIT_FAILURE);
+  }
+
+  for(int g=0; g<gpus; ++g) {
+    if(argc > 7) {
+      list[g] = strToNonNeg(argv[7+g]);
+      if (list[g] < 0) {
+        printf("Invalid GPU%d '%s'\n", g, argv[7+g]);
+        showUsage(argv[0]);
+        exit(EXIT_FAILURE);
+      } else if (list[g] >= nvis) {
+        printf("GPU%d (%d) exceeds visible devices (%d)\n", g, list[g], nvis);
+        showUsage(argv[0]);
+        exit(EXIT_FAILURE);
+      }
+    } else {
+      list[g] = g % nvis;
+    }
+  }
+
+  size_t word = wordSize(type);
+  size_t max_size = n_max * word;
+  void* refout;
+  CUDACHECK(cudaMallocHost(&refout, max_size));
+
+  void **input, **output;
+  double** localError;
+  ncclComm_t* comm;
+  cudaStream_t* stream;
+
+  input = (void**)malloc(gpus*sizeof(void*));
+  output = (void**)malloc(gpus*sizeof(void*));
+  localError = (double**)malloc(gpus*sizeof(double*));
+  comm = (ncclComm_t*)malloc(gpus*sizeof(ncclComm_t));
+  stream = (cudaStream_t*)malloc(gpus*sizeof(cudaStream_t));
+
+  for(int g=0; g<gpus; ++g) {
+    char busid[32] = {0};
+    CUDACHECK(cudaDeviceGetPCIBusId(busid, 32, list[g]));
+    printf("# Rank %d using device %d [%s]\n", g, list[g], busid);
+
+    CUDACHECK(cudaSetDevice(list[g]));
+    CUDACHECK(cudaMalloc(&input[g],  max_size));
+    CUDACHECK(cudaMalloc(&output[g], max_size));
+    CUDACHECK(cudaMallocHost(&localError[g], sizeof(double)));
+    CUDACHECK(cudaStreamCreate(&stream[g]));
+    makeRandom(input[g], n_max, type, 42+g);
+
+    if (g == 0)
+      CUDACHECK(cudaMemcpy(refout, input[g], max_size, cudaMemcpyDeviceToHost));
+    else
+      accVec(refout, input[g], n_max, type, op);
+  }
+
+  NCCLCHECK(ncclCommInitAll(comm, gpus, list));
+
+  printf("       BYTES ERROR       MSEC  ALGBW  BUSBW\n");
+
+  for(int n=n_min; n<=n_max; n+=delta) {
+    size_t bytes = word * n;
+
+    for(int g=0; g<gpus; ++g) {
+      CUDACHECK(cudaSetDevice(list[g]));
+      CUDACHECK(cudaMemsetAsync(output[g], 0, bytes, stream[g]));
+      CUDACHECK(cudaStreamSynchronize(stream[g]));
+    }
+
+    auto start = std::chrono::high_resolution_clock::now();
+    for(int g=0; g<gpus; ++g) {
+      CUDACHECK(cudaSetDevice(list[g]));
+      NCCLCHECK(ncclAllReduce(input[g], output[g], n, type, op, comm[g], stream[g]));
+    }
+    for(int g=0; g<gpus; ++g) {
+      CUDACHECK(cudaSetDevice(list[g]));
+      CUDACHECK(cudaStreamSynchronize(stream[g]));
+    }
+    auto stop = std::chrono::high_resolution_clock::now();
+    double ms = std::chrono::duration_cast<std::chrono::duration<double>>
+        (stop - start).count() * 1000.0;
+
+    double max_error = 0.0;
+    for(int g=0; g<gpus; ++g) {
+      CUDACHECK(cudaSetDevice(list[g]));
+      maxDiff(localError[g], output[g], refout, n, type, stream[g]);
+    }
+    for(int g=0; g<gpus; ++g) {
+      CUDACHECK(cudaSetDevice(list[g]));
+      CUDACHECK(cudaStreamSynchronize(stream[g]));
+      max_error = max(max_error, *localError[g]);
+    }
+
+    double mb = (double)bytes * 1.e-6;
+    double algbw = mb / ms;
+    double busbw = algbw * (double)(2*gpus - 2) / (double)gpus;
+    printf("%12lu %5.0le %10.3lf %6.2lf %6.2lf\n",
+        n*word, max_error, ms, algbw, busbw);
+  }
+
+  for(int g=0; g<gpus; ++g) {
+    CUDACHECK(cudaSetDevice(list[g]));
+    CUDACHECK(cudaStreamDestroy(stream[g]));
+    ncclCommDestroy(comm[g]);
+    CUDACHECK(cudaFree(input[g]));
+    CUDACHECK(cudaFree(output[g]));
+    CUDACHECK(cudaFreeHost(localError[g]));
+  }
+
+  free(localError);
+  free(output);
+  free(input);
+  free(comm);
+  free(stream);
+  CUDACHECK(cudaFreeHost(refout));
+  exit(EXIT_SUCCESS);
+}
+
diff --git a/test/single/broadcast_scan.cu b/test/single/broadcast_scan.cu
new file mode 100644
index 0000000..ea11c7d
--- /dev/null
+++ b/test/single/broadcast_scan.cu
@@ -0,0 +1,232 @@
+/*************************************************************************
+ * Copyright (c) 2016, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ ************************************************************************/
+
+#include <chrono>
+#include <cstdio>
+#include <cstdlib>
+#include <string>
+#include <float.h>
+
+#include "nccl.h"
+#include "test_utilities.h"
+#include <nvToolsExt.h>
+
+void showUsage(const char* bin) {
+  printf("\n"
+         "Usage: %s <type> <n_min> <n_max> [delta] [gpus] [gpu0 [gpu1 [...]]]\n"
+         "Where:\n"
+#ifdef CUDA_HAS_HALF
+         "    type   =   [char|int|half|float|double|int64|uint64]\n"
+#else
+         "    type   =   [char|int|float|double|int64|uint64]\n"
+#endif
+         "    n_min  >   0\n"
+         "    n_max  >=  n_min\n"
+         "    delta  >   0\n\n", bin);
+  return;
+}
+
+int main(int argc, char* argv[]) {
+  int nvis = 0;
+  CUDACHECK(cudaGetDeviceCount(&nvis));
+  if (nvis == 0) {
+    printf("No GPUs found\n");
+    showUsage(argv[0]);
+    exit(EXIT_FAILURE);
+  }
+
+  ncclDataType_t type;
+  int n_min;
+  int n_max;
+  int delta;
+  int gpus;
+  int* list = NULL;
+
+  if (argc < 4) {
+    showUsage(argv[0]);
+    exit(EXIT_FAILURE);
+  }
+
+  type = strToType(argv[1]);
+  if (type == nccl_NUM_TYPES) {
+    printf("Invalid <type> '%s'\n", argv[1]);
+    showUsage(argv[0]);
+    exit(EXIT_FAILURE);
+  }
+
+  n_min = strToPosInt(argv[2]);
+  if (n_min < 1) {
+    printf("Invalid <n_min> '%s'\n", argv[2]);
+    showUsage(argv[0]);
+    exit(EXIT_FAILURE);
+  }
+
+  n_max = strToPosInt(argv[3]);
+  if (n_max < n_min) {
+    printf("Invalid <n_max> '%s'\n", argv[3]);
+    showUsage(argv[0]);
+    exit(EXIT_FAILURE);
+  }
+
+  if (argc > 4) {
+    delta = strToPosInt(argv[4]);
+    if (delta < 1) {
+      printf("Invalid <delta> '%s'\n", argv[4]);
+      showUsage(argv[0]);
+      exit(EXIT_FAILURE);
+    }
+  } else {
+    delta = (n_max == n_min) ? 1 : (n_max - n_min+9) / 10;
+  }
+
+  if (argc > 5) {
+    gpus = strToPosInt(argv[5]);
+    if (gpus < 1) {
+      printf("Invalid <gpus> '%s'\n", argv[5]);
+      showUsage(argv[0]);
+      exit(EXIT_FAILURE);
+    }
+  } else {
+    gpus = nvis;
+  }
+
+  list = (int*)malloc(gpus*sizeof(int));
+
+  if (argc > 6 && argc != 6+gpus) {
+    printf("If given, GPU list must be fully specified.\n");
+    showUsage(argv[0]);
+    exit(EXIT_FAILURE);
+  }
+
+  for(int g=0; g<gpus; ++g) {
+    if(argc > 6) {
+      list[g] = strToNonNeg(argv[6+g]);
+      if (list[g] < 0) {
+        printf("Invalid GPU%d '%s'\n", g, argv[6+g]);
+        showUsage(argv[0]);
+        exit(EXIT_FAILURE);
+      } else if (list[g] >= nvis) {
+        printf("GPU%d (%d) exceeds visible devices (%d)\n", g, list[g], nvis);
+        showUsage(argv[0]);
+        exit(EXIT_FAILURE);
+      }
+    } else {
+      list[g] = g % nvis;
+    }
+  }
+
+  size_t word = wordSize(type);
+  size_t max_size = n_max * word;
+  void* refout;
+  CUDACHECK(cudaMallocHost(&refout, max_size));
+
+  void** io;
+  double* localError;
+  ncclComm_t* comm;
+  cudaStream_t* stream;
+
+  io = (void**)malloc(gpus*sizeof(void*));
+  CUDACHECK(cudaMallocHost(&localError, gpus*sizeof(double)));
+  comm = (ncclComm_t*)malloc(gpus*sizeof(ncclComm_t));
+  stream = (cudaStream_t*)malloc(gpus*sizeof(cudaStream_t));
+
+  for(int g=0; g<gpus; ++g) {
+    char busid[32] = {0};
+    CUDACHECK(cudaDeviceGetPCIBusId(busid, 32, list[g]));
+    printf("# Rank %d using device %d [%s]\n", g, list[g], busid);
+
+    CUDACHECK(cudaSetDevice(list[g]));
+    CUDACHECK(cudaStreamCreate(&stream[g]));
+    CUDACHECK(cudaMalloc(&io[g], max_size));
+    if(g == 0) {
+      makeRandom(io[g], n_max, type, 42+g);
+      CUDACHECK(cudaMemcpy(refout, io[g], max_size, cudaMemcpyDeviceToHost));
+    }
+  }
+
+  NCCLCHECK(ncclCommInitAll(comm, gpus, list));
+
+  printf("       BYTES ERROR       MSEC     BW\n");
+
+  for(int n=n_min; n<=n_max; n+=delta) {
+    size_t bytes = word * n;
+
+    for(int g=1; g<gpus; ++g) {
+      CUDACHECK(cudaSetDevice(list[g]));
+      CUDACHECK(cudaMemsetAsync(io[g], 0, bytes, stream[g]));
+    }
+    for(int g=0; g<gpus; ++g) {
+      CUDACHECK(cudaSetDevice(list[g]));
+      CUDACHECK(cudaStreamSynchronize(stream[0]));
+    }
+
+    auto start = std::chrono::high_resolution_clock::now();
+    for(int g=0; g<gpus; ++g) {
+      CUDACHECK(cudaSetDevice(list[g]));
+      NCCLCHECK(ncclBcast(io[g], n, type, 0, comm[g], stream[g]));
+    }
+    for(int g=0; g<gpus; ++g) {
+      CUDACHECK(cudaSetDevice(list[g]));
+      CUDACHECK(cudaStreamSynchronize(stream[g]));
+    }
+    auto stop = std::chrono::high_resolution_clock::now();
+    double ms = std::chrono::duration_cast<std::chrono::duration<double>>
+        (stop - start).count() * 1000.0;
+
+    for(int g=1; g<gpus; ++g) {
+      CUDACHECK(cudaSetDevice(list[g]));
+      maxDiff(localError+g, io[g], refout, n, type, stream[g]);
+    }
+    double maxError = 0.0;
+    for(int g=1; g<gpus; ++g) {
+      CUDACHECK(cudaSetDevice(list[g]));
+      CUDACHECK(cudaStreamSynchronize(stream[g]));
+      maxError = max(maxError, localError[g]);
+    }
+
+    double mb = (double)bytes * 1.e-6;
+    double algbw = mb / ms;
+    printf("%12lu %5.0le %10.3lf %6.2lf\n",
+        n*word, maxError, ms, algbw);
+  }
+
+  for(int g=0; g<gpus; ++g) {
+    CUDACHECK(cudaSetDevice(list[g]));
+    CUDACHECK(cudaStreamDestroy(stream[g]));
+    ncclCommDestroy(comm[g]);
+    CUDACHECK(cudaFree(io[g]));
+  }
+
+  free(io);
+  free(comm);
+  free(stream);
+  CUDACHECK(cudaFreeHost(refout));
+  CUDACHECK(cudaFreeHost(localError));
+  exit(EXIT_SUCCESS);
+}
+
diff --git a/test/single/reduce_scan.cu b/test/single/reduce_scan.cu
new file mode 100644
index 0000000..f42643e
--- /dev/null
+++ b/test/single/reduce_scan.cu
@@ -0,0 +1,238 @@
+/*************************************************************************
+ * Copyright (c) 2016, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ ************************************************************************/
+
+#include <chrono>
+#include <cstdio>
+#include <cstdlib>
+#include <string>
+#include <float.h>
+
+#include "nccl.h"
+#include "test_utilities.h"
+#include <nvToolsExt.h>
+
+void showUsage(const char* bin) {
+  printf("\n"
+         "Usage: %s <type> <op> <n_min> <n_max> [delta] [gpus] [gpu0 [gpu1 [...]]]\n"
+         "Where:\n"
+#ifdef CUDA_HAS_HALF
+         "    type   =   [char|int|half|float|double|int64|uint64]\n"
+#else
+         "    type   =   [char|int|float|double|int64|uint64]\n"
+#endif
+         "    op     =   [sum|prod|max|min]\n"
+         "    n_min  >   0\n"
+         "    n_max  >=  n_min\n"
+         "    delta  >   0\n\n", bin);
+  return;
+}
+
+int main(int argc, char* argv[]) {
+  int nvis = 0;
+  CUDACHECK(cudaGetDeviceCount(&nvis));
+  if (nvis == 0) {
+    printf("No GPUs found\n");
+    showUsage(argv[0]);
+    exit(EXIT_FAILURE);
+  }
+
+  ncclDataType_t type;
+  ncclRedOp_t op;
+  int n_min;
+  int n_max;
+  int delta;
+  int gpus;
+  int* list = NULL;
+
+  if (argc < 5) {
+    showUsage(argv[0]);
+    exit(EXIT_FAILURE);
+  }
+
+  type = strToType(argv[1]);
+  if (type == nccl_NUM_TYPES) {
+    printf("Invalid <type> '%s'\n", argv[1]);
+    showUsage(argv[0]);
+    exit(EXIT_FAILURE);
+  }
+
+  op = strToOp(argv[2]);
+  if (op == nccl_NUM_OPS) {
+    printf("Invalid <op> '%s'\n", argv[2]);
+    showUsage(argv[0]);
+    exit(EXIT_FAILURE);
+  }
+
+  n_min = strToPosInt(argv[3]);
+  if (n_min < 1) {
+    printf("Invalid <n_min> '%s'\n", argv[3]);
+    showUsage(argv[0]);
+    exit(EXIT_FAILURE);
+  }
+
+  n_max = strToPosInt(argv[4]);
+  if (n_max < n_min) {
+    printf("Invalid <n_max> '%s'\n", argv[4]);
+    showUsage(argv[0]);
+    exit(EXIT_FAILURE);
+  }
+
+  if (argc > 5) {
+    delta = strToPosInt(argv[5]);
+    if (delta < 1) {
+      printf("Invalid <delta> '%s'\n", argv[5]);
+      showUsage(argv[0]);
+      exit(EXIT_FAILURE);
+    }
+  } else {
+    delta = (n_max == n_min) ? 1 : (n_max - n_min+9) / 10;
+  }
+
+  if (argc > 6) {
+    gpus = strToPosInt(argv[6]);
+    if (gpus < 1) {
+      printf("Invalid <gpus> '%s'\n", argv[6]);
+      showUsage(argv[0]);
+      exit(EXIT_FAILURE);
+    }
+  } else {
+    gpus = nvis;
+  }
+
+  list = (int*)malloc(gpus*sizeof(int));
+
+  if (argc > 7 && argc != 7+gpus) {
+    printf("If given, GPU list must be fully specified.\n");
+    showUsage(argv[0]);
+    exit(EXIT_FAILURE);
+  }
+
+  for(int g=0; g<gpus; ++g) {
+    if(argc > 7) {
+      list[g] = strToNonNeg(argv[7+g]);
+      if (list[g] < 0) {
+        printf("Invalid GPU%d '%s'\n", g, argv[7+g]);
+        showUsage(argv[0]);
+        exit(EXIT_FAILURE);
+      } else if (list[g] >= nvis) {
+        printf("GPU%d (%d) exceeds visible devices (%d)\n", g, list[g], nvis);
+        showUsage(argv[0]);
+        exit(EXIT_FAILURE);
+      }
+    } else {
+      list[g] = g % nvis;
+    }
+  }
+
+  size_t word = wordSize(type);
+  size_t max_size = n_max * word;
+  void* refout;
+  CUDACHECK(cudaMallocHost(&refout, max_size));
+
+  void** input;
+  void* output; // always goes on rank 0
+  double* maxError;
+  ncclComm_t* comm;
+  cudaStream_t* stream;
+
+  input = (void**)malloc(gpus*sizeof(void*));
+  comm = (ncclComm_t*)malloc(gpus*sizeof(ncclComm_t));
+  stream = (cudaStream_t*)malloc(gpus*sizeof(cudaStream_t));
+
+  for(int g=0; g<gpus; ++g) {
+    char busid[32] = {0};
+    CUDACHECK(cudaDeviceGetPCIBusId(busid, 32, list[g]));
+    printf("# Rank %d using device %d [%s]\n", g, list[g], busid);
+
+    CUDACHECK(cudaSetDevice(list[g]));
+    CUDACHECK(cudaStreamCreate(&stream[g]));
+    CUDACHECK(cudaMalloc(&input[g],  max_size));
+    makeRandom(input[g], n_max, type, 42+g);
+
+    if (g == 0) {
+      CUDACHECK(cudaMalloc(&output, max_size));
+      CUDACHECK(cudaMallocHost(&maxError, sizeof(double)));
+      CUDACHECK(cudaMemcpy(refout, input[g], max_size, cudaMemcpyDeviceToHost));
+    } else {
+      accVec(refout, input[g], n_max, type, op);
+    }
+  }
+
+  NCCLCHECK(ncclCommInitAll(comm, gpus, list));
+
+  printf("       BYTES ERROR       MSEC     BW\n");
+
+  for(int n=n_min; n<=n_max; n+=delta) {
+    size_t bytes = word * n;
+
+    CUDACHECK(cudaSetDevice(list[0]));
+    CUDACHECK(cudaMemsetAsync(output, 0, bytes, stream[0]));
+    for(int g=0; g<gpus; ++g)
+      CUDACHECK(cudaStreamSynchronize(stream[0]));
+
+    auto start = std::chrono::high_resolution_clock::now();
+    for(int g=0; g<gpus; ++g) {
+      CUDACHECK(cudaSetDevice(list[g]));
+      NCCLCHECK(ncclReduce(input[g], output, n, type, op, 0, comm[g], stream[g]));
+    }
+    for(int g=0; g<gpus; ++g) {
+      CUDACHECK(cudaSetDevice(list[g]));
+      CUDACHECK(cudaStreamSynchronize(stream[g]));
+    }
+    auto stop = std::chrono::high_resolution_clock::now();
+    double ms = std::chrono::duration_cast<std::chrono::duration<double>>
+        (stop - start).count() * 1000.0;
+
+    CUDACHECK(cudaSetDevice(list[0]));
+    maxDiff(maxError, output, refout, n, type, stream[0]);
+    CUDACHECK(cudaStreamSynchronize(stream[0]));
+
+    double mb = (double)bytes * 1.e-6;
+    double algbw = mb / ms;
+    printf("%12lu %5.0le %10.3lf %6.2lf\n",
+        n*word, *maxError, ms, algbw);
+  }
+
+  for(int g=0; g<gpus; ++g) {
+    CUDACHECK(cudaSetDevice(list[g]));
+    CUDACHECK(cudaStreamDestroy(stream[g]));
+    ncclCommDestroy(comm[g]);
+    CUDACHECK(cudaFree(input[g]));
+    if(g == 0) {
+      CUDACHECK(cudaFree(output));
+      CUDACHECK(cudaFreeHost(maxError));
+    }
+  }
+
+  free(input);
+  free(comm);
+  free(stream);
+  CUDACHECK(cudaFreeHost(refout));
+  exit(EXIT_SUCCESS);
+}
+
diff --git a/test/single/reduce_scatter_scan.cu b/test/single/reduce_scatter_scan.cu
new file mode 100644
index 0000000..8c37508
--- /dev/null
+++ b/test/single/reduce_scatter_scan.cu
@@ -0,0 +1,249 @@
+/*************************************************************************
+ * Copyright (c) 2016, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ ************************************************************************/
+
+#include <chrono>
+#include <cstdio>
+#include <cstdlib>
+#include <string>
+#include <float.h>
+
+#include "nccl.h"
+#include "test_utilities.h"
+#include <nvToolsExt.h>
+
+void showUsage(const char* bin) {
+  printf("\n"
+         "Usage: %s <type> <op> <n_min> <n_max> [delta] [gpus] [gpu0 [gpu1 [...]]]\n"
+         "Where:\n"
+#ifdef CUDA_HAS_HALF
+         "    type   =   [char|int|half|float|double|int64|uint64]\n"
+#else
+         "    type   =   [char|int|float|double|int64|uint64]\n"
+#endif
+         "    op     =   [sum|prod|max|min]\n"
+         "    n_min  >   0\n"
+         "    n_max  >=  n_min\n"
+         "    delta  >   0\n\n", bin);
+  return;
+}
+
+int main(int argc, char* argv[]) {
+  int nvis = 0;
+  CUDACHECK(cudaGetDeviceCount(&nvis));
+  if (nvis == 0) {
+    printf("No GPUs found\n");
+    showUsage(argv[0]);
+    exit(EXIT_FAILURE);
+  }
+
+  ncclDataType_t type;
+  ncclRedOp_t op;
+  int n_min;
+  int n_max;
+  int delta;
+  int gpus;
+  int* list = NULL;
+
+  if (argc < 5) {
+    showUsage(argv[0]);
+    exit(EXIT_FAILURE);
+  }
+
+  type = strToType(argv[1]);
+  if (type == nccl_NUM_TYPES) {
+    printf("Invalid <type> '%s'\n", argv[1]);
+    showUsage(argv[0]);
+    exit(EXIT_FAILURE);
+  }
+
+  op = strToOp(argv[2]);
+  if (op == nccl_NUM_OPS) {
+    printf("Invalid <op> '%s'\n", argv[2]);
+    showUsage(argv[0]);
+    exit(EXIT_FAILURE);
+  }
+
+  n_min = strToPosInt(argv[3]);
+  if (n_min < 1) {
+    printf("Invalid <n_min> '%s'\n", argv[3]);
+    showUsage(argv[0]);
+    exit(EXIT_FAILURE);
+  }
+
+  n_max = strToPosInt(argv[4]);
+  if (n_max < n_min) {
+    printf("Invalid <n_max> '%s'\n", argv[4]);
+    showUsage(argv[0]);
+    exit(EXIT_FAILURE);
+  }
+
+  if (argc > 5) {
+    delta = strToPosInt(argv[5]);
+    if (delta < 1) {
+      printf("Invalid <delta> '%s'\n", argv[5]);
+      showUsage(argv[0]);
+      exit(EXIT_FAILURE);
+    }
+  } else {
+    delta = (n_max == n_min) ? 1 : (n_max - n_min+9) / 10;
+  }
+
+  if (argc > 6) {
+    gpus = strToPosInt(argv[6]);
+    if (gpus < 1) {
+      printf("Invalid <gpus> '%s'\n", argv[6]);
+      showUsage(argv[0]);
+      exit(EXIT_FAILURE);
+    }
+  } else {
+    gpus = nvis;
+  }
+
+  list = (int*)malloc(gpus*sizeof(int));
+
+  if (argc > 7 && argc != 7+gpus) {
+    printf("If given, GPU list must be fully specified.\n");
+    showUsage(argv[0]);
+    exit(EXIT_FAILURE);
+  }
+
+  for(int g=0; g<gpus; ++g) {
+    if(argc > 7) {
+      list[g] = strToNonNeg(argv[7+g]);
+      if (list[g] < 0) {
+        printf("Invalid GPU%d '%s'\n", g, argv[7+g]);
+        showUsage(argv[0]);
+        exit(EXIT_FAILURE);
+      } else if (list[g] >= nvis) {
+        printf("GPU%d (%d) exceeds visible devices (%d)\n", g, list[g], nvis);
+        showUsage(argv[0]);
+        exit(EXIT_FAILURE);
+      }
+    } else {
+      list[g] = g % nvis;
+    }
+  }
+
+  size_t word = wordSize(type);
+  size_t max_output = n_max * word;
+  size_t max_input = gpus * max_output;
+  void* refout;
+  CUDACHECK(cudaMallocHost(&refout, max_input)); // contains entire reduction
+
+  void **input, **output;
+  double** localError;
+  ncclComm_t* comm;
+  cudaStream_t* stream;
+
+  input = (void**)malloc(gpus*sizeof(void*));
+  output = (void**)malloc(gpus*sizeof(void*));
+  localError = (double**)malloc(gpus*sizeof(double*));
+  comm = (ncclComm_t*)malloc(gpus*sizeof(ncclComm_t));
+  stream = (cudaStream_t*)malloc(gpus*sizeof(cudaStream_t));
+
+  for(int g=0; g<gpus; ++g) {
+    char busid[32] = {0};
+    CUDACHECK(cudaDeviceGetPCIBusId(busid, 32, list[g]));
+    printf("# Rank %d using device %d [%s]\n", g, list[g], busid);
+
+    CUDACHECK(cudaSetDevice(list[g]));
+    CUDACHECK(cudaMalloc(&input[g],  max_input));
+    CUDACHECK(cudaMalloc(&output[g], max_output));
+    CUDACHECK(cudaMallocHost(&localError[g], sizeof(double)));
+    CUDACHECK(cudaStreamCreate(&stream[g]));
+    makeRandom(input[g], n_max*gpus, type, 42+g);
+
+    if (g == 0)
+      CUDACHECK(cudaMemcpy(refout, input[g], max_input, cudaMemcpyDeviceToHost));
+    else
+      accVec(refout, input[g], n_max*gpus, type, op);
+  }
+
+  NCCLCHECK(ncclCommInitAll(comm, gpus, list));
+
+  printf("       BYTES ERROR       MSEC  ALGBW  BUSBW\n");
+
+  for(int n=n_min; n<=n_max; n+=delta) {
+    size_t bytes = word * n;
+
+    for(int g=0; g<gpus; ++g) {
+      CUDACHECK(cudaSetDevice(list[g]));
+      CUDACHECK(cudaMemsetAsync(output[g], 0, bytes, stream[g]));
+      CUDACHECK(cudaStreamSynchronize(stream[g]));
+    }
+
+    auto start = std::chrono::high_resolution_clock::now();
+    for(int g=0; g<gpus; ++g) {
+      CUDACHECK(cudaSetDevice(list[g]));
+      NCCLCHECK(ncclReduceScatter(input[g], output[g], n, type, op, comm[g], stream[g]));
+    }
+    for(int g=0; g<gpus; ++g) {
+      CUDACHECK(cudaSetDevice(list[g]));
+      CUDACHECK(cudaStreamSynchronize(stream[g]));
+    }
+    auto stop = std::chrono::high_resolution_clock::now();
+    double ms = std::chrono::duration_cast<std::chrono::duration<double>>
+        (stop - start).count() * 1000.0;
+
+    double max_error = 0.0;
+    for(int g=0; g<gpus; ++g) {
+      CUDACHECK(cudaSetDevice(list[g]));
+      void* myRef = (void*)((char*)refout + g*bytes);
+      maxDiff(localError[g], output[g], myRef, n, type, stream[g]);
+    }
+    for(int g=0; g<gpus; ++g) {
+      CUDACHECK(cudaSetDevice(list[g]));
+      CUDACHECK(cudaStreamSynchronize(stream[g]));
+      max_error = max(max_error, *localError[g]);
+    }
+
+    double mb = (double)bytes * 1.e-6;
+    double algbw = mb / ms;
+    double busbw = algbw * (double)(gpus - 1);
+    printf("%12lu %5.0le %10.3lf %6.2lf %6.2lf\n",
+        n*word, max_error, ms, algbw, busbw);
+  }
+
+  for(int g=0; g<gpus; ++g) {
+    CUDACHECK(cudaSetDevice(list[g]));
+    CUDACHECK(cudaStreamDestroy(stream[g]));
+    ncclCommDestroy(comm[g]);
+    CUDACHECK(cudaFree(input[g]));
+    CUDACHECK(cudaFree(output[g]));
+    CUDACHECK(cudaFreeHost(localError[g]));
+  }
+
+  free(localError);
+  free(output);
+  free(input);
+  free(comm);
+  free(stream);
+  CUDACHECK(cudaFreeHost(refout));
+  exit(EXIT_SUCCESS);
+}
+
author	Sylvain Jeaugey <sjeaugey@nvidia.com>	2016-09-20 21:52:15 +0300
committer	Sylvain Jeaugey <sjeaugey@nvidia.com>	2016-09-22 21:58:33 +0300
commit	ca330b110ae76ace344182ab83a028911111cc36 (patch)
tree	2225d406a3eca4ec735213e1b86e4ecfa2eafd22
parent	6c77476cc1aeb38a7666550813baaedde3dd32d1 (diff)