Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/marian-nmt/nccl.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
Diffstat (limited to 'test/single/all_reduce_test.cu')
-rw-r--r--test/single/all_reduce_test.cu301
1 files changed, 0 insertions, 301 deletions
diff --git a/test/single/all_reduce_test.cu b/test/single/all_reduce_test.cu
deleted file mode 100644
index 1935a38..0000000
--- a/test/single/all_reduce_test.cu
+++ /dev/null
@@ -1,301 +0,0 @@
-/*************************************************************************
- * Copyright (c) 2015-2016, NVIDIA CORPORATION. All rights reserved.
- *
- * See LICENCE.txt for license information
- ************************************************************************/
-
-#include <chrono>
-#include <cstdio>
-#include <cstdlib>
-#include <string>
-#include <vector>
-
-#include "nccl.h"
-#include "test_utilities.h"
-#include <nvToolsExt.h>
-
-int csv = false;
-int errors = 0;
-double avg_bw = 0.0;
-int avg_count = 0;
-bool is_reduction = true;
-
-template<typename T>
-void RunTest(T** sendbuff, T** recvbuff, const int N, const ncclDataType_t type,
- const ncclRedOp_t op, ncclComm_t* comms, const std::vector<int>& dList) {
- // initialize data
- T* buffer = (T*)malloc(N * sizeof(T));
- T* result = (T*)malloc(N * sizeof(T));
- memset(buffer, 0, N * sizeof(T));
- memset(result, 0, N * sizeof(T));
-
- int nDev = 0;
- NCCLCHECK(ncclCommCount(comms[0], &nDev));
- cudaStream_t* s = (cudaStream_t*)malloc(sizeof(cudaStream_t)*nDev);
-
- for (int i = 0; i < nDev; ++i) {
- CUDACHECK(cudaSetDevice(dList[i]));
- CUDACHECK(cudaStreamCreate(s+i));
- CUDACHECK(cudaMemset(recvbuff[i], 0, N * sizeof(T)));
- Randomize(sendbuff[i], N, i);
- if(i == 0) {
- CUDACHECK(cudaMemcpy(result, sendbuff[i], N*sizeof(T), cudaMemcpyDeviceToHost));
- } else {
- Accumulate<T>(result, sendbuff[i], N, op);
- }
- }
-
- // warm up GPU
- for (int i = 0; i < nDev; ++i) {
- CUDACHECK(cudaSetDevice(dList[i]));
- NCCLCHECK(ncclAllReduce((const void*)sendbuff[i], (void*)recvbuff[i], std::min(N, 1024 * 1024), type, op, comms[i], s[i]));
- }
-
- for (int i = 0; i < nDev; ++i) {
- CUDACHECK(cudaSetDevice(dList[i]));
- CUDACHECK(cudaStreamSynchronize(s[i]));
- }
-
-// for (int n = 0; n <= N; n = (n > 0) ? n << 1 : 1)
- {
- int n = N;
- printf((csv) ? "%i,%i,%s,%s," : "%12i %12i %6s %6s",
- (int) (n * sizeof(T)), n, TypeName(type).c_str(),
- OperationName(op).c_str());
-
- // do out-of-place reduction first
- nvtxRangePushA("out of place");
- auto start = std::chrono::high_resolution_clock::now();
- //for (int i=0; i<100; i++) {
- for (int i = 0; i < nDev; ++i) {
- CUDACHECK(cudaSetDevice(dList[i]));
- NCCLCHECK(ncclAllReduce((const void*)sendbuff[i], (void*)recvbuff[i], n, type, op,
- comms[i], s[i]));
- }
- //}
-
- for (int i = 0; i < nDev; ++i) {
- CUDACHECK(cudaSetDevice(dList[i]));
- CUDACHECK(cudaStreamSynchronize(s[i]));
- }
-
- auto stop = std::chrono::high_resolution_clock::now();
- nvtxRangePop();
-
- nvtxRangePushA("out of place bookkeeping");
- double elapsedSec =
- std::chrono::duration_cast<std::chrono::duration<double>>(
- stop - start).count(); // / 100.0;
- double algbw = (double)(n * sizeof(T)) / 1.0E9 / elapsedSec;
- double busbw = algbw * (double)(2 * nDev - 2) / (double)nDev;
-
- double maxDelta = 0.0;
- for (int i = 0; i < nDev; ++i) {
- CUDACHECK(cudaSetDevice(dList[i]));
- double tmpDelta = CheckDelta<T>(recvbuff[i], result, N);
- maxDelta = std::max(tmpDelta, maxDelta);
- }
-
- printf((csv)?"%f,%f,%f,%le,":" %7.3f %5.2f %5.2f %7.0le",
- elapsedSec * 1.0E3, algbw, busbw, maxDelta);
-
- if (maxDelta > deltaMaxValue(type, is_reduction)) errors++;
- avg_bw += busbw;
- avg_count++;
-
- nvtxRangePop();
- }
-
-
-// for (int n = 0; n <= N; n = (n > 0) ? n << 1 : 1)
- {
- int n = N;
- // now do in-place reduction
- nvtxRangePushA("in place");
- auto start = std::chrono::high_resolution_clock::now();
- //for (int i=0; i<100; i++) {
- for (int i = 0; i < nDev; ++i) {
- CUDACHECK(cudaSetDevice(dList[i]));
- NCCLCHECK(ncclAllReduce((const void*)sendbuff[i], (void*)sendbuff[i], n, type, op,
- comms[i], s[i]));
- }
- //}
-
- for (int i = 0; i < nDev; ++i) {
- CUDACHECK(cudaSetDevice(dList[i]));
- CUDACHECK(cudaStreamSynchronize(s[i]));
- }
-
- auto stop = std::chrono::high_resolution_clock::now();
- nvtxRangePop();
-
- nvtxRangePushA("in place bookkeeping");
- double elapsedSec =
- std::chrono::duration_cast<std::chrono::duration<double>>(
- stop - start).count(); // / 100.0;
- double algbw = (double)(n * sizeof(T)) / 1.0E9 / elapsedSec;
- double busbw = algbw * (double)(2 * nDev - 2) / (double)nDev;
-
- double maxDelta = 0.0;
- for (int i = 0; i < nDev; ++i) {
- CUDACHECK(cudaSetDevice(dList[i]));
- double tmpDelta = CheckDelta<T>(sendbuff[i], result, N);
- maxDelta = std::max(tmpDelta, maxDelta);
- }
-
- printf((csv)?"%f,%f,%f,%le,":" %7.3f %5.2f %5.2f %7.0le\n",
- elapsedSec * 1.0E3, algbw, busbw, maxDelta);
-
- if (maxDelta > deltaMaxValue(type, is_reduction)) errors++;
- avg_bw += busbw;
- avg_count++;
-
- nvtxRangePop();
- }
-
- for (int i = 0; i < nDev; ++i) {
- CUDACHECK(cudaSetDevice(dList[i]));
- CUDACHECK(cudaStreamDestroy(s[i]));
- }
- free(s);
- free(buffer);
- free(result);
-}
-
-template<typename T>
-void RunTests(const int N, const ncclDataType_t type, ncclComm_t* comms,
- const std::vector<int>& dList) {
- int nDev = 0;
- NCCLCHECK(ncclCommCount(comms[0], &nDev));
- T** sendbuff = (T**)malloc(nDev * sizeof(T*));
- T** recvbuff = (T**)malloc(nDev * sizeof(T*));
-
- for (int i = 0; i < nDev; ++i) {
- CUDACHECK(cudaSetDevice(dList[i]));
- CUDACHECK(cudaMalloc(sendbuff + i, N * sizeof(T)));
- CUDACHECK(cudaMalloc(recvbuff + i, N * sizeof(T)));
- }
-
- for (ncclRedOp_t op : { ncclSum, ncclProd, ncclMax, ncclMin }) {
-// for (ncclRedOp_t op : { ncclSum }) {
- RunTest<T>(sendbuff, recvbuff, N, type, op, comms, dList);
- }
-
- for (int i = 0; i < nDev; ++i) {
- CUDACHECK(cudaSetDevice(dList[i]));
- CUDACHECK(cudaFree(sendbuff[i]));
- CUDACHECK(cudaFree(recvbuff[i]));
- }
-
- free(sendbuff);
- free(recvbuff);
-}
-
-void usage() {
- printf("Tests nccl AllReduce with user supplied arguments.\n"
- " Usage: all_reduce_test <data size in bytes> [number of GPUs] "
- "[GPU 0] [GPU 1] ...\n\n");
-}
-
-int main(int argc, char* argv[]) {
- int nVis = 0;
- CUDACHECK(cudaGetDeviceCount(&nVis));
-
- int N = 0;
- if (argc > 1) {
- int t = sscanf(argv[1], "%d", &N);
- if (t == 0) {
- printf("Error: %s is not an integer!\n\n", argv[1]);
- usage();
- exit(EXIT_FAILURE);
- }
- } else {
- printf("Error: must specify at least data size in bytes!\n\n");
- usage();
- exit(EXIT_FAILURE);
- }
-
- int nDev = nVis;
- if (argc > 2) {
- int t = sscanf(argv[2], "%d", &nDev);
- if (t == 0) {
- printf("Error: %s is not an integer!\n\n", argv[1]);
- usage();
- exit(EXIT_FAILURE);
- }
- }
- std::vector<int> dList(nDev);
- for (int i = 0; i < nDev; ++i)
- dList[i] = i % nVis;
-
- if (argc > 3) {
- if (argc - 3 != nDev) {
- printf("Error: insufficient number of GPUs in list\n\n");
- usage();
- exit(EXIT_FAILURE);
- }
-
- for (int i = 0; i < nDev; ++i) {
- int t = sscanf(argv[3 + i], "%d", dList.data() + i);
- if (t == 0) {
- printf("Error: %s is not an integer!\n\n", argv[2 + i]);
- usage();
- exit(EXIT_FAILURE);
- }
- }
- }
-
- ncclComm_t* comms = (ncclComm_t*)malloc(sizeof(ncclComm_t)*nDev);
- NCCLCHECK(ncclCommInitAll(comms, nDev, dList.data()));
-
- if (!csv) {
- printf("# Using devices\n");
- for (int g = 0; g < nDev; ++g) {
- int cudaDev;
- int rank;
- cudaDeviceProp prop;
- NCCLCHECK(ncclCommCuDevice(comms[g], &cudaDev));
- NCCLCHECK(ncclCommUserRank(comms[g], &rank));
- CUDACHECK(cudaGetDeviceProperties(&prop, cudaDev));
- printf("# Rank %2d uses device %2d [0x%02x] %s\n", rank, cudaDev,
- prop.pciBusID, prop.name);
- }
- printf("\n");
-
- printf("# %10s %12s %6s %6s out-of-place in-place\n", "", "", "", "");
- printf("# %10s %12s %6s %6s %7s %5s %5s %7s %7s %5s %5s %7s\n", "bytes", "N", "type", "op",
- "time", "algbw", "busbw", "res", "time", "algbw", "busbw", "res");
- }
- else {
- printf("B,N,type,op,oop_time,oop_algbw,oop_busbw,oop_res,ip_time,ip_algbw,ip_busbw,ip_res\n");
- }
-
- RunTests<char>(N / sizeof(char), ncclChar, comms, dList);
- RunTests<int>(N / sizeof(int), ncclInt, comms, dList);
-#ifdef CUDA_HAS_HALF
- RunTests<half>(N / sizeof(half), ncclHalf, comms, dList);
-#endif
- RunTests<float>(N / sizeof(float), ncclFloat, comms, dList);
- RunTests<double>(N / sizeof(double), ncclDouble, comms, dList);
- RunTests<long long>(N / sizeof(long long), ncclInt64, comms, dList);
- RunTests<unsigned long long>(N / sizeof(unsigned long long), ncclUint64, comms, dList);
-
- printf("\n");
-
- for(int i=0; i<nDev; ++i)
- ncclCommDestroy(comms[i]);
- free(comms);
-
- char* str = getenv("NCCL_TESTS_MIN_BW");
- double check_avg_bw = str ? atof(str) : -1;
- avg_bw /= avg_count;
-
- printf(" Out of bounds values : %d %s\n", errors, errors ? "FAILED" : "OK");
- printf(" Avg bus bandwidth : %g %s\n", avg_bw, check_avg_bw == -1 ? "" : (avg_bw < check_avg_bw ? "FAILED" : "OK"));
- printf("\n");
- if (errors || avg_bw < check_avg_bw)
- exit(EXIT_FAILURE);
- else
- exit(EXIT_SUCCESS);
-}
-