diff options
Diffstat (limited to 'test/single/all_reduce_test.cu')
-rw-r--r-- | test/single/all_reduce_test.cu | 301 |
1 files changed, 0 insertions, 301 deletions
diff --git a/test/single/all_reduce_test.cu b/test/single/all_reduce_test.cu deleted file mode 100644 index 1935a38..0000000 --- a/test/single/all_reduce_test.cu +++ /dev/null @@ -1,301 +0,0 @@ -/************************************************************************* - * Copyright (c) 2015-2016, NVIDIA CORPORATION. All rights reserved. - * - * See LICENCE.txt for license information - ************************************************************************/ - -#include <chrono> -#include <cstdio> -#include <cstdlib> -#include <string> -#include <vector> - -#include "nccl.h" -#include "test_utilities.h" -#include <nvToolsExt.h> - -int csv = false; -int errors = 0; -double avg_bw = 0.0; -int avg_count = 0; -bool is_reduction = true; - -template<typename T> -void RunTest(T** sendbuff, T** recvbuff, const int N, const ncclDataType_t type, - const ncclRedOp_t op, ncclComm_t* comms, const std::vector<int>& dList) { - // initialize data - T* buffer = (T*)malloc(N * sizeof(T)); - T* result = (T*)malloc(N * sizeof(T)); - memset(buffer, 0, N * sizeof(T)); - memset(result, 0, N * sizeof(T)); - - int nDev = 0; - NCCLCHECK(ncclCommCount(comms[0], &nDev)); - cudaStream_t* s = (cudaStream_t*)malloc(sizeof(cudaStream_t)*nDev); - - for (int i = 0; i < nDev; ++i) { - CUDACHECK(cudaSetDevice(dList[i])); - CUDACHECK(cudaStreamCreate(s+i)); - CUDACHECK(cudaMemset(recvbuff[i], 0, N * sizeof(T))); - Randomize(sendbuff[i], N, i); - if(i == 0) { - CUDACHECK(cudaMemcpy(result, sendbuff[i], N*sizeof(T), cudaMemcpyDeviceToHost)); - } else { - Accumulate<T>(result, sendbuff[i], N, op); - } - } - - // warm up GPU - for (int i = 0; i < nDev; ++i) { - CUDACHECK(cudaSetDevice(dList[i])); - NCCLCHECK(ncclAllReduce((const void*)sendbuff[i], (void*)recvbuff[i], std::min(N, 1024 * 1024), type, op, comms[i], s[i])); - } - - for (int i = 0; i < nDev; ++i) { - CUDACHECK(cudaSetDevice(dList[i])); - CUDACHECK(cudaStreamSynchronize(s[i])); - } - -// for (int n = 0; n <= N; n = (n > 0) ? n << 1 : 1) - { - int n = N; - printf((csv) ? "%i,%i,%s,%s," : "%12i %12i %6s %6s", - (int) (n * sizeof(T)), n, TypeName(type).c_str(), - OperationName(op).c_str()); - - // do out-of-place reduction first - nvtxRangePushA("out of place"); - auto start = std::chrono::high_resolution_clock::now(); - //for (int i=0; i<100; i++) { - for (int i = 0; i < nDev; ++i) { - CUDACHECK(cudaSetDevice(dList[i])); - NCCLCHECK(ncclAllReduce((const void*)sendbuff[i], (void*)recvbuff[i], n, type, op, - comms[i], s[i])); - } - //} - - for (int i = 0; i < nDev; ++i) { - CUDACHECK(cudaSetDevice(dList[i])); - CUDACHECK(cudaStreamSynchronize(s[i])); - } - - auto stop = std::chrono::high_resolution_clock::now(); - nvtxRangePop(); - - nvtxRangePushA("out of place bookkeeping"); - double elapsedSec = - std::chrono::duration_cast<std::chrono::duration<double>>( - stop - start).count(); // / 100.0; - double algbw = (double)(n * sizeof(T)) / 1.0E9 / elapsedSec; - double busbw = algbw * (double)(2 * nDev - 2) / (double)nDev; - - double maxDelta = 0.0; - for (int i = 0; i < nDev; ++i) { - CUDACHECK(cudaSetDevice(dList[i])); - double tmpDelta = CheckDelta<T>(recvbuff[i], result, N); - maxDelta = std::max(tmpDelta, maxDelta); - } - - printf((csv)?"%f,%f,%f,%le,":" %7.3f %5.2f %5.2f %7.0le", - elapsedSec * 1.0E3, algbw, busbw, maxDelta); - - if (maxDelta > deltaMaxValue(type, is_reduction)) errors++; - avg_bw += busbw; - avg_count++; - - nvtxRangePop(); - } - - -// for (int n = 0; n <= N; n = (n > 0) ? n << 1 : 1) - { - int n = N; - // now do in-place reduction - nvtxRangePushA("in place"); - auto start = std::chrono::high_resolution_clock::now(); - //for (int i=0; i<100; i++) { - for (int i = 0; i < nDev; ++i) { - CUDACHECK(cudaSetDevice(dList[i])); - NCCLCHECK(ncclAllReduce((const void*)sendbuff[i], (void*)sendbuff[i], n, type, op, - comms[i], s[i])); - } - //} - - for (int i = 0; i < nDev; ++i) { - CUDACHECK(cudaSetDevice(dList[i])); - CUDACHECK(cudaStreamSynchronize(s[i])); - } - - auto stop = std::chrono::high_resolution_clock::now(); - nvtxRangePop(); - - nvtxRangePushA("in place bookkeeping"); - double elapsedSec = - std::chrono::duration_cast<std::chrono::duration<double>>( - stop - start).count(); // / 100.0; - double algbw = (double)(n * sizeof(T)) / 1.0E9 / elapsedSec; - double busbw = algbw * (double)(2 * nDev - 2) / (double)nDev; - - double maxDelta = 0.0; - for (int i = 0; i < nDev; ++i) { - CUDACHECK(cudaSetDevice(dList[i])); - double tmpDelta = CheckDelta<T>(sendbuff[i], result, N); - maxDelta = std::max(tmpDelta, maxDelta); - } - - printf((csv)?"%f,%f,%f,%le,":" %7.3f %5.2f %5.2f %7.0le\n", - elapsedSec * 1.0E3, algbw, busbw, maxDelta); - - if (maxDelta > deltaMaxValue(type, is_reduction)) errors++; - avg_bw += busbw; - avg_count++; - - nvtxRangePop(); - } - - for (int i = 0; i < nDev; ++i) { - CUDACHECK(cudaSetDevice(dList[i])); - CUDACHECK(cudaStreamDestroy(s[i])); - } - free(s); - free(buffer); - free(result); -} - -template<typename T> -void RunTests(const int N, const ncclDataType_t type, ncclComm_t* comms, - const std::vector<int>& dList) { - int nDev = 0; - NCCLCHECK(ncclCommCount(comms[0], &nDev)); - T** sendbuff = (T**)malloc(nDev * sizeof(T*)); - T** recvbuff = (T**)malloc(nDev * sizeof(T*)); - - for (int i = 0; i < nDev; ++i) { - CUDACHECK(cudaSetDevice(dList[i])); - CUDACHECK(cudaMalloc(sendbuff + i, N * sizeof(T))); - CUDACHECK(cudaMalloc(recvbuff + i, N * sizeof(T))); - } - - for (ncclRedOp_t op : { ncclSum, ncclProd, ncclMax, ncclMin }) { -// for (ncclRedOp_t op : { ncclSum }) { - RunTest<T>(sendbuff, recvbuff, N, type, op, comms, dList); - } - - for (int i = 0; i < nDev; ++i) { - CUDACHECK(cudaSetDevice(dList[i])); - CUDACHECK(cudaFree(sendbuff[i])); - CUDACHECK(cudaFree(recvbuff[i])); - } - - free(sendbuff); - free(recvbuff); -} - -void usage() { - printf("Tests nccl AllReduce with user supplied arguments.\n" - " Usage: all_reduce_test <data size in bytes> [number of GPUs] " - "[GPU 0] [GPU 1] ...\n\n"); -} - -int main(int argc, char* argv[]) { - int nVis = 0; - CUDACHECK(cudaGetDeviceCount(&nVis)); - - int N = 0; - if (argc > 1) { - int t = sscanf(argv[1], "%d", &N); - if (t == 0) { - printf("Error: %s is not an integer!\n\n", argv[1]); - usage(); - exit(EXIT_FAILURE); - } - } else { - printf("Error: must specify at least data size in bytes!\n\n"); - usage(); - exit(EXIT_FAILURE); - } - - int nDev = nVis; - if (argc > 2) { - int t = sscanf(argv[2], "%d", &nDev); - if (t == 0) { - printf("Error: %s is not an integer!\n\n", argv[1]); - usage(); - exit(EXIT_FAILURE); - } - } - std::vector<int> dList(nDev); - for (int i = 0; i < nDev; ++i) - dList[i] = i % nVis; - - if (argc > 3) { - if (argc - 3 != nDev) { - printf("Error: insufficient number of GPUs in list\n\n"); - usage(); - exit(EXIT_FAILURE); - } - - for (int i = 0; i < nDev; ++i) { - int t = sscanf(argv[3 + i], "%d", dList.data() + i); - if (t == 0) { - printf("Error: %s is not an integer!\n\n", argv[2 + i]); - usage(); - exit(EXIT_FAILURE); - } - } - } - - ncclComm_t* comms = (ncclComm_t*)malloc(sizeof(ncclComm_t)*nDev); - NCCLCHECK(ncclCommInitAll(comms, nDev, dList.data())); - - if (!csv) { - printf("# Using devices\n"); - for (int g = 0; g < nDev; ++g) { - int cudaDev; - int rank; - cudaDeviceProp prop; - NCCLCHECK(ncclCommCuDevice(comms[g], &cudaDev)); - NCCLCHECK(ncclCommUserRank(comms[g], &rank)); - CUDACHECK(cudaGetDeviceProperties(&prop, cudaDev)); - printf("# Rank %2d uses device %2d [0x%02x] %s\n", rank, cudaDev, - prop.pciBusID, prop.name); - } - printf("\n"); - - printf("# %10s %12s %6s %6s out-of-place in-place\n", "", "", "", ""); - printf("# %10s %12s %6s %6s %7s %5s %5s %7s %7s %5s %5s %7s\n", "bytes", "N", "type", "op", - "time", "algbw", "busbw", "res", "time", "algbw", "busbw", "res"); - } - else { - printf("B,N,type,op,oop_time,oop_algbw,oop_busbw,oop_res,ip_time,ip_algbw,ip_busbw,ip_res\n"); - } - - RunTests<char>(N / sizeof(char), ncclChar, comms, dList); - RunTests<int>(N / sizeof(int), ncclInt, comms, dList); -#ifdef CUDA_HAS_HALF - RunTests<half>(N / sizeof(half), ncclHalf, comms, dList); -#endif - RunTests<float>(N / sizeof(float), ncclFloat, comms, dList); - RunTests<double>(N / sizeof(double), ncclDouble, comms, dList); - RunTests<long long>(N / sizeof(long long), ncclInt64, comms, dList); - RunTests<unsigned long long>(N / sizeof(unsigned long long), ncclUint64, comms, dList); - - printf("\n"); - - for(int i=0; i<nDev; ++i) - ncclCommDestroy(comms[i]); - free(comms); - - char* str = getenv("NCCL_TESTS_MIN_BW"); - double check_avg_bw = str ? atof(str) : -1; - avg_bw /= avg_count; - - printf(" Out of bounds values : %d %s\n", errors, errors ? "FAILED" : "OK"); - printf(" Avg bus bandwidth : %g %s\n", avg_bw, check_avg_bw == -1 ? "" : (avg_bw < check_avg_bw ? "FAILED" : "OK")); - printf("\n"); - if (errors || avg_bw < check_avg_bw) - exit(EXIT_FAILURE); - else - exit(EXIT_SUCCESS); -} - |