diff options
author | Nathan Luehr <nluehr@nvidia.com> | 2016-01-22 03:30:05 +0300 |
---|---|---|
committer | Przemek Tredak <ptredak@nvidia.com> | 2016-01-29 22:09:05 +0300 |
commit | 27583533805dde8fd5e4514d25531b8347fd985b (patch) | |
tree | 1adb7db3bc2efdd11f85a7515f5b8c9b1e8d6cf7 /test | |
parent | fe1a9567155c8965425f884c552496ebd960ff42 (diff) |
Added NCCL error checking to tests.
Also cleaned up makefile so that tests and lib are not built unnecessarily.
Change-Id: Ia0c596cc2213628de2f066be97615c09bb1bb262
Reviewed-on: http://git-master/r/999627
Reviewed-by: Przemek Tredak <ptredak@nvidia.com>
Tested-by: Przemek Tredak <ptredak@nvidia.com>
Diffstat (limited to 'test')
-rw-r--r-- | test/include/test_utilities.h | 25 | ||||
-rw-r--r-- | test/mpi/mpi_test.cu | 18 | ||||
-rw-r--r-- | test/single/all_gather_test.cu | 20 | ||||
-rw-r--r-- | test/single/all_reduce_test.cu | 20 | ||||
-rw-r--r-- | test/single/broadcast_test.cu | 16 | ||||
-rw-r--r-- | test/single/reduce_scatter_test.cu | 24 | ||||
-rw-r--r-- | test/single/reduce_test.cu | 22 |
7 files changed, 73 insertions, 72 deletions
diff --git a/test/include/test_utilities.h b/test/include/test_utilities.h index c929a9e..fb34d19 100644 --- a/test/include/test_utilities.h +++ b/test/include/test_utilities.h @@ -32,14 +32,23 @@ #include <curand.h> -#define CUDACHECK(cmd) do { \ - cudaError_t e = cmd; \ - if( e != cudaSuccess ) { \ - printf("Cuda failure %s:%d '%s'\n", \ - __FILE__,__LINE__,cudaGetErrorString(e)); \ - exit(EXIT_FAILURE); \ - } \ -} while(false) +#define CUDACHECK(cmd) do { \ + cudaError_t e = cmd; \ + if( e != cudaSuccess ) { \ + printf("Cuda failure %s:%d '%s'\n", \ + __FILE__,__LINE__,cudaGetErrorString(e)); \ + exit(EXIT_FAILURE); \ + } \ +} while(0) + +#define NCCLCHECK(cmd) do { \ + ncclResult_t r = cmd; \ + if (r!= ncclSuccess) { \ + printf("NCCL failure %s:%d '%s'\n", \ + __FILE__,__LINE__,ncclGetErrorString(r)); \ + exit(EXIT_FAILURE); \ + } \ +} while(0) template<typename T> void Randomize(T* const dest, const int N, const int randomSeed); diff --git a/test/mpi/mpi_test.cu b/test/mpi/mpi_test.cu index 87465e5..54ebbce 100644 --- a/test/mpi/mpi_test.cu +++ b/test/mpi/mpi_test.cu @@ -32,15 +32,7 @@ #include "nccl.h" #include "mpi.h" - -#define CUDACHECK(cmd) do { \ - cudaError_t e = cmd; \ - if( e != cudaSuccess ) { \ - printf("Cuda failure %s:%d '%s'\n", \ - __FILE__,__LINE__,cudaGetErrorString(e)); \ - exit(EXIT_FAILURE); \ - } \ -} while(false) +#include "test_utilities.h" #define SIZE 128 #define NITERS 1 @@ -48,7 +40,7 @@ int main(int argc, char *argv[]) { ncclUniqueId commId; int size, rank; - int ret; + ncclResult_t ret; MPI_Init(&argc, &argv); MPI_Comm_size(MPI_COMM_WORLD, &size); @@ -66,11 +58,11 @@ int main(int argc, char *argv[]) { // NCCL Communicator creation ncclComm_t comm; - ncclGetUniqueId(&commId); + NCCLCHECK(ncclGetUniqueId(&commId)); MPI_Bcast(&commId, NCCL_UNIQUE_ID_BYTES, MPI_CHAR, 0, MPI_COMM_WORLD); ret = ncclCommInitRank(&comm, size, commId, rank); if (ret != ncclSuccess) { - printf("NCCL Init failed : %d\n", ret); + printf("NCCL Init failed (%d) '%s'\n", ret, ncclGetErrorString(ret)); exit(1); } @@ -93,7 +85,7 @@ int main(int argc, char *argv[]) { // Run allreduce int errors = 0; for (int i=0; i<NITERS; i++) { - ncclAllReduce((const void*)dptr, (void*)(dptr+SIZE), SIZE, ncclInt, ncclSum, comm, stream); + NCCLCHECK(ncclAllReduce((const void*)dptr, (void*)(dptr+SIZE), SIZE, ncclInt, ncclSum, comm, stream)); } // Check results diff --git a/test/single/all_gather_test.cu b/test/single/all_gather_test.cu index a9e1c1e..0925b7e 100644 --- a/test/single/all_gather_test.cu +++ b/test/single/all_gather_test.cu @@ -1,5 +1,5 @@ /************************************************************************* - * Copyright (c) 2015, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2015-2016, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions @@ -41,7 +41,7 @@ void RunTest(T** sendbuff, T** recvbuff, const int N, const ncclDataType_t type, ncclComm_t* const comms, const std::vector<int>& dList) { // initialize data int nDev = 0; - ncclCommCount(comms[0], &nDev); + NCCLCHECK(ncclCommCount(comms[0], &nDev)); cudaStream_t* s = (cudaStream_t*)malloc(sizeof(cudaStream_t)*nDev); T* buffer = (T*)malloc(nDev * N * sizeof(T)); T* result = (T*)malloc(nDev * N * sizeof(T)); @@ -61,8 +61,8 @@ void RunTest(T** sendbuff, T** recvbuff, const int N, const ncclDataType_t type, // warm up GPU for (int i = 0; i < nDev; ++i) { CUDACHECK(cudaSetDevice(dList[i])); - ncclAllGather((const void*)sendbuff[i], std::min(32 * 1024, N), type, - (void*)recvbuff[i], comms[i], s[i]); + NCCLCHECK(ncclAllGather((const void*)sendbuff[i], std::min(32 * 1024, N), type, + (void*)recvbuff[i], comms[i], s[i])); } for (int i = 0; i < nDev; ++i) { @@ -79,8 +79,8 @@ void RunTest(T** sendbuff, T** recvbuff, const int N, const ncclDataType_t type, for (int i = 0; i < nDev; ++i) { CUDACHECK(cudaSetDevice(dList[i])); - ncclAllGather((const void*)sendbuff[i], n, type, (void*)recvbuff[i], comms[i], - s[i]); + NCCLCHECK(ncclAllGather((const void*)sendbuff[i], n, type, (void*)recvbuff[i], comms[i], + s[i])); } for (int i = 0; i < nDev; ++i) { @@ -121,7 +121,7 @@ template<typename T> void RunTests(const int N, const ncclDataType_t type, ncclComm_t* const comms, const std::vector<int>& dList) { int nDev = 0; - ncclCommCount(comms[0], &nDev); + NCCLCHECK(ncclCommCount(comms[0], &nDev)); T** sendbuff = (T**)malloc(nDev * sizeof(T*)); T** recvbuff = (T**)malloc(nDev * sizeof(T*)); @@ -199,15 +199,15 @@ int main(int argc, char* argv[]) { } ncclComm_t* comms = (ncclComm_t*)malloc(sizeof(ncclComm_t)*nDev); - ncclCommInitAll(comms, nDev, dList.data()); + NCCLCHECK(ncclCommInitAll(comms, nDev, dList.data())); printf("# Using devices\n"); for (int g=0; g<nDev; ++g) { int cudaDev; int rank; cudaDeviceProp prop; - ncclCommCuDevice(comms[g], &cudaDev); - ncclCommUserRank(comms[g], &rank); + NCCLCHECK(ncclCommCuDevice(comms[g], &cudaDev)); + NCCLCHECK(ncclCommUserRank(comms[g], &rank)); CUDACHECK(cudaGetDeviceProperties(&prop, cudaDev)); printf("# Rank %2d uses device %2d [0x%02x] %s\n", rank, cudaDev, prop.pciBusID, prop.name); diff --git a/test/single/all_reduce_test.cu b/test/single/all_reduce_test.cu index a2fcb3d..2e14335 100644 --- a/test/single/all_reduce_test.cu +++ b/test/single/all_reduce_test.cu @@ -48,7 +48,7 @@ void RunTest(T** sendbuff, T** recvbuff, const int N, const ncclDataType_t type, memset(result, 0, N * sizeof(T)); int nDev = 0; - ncclCommCount(comms[0], &nDev); + NCCLCHECK(ncclCommCount(comms[0], &nDev)); cudaStream_t* s = (cudaStream_t*)malloc(sizeof(cudaStream_t)*nDev); for (int i = 0; i < nDev; ++i) { @@ -66,7 +66,7 @@ void RunTest(T** sendbuff, T** recvbuff, const int N, const ncclDataType_t type, // warm up GPU for (int i = 0; i < nDev; ++i) { CUDACHECK(cudaSetDevice(dList[i])); - ncclAllReduce((const void*)sendbuff[i], (void*)recvbuff[i], std::min(N, 1024 * 1024), type, op, comms[i], s[i]); + NCCLCHECK(ncclAllReduce((const void*)sendbuff[i], (void*)recvbuff[i], std::min(N, 1024 * 1024), type, op, comms[i], s[i])); } for (int i = 0; i < nDev; ++i) { @@ -87,8 +87,8 @@ void RunTest(T** sendbuff, T** recvbuff, const int N, const ncclDataType_t type, //for (int i=0; i<100; i++) { for (int i = 0; i < nDev; ++i) { CUDACHECK(cudaSetDevice(dList[i])); - ncclAllReduce((const void*)sendbuff[i], (void*)recvbuff[i], n, type, op, - comms[i], s[i]); + NCCLCHECK(ncclAllReduce((const void*)sendbuff[i], (void*)recvbuff[i], n, type, op, + comms[i], s[i])); } //} @@ -130,8 +130,8 @@ void RunTest(T** sendbuff, T** recvbuff, const int N, const ncclDataType_t type, //for (int i=0; i<100; i++) { for (int i = 0; i < nDev; ++i) { CUDACHECK(cudaSetDevice(dList[i])); - ncclAllReduce((const void*)sendbuff[i], (void*)sendbuff[i], n, type, op, - comms[i], s[i]); + NCCLCHECK(ncclAllReduce((const void*)sendbuff[i], (void*)sendbuff[i], n, type, op, + comms[i], s[i])); } //} @@ -176,7 +176,7 @@ template<typename T> void RunTests(const int N, const ncclDataType_t type, ncclComm_t* comms, const std::vector<int>& dList) { int nDev = 0; - ncclCommCount(comms[0], &nDev); + NCCLCHECK(ncclCommCount(comms[0], &nDev)); T** sendbuff = (T**)malloc(nDev * sizeof(T*)); T** recvbuff = (T**)malloc(nDev * sizeof(T*)); @@ -256,7 +256,7 @@ int main(int argc, char* argv[]) { } ncclComm_t* comms = (ncclComm_t*)malloc(sizeof(ncclComm_t)*nDev); - ncclCommInitAll(comms, nDev, dList.data()); + NCCLCHECK(ncclCommInitAll(comms, nDev, dList.data())); if (!csv) { printf("# Using devices\n"); @@ -264,8 +264,8 @@ int main(int argc, char* argv[]) { int cudaDev; int rank; cudaDeviceProp prop; - ncclCommCuDevice(comms[g], &cudaDev); - ncclCommUserRank(comms[g], &rank); + NCCLCHECK(ncclCommCuDevice(comms[g], &cudaDev)); + NCCLCHECK(ncclCommUserRank(comms[g], &rank)); CUDACHECK(cudaGetDeviceProperties(&prop, cudaDev)); printf("# Rank %2d uses device %2d [0x%02x] %s\n", rank, cudaDev, prop.pciBusID, prop.name); diff --git a/test/single/broadcast_test.cu b/test/single/broadcast_test.cu index 9c85a1f..9801d04 100644 --- a/test/single/broadcast_test.cu +++ b/test/single/broadcast_test.cu @@ -1,5 +1,5 @@ /************************************************************************* - * Copyright (c) 2015, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2015-2016, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions @@ -41,7 +41,7 @@ void RunTest(T** buff, const int N, const ncclDataType_t type, const int root, ncclComm_t* const comms, const std::vector<int>& dList) { // initialize data int nDev = 0; - ncclCommCount(comms[0], &nDev); + NCCLCHECK(ncclCommCount(comms[0], &nDev)); cudaStream_t* s = (cudaStream_t*)malloc(sizeof(cudaStream_t)*nDev); T* buffer = (T*)malloc(N * sizeof(T)); T* result = (T*)malloc(N * sizeof(T)); @@ -65,7 +65,7 @@ void RunTest(T** buff, const int N, const ncclDataType_t type, const int root, // warm up GPU for (int i = 0; i < nDev; ++i) { CUDACHECK(cudaSetDevice(dList[i])); - ncclBcast((void*)buff[i], std::min(32 * 1024, N), type, root, comms[i], s[i]); + NCCLCHECK(ncclBcast((void*)buff[i], std::min(32 * 1024, N), type, root, comms[i], s[i])); } for (int i = 0; i < nDev; ++i) { @@ -83,7 +83,7 @@ void RunTest(T** buff, const int N, const ncclDataType_t type, const int root, for (int i = 0; i < nDev; ++i) { CUDACHECK(cudaSetDevice(dList[i])); - ncclBcast((void*)buff[i], n, type, root, comms[i], s[i]); + NCCLCHECK(ncclBcast((void*)buff[i], n, type, root, comms[i], s[i])); } for (int i = 0; i < nDev; ++i) { @@ -123,7 +123,7 @@ template<typename T> void RunTests(const int N, const ncclDataType_t type, ncclComm_t* const comms, const std::vector<int>& dList) { int nDev = 0; - ncclCommCount(comms[0], &nDev); + NCCLCHECK(ncclCommCount(comms[0], &nDev)); T** buff = (T**)malloc(nDev * sizeof(T*)); for (int i = 0; i < nDev; ++i) { @@ -199,15 +199,15 @@ int main(int argc, char* argv[]) { } ncclComm_t* comms = (ncclComm_t*)malloc(sizeof(ncclComm_t)*nDev);; - ncclCommInitAll(comms, nDev, dList.data()); + NCCLCHECK(ncclCommInitAll(comms, nDev, dList.data())); printf("# Using devices\n"); for (int g = 0; g < nDev; ++g) { int cudaDev; int rank; cudaDeviceProp prop; - ncclCommCuDevice(comms[g], &cudaDev); - ncclCommUserRank(comms[g], &rank); + NCCLCHECK(ncclCommCuDevice(comms[g], &cudaDev)); + NCCLCHECK(ncclCommUserRank(comms[g], &rank)); CUDACHECK(cudaGetDeviceProperties(&prop, cudaDev)); printf("# Rank %2d uses device %2d [0x%02x] %s\n", rank, cudaDev, prop.pciBusID, prop.name); diff --git a/test/single/reduce_scatter_test.cu b/test/single/reduce_scatter_test.cu index da205d5..4fc3292 100644 --- a/test/single/reduce_scatter_test.cu +++ b/test/single/reduce_scatter_test.cu @@ -1,5 +1,5 @@ /************************************************************************* - * Copyright (c) 2015, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2015-2016, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions @@ -41,7 +41,7 @@ void RunTest(T** sendbuff, T** recvbuff, const int N, const ncclDataType_t type, const ncclRedOp_t op, ncclComm_t* const comms, const std::vector<int>& dList) { // initialize data int nDev = 0; - ncclCommCount(comms[0], &nDev); + NCCLCHECK(ncclCommCount(comms[0], &nDev)); cudaStream_t* s = (cudaStream_t*)malloc(sizeof(cudaStream_t)*nDev); T* buffer = (T*)malloc(N * nDev * sizeof(T)); @@ -66,8 +66,8 @@ void RunTest(T** sendbuff, T** recvbuff, const int N, const ncclDataType_t type, // warm up GPU for (int i = 0; i < nDev; ++i) { CUDACHECK(cudaSetDevice(dList[i])); - ncclReduceScatter((const void*)sendbuff[i], (void*)recvbuff[i], - std::min(N, 1024 * 1024), type, op, comms[i], s[i]); + NCCLCHECK(ncclReduceScatter((const void*)sendbuff[i], (void*)recvbuff[i], + std::min(N, 1024 * 1024), type, op, comms[i], s[i])); } for (int i = 0; i < nDev; ++i) { @@ -86,8 +86,8 @@ void RunTest(T** sendbuff, T** recvbuff, const int N, const ncclDataType_t type, for (int i = 0; i < nDev; ++i) { CUDACHECK(cudaSetDevice(dList[i])); - ncclReduceScatter((const void*)sendbuff[i], (void*)recvbuff[i], n, type, - op, comms[i], s[i]); + NCCLCHECK(ncclReduceScatter((const void*)sendbuff[i], (void*)recvbuff[i], n, type, + op, comms[i], s[i])); } for (int i = 0; i < nDev; ++i) { @@ -122,8 +122,8 @@ void RunTest(T** sendbuff, T** recvbuff, const int N, const ncclDataType_t type, for (int i = 0; i < nDev; ++i) { CUDACHECK(cudaSetDevice(dList[i])); - ncclReduceScatter((const void*)sendbuff[i], (void*)sendbuff[i], n, type, - op, comms[i], s[i]); + NCCLCHECK(ncclReduceScatter((const void*)sendbuff[i], (void*)sendbuff[i], n, type, + op, comms[i], s[i])); } for (int i = 0; i < nDev; ++i) { @@ -163,7 +163,7 @@ template<typename T> void RunTests(const int N, const ncclDataType_t type, ncclComm_t* const comms, const std::vector<int>& dList) { int nDev = 0; - ncclCommCount(comms[0], &nDev); + NCCLCHECK(ncclCommCount(comms[0], &nDev)); T** sendbuff = (T**)malloc(nDev * sizeof(T*)); T** recvbuff = (T**)malloc(nDev * sizeof(T*)); @@ -243,15 +243,15 @@ int main(int argc, char* argv[]) { } ncclComm_t* comms = (ncclComm_t*)malloc(sizeof(ncclComm_t)*nDev); - ncclCommInitAll(comms, nDev, dList.data()); + NCCLCHECK(ncclCommInitAll(comms, nDev, dList.data())); printf("# Using devices\n"); for (int g = 0; g < nDev; ++g) { int cudaDev; int rank; cudaDeviceProp prop; - ncclCommCuDevice(comms[g], &cudaDev); - ncclCommUserRank(comms[g], &rank); + NCCLCHECK(ncclCommCuDevice(comms[g], &cudaDev)); + NCCLCHECK(ncclCommUserRank(comms[g], &rank)); CUDACHECK(cudaGetDeviceProperties(&prop, cudaDev)); printf("# Rank %2d uses device %2d [0x%02x] %s\n", rank, cudaDev, prop.pciBusID, prop.name); diff --git a/test/single/reduce_test.cu b/test/single/reduce_test.cu index 42b1e9b..9500c18 100644 --- a/test/single/reduce_test.cu +++ b/test/single/reduce_test.cu @@ -50,7 +50,7 @@ void RunTest(T** sendbuff, T** recvbuff, const int N, const ncclDataType_t type, memset(result, 0, N * sizeof(T)); int nDev = 0; - ncclCommCount(comms[0], &nDev); + NCCLCHECK(ncclCommCount(comms[0], &nDev)); cudaStream_t* s = (cudaStream_t*)malloc(sizeof(cudaStream_t)*nDev); for (int i = 0; i < nDev; ++i) { @@ -68,8 +68,8 @@ void RunTest(T** sendbuff, T** recvbuff, const int N, const ncclDataType_t type, // warm up GPU for (int i = 0; i < nDev; ++i) { CUDACHECK(cudaSetDevice(dList[i])); - ncclReduce((const void*)sendbuff[i], (void*)recvbuff[i], std::min(N, 1024 * 1024), - type, op, root, comms[i], s[i]); + NCCLCHECK(ncclReduce((const void*)sendbuff[i], (void*)recvbuff[i], std::min(N, 1024 * 1024), + type, op, root, comms[i], s[i])); } for (int i = 0; i < nDev; ++i) { @@ -90,8 +90,8 @@ void RunTest(T** sendbuff, T** recvbuff, const int N, const ncclDataType_t type, //for (int i=0; i<100; i++) { for (int i = 0; i < nDev; ++i) { CUDACHECK(cudaSetDevice(dList[i])); - ncclReduce((const void*)sendbuff[i], (void*)recvbuff[i], n, type, op, - root, comms[i], s[i]); + NCCLCHECK(ncclReduce((const void*)sendbuff[i], (void*)recvbuff[i], n, type, op, + root, comms[i], s[i])); } //} @@ -129,8 +129,8 @@ void RunTest(T** sendbuff, T** recvbuff, const int N, const ncclDataType_t type, //for (int i=0; i<100; i++) { for (int i = 0; i < nDev; ++i) { CUDACHECK(cudaSetDevice(dList[i])); - ncclReduce((const void*)sendbuff[i], (void*)sendbuff[i], n, type, op, - root, comms[i], s[i]); + NCCLCHECK(ncclReduce((const void*)sendbuff[i], (void*)sendbuff[i], n, type, op, + root, comms[i], s[i])); } //} @@ -171,7 +171,7 @@ template<typename T> void RunTests(const int N, const ncclDataType_t type, ncclComm_t* const comms, const std::vector<int>& dList) { int nDev = 0; - ncclCommCount(comms[0], &nDev); + NCCLCHECK(ncclCommCount(comms[0], &nDev)); T** sendbuff = (T**)malloc(nDev * sizeof(T*)); T** recvbuff = (T**)malloc(nDev * sizeof(T*)); @@ -253,7 +253,7 @@ int main(int argc, char* argv[]) { } ncclComm_t* comms = (ncclComm_t*)malloc(sizeof(ncclComm_t)*nDev); - ncclCommInitAll(comms, nDev, dList.data()); + NCCLCHECK(ncclCommInitAll(comms, nDev, dList.data())); if (!csv) { printf("# Using devices\n"); @@ -261,8 +261,8 @@ int main(int argc, char* argv[]) { int cudaDev; int rank; cudaDeviceProp prop; - ncclCommCuDevice(comms[g], &cudaDev); - ncclCommUserRank(comms[g], &rank); + NCCLCHECK(ncclCommCuDevice(comms[g], &cudaDev)); + NCCLCHECK(ncclCommUserRank(comms[g], &rank)); CUDACHECK(cudaGetDeviceProperties(&prop, cudaDev)); printf("# Rank %2d uses device %2d [0x%02x] %s\n", rank, cudaDev, prop.pciBusID, prop.name); |