diff options
author | Sylvain Jeaugey <sjeaugey@nvidia.com> | 2016-09-20 21:52:15 +0300 |
---|---|---|
committer | Sylvain Jeaugey <sjeaugey@nvidia.com> | 2016-09-22 21:58:33 +0300 |
commit | ca330b110ae76ace344182ab83a028911111cc36 (patch) | |
tree | 2225d406a3eca4ec735213e1b86e4ecfa2eafd22 /test | |
parent | 6c77476cc1aeb38a7666550813baaedde3dd32d1 (diff) |
Add scan testsv1.3.0-1
Diffstat (limited to 'test')
-rw-r--r-- | test/single/all_gather_scan.cu | 239 | ||||
-rw-r--r-- | test/single/all_reduce_scan.cu | 247 | ||||
-rw-r--r-- | test/single/broadcast_scan.cu | 232 | ||||
-rw-r--r-- | test/single/reduce_scan.cu | 238 | ||||
-rw-r--r-- | test/single/reduce_scatter_scan.cu | 249 |
5 files changed, 1205 insertions, 0 deletions
diff --git a/test/single/all_gather_scan.cu b/test/single/all_gather_scan.cu new file mode 100644 index 0000000..becf315 --- /dev/null +++ b/test/single/all_gather_scan.cu @@ -0,0 +1,239 @@ +/************************************************************************* + * Copyright (c) 2016, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of NVIDIA CORPORATION nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + ************************************************************************/ + +#include <chrono> +#include <cstdio> +#include <cstdlib> +#include <string> +#include <float.h> + +#include "nccl.h" +#include "test_utilities.h" +#include <nvToolsExt.h> + +void showUsage(const char* bin) { + printf("\n" + "Usage: %s <type> <n_min> <n_max> [delta] [gpus] [gpu0 [gpu1 [...]]]\n" + "Where:\n" +#ifdef CUDA_HAS_HALF + " type = [char|int|half|float|double|int64|uint64]\n" +#else + " type = [char|int|float|double|int64|uint64]\n" +#endif + " n_min > 0\n" + " n_max >= n_min\n" + " delta > 0\n\n", bin); + return; +} + +int main(int argc, char* argv[]) { + int nvis = 0; + CUDACHECK(cudaGetDeviceCount(&nvis)); + if (nvis == 0) { + printf("No GPUs found\n"); + showUsage(argv[0]); + exit(EXIT_FAILURE); + } + + ncclDataType_t type; + int n_min; + int n_max; + int delta; + int gpus; + int* list = NULL; + + if (argc < 4) { + showUsage(argv[0]); + exit(EXIT_FAILURE); + } + + type = strToType(argv[1]); + if (type == nccl_NUM_TYPES) { + printf("Invalid <type> '%s'\n", argv[1]); + showUsage(argv[0]); + exit(EXIT_FAILURE); + } + + n_min = strToPosInt(argv[2]); + if (n_min < 1) { + printf("Invalid <n_min> '%s'\n", argv[2]); + showUsage(argv[0]); + exit(EXIT_FAILURE); + } + + n_max = strToPosInt(argv[3]); + if (n_max < n_min) { + printf("Invalid <n_max> '%s'\n", argv[3]); + showUsage(argv[0]); + exit(EXIT_FAILURE); + } + + if (argc > 4) { + delta = strToPosInt(argv[4]); + if (delta < 1) { + printf("Invalid <delta> '%s'\n", argv[4]); + showUsage(argv[0]); + exit(EXIT_FAILURE); + } + } else { + delta = (n_max == n_min) ? 1 : (n_max - n_min+9) / 10; + } + + if (argc > 5) { + gpus = strToPosInt(argv[5]); + if (gpus < 1) { + printf("Invalid <gpus> '%s'\n", argv[5]); + showUsage(argv[0]); + exit(EXIT_FAILURE); + } + } else { + gpus = nvis; + } + + list = (int*)malloc(gpus*sizeof(int)); + + if (argc > 6 && argc != 6+gpus) { + printf("If given, GPU list must be fully specified.\n"); + showUsage(argv[0]); + exit(EXIT_FAILURE); + } + + for(int g=0; g<gpus; ++g) { + if(argc > 6) { + list[g] = strToNonNeg(argv[6+g]); + if (list[g] < 0) { + printf("Invalid GPU%d '%s'\n", g, argv[6+g]); + showUsage(argv[0]); + exit(EXIT_FAILURE); + } else if (list[g] >= nvis) { + printf("GPU%d (%d) exceeds visible devices (%d)\n", g, list[g], nvis); + showUsage(argv[0]); + exit(EXIT_FAILURE); + } + } else { + list[g] = g % nvis; + } + } + + size_t word = wordSize(type); + size_t max_input = n_max * word; + size_t max_output = max_input * gpus; + void* refout; + CUDACHECK(cudaMallocHost(&refout, max_output)); + + void **input, **output; + double** localError; + ncclComm_t* comm; + cudaStream_t* stream; + + input = (void**)malloc(gpus*sizeof(void*)); + output = (void**)malloc(gpus*sizeof(void*)); + localError = (double**)malloc(gpus*sizeof(double*)); + comm = (ncclComm_t*)malloc(gpus*sizeof(ncclComm_t)); + stream = (cudaStream_t*)malloc(gpus*sizeof(cudaStream_t)); + + for(int g=0; g<gpus; ++g) { + char busid[32] = {0}; + CUDACHECK(cudaDeviceGetPCIBusId(busid, 32, list[g])); + printf("# Rank %d using device %d [%s]\n", g, list[g], busid); + + CUDACHECK(cudaSetDevice(list[g])); + CUDACHECK(cudaMalloc(&input[g], max_input)); + CUDACHECK(cudaMalloc(&output[g], max_output)); + CUDACHECK(cudaMallocHost(&localError[g], sizeof(double))); + CUDACHECK(cudaStreamCreate(&stream[g])); + makeRandom(input[g], n_max, type, 42+g); + + CUDACHECK(cudaMemcpy((char*)refout+max_input*g, input[g], max_input, cudaMemcpyDeviceToHost)); + } + + NCCLCHECK(ncclCommInitAll(comm, gpus, list)); + + printf(" BYTES ERROR MSEC BW\n"); + + for(int n=n_min; n<=n_max; n+=delta) { + size_t out_bytes = word * n * gpus; + + for(int g=0; g<gpus; ++g) { + CUDACHECK(cudaSetDevice(list[g])); + CUDACHECK(cudaMemsetAsync(output[g], 0, out_bytes, stream[g])); + CUDACHECK(cudaStreamSynchronize(stream[g])); + } + + auto start = std::chrono::high_resolution_clock::now(); + for(int g=0; g<gpus; ++g) { + CUDACHECK(cudaSetDevice(list[g])); + NCCLCHECK(ncclAllGather(input[g], n, type, output[g], comm[g], stream[g])); + } + for(int g=0; g<gpus; ++g) { + CUDACHECK(cudaSetDevice(list[g])); + CUDACHECK(cudaStreamSynchronize(stream[g])); + } + auto stop = std::chrono::high_resolution_clock::now(); + double ms = std::chrono::duration_cast<std::chrono::duration<double>> + (stop - start).count() * 1000.0; + + double max_error = 0.0; + for(int slice=0; slice<gpus; ++slice) { + void* refSlice = (void*)((char*)refout + slice*max_input); + for(int g=0; g<gpus; ++g) { + CUDACHECK(cudaSetDevice(list[g])); + void* mySlice = (void*)((char*)output[g] + slice*n*word); + maxDiff(localError[g], mySlice, refSlice, n, type, stream[g]); + } + for(int g=0; g<gpus; ++g) { + CUDACHECK(cudaSetDevice(list[g])); + CUDACHECK(cudaStreamSynchronize(stream[g])); + max_error = max(max_error, *localError[g]); + } + } + + double mb = (double)(n*word * (gpus-1)) * 1.e-6; + double algbw = mb / ms; + printf("%12lu %5.0le %10.3lf %6.2lf\n", + n*word, max_error, ms, algbw); + } + + for(int g=0; g<gpus; ++g) { + CUDACHECK(cudaSetDevice(list[g])); + CUDACHECK(cudaStreamDestroy(stream[g])); + ncclCommDestroy(comm[g]); + CUDACHECK(cudaFree(input[g])); + CUDACHECK(cudaFree(output[g])); + CUDACHECK(cudaFreeHost(localError[g])); + } + + free(localError); + free(output); + free(input); + free(comm); + free(stream); + CUDACHECK(cudaFreeHost(refout)); + exit(EXIT_SUCCESS); +} + diff --git a/test/single/all_reduce_scan.cu b/test/single/all_reduce_scan.cu new file mode 100644 index 0000000..f93a099 --- /dev/null +++ b/test/single/all_reduce_scan.cu @@ -0,0 +1,247 @@ +/************************************************************************* + * Copyright (c) 2016, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of NVIDIA CORPORATION nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + ************************************************************************/ + +#include <chrono> +#include <cstdio> +#include <cstdlib> +#include <string> +#include <float.h> + +#include "nccl.h" +#include "test_utilities.h" +#include <nvToolsExt.h> + +void showUsage(const char* bin) { + printf("\n" + "Usage: %s <type> <op> <n_min> <n_max> [delta] [gpus] [gpu0 [gpu1 [...]]]\n" + "Where:\n" +#ifdef CUDA_HAS_HALF + " type = [char|int|half|float|double|int64|uint64]\n" +#else + " type = [char|int|float|double|int64|uint64]\n" +#endif + " op = [sum|prod|max|min]\n" + " n_min > 0\n" + " n_max >= n_min\n" + " delta > 0\n\n", bin); + return; +} + +int main(int argc, char* argv[]) { + int nvis = 0; + CUDACHECK(cudaGetDeviceCount(&nvis)); + if (nvis == 0) { + printf("No GPUs found\n"); + showUsage(argv[0]); + exit(EXIT_FAILURE); + } + + ncclDataType_t type; + ncclRedOp_t op; + int n_min; + int n_max; + int delta; + int gpus; + int* list = NULL; + + if (argc < 5) { + showUsage(argv[0]); + exit(EXIT_FAILURE); + } + + type = strToType(argv[1]); + if (type == nccl_NUM_TYPES) { + printf("Invalid <type> '%s'\n", argv[1]); + showUsage(argv[0]); + exit(EXIT_FAILURE); + } + + op = strToOp(argv[2]); + if (op == nccl_NUM_OPS) { + printf("Invalid <op> '%s'\n", argv[2]); + showUsage(argv[0]); + exit(EXIT_FAILURE); + } + + n_min = strToPosInt(argv[3]); + if (n_min < 1) { + printf("Invalid <n_min> '%s'\n", argv[3]); + showUsage(argv[0]); + exit(EXIT_FAILURE); + } + + n_max = strToPosInt(argv[4]); + if (n_max < n_min) { + printf("Invalid <n_max> '%s'\n", argv[4]); + showUsage(argv[0]); + exit(EXIT_FAILURE); + } + + if (argc > 5) { + delta = strToPosInt(argv[5]); + if (delta < 1) { + printf("Invalid <delta> '%s'\n", argv[5]); + showUsage(argv[0]); + exit(EXIT_FAILURE); + } + } else { + delta = (n_max == n_min) ? 1 : (n_max - n_min+9) / 10; + } + + if (argc > 6) { + gpus = strToPosInt(argv[6]); + if (gpus < 1) { + printf("Invalid <gpus> '%s'\n", argv[6]); + showUsage(argv[0]); + exit(EXIT_FAILURE); + } + } else { + gpus = nvis; + } + + list = (int*)malloc(gpus*sizeof(int)); + + if (argc > 7 && argc != 7+gpus) { + printf("If given, GPU list must be fully specified.\n"); + showUsage(argv[0]); + exit(EXIT_FAILURE); + } + + for(int g=0; g<gpus; ++g) { + if(argc > 7) { + list[g] = strToNonNeg(argv[7+g]); + if (list[g] < 0) { + printf("Invalid GPU%d '%s'\n", g, argv[7+g]); + showUsage(argv[0]); + exit(EXIT_FAILURE); + } else if (list[g] >= nvis) { + printf("GPU%d (%d) exceeds visible devices (%d)\n", g, list[g], nvis); + showUsage(argv[0]); + exit(EXIT_FAILURE); + } + } else { + list[g] = g % nvis; + } + } + + size_t word = wordSize(type); + size_t max_size = n_max * word; + void* refout; + CUDACHECK(cudaMallocHost(&refout, max_size)); + + void **input, **output; + double** localError; + ncclComm_t* comm; + cudaStream_t* stream; + + input = (void**)malloc(gpus*sizeof(void*)); + output = (void**)malloc(gpus*sizeof(void*)); + localError = (double**)malloc(gpus*sizeof(double*)); + comm = (ncclComm_t*)malloc(gpus*sizeof(ncclComm_t)); + stream = (cudaStream_t*)malloc(gpus*sizeof(cudaStream_t)); + + for(int g=0; g<gpus; ++g) { + char busid[32] = {0}; + CUDACHECK(cudaDeviceGetPCIBusId(busid, 32, list[g])); + printf("# Rank %d using device %d [%s]\n", g, list[g], busid); + + CUDACHECK(cudaSetDevice(list[g])); + CUDACHECK(cudaMalloc(&input[g], max_size)); + CUDACHECK(cudaMalloc(&output[g], max_size)); + CUDACHECK(cudaMallocHost(&localError[g], sizeof(double))); + CUDACHECK(cudaStreamCreate(&stream[g])); + makeRandom(input[g], n_max, type, 42+g); + + if (g == 0) + CUDACHECK(cudaMemcpy(refout, input[g], max_size, cudaMemcpyDeviceToHost)); + else + accVec(refout, input[g], n_max, type, op); + } + + NCCLCHECK(ncclCommInitAll(comm, gpus, list)); + + printf(" BYTES ERROR MSEC ALGBW BUSBW\n"); + + for(int n=n_min; n<=n_max; n+=delta) { + size_t bytes = word * n; + + for(int g=0; g<gpus; ++g) { + CUDACHECK(cudaSetDevice(list[g])); + CUDACHECK(cudaMemsetAsync(output[g], 0, bytes, stream[g])); + CUDACHECK(cudaStreamSynchronize(stream[g])); + } + + auto start = std::chrono::high_resolution_clock::now(); + for(int g=0; g<gpus; ++g) { + CUDACHECK(cudaSetDevice(list[g])); + NCCLCHECK(ncclAllReduce(input[g], output[g], n, type, op, comm[g], stream[g])); + } + for(int g=0; g<gpus; ++g) { + CUDACHECK(cudaSetDevice(list[g])); + CUDACHECK(cudaStreamSynchronize(stream[g])); + } + auto stop = std::chrono::high_resolution_clock::now(); + double ms = std::chrono::duration_cast<std::chrono::duration<double>> + (stop - start).count() * 1000.0; + + double max_error = 0.0; + for(int g=0; g<gpus; ++g) { + CUDACHECK(cudaSetDevice(list[g])); + maxDiff(localError[g], output[g], refout, n, type, stream[g]); + } + for(int g=0; g<gpus; ++g) { + CUDACHECK(cudaSetDevice(list[g])); + CUDACHECK(cudaStreamSynchronize(stream[g])); + max_error = max(max_error, *localError[g]); + } + + double mb = (double)bytes * 1.e-6; + double algbw = mb / ms; + double busbw = algbw * (double)(2*gpus - 2) / (double)gpus; + printf("%12lu %5.0le %10.3lf %6.2lf %6.2lf\n", + n*word, max_error, ms, algbw, busbw); + } + + for(int g=0; g<gpus; ++g) { + CUDACHECK(cudaSetDevice(list[g])); + CUDACHECK(cudaStreamDestroy(stream[g])); + ncclCommDestroy(comm[g]); + CUDACHECK(cudaFree(input[g])); + CUDACHECK(cudaFree(output[g])); + CUDACHECK(cudaFreeHost(localError[g])); + } + + free(localError); + free(output); + free(input); + free(comm); + free(stream); + CUDACHECK(cudaFreeHost(refout)); + exit(EXIT_SUCCESS); +} + diff --git a/test/single/broadcast_scan.cu b/test/single/broadcast_scan.cu new file mode 100644 index 0000000..ea11c7d --- /dev/null +++ b/test/single/broadcast_scan.cu @@ -0,0 +1,232 @@ +/************************************************************************* + * Copyright (c) 2016, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of NVIDIA CORPORATION nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + ************************************************************************/ + +#include <chrono> +#include <cstdio> +#include <cstdlib> +#include <string> +#include <float.h> + +#include "nccl.h" +#include "test_utilities.h" +#include <nvToolsExt.h> + +void showUsage(const char* bin) { + printf("\n" + "Usage: %s <type> <n_min> <n_max> [delta] [gpus] [gpu0 [gpu1 [...]]]\n" + "Where:\n" +#ifdef CUDA_HAS_HALF + " type = [char|int|half|float|double|int64|uint64]\n" +#else + " type = [char|int|float|double|int64|uint64]\n" +#endif + " n_min > 0\n" + " n_max >= n_min\n" + " delta > 0\n\n", bin); + return; +} + +int main(int argc, char* argv[]) { + int nvis = 0; + CUDACHECK(cudaGetDeviceCount(&nvis)); + if (nvis == 0) { + printf("No GPUs found\n"); + showUsage(argv[0]); + exit(EXIT_FAILURE); + } + + ncclDataType_t type; + int n_min; + int n_max; + int delta; + int gpus; + int* list = NULL; + + if (argc < 4) { + showUsage(argv[0]); + exit(EXIT_FAILURE); + } + + type = strToType(argv[1]); + if (type == nccl_NUM_TYPES) { + printf("Invalid <type> '%s'\n", argv[1]); + showUsage(argv[0]); + exit(EXIT_FAILURE); + } + + n_min = strToPosInt(argv[2]); + if (n_min < 1) { + printf("Invalid <n_min> '%s'\n", argv[2]); + showUsage(argv[0]); + exit(EXIT_FAILURE); + } + + n_max = strToPosInt(argv[3]); + if (n_max < n_min) { + printf("Invalid <n_max> '%s'\n", argv[3]); + showUsage(argv[0]); + exit(EXIT_FAILURE); + } + + if (argc > 4) { + delta = strToPosInt(argv[4]); + if (delta < 1) { + printf("Invalid <delta> '%s'\n", argv[4]); + showUsage(argv[0]); + exit(EXIT_FAILURE); + } + } else { + delta = (n_max == n_min) ? 1 : (n_max - n_min+9) / 10; + } + + if (argc > 5) { + gpus = strToPosInt(argv[5]); + if (gpus < 1) { + printf("Invalid <gpus> '%s'\n", argv[5]); + showUsage(argv[0]); + exit(EXIT_FAILURE); + } + } else { + gpus = nvis; + } + + list = (int*)malloc(gpus*sizeof(int)); + + if (argc > 6 && argc != 6+gpus) { + printf("If given, GPU list must be fully specified.\n"); + showUsage(argv[0]); + exit(EXIT_FAILURE); + } + + for(int g=0; g<gpus; ++g) { + if(argc > 6) { + list[g] = strToNonNeg(argv[6+g]); + if (list[g] < 0) { + printf("Invalid GPU%d '%s'\n", g, argv[6+g]); + showUsage(argv[0]); + exit(EXIT_FAILURE); + } else if (list[g] >= nvis) { + printf("GPU%d (%d) exceeds visible devices (%d)\n", g, list[g], nvis); + showUsage(argv[0]); + exit(EXIT_FAILURE); + } + } else { + list[g] = g % nvis; + } + } + + size_t word = wordSize(type); + size_t max_size = n_max * word; + void* refout; + CUDACHECK(cudaMallocHost(&refout, max_size)); + + void** io; + double* localError; + ncclComm_t* comm; + cudaStream_t* stream; + + io = (void**)malloc(gpus*sizeof(void*)); + CUDACHECK(cudaMallocHost(&localError, gpus*sizeof(double))); + comm = (ncclComm_t*)malloc(gpus*sizeof(ncclComm_t)); + stream = (cudaStream_t*)malloc(gpus*sizeof(cudaStream_t)); + + for(int g=0; g<gpus; ++g) { + char busid[32] = {0}; + CUDACHECK(cudaDeviceGetPCIBusId(busid, 32, list[g])); + printf("# Rank %d using device %d [%s]\n", g, list[g], busid); + + CUDACHECK(cudaSetDevice(list[g])); + CUDACHECK(cudaStreamCreate(&stream[g])); + CUDACHECK(cudaMalloc(&io[g], max_size)); + if(g == 0) { + makeRandom(io[g], n_max, type, 42+g); + CUDACHECK(cudaMemcpy(refout, io[g], max_size, cudaMemcpyDeviceToHost)); + } + } + + NCCLCHECK(ncclCommInitAll(comm, gpus, list)); + + printf(" BYTES ERROR MSEC BW\n"); + + for(int n=n_min; n<=n_max; n+=delta) { + size_t bytes = word * n; + + for(int g=1; g<gpus; ++g) { + CUDACHECK(cudaSetDevice(list[g])); + CUDACHECK(cudaMemsetAsync(io[g], 0, bytes, stream[g])); + } + for(int g=0; g<gpus; ++g) { + CUDACHECK(cudaSetDevice(list[g])); + CUDACHECK(cudaStreamSynchronize(stream[0])); + } + + auto start = std::chrono::high_resolution_clock::now(); + for(int g=0; g<gpus; ++g) { + CUDACHECK(cudaSetDevice(list[g])); + NCCLCHECK(ncclBcast(io[g], n, type, 0, comm[g], stream[g])); + } + for(int g=0; g<gpus; ++g) { + CUDACHECK(cudaSetDevice(list[g])); + CUDACHECK(cudaStreamSynchronize(stream[g])); + } + auto stop = std::chrono::high_resolution_clock::now(); + double ms = std::chrono::duration_cast<std::chrono::duration<double>> + (stop - start).count() * 1000.0; + + for(int g=1; g<gpus; ++g) { + CUDACHECK(cudaSetDevice(list[g])); + maxDiff(localError+g, io[g], refout, n, type, stream[g]); + } + double maxError = 0.0; + for(int g=1; g<gpus; ++g) { + CUDACHECK(cudaSetDevice(list[g])); + CUDACHECK(cudaStreamSynchronize(stream[g])); + maxError = max(maxError, localError[g]); + } + + double mb = (double)bytes * 1.e-6; + double algbw = mb / ms; + printf("%12lu %5.0le %10.3lf %6.2lf\n", + n*word, maxError, ms, algbw); + } + + for(int g=0; g<gpus; ++g) { + CUDACHECK(cudaSetDevice(list[g])); + CUDACHECK(cudaStreamDestroy(stream[g])); + ncclCommDestroy(comm[g]); + CUDACHECK(cudaFree(io[g])); + } + + free(io); + free(comm); + free(stream); + CUDACHECK(cudaFreeHost(refout)); + CUDACHECK(cudaFreeHost(localError)); + exit(EXIT_SUCCESS); +} + diff --git a/test/single/reduce_scan.cu b/test/single/reduce_scan.cu new file mode 100644 index 0000000..f42643e --- /dev/null +++ b/test/single/reduce_scan.cu @@ -0,0 +1,238 @@ +/************************************************************************* + * Copyright (c) 2016, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of NVIDIA CORPORATION nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + ************************************************************************/ + +#include <chrono> +#include <cstdio> +#include <cstdlib> +#include <string> +#include <float.h> + +#include "nccl.h" +#include "test_utilities.h" +#include <nvToolsExt.h> + +void showUsage(const char* bin) { + printf("\n" + "Usage: %s <type> <op> <n_min> <n_max> [delta] [gpus] [gpu0 [gpu1 [...]]]\n" + "Where:\n" +#ifdef CUDA_HAS_HALF + " type = [char|int|half|float|double|int64|uint64]\n" +#else + " type = [char|int|float|double|int64|uint64]\n" +#endif + " op = [sum|prod|max|min]\n" + " n_min > 0\n" + " n_max >= n_min\n" + " delta > 0\n\n", bin); + return; +} + +int main(int argc, char* argv[]) { + int nvis = 0; + CUDACHECK(cudaGetDeviceCount(&nvis)); + if (nvis == 0) { + printf("No GPUs found\n"); + showUsage(argv[0]); + exit(EXIT_FAILURE); + } + + ncclDataType_t type; + ncclRedOp_t op; + int n_min; + int n_max; + int delta; + int gpus; + int* list = NULL; + + if (argc < 5) { + showUsage(argv[0]); + exit(EXIT_FAILURE); + } + + type = strToType(argv[1]); + if (type == nccl_NUM_TYPES) { + printf("Invalid <type> '%s'\n", argv[1]); + showUsage(argv[0]); + exit(EXIT_FAILURE); + } + + op = strToOp(argv[2]); + if (op == nccl_NUM_OPS) { + printf("Invalid <op> '%s'\n", argv[2]); + showUsage(argv[0]); + exit(EXIT_FAILURE); + } + + n_min = strToPosInt(argv[3]); + if (n_min < 1) { + printf("Invalid <n_min> '%s'\n", argv[3]); + showUsage(argv[0]); + exit(EXIT_FAILURE); + } + + n_max = strToPosInt(argv[4]); + if (n_max < n_min) { + printf("Invalid <n_max> '%s'\n", argv[4]); + showUsage(argv[0]); + exit(EXIT_FAILURE); + } + + if (argc > 5) { + delta = strToPosInt(argv[5]); + if (delta < 1) { + printf("Invalid <delta> '%s'\n", argv[5]); + showUsage(argv[0]); + exit(EXIT_FAILURE); + } + } else { + delta = (n_max == n_min) ? 1 : (n_max - n_min+9) / 10; + } + + if (argc > 6) { + gpus = strToPosInt(argv[6]); + if (gpus < 1) { + printf("Invalid <gpus> '%s'\n", argv[6]); + showUsage(argv[0]); + exit(EXIT_FAILURE); + } + } else { + gpus = nvis; + } + + list = (int*)malloc(gpus*sizeof(int)); + + if (argc > 7 && argc != 7+gpus) { + printf("If given, GPU list must be fully specified.\n"); + showUsage(argv[0]); + exit(EXIT_FAILURE); + } + + for(int g=0; g<gpus; ++g) { + if(argc > 7) { + list[g] = strToNonNeg(argv[7+g]); + if (list[g] < 0) { + printf("Invalid GPU%d '%s'\n", g, argv[7+g]); + showUsage(argv[0]); + exit(EXIT_FAILURE); + } else if (list[g] >= nvis) { + printf("GPU%d (%d) exceeds visible devices (%d)\n", g, list[g], nvis); + showUsage(argv[0]); + exit(EXIT_FAILURE); + } + } else { + list[g] = g % nvis; + } + } + + size_t word = wordSize(type); + size_t max_size = n_max * word; + void* refout; + CUDACHECK(cudaMallocHost(&refout, max_size)); + + void** input; + void* output; // always goes on rank 0 + double* maxError; + ncclComm_t* comm; + cudaStream_t* stream; + + input = (void**)malloc(gpus*sizeof(void*)); + comm = (ncclComm_t*)malloc(gpus*sizeof(ncclComm_t)); + stream = (cudaStream_t*)malloc(gpus*sizeof(cudaStream_t)); + + for(int g=0; g<gpus; ++g) { + char busid[32] = {0}; + CUDACHECK(cudaDeviceGetPCIBusId(busid, 32, list[g])); + printf("# Rank %d using device %d [%s]\n", g, list[g], busid); + + CUDACHECK(cudaSetDevice(list[g])); + CUDACHECK(cudaStreamCreate(&stream[g])); + CUDACHECK(cudaMalloc(&input[g], max_size)); + makeRandom(input[g], n_max, type, 42+g); + + if (g == 0) { + CUDACHECK(cudaMalloc(&output, max_size)); + CUDACHECK(cudaMallocHost(&maxError, sizeof(double))); + CUDACHECK(cudaMemcpy(refout, input[g], max_size, cudaMemcpyDeviceToHost)); + } else { + accVec(refout, input[g], n_max, type, op); + } + } + + NCCLCHECK(ncclCommInitAll(comm, gpus, list)); + + printf(" BYTES ERROR MSEC BW\n"); + + for(int n=n_min; n<=n_max; n+=delta) { + size_t bytes = word * n; + + CUDACHECK(cudaSetDevice(list[0])); + CUDACHECK(cudaMemsetAsync(output, 0, bytes, stream[0])); + for(int g=0; g<gpus; ++g) + CUDACHECK(cudaStreamSynchronize(stream[0])); + + auto start = std::chrono::high_resolution_clock::now(); + for(int g=0; g<gpus; ++g) { + CUDACHECK(cudaSetDevice(list[g])); + NCCLCHECK(ncclReduce(input[g], output, n, type, op, 0, comm[g], stream[g])); + } + for(int g=0; g<gpus; ++g) { + CUDACHECK(cudaSetDevice(list[g])); + CUDACHECK(cudaStreamSynchronize(stream[g])); + } + auto stop = std::chrono::high_resolution_clock::now(); + double ms = std::chrono::duration_cast<std::chrono::duration<double>> + (stop - start).count() * 1000.0; + + CUDACHECK(cudaSetDevice(list[0])); + maxDiff(maxError, output, refout, n, type, stream[0]); + CUDACHECK(cudaStreamSynchronize(stream[0])); + + double mb = (double)bytes * 1.e-6; + double algbw = mb / ms; + printf("%12lu %5.0le %10.3lf %6.2lf\n", + n*word, *maxError, ms, algbw); + } + + for(int g=0; g<gpus; ++g) { + CUDACHECK(cudaSetDevice(list[g])); + CUDACHECK(cudaStreamDestroy(stream[g])); + ncclCommDestroy(comm[g]); + CUDACHECK(cudaFree(input[g])); + if(g == 0) { + CUDACHECK(cudaFree(output)); + CUDACHECK(cudaFreeHost(maxError)); + } + } + + free(input); + free(comm); + free(stream); + CUDACHECK(cudaFreeHost(refout)); + exit(EXIT_SUCCESS); +} + diff --git a/test/single/reduce_scatter_scan.cu b/test/single/reduce_scatter_scan.cu new file mode 100644 index 0000000..8c37508 --- /dev/null +++ b/test/single/reduce_scatter_scan.cu @@ -0,0 +1,249 @@ +/************************************************************************* + * Copyright (c) 2016, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of NVIDIA CORPORATION nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + ************************************************************************/ + +#include <chrono> +#include <cstdio> +#include <cstdlib> +#include <string> +#include <float.h> + +#include "nccl.h" +#include "test_utilities.h" +#include <nvToolsExt.h> + +void showUsage(const char* bin) { + printf("\n" + "Usage: %s <type> <op> <n_min> <n_max> [delta] [gpus] [gpu0 [gpu1 [...]]]\n" + "Where:\n" +#ifdef CUDA_HAS_HALF + " type = [char|int|half|float|double|int64|uint64]\n" +#else + " type = [char|int|float|double|int64|uint64]\n" +#endif + " op = [sum|prod|max|min]\n" + " n_min > 0\n" + " n_max >= n_min\n" + " delta > 0\n\n", bin); + return; +} + +int main(int argc, char* argv[]) { + int nvis = 0; + CUDACHECK(cudaGetDeviceCount(&nvis)); + if (nvis == 0) { + printf("No GPUs found\n"); + showUsage(argv[0]); + exit(EXIT_FAILURE); + } + + ncclDataType_t type; + ncclRedOp_t op; + int n_min; + int n_max; + int delta; + int gpus; + int* list = NULL; + + if (argc < 5) { + showUsage(argv[0]); + exit(EXIT_FAILURE); + } + + type = strToType(argv[1]); + if (type == nccl_NUM_TYPES) { + printf("Invalid <type> '%s'\n", argv[1]); + showUsage(argv[0]); + exit(EXIT_FAILURE); + } + + op = strToOp(argv[2]); + if (op == nccl_NUM_OPS) { + printf("Invalid <op> '%s'\n", argv[2]); + showUsage(argv[0]); + exit(EXIT_FAILURE); + } + + n_min = strToPosInt(argv[3]); + if (n_min < 1) { + printf("Invalid <n_min> '%s'\n", argv[3]); + showUsage(argv[0]); + exit(EXIT_FAILURE); + } + + n_max = strToPosInt(argv[4]); + if (n_max < n_min) { + printf("Invalid <n_max> '%s'\n", argv[4]); + showUsage(argv[0]); + exit(EXIT_FAILURE); + } + + if (argc > 5) { + delta = strToPosInt(argv[5]); + if (delta < 1) { + printf("Invalid <delta> '%s'\n", argv[5]); + showUsage(argv[0]); + exit(EXIT_FAILURE); + } + } else { + delta = (n_max == n_min) ? 1 : (n_max - n_min+9) / 10; + } + + if (argc > 6) { + gpus = strToPosInt(argv[6]); + if (gpus < 1) { + printf("Invalid <gpus> '%s'\n", argv[6]); + showUsage(argv[0]); + exit(EXIT_FAILURE); + } + } else { + gpus = nvis; + } + + list = (int*)malloc(gpus*sizeof(int)); + + if (argc > 7 && argc != 7+gpus) { + printf("If given, GPU list must be fully specified.\n"); + showUsage(argv[0]); + exit(EXIT_FAILURE); + } + + for(int g=0; g<gpus; ++g) { + if(argc > 7) { + list[g] = strToNonNeg(argv[7+g]); + if (list[g] < 0) { + printf("Invalid GPU%d '%s'\n", g, argv[7+g]); + showUsage(argv[0]); + exit(EXIT_FAILURE); + } else if (list[g] >= nvis) { + printf("GPU%d (%d) exceeds visible devices (%d)\n", g, list[g], nvis); + showUsage(argv[0]); + exit(EXIT_FAILURE); + } + } else { + list[g] = g % nvis; + } + } + + size_t word = wordSize(type); + size_t max_output = n_max * word; + size_t max_input = gpus * max_output; + void* refout; + CUDACHECK(cudaMallocHost(&refout, max_input)); // contains entire reduction + + void **input, **output; + double** localError; + ncclComm_t* comm; + cudaStream_t* stream; + + input = (void**)malloc(gpus*sizeof(void*)); + output = (void**)malloc(gpus*sizeof(void*)); + localError = (double**)malloc(gpus*sizeof(double*)); + comm = (ncclComm_t*)malloc(gpus*sizeof(ncclComm_t)); + stream = (cudaStream_t*)malloc(gpus*sizeof(cudaStream_t)); + + for(int g=0; g<gpus; ++g) { + char busid[32] = {0}; + CUDACHECK(cudaDeviceGetPCIBusId(busid, 32, list[g])); + printf("# Rank %d using device %d [%s]\n", g, list[g], busid); + + CUDACHECK(cudaSetDevice(list[g])); + CUDACHECK(cudaMalloc(&input[g], max_input)); + CUDACHECK(cudaMalloc(&output[g], max_output)); + CUDACHECK(cudaMallocHost(&localError[g], sizeof(double))); + CUDACHECK(cudaStreamCreate(&stream[g])); + makeRandom(input[g], n_max*gpus, type, 42+g); + + if (g == 0) + CUDACHECK(cudaMemcpy(refout, input[g], max_input, cudaMemcpyDeviceToHost)); + else + accVec(refout, input[g], n_max*gpus, type, op); + } + + NCCLCHECK(ncclCommInitAll(comm, gpus, list)); + + printf(" BYTES ERROR MSEC ALGBW BUSBW\n"); + + for(int n=n_min; n<=n_max; n+=delta) { + size_t bytes = word * n; + + for(int g=0; g<gpus; ++g) { + CUDACHECK(cudaSetDevice(list[g])); + CUDACHECK(cudaMemsetAsync(output[g], 0, bytes, stream[g])); + CUDACHECK(cudaStreamSynchronize(stream[g])); + } + + auto start = std::chrono::high_resolution_clock::now(); + for(int g=0; g<gpus; ++g) { + CUDACHECK(cudaSetDevice(list[g])); + NCCLCHECK(ncclReduceScatter(input[g], output[g], n, type, op, comm[g], stream[g])); + } + for(int g=0; g<gpus; ++g) { + CUDACHECK(cudaSetDevice(list[g])); + CUDACHECK(cudaStreamSynchronize(stream[g])); + } + auto stop = std::chrono::high_resolution_clock::now(); + double ms = std::chrono::duration_cast<std::chrono::duration<double>> + (stop - start).count() * 1000.0; + + double max_error = 0.0; + for(int g=0; g<gpus; ++g) { + CUDACHECK(cudaSetDevice(list[g])); + void* myRef = (void*)((char*)refout + g*bytes); + maxDiff(localError[g], output[g], myRef, n, type, stream[g]); + } + for(int g=0; g<gpus; ++g) { + CUDACHECK(cudaSetDevice(list[g])); + CUDACHECK(cudaStreamSynchronize(stream[g])); + max_error = max(max_error, *localError[g]); + } + + double mb = (double)bytes * 1.e-6; + double algbw = mb / ms; + double busbw = algbw * (double)(gpus - 1); + printf("%12lu %5.0le %10.3lf %6.2lf %6.2lf\n", + n*word, max_error, ms, algbw, busbw); + } + + for(int g=0; g<gpus; ++g) { + CUDACHECK(cudaSetDevice(list[g])); + CUDACHECK(cudaStreamDestroy(stream[g])); + ncclCommDestroy(comm[g]); + CUDACHECK(cudaFree(input[g])); + CUDACHECK(cudaFree(output[g])); + CUDACHECK(cudaFreeHost(localError[g])); + } + + free(localError); + free(output); + free(input); + free(comm); + free(stream); + CUDACHECK(cudaFreeHost(refout)); + exit(EXIT_SUCCESS); +} + |