diff options
author | Kyle Fernandes, ne Jacobs <kj333@cam.ac.uk> | 2016-11-18 02:27:20 +0300 |
---|---|---|
committer | Sylvain Jeaugey <sjeaugey@nvidia.com> | 2016-11-18 02:33:34 +0300 |
commit | 5f2b32e45b7d31ec942de27369a1308a0afe8fb0 (patch) | |
tree | 761f62f22038833b75934afd7302f337a03880c8 | |
parent | 534b9a169778040dbe7562854893b74bb9387e46 (diff) |
Add Fortran bindings
-rw-r--r-- | Makefile | 30 | ||||
-rw-r--r-- | fortran/Makefile | 81 | ||||
-rw-r--r-- | fortran/src/cudafor.f90 | 164 | ||||
-rw-r--r-- | fortran/src/ncclfor.f90 | 305 | ||||
-rw-r--r-- | fortran/test/allgather_arr_out.f90 | 155 | ||||
-rw-r--r-- | fortran/test/allgather_ptr_out.f90 | 164 | ||||
-rw-r--r-- | fortran/test/allreduce_arr_out.f90 | 158 | ||||
-rw-r--r-- | fortran/test/allreduce_ptr_out.f90 | 159 | ||||
-rw-r--r-- | fortran/test/broadcast_arr.f90 | 130 | ||||
-rw-r--r-- | fortran/test/broadcast_ptr.f90 | 135 | ||||
-rw-r--r-- | fortran/test/reduce_arr_out.f90 | 168 | ||||
-rw-r--r-- | fortran/test/reduce_ptr_out.f90 | 168 | ||||
-rw-r--r-- | fortran/test/reducescatter_arr_out.f90 | 158 | ||||
-rw-r--r-- | fortran/test/reducescatter_ptr_out.f90 | 167 |
14 files changed, 2133 insertions, 9 deletions
@@ -11,6 +11,7 @@ KEEP ?= 0 DEBUG ?= 0 PROFAPI ?= 0 BUILDDIR ?= build +BUILDDIR := $(abspath $(BUILDDIR)) CUDA_LIB ?= $(CUDA_HOME)/lib64 CUDA_INC ?= $(CUDA_HOME)/include @@ -21,7 +22,7 @@ NVCC_GENCODE ?= -gencode=arch=compute_35,code=sm_35 \ -gencode=arch=compute_52,code=sm_52 \ -gencode=arch=compute_52,code=compute_52 -CXXFLAGS := -I$(CUDA_INC) -fPIC -fvisibility=hidden +CXXFLAGS := -I$(CUDA_INC) -fPIC -fvisibility=hidden NVCUFLAGS := -ccbin $(CXX) $(NVCC_GENCODE) -lineinfo -std=c++11 -maxrregcount 96 # Use addprefix so that we can specify more than one path LDFLAGS := $(addprefix -L,${CUDA_LIB}) -lcudart -lrt @@ -59,7 +60,7 @@ CUDA_MAJOR = $(shell echo $(CUDA_VERSION) | cut -d "." -f 1) CUDA_MINOR = $(shell echo $(CUDA_VERSION) | cut -d "." -f 2) CXXFLAGS += -DCUDA_MAJOR=$(CUDA_MAJOR) -DCUDA_MINOR=$(CUDA_MINOR) -.PHONY : lib clean test mpitest install deb debian debclean +.PHONY : lib clean test mpitest install deb debian debclean forlib fortest forclean .DEFAULT : lib INCEXPORTS := nccl.h @@ -82,19 +83,19 @@ lib : $(INCTARGETS) $(LIBDIR)/$(LIBTARGET) -include $(DEPFILES) $(LIBDIR)/$(LIBTARGET) : $(LIBOBJ) - @printf "Linking %-25s\n" $@ + @printf "Linking %-35s > %s\n" $(LIBTARGET) $@ mkdir -p $(LIBDIR) $(CXX) $(CXXFLAGS) -shared -Wl,--no-as-needed -Wl,-soname,$(LIBSONAME) -o $@ $(LDFLAGS) $(LIBOBJ) ln -sf $(LIBSONAME) $(LIBDIR)/$(LIBNAME) ln -sf $(LIBTARGET) $(LIBDIR)/$(LIBSONAME) $(INCDIR)/%.h : src/%.h - @printf "Grabbing %-25s > %-25s\n" $< $@ + @printf "Grabbing %-35s > %s\n" $< $@ mkdir -p $(INCDIR) cp -f $< $@ $(OBJDIR)/%.o : src/%.cu - @printf "Compiling %-25s > %-25s\n" $< $@ + @printf "Compiling %-35s > %s\n" $< $@ mkdir -p $(OBJDIR) $(NVCC) -c $(NVCUFLAGS) --compiler-options "$(CXXFLAGS)" $< -o $@ @$(NVCC) -M $(NVCUFLAGS) --compiler-options "$(CXXFLAGS)" $< > $(@:%.o=%.d.tmp) @@ -147,7 +148,7 @@ MPITESTBINS:= $(patsubst %, $(MPITSTDIR)/%, $(MPITESTS)) test : $(TESTBINS) $(TSTDIR)/% : test/single/%.cu test/include/*.h $(TSTDEP) - @printf "Building %-25s > %-24s\n" $< $@ + @printf "Building %-35s > %s\n" $< $@ mkdir -p $(TSTDIR) $(NVCC) $(TSTINC) $(NVCUFLAGS) --compiler-options "$(CXXFLAGS)" -o $@ $< $(TSTLIB) -lcuda -lcurand -lnvToolsExt @$(NVCC) -M $(TSTINC) $(NVCUFLAGS) --compiler-options "$(CXXFLAGS)" $< $(TSTLIB) -lcuda -lcurand -lnvToolsExt > $(@:%=%.d.tmp) @@ -159,7 +160,7 @@ $(TSTDIR)/% : test/single/%.cu test/include/*.h $(TSTDEP) mpitest : $(MPITESTBINS) $(MPITSTDIR)/% : test/mpi/%.cu $(TSTDEP) - @printf "Building %-25s > %-24s\n" $< $@ + @printf "Building %-35s > %s\n" $< $@ mkdir -p $(MPITSTDIR) $(NVCC) $(MPIFLAGS) $(TSTINC) $(NVCUFLAGS) --compiler-options "$(CXXFLAGS)" -o $@ $< $(TSTLIB) -lcurand @$(NVCC) $(MPIFLAGS) -M $(TSTINC) $(NVCUFLAGS) --compiler-options "$(CXXFLAGS)" $< $(TSTLIB) -lcurand > $(@:%=%.d.tmp) @@ -193,7 +194,7 @@ debclean : rm -Rf $(DEBIANDIR) $(DEBIANDIR)/% : debian/%.in - @printf "Generating %-25s > %-24s\n" $< $@ + @printf "Generating %-35s > %s\n" $< $@ sed -e "s/\$${nccl:Major}/$(NCCL_MAJOR)/g" \ -e "s/\$${nccl:Minor}/$(NCCL_MINOR)/g" \ -e "s/\$${nccl:Patch}/$(NCCL_PATCH)/g" \ @@ -205,7 +206,18 @@ $(DEBIANDIR)/% : debian/%.in $< > $@ $(DEBIANDIR)/% : debian/% - @printf "Grabbing %-25s > %-25s\n" $< $@ + @printf "Grabbing %-35s > %s\n" $< $@ mkdir -p $(DEBIANDIR) cp -f $< $@ +#### FORTRAN BINDINGS #### + +export NCCL_MAJOR NCCL_MINOR NCCL_PATCH CUDA_MAJOR CUDA_MINOR LIBLINK CUDA_LIB BUILDDIR + +forlib : lib + $(MAKE) -C fortran lib +fortest : forlib + $(MAKE) -C fortran test +forclean : + $(MAKE) -C fortran clean + diff --git a/fortran/Makefile b/fortran/Makefile new file mode 100644 index 0000000..b60b016 --- /dev/null +++ b/fortran/Makefile @@ -0,0 +1,81 @@ +FC := gfortran +FCNAME := $(notdir $(FC)) + +BUILDDIR ?= ../build +INCDIR := $(BUILDDIR)/include +LIBDIR := $(BUILDDIR)/lib +OBJDIR := $(BUILDDIR)/obj + +LIBNAME := libncclfor.so +LIBSONAME := $(patsubst %,%.$(NCCL_MAJOR),$(LIBNAME)) +LIBTARGET := $(patsubst %,%.$(NCCL_MAJOR).$(NCCL_MINOR).$(NCCL_PATCH),$(LIBNAME)) +LIBLINK += $(patsubst lib%.so,-l%,$(LIBNAME)) + +LIBCUDAFOR := libcudafor.so + +ifneq ($(filter pgf%, $(FCNAME)), ) +# PGI compiler (pgfortran, pgf90, pgf95) +FCMODFLAGS := -module $(INCDIR) +FCPREFLAGS := -Mpreprocess +FCCUDAFLAGS := -Mcuda,cuda$(CUDA_MAJOR).$(CUDA_MINOR) +FCFLAGS := -fast -O3 +else +# non-PGI compilers do not have CUDA support, compile our own CUDA lib +CUDAFORDEP := $(LIBDIR)/$(LIBCUDAFOR) +CUDALINK := -L$(CUDA_LIB) -lcudart +CUDAFORLINK := -lcudafor +ifeq ($(FCNAME), gfortran) +FCMODFLAGS := -J$(INCDIR) +FCPREFLAGS += -cpp +FCFLAGS += -ffree-line-length-none +else ifeq ($(FCNAME), ifort) +FCMODFLAGS := -module $(INCDIR) +FCPREFLAGS += -fpp +endif +endif + +ifeq ($(VERBOSE), 0) +.SILENT: +endif + +lib: $(CUDAFORDEP) + $(MAKE) $(LIBDIR)/$(LIBTARGET) + +$(LIBDIR)/$(LIBTARGET): $(OBJDIR)/ncclfor.o + @printf "Linking %-35s > %s\n" $(LIBTARGET) $@ + mkdir -p $(LIBDIR) + $(FC) -shared -Wl,--no-as-needed -Wl,-soname,$(LIBSONAME) $< -o $(LIBDIR)/$(LIBTARGET) + ln -sf $(LIBSONAME) $(LIBDIR)/$(LIBNAME) + ln -sf $(LIBTARGET) $(LIBDIR)/$(LIBSONAME) + +$(LIBDIR)/$(LIBCUDAFOR): $(OBJDIR)/cudafor.o + @printf "Linking %-35s > %s\n" $(LIBCUDAFOR) $@ + mkdir -p $(LIBDIR) + $(FC) -shared -Wl,--no-as-needed -Wl,-soname,$(LIBCUDAFOR) $< -o $(LIBDIR)/$(LIBCUDAFOR) + +$(OBJDIR)/%.o: src/%.f90 + @printf "Building %-35s > %s\n" $< $@ + mkdir -p $(OBJDIR) + mkdir -p $(INCDIR) + $(FC) -c $(FCMODFLAGS) $(FCPREFLAGS) -fPIC $(FCCUDAFLAGS) $(FCFLAGS) $< -o $@ + +TESTS := reduce_ptr_out allreduce_ptr_out reducescatter_ptr_out broadcast_ptr allgather_ptr_out +ifneq ($(filter pgf%, $(FCNAME)), ) +TESTS += reduce_arr_out allreduce_arr_out reducescatter_arr_out broadcast_arr allgather_arr_out +endif + +TESTDIR := $(BUILDDIR)/test/fortran +TESTBINS := $(patsubst %,$(TESTDIR)/%,$(TESTS)) + +test: lib $(TESTBINS) + +$(TESTDIR)/%: test/%.f90 lib + @printf "Building %-35s > %s\n" $< $@ + @mkdir -p $(TESTDIR) + $(FC) $(FCCUDAFLAGS) $(FCFLAGS) $< $(CUDALINK) -I$(INCDIR) -L$(LIBDIR) $(CUDAFORLINK) $(LIBLINK) -o $@ + +clean: + rm -f $(LIBDIR)/$(LIBTARGET) $(LIBDIR)/$(LIBSONAME) $(LIBDIR)/$(LIBNAME) + rm -f $(LIBDIR)/$(LIBCUDAFOR) $(OBJDIR)/*for.o $(INCDIR)/*.mod + rm -rf $(TESTDIR)/ + diff --git a/fortran/src/cudafor.f90 b/fortran/src/cudafor.f90 new file mode 100644 index 0000000..8c6934d --- /dev/null +++ b/fortran/src/cudafor.f90 @@ -0,0 +1,164 @@ +#ifndef _CUDA + +!Start cudaFor module +module cudaFor +use iso_c_binding +implicit none +private +public :: c_devptr +public :: cudaMemcpyKind, & + cudaMemcpyHostToHost, & + cudaMemcpyHostToDevice, & + cudaMemcpyDeviceToHost, & + cudaMemcpyDeviceToDevice, & + cudaMemcpyDefault +public :: cuda_stream_kind +public :: cudaGetDeviceCount +public :: cudaSetDevice +public :: cudaMalloc +public :: cudaMemcpy +public :: cudaFree +public :: cudaStreamCreate +public :: cudaStreamSynchronize +public :: cudaStreamDestroy + +!Start types + +!Start c_devptr +type, bind(c) :: c_devptr +type(c_ptr) :: member +end type c_devptr +!End c_devptr + +!Start cudaMemcpyKind +type, bind(c) :: cudaMemcpyKind +integer(c_int) :: member +end type cudaMemcpyKind + +type(cudaMemcpyKind), parameter :: cudaMemcpyHostToHost = cudaMemcpyKind(0), & + cudaMemcpyHostToDevice = cudaMemcpyKind(1), & + cudaMemcpyDeviceToHost = cudaMemcpyKind(2), & + cudaMemcpyDeviceToDevice = cudaMemcpyKind(3), & + cudaMemcpyDefault = cudaMemcpyKind(4) +!End cudaMemcpyKind + +!Start cuda_stream_kind +integer(c_intptr_t), parameter :: cuda_stream_kind = c_intptr_t +!End cuda_stream_kind + +!End types + +!Start interfaces + +!Start cudaGetDeviceCount +interface cudaGetDeviceCount +integer(c_int) function cudaGetDeviceCount(count) bind(c, name = "cudaGetDeviceCount") +import :: c_int +implicit none +integer(c_int) :: count +end function cudaGetDeviceCount +end interface cudaGetDeviceCount +!End cudaGetDeviceCount + +!Start cudaSetDevice +interface cudaSetDevice +integer(c_int) function cudaSetDevice(device) bind(c, name = "cudaSetDevice") +import :: c_int +implicit none +integer(c_int), value :: device +end function cudaSetDevice +end interface cudaSetDevice +!End cudaSetDevice + +!Start cudaMalloc +interface cudaMalloc +integer(c_int) function cudaMalloc(devPtr, size) bind(c, name = "cudaMalloc") +import :: c_int, c_size_t +import :: c_devptr +implicit none +type(c_devptr) :: devPtr +integer(c_size_t), value :: size +end function cudaMalloc +end interface cudaMalloc +!End cudaMalloc + +!Start cudaMemcpy +interface cudaMemcpy + +!Start cudaMemcpyH2D +integer(c_int) function cudaMemcpyH2D(dst, src, count, kind) bind(c, name = "cudaMemcpy") +import :: c_ptr, c_int, c_size_t +import :: c_devptr, cudaMemcpyKind +implicit none +type(c_devptr), value :: dst +type(c_ptr), value :: src +integer(c_size_t), value :: count +type(cudaMemcpyKind), value :: kind +end function cudaMemcpyH2D +!End cudaMemcpyH2D + +!Start cudaMemcpyD2H +integer(c_int) function cudaMemcpyD2H(dst, src, count, kind) bind(c, name = "cudaMemcpy") +import :: c_ptr, c_int, c_size_t +import :: c_devptr, cudaMemcpyKind +implicit none +type(c_ptr), value :: dst +type(c_devptr), value :: src +integer(c_size_t), value :: count +type(cudaMemcpyKind), value :: kind +end function cudaMemcpyD2H +!End cudaMemcpyD2H + +end interface cudaMemcpy +!End cudaMemcpy + +!Start cudaFree +interface cudaFree +integer(c_int) function cudaFree(devPtr) bind(c, name = "cudaFree") +import :: c_int +import :: c_devptr +implicit none +type(c_devptr), value :: devPtr +end function cudaFree +end interface cudaFree +!End cudaFree + +!Start cudaStreamCreate +interface cudaStreamCreate +integer(c_int) function cudaStreamCreate(pStream) bind(c, name = "cudaStreamCreate") +import :: c_int +import :: cuda_stream_kind +implicit none +integer(cuda_stream_kind) :: pStream +end function cudaStreamCreate +end interface cudaStreamCreate +!End cudaStreamCreate + +!Start cudaStreamSynchronize +interface cudaStreamSynchronize +integer(c_int) function cudaStreamSynchronize(stream) bind(c, name = "cudaStreamSynchronize") +import :: c_int +import :: cuda_stream_kind +implicit none +integer(cuda_stream_kind), value :: stream +end function cudaStreamSynchronize +end interface cudaStreamSynchronize +!End cudaStreamSynchronize + +!Start cudaStreamDestroy +interface cudaStreamDestroy +integer(c_int) function cudaStreamDestroy(stream) bind(c, name = "cudaStreamDestroy") +import :: c_int +import :: cuda_stream_kind +implicit none +integer(cuda_stream_kind), value :: stream +end function cudaStreamDestroy +end interface cudaStreamDestroy +!End cudaStreamDestroy + +!End interfaces + +end module cudaFor +!End cudaFor module + +#endif diff --git a/fortran/src/ncclfor.f90 b/fortran/src/ncclfor.f90 new file mode 100644 index 0000000..db98f3d --- /dev/null +++ b/fortran/src/ncclfor.f90 @@ -0,0 +1,305 @@ +!Start defines +#define NCCL_UNIQUE_ID_BYTES 128 +!End defines + +!Start ncclFor module +module ncclFor +use iso_c_binding +use cudaFor +implicit none +private +public :: ncclUniqueId +public :: ncclComm +public :: ncclResult, & + ncclSuccess, & + ncclUnhandledCudaError, & + ncclSystemError, & + ncclInternalError, & + ncclInvalidDevicePointer, & + ncclInvalidRank, & + ncclUnsupportedDeviceCount, & + ncclDeviceNotFound, & + ncclInvalidDeviceIndex, & + ncclLibWrapperNotSet, & + ncclCudaMallocFailed, & + ncclRankMismatch, & + ncclInvalidArgument, & + ncclInvalidType, & + ncclInvalidOperation, & + nccl_NUM_RESULTS +public :: ncclDataType, & + ncclChar, & + ncclInt, & +#ifdef CUDA_HAS_HALF + ncclHalf, & +#endif + ncclFloat, & + ncclDouble, & + ncclInt64, & + ncclUInt64, & + nccl_NUM_TYPES +public :: ncclRedOp, & + ncclSum, & + ncclProd, & + ncclMax, & + ncclMin, & + nccl_NUM_OPS +public :: ncclGetUniqueId +public :: ncclCommInitRank +public :: ncclCommInitAll +public :: ncclCommCuDevice +public :: ncclCommUserRank +public :: ncclCommCount +public :: ncclCommDestroy +public :: ncclReduce +public :: ncclAllReduce +public :: ncclReduceScatter +public :: ncclBcast +public :: ncclAllGather + +!Start types + +!Start ncclUniqueId +type, bind(c) :: ncclUniqueId +character(c_char) :: internal(NCCL_UNIQUE_ID_BYTES) +end type ncclUniqueId +!End ncclUniqueId + +!Start ncclComm +type, bind(c) :: ncclComm +type(c_ptr) :: member +end type ncclComm +!End ncclComm + +!Start ncclResult +type, bind(c) :: ncclResult +integer(c_int) :: member +end type ncclResult + +type(ncclResult), parameter :: ncclSuccess = ncclResult( 0), & + ncclUnhandledCudaError = ncclResult( 1), & + ncclSystemError = ncclResult( 2), & + ncclInternalError = ncclResult( 3), & + ncclInvalidDevicePointer = ncclResult( 4), & + ncclInvalidRank = ncclResult( 5), & + ncclUnsupportedDeviceCount = ncclResult( 6), & + ncclDeviceNotFound = ncclResult( 7), & + ncclInvalidDeviceIndex = ncclResult( 8), & + ncclLibWrapperNotSet = ncclResult( 9), & + ncclCudaMallocFailed = ncclResult(10), & + ncclRankMismatch = ncclResult(11), & + ncclInvalidArgument = ncclResult(12), & + ncclInvalidType = ncclResult(13), & + ncclInvalidOperation = ncclResult(14), & + nccl_NUM_RESULTS = ncclResult(15) +!End ncclResult + +!Start ncclDataType +type, bind(c) :: ncclDataType +integer(c_int) :: member +end type ncclDataType + +type(ncclDataType), parameter :: ncclChar = ncclDataType(0), & + ncclInt = ncclDataType(1), & +#ifdef CUDA_HAS_HALF + ncclHalf = ncclDataType(2), & +#endif + ncclFloat = ncclDataType(3), & + ncclDouble = ncclDataType(4), & + ncclInt64 = ncclDataType(5), & + ncclUInt64 = ncclDataType(6), & + nccl_NUM_TYPES = ncclDataType(7) +!End ncclDataType + +!Start ncclRedOp +type, bind(c) :: ncclRedOp +integer(c_int) :: member +end type ncclRedOp + +type(ncclRedOp), parameter :: ncclSum = ncclRedOp(0), & + ncclProd = ncclRedOp(1), & + ncclMax = ncclRedOp(2), & + ncclMin = ncclRedOp(3), & + nccl_NUM_OPS = ncclRedOp(4) +!End ncclRedOp + +!End types + +!Start interfaces + +!Start ncclGetUniqueId +interface ncclGetUniqueId +type(ncclResult) function ncclGetUniqueId(uniqueId) bind(c, name = 'ncclGetUniqueId') +import :: ncclResult, ncclUniqueId +implicit none +type(ncclUniqueId) :: uniqueId +end function ncclGetUniqueId +end interface ncclGetUniqueId +!End ncclGetUniqueId + +!Start ncclCommInitRank +interface ncclCommInitRank +type(ncclResult) function ncclCommInitRank(comm, ndev, commId, rank) bind(c, name = 'ncclCommInitRank') +import :: c_int +import :: ncclResult, ncclUniqueId, ncclComm +implicit none +type(ncclComm) :: comm(*) +integer(c_int), value :: ndev +type(ncclUniqueId), value :: commId +integer(c_int), value :: rank +end function ncclCommInitRank +end interface ncclCommInitRank +!End ncclCommInitRank + +!Start ncclCommInitAll +interface ncclCommInitAll +type(ncclResult) function ncclCommInitAll(comm, ndev, devlist) bind(c, name = 'ncclCommInitAll') +import :: c_int +import :: ncclResult, ncclComm +implicit none +type(ncclComm) :: comm(*) +integer(c_int), value :: ndev +integer(c_int) :: devlist(*) +end function ncclCommInitAll +end interface ncclCommInitAll +!End ncclCommInitAll + +!Start ncclCommCuDevice +interface ncclCommCuDevice +type(ncclResult) function ncclCommCuDevice(comm, devid) bind(c, name = 'ncclCommCuDevice') +import :: c_int +import :: ncclResult, ncclComm +implicit none +type(ncclComm), value :: comm +integer(c_int) :: devid +end function ncclCommCuDevice +end interface ncclCommCuDevice +!End ncclCommCuDevice + +!Start ncclCommUserRank +interface ncclCommUserRank +type(ncclResult) function ncclCommUserRank(comm, rank) bind(c, name = 'ncclCommUserRank') +import :: c_int +import :: ncclResult, ncclComm +implicit none +type(ncclComm), value :: comm +integer(c_int) :: rank +end function ncclCommUserRank +end interface ncclCommUserRank +!End ncclCommUserRank + +!Start ncclCommCount +interface ncclCommCount +type(ncclResult) function ncclCommCount(comm, count) bind(c, name = 'ncclCommCount') +import :: c_int +import :: ncclResult, ncclComm +implicit none +type(ncclComm), value :: comm +integer(c_int) :: count +end function ncclCommCount +end interface ncclCommCount +!End ncclCommCount + +!Start ncclCommDestroy +interface ncclCommDestroy +subroutine ncclCommDestroy(comm) bind(c, name = 'ncclCommDestroy') +import :: ncclComm +implicit none +type(ncclComm), value :: comm +end subroutine ncclCommDestroy +end interface ncclCommDestroy +!End ncclCommDestroy + +!Start ncclReduce +interface ncclReduce +type(ncclResult) function ncclReduce(sendbuff, recvbuff, count, datatype, op, root, comm, stream) bind(c, name = 'ncclReduce') +import :: c_int +import :: c_devptr, cuda_stream_kind +import :: ncclResult, ncclComm, ncclDataType, ncclRedOp +implicit none +type(c_devptr), value :: sendbuff +type(c_devptr), value :: recvbuff +integer(c_int), value :: count +type(ncclDataType), value :: datatype +type(ncclRedOp), value :: op +integer(c_int), value :: root +type(ncclComm), value :: comm +integer(cuda_stream_kind), value :: stream +end function ncclReduce +end interface ncclReduce +!End ncclReduce + +!Start ncclAllReduce +interface ncclAllReduce +type(ncclResult) function ncclAllReduce(sendbuff, recvbuff, count, datatype, op, comm, stream) bind(c, name = 'ncclAllReduce') +import :: c_int +import :: c_devptr, cuda_stream_kind +import :: ncclResult, ncclComm, ncclDataType, ncclRedOp +implicit none +type(c_devptr), value :: sendbuff +type(c_devptr), value :: recvbuff +integer(c_int), value :: count +type(ncclDataType), value :: datatype +type(ncclRedOp), value :: op +type(ncclComm), value :: comm +integer(cuda_stream_kind), value :: stream +end function ncclAllReduce +end interface ncclAllReduce +!End ncclAllReduce + +!Start ncclReduceScatter +interface ncclReduceScatter +type(ncclResult) function ncclReduceScatter(sendbuff, recvbuff, recvcount, datatype, op, comm, stream) bind(c, name = 'ncclReduceScatter') +import :: c_int +import :: c_devptr, cuda_stream_kind +import :: ncclResult, ncclComm, ncclDataType, ncclRedOp +implicit none +type(c_devptr), value :: sendbuff +type(c_devptr), value :: recvbuff +integer(c_int), value :: recvcount +type(ncclDataType), value :: datatype +type(ncclRedOp), value :: op +type(ncclComm), value :: comm +integer(cuda_stream_kind), value :: stream +end function ncclReduceScatter +end interface ncclReduceScatter +!End ncclReduceScatter + +!Start ncclBcast +interface ncclBcast +type(ncclResult) function ncclBcast(buff, count, datatype, root, comm, stream) bind(c, name = 'ncclBcast') +import :: c_int +import :: c_devptr, cuda_stream_kind +import :: ncclResult, ncclComm, ncclDataType +implicit none +type(c_devptr), value :: buff +integer(c_int), value :: count +type(ncclDataType), value :: datatype +integer(c_int), value :: root +type(ncclComm), value :: comm +integer(cuda_stream_kind), value :: stream +end function ncclBcast +end interface ncclBcast +!End ncclBcast + +!Start ncclAllGather +interface ncclAllGather +type(ncclResult) function ncclAllGather(sendbuff, count, datatype, recvbuff, comm, stream) bind(c, name = 'ncclAllGather') +import :: c_int +import :: c_devptr, cuda_stream_kind +import :: ncclResult, ncclComm, ncclDataType +implicit none +type(c_devptr), value :: sendbuff +integer(c_int), value :: count +type(ncclDataType), value :: datatype +type(c_devptr), value :: recvbuff +type(ncclComm), value :: comm +integer(cuda_stream_kind), value :: stream +end function ncclAllGather +end interface ncclAllGather +!End ncclAllGather + +!End interfaces + +end module ncclFor +!End nccl module diff --git a/fortran/test/allgather_arr_out.f90 b/fortran/test/allgather_arr_out.f90 new file mode 100644 index 0000000..61fd35b --- /dev/null +++ b/fortran/test/allgather_arr_out.f90 @@ -0,0 +1,155 @@ +program test +use iso_c_binding +use iso_fortran_env +use cudaFor +use ncclFor +implicit none +integer(int32) :: stat, i +real(real32) :: err +integer(int32) :: nEl, nDev +type(ncclDataType) :: dataType +type(ncclComm), allocatable :: comm(:) +integer(int32), allocatable :: devList(:) +type(ncclResult) :: res +integer(int32) :: cudaDev, rank +integer(cuda_stream_kind), allocatable :: stream(:) +integer(int32) :: time(8) +integer(int32), allocatable :: seed(:) +real(real32), allocatable :: hostBuff(:, :) +real(real32), allocatable, device :: sendBuff(:) +type(c_devptr), allocatable :: sendBuffPtr(:) +real(real32), allocatable, device :: recvBuff(:) +type(c_devptr), allocatable :: recvBuffPtr(:) + + nEl = 2621440 + +! nDev = 2 + stat = cudaGetDeviceCount(nDev) + + dataType = ncclFloat + + allocate(comm(nDev)) + allocate(devList(nDev)) + + do i = 1, nDev + devList(i) = i - 1 + end do + + res = ncclCommInitAll(comm, nDev, devList) + + do i = 1, nDev + res = ncclCommCuDevice(comm(i), cudaDev) + res = ncclCommUserRank(comm(i), rank) + end do + + allocate(stream(nDev)) + + do i = 1, nDev + stat = cudaSetDevice(devList(i)) + stat = cudaStreamCreate(stream(i)) + end do + + call date_and_time(values = time) + call random_seed(size = i) + allocate(seed(i)) + call random_seed(get = seed) + seed = 60 * 60 * 1000 * time(5) + 60 * 1000 * time(6) + 1000 * time(7) + time(8) - seed + call random_seed(put = seed) + + allocate(hostBuff(nEl * nDev, nDev + 1)) + + call random_number(hostBuff) + + print "(a)", "before allgather:" + do i = 1, nDev + err = maxval(abs(hostBuff(:, i) / hostBuff(:, nDev + 1) - 1.0_real32)) + print "(a, i2.2, a, e11.4e2)", "maximum error of rank ", i - 1, " vs sendbuff = ", err + end do + + allocate(sendBuffPtr(nDev)) + + do i = 1, nDev + stat = cudaSetDevice(devList(i)) + allocate(sendBuff(nEl)) + sendBuffPtr(i) = c_devloc(sendBuff) + sendBuff = hostBuff((i - 1) * nEl + 1:i * nEl, nDev + 1) + end do + + allocate(recvBuffPtr(nDev)) + + do i = 1, nDev + stat = cudaSetDevice(devList(i)) + allocate(recvBuff(nEl * nDev)) + recvBuffPtr(i) = c_devloc(recvBuff) + recvBuff = hostBuff(:, i) + end do + + do i = 1, nDev + stat = cudaSetDevice(devList(i)) + res = ncclAllGather(sendBuffPtr(i), nEl, dataType, recvBuffPtr(i), comm(i), stream(i)) + end do + + do i = 1, nDev + stat = cudaSetDevice(devList(i)) + stat = cudaStreamSynchronize(stream(i)) + end do + + do i = 1, nDev + stat = cudaSetDevice(devList(i)) + call c_f_pointer(recvBuffPtr(i), recvBuff, [nEl * nDev]) + hostBuff(:, i) = recvBuff + end do + + print "(a)", "" + print "(a)", "after allgather:" + do i = 1, nDev + err = maxval(abs(hostBuff(:, i) / hostBuff(:, nDev + 1) - 1.0_real32)) + print "(a, i2.2, a, e11.4e2)", "maximum error of rank ", i - 1, " vs sendbuff = ", err + end do + + do i = 1, nDev + stat = cudaSetDevice(devList(i)) + call c_f_pointer(sendBuffPtr(i), sendBuff, [nEl]) + hostBuff((i - 1) * nEl + 1:i * nEl, 1) = sendBuff + end do + + err = maxval(abs(hostBuff(:, 1) / hostBuff(:, nDev + 1) - 1.0_real32)) + print "(a)", "" + print "(a, e11.4e2)", "maximum error in sendbuff = ", err + print "(a)", "" + + do i = 1, nDev + stat = cudaSetDevice(devList(i)) + call c_f_pointer(recvBuffPtr(i), recvBuff, [nEl * nDev]) + deallocate(recvBuff) + end do + + deallocate(recvBuffPtr) + + do i = 1, nDev + stat = cudaSetDevice(devList(i)) + call c_f_pointer(sendBuffPtr(i), sendBuff, [nEl]) + deallocate(sendBuff) + end do + + deallocate(sendBuffPtr) + + deallocate(hostBuff) + + deallocate(seed) + + do i = 1, nDev + stat = cudaSetDevice(devList(i)) + stat = cudaStreamDestroy(stream(i)) + end do + + deallocate(stream) + + do i = 1, nDev + call ncclCommDestroy(comm(i)) + end do + + deallocate(devList) + deallocate(comm) + +end program test diff --git a/fortran/test/allgather_ptr_out.f90 b/fortran/test/allgather_ptr_out.f90 new file mode 100644 index 0000000..b407664 --- /dev/null +++ b/fortran/test/allgather_ptr_out.f90 @@ -0,0 +1,164 @@ +program test +use iso_c_binding +use iso_fortran_env +use cudaFor +use ncclFor +implicit none +integer(int32) :: stat, i +real(real32) :: err +integer(int32) :: nEl, nDev +type(ncclDataType) :: dataType +type(ncclComm), allocatable :: comm(:) +integer(int32), allocatable :: devList(:) +type(ncclResult) :: res +integer(int32) :: cudaDev, rank +integer(cuda_stream_kind), allocatable :: stream(:) +integer(int32) :: time(8) +integer(int32), allocatable :: seed(:) +real(real32), allocatable, target :: hostBuff(:, :) +type(c_ptr), allocatable :: hostBuffPtr(:) +type(c_devptr), allocatable :: sendBuffPtr(:) +type(c_devptr), allocatable :: recvBuffPtr(:) + + nEl = 2621440 + +! nDev = 2 + stat = cudaGetDeviceCount(nDev) + + dataType = ncclFloat + + allocate(comm(nDev)) + allocate(devList(nDev)) + + do i = 1, nDev + devList(i) = i - 1 + end do + + res = ncclCommInitAll(comm, nDev, devList) + + do i = 1, nDev + res = ncclCommCuDevice(comm(i), cudaDev) + res = ncclCommUserRank(comm(i), rank) + end do + + allocate(stream(nDev)) + + do i = 1, nDev + stat = cudaSetDevice(devList(i)) + stat = cudaStreamCreate(stream(i)) + end do + + call date_and_time(values = time) + call random_seed(size = i) + allocate(seed(i)) + call random_seed(get = seed) + seed = 60 * 60 * 1000 * time(5) + 60 * 1000 * time(6) + 1000 * time(7) + time(8) - seed + call random_seed(put = seed) + + allocate(hostBuff(nEl * nDev, nDev + 1)) + + call random_number(hostBuff) + + print "(a)", "before allgather:" + do i = 1, nDev + err = maxval(abs(hostBuff(:, i) / hostBuff(:, nDev + 1) - 1.0_real32)) + print "(a, i2.2, a, e11.4e2)", "maximum error of rank ", i - 1, " vs sendbuff = ", err + end do + + allocate(hostBuffPtr(nDev)) + + do i = 1, nDev + hostBuffPtr(i) = c_loc(hostBuff((i - 1) * nEl + 1, nDev + 1)) + end do + + allocate(sendBuffPtr(nDev)) + + do i = 1, nDev + stat = cudaSetDevice(devList(i)) + stat = cudaMalloc(sendBuffPtr(i), nEl * c_sizeof(hostBuff(1, 1))) + stat = cudaMemcpy(sendBuffPtr(i), hostBuffPtr(i), nEl * c_sizeof(hostBuff(1, 1)), cudaMemcpyHostToDevice) + end do + + do i = 1, nDev + hostBuffPtr(i) = c_loc(hostBuff(1, i)) + end do + + allocate(recvBuffPtr(nDev)) + + do i = 1, nDev + stat = cudaSetDevice(devList(i)) + stat = cudaMalloc(recvBuffPtr(i), nEl * c_sizeof(hostBuff(1, 1)) * nDev) + stat = cudaMemcpy(recvBuffPtr(i), hostBuffPtr(i), nEl * c_sizeof(hostBuff(1, 1)) * nDev, cudaMemcpyHostToDevice) + end do + + do i = 1, nDev + stat = cudaSetDevice(devList(i)) + res = ncclAllGather(sendBuffPtr(i), nEl, dataType, recvBuffPtr(i), comm(i), stream(i)) + end do + + do i = 1, nDev + stat = cudaSetDevice(devList(i)) + stat = cudaStreamSynchronize(stream(i)) + end do + + do i = 1, nDev + stat = cudaSetDevice(devList(i)) + stat = cudaMemcpy(hostBuffPtr(i), recvBuffPtr(i), nEl * c_sizeof(hostBuff(1, 1)) * nDev, cudaMemcpyDeviceToHost) + end do + + print "(a)", "" + print "(a)", "after allgather:" + do i = 1, nDev + err = maxval(abs(hostBuff(:, i) / hostBuff(:, nDev + 1) - 1.0_real32)) + print "(a, i2.2, a, e11.4e2)", "maximum error of rank ", i - 1, " vs sendbuff = ", err + end do + + do i = 1, nDev + hostBuffPtr(i) = c_loc(hostBuff((i - 1) * nEl + 1, 1)) + end do + + do i = 1, nDev + stat = cudaSetDevice(devList(i)) + stat = cudaMemcpy(hostBuffPtr(i), sendBuffPtr(i), nEl * c_sizeof(hostBuff(1, 1)), cudaMemcpyDeviceToHost) + end do + + err = maxval(abs(hostBuff(:, 1) / hostBuff(:, nDev + 1) - 1.0_real32)) + print "(a)", "" + print "(a, e11.4e2)", "maximum error in sendbuff = ", err + print "(a)", "" + + do i = 1, nDev + stat = cudaSetDevice(devList(i)) + stat = cudaFree(recvBuffPtr(i)) + end do + + deallocate(recvBuffPtr) + + do i = 1, nDev + stat = cudaSetDevice(devList(i)) + stat = cudaFree(sendBuffPtr(i)) + end do + + deallocate(sendBuffPtr) + + deallocate(hostBuffPtr) + + deallocate(hostBuff) + + deallocate(seed) + + do i = 1, nDev + stat = cudaSetDevice(devList(i)) + stat = cudaStreamDestroy(stream(i)) + end do + + deallocate(stream) + + do i = 1, nDev + call ncclCommDestroy(comm(i)) + end do + + deallocate(devList) + deallocate(comm) + +end program test diff --git a/fortran/test/allreduce_arr_out.f90 b/fortran/test/allreduce_arr_out.f90 new file mode 100644 index 0000000..4804ceb --- /dev/null +++ b/fortran/test/allreduce_arr_out.f90 @@ -0,0 +1,158 @@ +program test +use iso_c_binding +use iso_fortran_env +use cudaFor +use ncclFor +implicit none +integer(int32) :: stat, i +real(real32) :: err +integer(int32) :: nEl, nDev +type(ncclDataType) :: dataType +type(ncclRedOp) :: redOp +type(ncclComm), allocatable :: comm(:) +integer(int32), allocatable :: devList(:) +type(ncclResult) :: res +integer(int32) :: cudaDev, rank +integer(cuda_stream_kind), allocatable :: stream(:) +integer(int32) :: time(8) +integer(int32), allocatable :: seed(:) +real(real32), allocatable :: hostBuff(:, :) +real(real32), allocatable, device :: sendBuff(:) +type(c_devptr), allocatable :: sendBuffPtr(:) +real(real32), allocatable, device :: recvBuff(:) +type(c_devptr), allocatable :: recvBuffPtr(:) + + nEl = 2621440 + +! nDev = 2 + stat = cudaGetDeviceCount(nDev) + + dataType = ncclFloat + redOp = ncclProd + + allocate(comm(nDev)) + allocate(devList(nDev)) + + do i = 1, nDev + devList(i) = i - 1 + end do + + res = ncclCommInitAll(comm, nDev, devList) + + do i = 1, nDev + res = ncclCommCuDevice(comm(i), cudaDev) + res = ncclCommUserRank(comm(i), rank) + end do + + allocate(stream(nDev)) + + do i = 1, nDev + stat = cudaSetDevice(devList(i)) + stat = cudaStreamCreate(stream(i)) + end do + + call date_and_time(values = time) + call random_seed(size = i) + allocate(seed(i)) + call random_seed(get = seed) + seed = 60 * 60 * 1000 * time(5) + 60 * 1000 * time(6) + 1000 * time(7) + time(8) - seed + call random_seed(put = seed) + + allocate(hostBuff(nEl, nDev + 2)) + + call random_number(hostBuff(:, 1:nDev + 1)) + + hostBuff(:, nDev + 2) = hostBuff(:, 1) + do i = 2, nDev + hostBuff(:, nDev + 2) = hostBuff(:, nDev + 2) * hostBuff(:, i) + end do + + print "(a)", "before allreduce:" + do i = 1, nDev + err = maxval(abs(hostBuff(:, i) / hostBuff(:, nDev + 2) - 1.0_real32)) + print "(a, i2.2, a, e11.4e2)", "maximum error in recvbuff from rank ", i - 1," = ", err + end do + + allocate(sendBuffPtr(nDev)) + + do i = 1, nDev + stat = cudaSetDevice(devList(i)) + allocate(sendBuff(nEl)) + sendBuffPtr(i) = c_devloc(sendBuff) + sendBuff = hostBuff(:, i) + end do + + allocate(recvBuffPtr(nDev)) + + do i = 1, nDev + stat = cudaSetDevice(devList(i)) + allocate(recvBuff(nEl)) + recvBuffPtr(i) = c_devloc(recvBuff) + recvBuff = hostBuff(:, i) + end do + + do i = 1, nDev + stat = cudaSetDevice(devList(i)) + res = ncclAllReduce(sendBuffPtr(i), recvBuffPtr(i), nEl, dataType, redOp, comm(i), stream(i)) + end do + + do i = 1, nDev + stat = cudaSetDevice(devList(i)) + stat = cudaStreamSynchronize(stream(i)) + end do + + print "(a)", "" + print "(a)", "after allreduce:" + do i = 1, nDev + stat = cudaSetDevice(devList(i)) + call c_f_pointer(recvBuffPtr(i), recvBuff, [nEl]) + hostBuff(:, nDev + 1) = recvBuff + err = maxval(abs(hostBuff(:, nDev + 1) / hostBuff(:, nDev + 2) - 1.0_real32)) + print "(a, i2.2, a, e11.4e2)", "maximum error in recvbuff from rank ", i - 1," = ", err + end do + + print "(a)", "" + do i = 1, nDev + stat = cudaSetDevice(devList(i)) + call c_f_pointer(sendBuffPtr(i), sendBuff, [nEl]) + hostBuff(:, nDev + 1) = sendBuff + err = maxval(abs(hostBuff(:, nDev + 1) / hostBuff(:, i) - 1.0_real32)) + print "(a, i2.2, a, e11.4e2)", "maximum error in sendbuff of rank ", i - 1," = ", err + end do + print "(a)", "" + + do i = 1, nDev + stat = cudaSetDevice(devList(i)) + call c_f_pointer(recvBuffPtr(i), recvBuff, [nEl]) + deallocate(recvBuff) + end do + + deallocate(recvBuffPtr) + + do i = 1, nDev + stat = cudaSetDevice(devList(i)) + call c_f_pointer(sendBuffPtr(i), sendBuff, [nEl]) + deallocate(sendBuff) + end do + + deallocate(sendBuffPtr) + + deallocate(hostBuff) + + deallocate(seed) + + do i = 1, nDev + stat = cudaSetDevice(devList(i)) + stat = cudaStreamDestroy(stream(i)) + end do + + deallocate(stream) + + do i = 1, nDev + call ncclCommDestroy(comm(i)) + end do + + deallocate(devList) + deallocate(comm) + +end program test diff --git a/fortran/test/allreduce_ptr_out.f90 b/fortran/test/allreduce_ptr_out.f90 new file mode 100644 index 0000000..39b2f2b --- /dev/null +++ b/fortran/test/allreduce_ptr_out.f90 @@ -0,0 +1,159 @@ +program test +use iso_c_binding +use iso_fortran_env +use cudaFor +use ncclFor +implicit none +integer(int32) :: stat, i +real(real32) :: err +integer(int32) :: nEl, nDev +type(ncclDataType) :: dataType +type(ncclRedOp) :: redOp +type(ncclComm), allocatable :: comm(:) +integer(int32), allocatable :: devList(:) +type(ncclResult) :: res +integer(int32) :: cudaDev, rank +integer(cuda_stream_kind), allocatable :: stream(:) +integer(int32) :: time(8) +integer(int32), allocatable :: seed(:) +real(real32), allocatable, target :: hostBuff(:, :) +type(c_ptr), allocatable :: hostBuffPtr(:) +type(c_devptr), allocatable :: sendBuffPtr(:) +type(c_devptr), allocatable :: recvBuffPtr(:) + + nEl = 2621440 + +! nDev = 2 + stat = cudaGetDeviceCount(nDev) + + dataType = ncclFloat + redOp = ncclProd + + allocate(comm(nDev)) + allocate(devList(nDev)) + + do i = 1, nDev + devList(i) = i - 1 + end do + + res = ncclCommInitAll(comm, nDev, devList) + + do i = 1, nDev + res = ncclCommCuDevice(comm(i), cudaDev) + res = ncclCommUserRank(comm(i), rank) + end do + + allocate(stream(nDev)) + + do i = 1, nDev + stat = cudaSetDevice(devList(i)) + stat = cudaStreamCreate(stream(i)) + end do + + call date_and_time(values = time) + call random_seed(size = i) + allocate(seed(i)) + call random_seed(get = seed) + seed = 60 * 60 * 1000 * time(5) + 60 * 1000 * time(6) + 1000 * time(7) + time(8) - seed + call random_seed(put = seed) + + allocate(hostBuff(nEl, nDev + 2)) + + call random_number(hostBuff(:, 1:nDev + 1)) + + hostBuff(:, nDev + 2) = hostBuff(:, 1) + do i = 2, nDev + hostBuff(:, nDev + 2) = hostBuff(:, nDev + 2) * hostBuff(:, i) + end do + + print "(a)", "before allreduce:" + do i = 1, nDev + err = maxval(abs(hostBuff(:, i) / hostBuff(:, nDev + 2) - 1.0_real32)) + print "(a, i2.2, a, e11.4e2)", "maximum error in recvbuff from rank ", i - 1," = ", err + end do + + allocate(hostBuffPtr(nDev + 1)) + + do i = 1, nDev + 1 + hostBuffPtr(i) = c_loc(hostBuff(1, i)) + end do + + allocate(sendBuffPtr(nDev)) + + do i = 1, nDev + stat = cudaSetDevice(devList(i)) + stat = cudaMalloc(sendBuffPtr(i), nEl * c_sizeof(hostBuff(1, 1))) + stat = cudaMemcpy(sendBuffPtr(i), hostBuffPtr(i), nEl * c_sizeof(hostBuff(1, 1)), cudaMemcpyHostToDevice) + end do + + allocate(recvBuffPtr(nDev)) + + do i = 1, nDev + stat = cudaSetDevice(devList(i)) + stat = cudaMalloc(recvBuffPtr(i), nEl * c_sizeof(hostBuff(1, 1))) + stat = cudaMemcpy(recvBuffPtr(i), hostBuffPtr(i), nEl * c_sizeof(hostBuff(1, 1)), cudaMemcpyHostToDevice) + end do + + do i = 1, nDev + stat = cudaSetDevice(devList(i)) + res = ncclAllReduce(sendBuffPtr(i), recvBuffPtr(i), nEl, dataType, redOp, comm(i), stream(i)) + end do + + do i = 1, nDev + stat = cudaSetDevice(devList(i)) + stat = cudaStreamSynchronize(stream(i)) + end do + + print "(a)", "" + print "(a)", "after allreduce:" + do i = 1, nDev + stat = cudaSetDevice(devList(i)) + stat = cudaMemcpy(hostBuffPtr(nDev + 1), recvBuffPtr(i), nEl * c_sizeof(hostBuff(1, 1)), cudaMemcpyDeviceToHost) + err = maxval(abs(hostBuff(:, nDev + 1) / hostBuff(:, nDev + 2) - 1.0_real32)) + print "(a, i2.2, a, e11.4e2)", "maximum error in recvbuff from rank ", i - 1," = ", err + end do + + print "(a)", "" + do i = 1, nDev + stat = cudaSetDevice(devList(i)) + stat = cudaMemcpy(hostBuffPtr(nDev + 1), sendBuffPtr(i), nEl * c_sizeof(hostBuff(1, 1)), cudaMemcpyDeviceToHost) + err = maxval(abs(hostBuff(:, nDev + 1) / hostBuff(:, i) - 1.0_real32)) + print "(a, i2.2, a, e11.4e2)", "maximum error in sendbuff of rank ", i - 1," = ", err + end do + print "(a)", "" + + do i = 1, nDev + stat = cudaSetDevice(devList(i)) + stat = cudaFree(recvBuffPtr(i)) + end do + + deallocate(recvBuffPtr) + + do i = 1, nDev + stat = cudaSetDevice(devList(i)) + stat = cudaFree(sendBuffPtr(i)) + end do + + deallocate(sendBuffPtr) + + deallocate(hostBuffPtr) + + deallocate(hostBuff) + + deallocate(seed) + + do i = 1, nDev + stat = cudaSetDevice(devList(i)) + stat = cudaStreamDestroy(stream(i)) + end do + + deallocate(stream) + + do i = 1, nDev + call ncclCommDestroy(comm(i)) + end do + + deallocate(devList) + deallocate(comm) + +end program test diff --git a/fortran/test/broadcast_arr.f90 b/fortran/test/broadcast_arr.f90 new file mode 100644 index 0000000..137c679 --- /dev/null +++ b/fortran/test/broadcast_arr.f90 @@ -0,0 +1,130 @@ +program test +use iso_c_binding +use iso_fortran_env +use cudaFor +use ncclFor +implicit none +integer(int32) :: stat, i +real(real32) :: err +integer(int32) :: nEl, nDev, root +type(ncclDataType) :: dataType +type(ncclComm), allocatable :: comm(:) +integer(int32), allocatable :: devList(:) +type(ncclResult) :: res +integer(int32) :: cudaDev, rank +integer(cuda_stream_kind), allocatable :: stream(:) +integer(int32) :: time(8) +integer(int32), allocatable :: seed(:) +real(real32), allocatable :: hostBuff(:, :) +real(real32), allocatable, device :: devBuff(:) +type(c_devptr), allocatable :: devBuffPtr(:) + + nEl = 2621440 + +! nDev = 2 +! root = 0 + stat = cudaGetDeviceCount(nDev) + root = nDev - 1 + + dataType = ncclFloat + + allocate(comm(nDev)) + allocate(devList(nDev)) + + do i = 1, nDev + devList(i) = i - 1 + end do + + res = ncclCommInitAll(comm, nDev, devList) + + do i = 1, nDev + res = ncclCommCuDevice(comm(i), cudaDev) + res = ncclCommUserRank(comm(i), rank) + end do + + allocate(stream(nDev)) + + do i = 1, nDev + stat = cudaSetDevice(devList(i)) + stat = cudaStreamCreate(stream(i)) + end do + + call date_and_time(values = time) + call random_seed(size = i) + allocate(seed(i)) + call random_seed(get = seed) + seed = 60 * 60 * 1000 * time(5) + 60 * 1000 * time(6) + 1000 * time(7) + time(8) - seed + call random_seed(put = seed) + + allocate(hostBuff(nEl, nDev + 1)) + + call random_number(hostBuff(:, 1:nDev)) + + hostBuff(:, nDev + 1) = hostBuff(:, root + 1) + + print "(a)", "before broadcast:" + do i = 1, nDev + err = maxval(abs(hostBuff(:, i) / hostBuff(:, nDev + 1) - 1.0_real32)) + print "(a, i2.2, a, i2.2, a, e11.4e2)", "maximum error of rank ", i - 1, " vs root (rank ", root,") = ", err + end do + + allocate(devBuffPtr(nDev)) + + do i = 1, nDev + stat = cudaSetDevice(devList(i)) + allocate(devBuff(nEl)) + devBuffPtr(i) = c_devloc(devBuff) + devBuff = hostBuff(:, i) + end do + + do i = 1, nDev + stat = cudaSetDevice(devList(i)) + res = ncclBcast(devBuffPtr(i), nEl, dataType, root, comm(i), stream(i)) + end do + + do i = 1, nDev + stat = cudaSetDevice(devList(i)) + stat = cudaStreamSynchronize(stream(i)) + end do + + do i = 1, nDev + stat = cudaSetDevice(devList(i)) + call c_f_pointer(devBuffPtr(i), devBuff, [nEl]) + hostBuff(:, i) = devBuff + end do + + print "(a)", "" + print "(a)", "after broadcast:" + do i = 1, nDev + err = maxval(abs(hostBuff(:, i) / hostBuff(:, nDev + 1) - 1.0_real32)) + print "(a, i2.2, a, i2.2, a, e11.4e2)", "maximum error of rank ", i - 1, " vs root (rank ", root,") = ", err + end do + print "(a)", "" + + do i = 1, nDev + stat = cudaSetDevice(devList(i)) + call c_f_pointer(devBuffPtr(i), devBuff, [nEl]) + deallocate(devBuff) + end do + + deallocate(devBuffPtr) + + deallocate(hostBuff) + + deallocate(seed) + + do i = 1, nDev + stat = cudaSetDevice(devList(i)) + stat = cudaStreamDestroy(stream(i)) + end do + + deallocate(stream) + + do i = 1, nDev + call ncclCommDestroy(comm(i)) + end do + + deallocate(devList) + deallocate(comm) + +end program test diff --git a/fortran/test/broadcast_ptr.f90 b/fortran/test/broadcast_ptr.f90 new file mode 100644 index 0000000..0918519 --- /dev/null +++ b/fortran/test/broadcast_ptr.f90 @@ -0,0 +1,135 @@ +program test +use iso_c_binding +use iso_fortran_env +use cudaFor +use ncclFor +implicit none +integer(int32) :: stat, i +real(real32) :: err +integer(int32) :: nEl, nDev, root +type(ncclDataType) :: dataType +type(ncclComm), allocatable :: comm(:) +integer(int32), allocatable :: devList(:) +type(ncclResult) :: res +integer(int32) :: cudaDev, rank +integer(cuda_stream_kind), allocatable :: stream(:) +integer(int32) :: time(8) +integer(int32), allocatable :: seed(:) +real(real32), allocatable, target :: hostBuff(:, :) +type(c_ptr), allocatable :: hostBuffPtr(:) +type(c_devptr), allocatable :: devBuffPtr(:) + + nEl = 2621440 + +! nDev = 2 +! root = 0 + stat = cudaGetDeviceCount(nDev) + root = nDev - 1 + + dataType = ncclFloat + + allocate(comm(nDev)) + allocate(devList(nDev)) + + do i = 1, nDev + devList(i) = i - 1 + end do + + res = ncclCommInitAll(comm, nDev, devList) + + do i = 1, nDev + res = ncclCommCuDevice(comm(i), cudaDev) + res = ncclCommUserRank(comm(i), rank) + end do + + allocate(stream(nDev)) + + do i = 1, nDev + stat = cudaSetDevice(devList(i)) + stat = cudaStreamCreate(stream(i)) + end do + + call date_and_time(values = time) + call random_seed(size = i) + allocate(seed(i)) + call random_seed(get = seed) + seed = 60 * 60 * 1000 * time(5) + 60 * 1000 * time(6) + 1000 * time(7) + time(8) - seed + call random_seed(put = seed) + + allocate(hostBuff(nEl, nDev + 1)) + + call random_number(hostBuff(:, 1:nDev)) + + hostBuff(:, nDev + 1) = hostBuff(:, root + 1) + + print "(a)", "before broadcast:" + do i = 1, nDev + err = maxval(abs(hostBuff(:, i) / hostBuff(:, nDev + 1) - 1.0_real32)) + print "(a, i2.2, a, i2.2, a, e11.4e2)", "maximum error of rank ", i - 1, " vs root (rank ", root,") = ", err + end do + + allocate(hostBuffPtr(nDev)) + + do i = 1, nDev + hostBuffPtr(i) = c_loc(hostBuff(1, i)) + end do + + allocate(devBuffPtr(nDev)) + + do i = 1, nDev + stat = cudaSetDevice(devList(i)) + stat = cudaMalloc(devBuffPtr(i), nEl * c_sizeof(hostBuff(1, 1))) + stat = cudaMemcpy(devBuffPtr(i), hostBuffPtr(i), nEl * c_sizeof(hostBuff(1, 1)), cudaMemcpyHostToDevice) + end do + + do i = 1, nDev + stat = cudaSetDevice(devList(i)) + res = ncclBcast(devBuffPtr(i), nEl, dataType, root, comm(i), stream(i)) + end do + + do i = 1, nDev + stat = cudaSetDevice(devList(i)) + stat = cudaStreamSynchronize(stream(i)) + end do + + do i = 1, nDev + stat = cudaSetDevice(devList(i)) + stat = cudaMemcpy(hostBuffPtr(i), devBuffPtr(i), nEl * c_sizeof(hostBuff(1, 1)), cudaMemcpyDeviceToHost) + end do + + print "(a)", "" + print "(a)", "after broadcast:" + do i = 1, nDev + err = maxval(abs(hostBuff(:, i) / hostBuff(:, nDev + 1) - 1.0_real32)) + print "(a, i2.2, a, i2.2, a, e11.4e2)", "maximum error of rank ", i - 1, " vs root (rank ", root,") = ", err + end do + print "(a)", "" + + do i = 1, nDev + stat = cudaSetDevice(devList(i)) + stat = cudaFree(devBuffPtr(i)) + end do + + deallocate(devBuffPtr) + + deallocate(hostBuffPtr) + + deallocate(hostBuff) + + deallocate(seed) + + do i = 1, nDev + stat = cudaSetDevice(devList(i)) + stat = cudaStreamDestroy(stream(i)) + end do + + deallocate(stream) + + do i = 1, nDev + call ncclCommDestroy(comm(i)) + end do + + deallocate(devList) + deallocate(comm) + +end program test diff --git a/fortran/test/reduce_arr_out.f90 b/fortran/test/reduce_arr_out.f90 new file mode 100644 index 0000000..a332cf5 --- /dev/null +++ b/fortran/test/reduce_arr_out.f90 @@ -0,0 +1,168 @@ +program test +use iso_c_binding +use iso_fortran_env +use cudaFor +use ncclFor +implicit none +integer(int32) :: stat, i +real(real32) :: err +integer(int32) :: nEl, nDev, root +type(ncclDataType) :: dataType +type(ncclRedOp) :: redOp +type(ncclComm), allocatable :: comm(:) +integer(int32), allocatable :: devList(:) +type(ncclResult) :: res +integer(int32) :: cudaDev, rank +integer(cuda_stream_kind), allocatable :: stream(:) +integer(int32) :: time(8) +integer(int32), allocatable :: seed(:) +real(real32), allocatable :: hostBuff(:, :) +real(real32), allocatable, device :: sendBuff(:) +type(c_devptr), allocatable :: sendBuffPtr(:) +real(real32), allocatable, device :: recvBuff(:) +type(c_devptr), allocatable :: recvBuffPtr(:) + + nEl = 2621440 + +! nDev = 2 +! root = 0 + stat = cudaGetDeviceCount(nDev) + root = nDev - 1 + + dataType = ncclFloat + redOp = ncclProd + + allocate(comm(nDev)) + allocate(devList(nDev)) + + do i = 1, nDev + devList(i) = i - 1 + end do + + res = ncclCommInitAll(comm, nDev, devList) + + do i = 1, nDev + res = ncclCommCuDevice(comm(i), cudaDev) + res = ncclCommUserRank(comm(i), rank) + end do + + allocate(stream(nDev)) + + do i = 1, nDev + stat = cudaSetDevice(devList(i)) + stat = cudaStreamCreate(stream(i)) + end do + + call date_and_time(values = time) + call random_seed(size = i) + allocate(seed(i)) + call random_seed(get = seed) + seed = 60 * 60 * 1000 * time(5) + 60 * 1000 * time(6) + 1000 * time(7) + time(8) - seed + call random_seed(put = seed) + + allocate(hostBuff(nEl, nDev + 2)) + + call random_number(hostBuff(:, 1:nDev + 1)) + + hostBuff(:, nDev + 2) = hostBuff(:, 1) + do i = 2, nDev + hostBuff(:, nDev + 2) = hostBuff(:, nDev + 2) * hostBuff(:, i) + end do + + print "(a)", "before reduce:" + err = maxval(abs(hostBuff(:, nDev + 1) / hostBuff(:, nDev + 2) - 1.0_real32)) + print "(a, i2.2, a, e11.4e2)", "maximum error in recvbuff from root (rank ", root,") = ", err + + allocate(sendBuffPtr(nDev)) + + do i = 1, nDev + stat = cudaSetDevice(devList(i)) + allocate(sendBuff(nEl)) + sendBuffPtr(i) = c_devloc(sendBuff) + sendBuff = hostBuff(:, i) + end do + + allocate(recvBuffPtr(nDev)) + + do i = 1, nDev + stat = cudaSetDevice(devList(i)) + allocate(recvBuff(nEl)) + recvBuffPtr(i) = c_devloc(recvBuff) + recvBuff = hostBuff(:, i) + end do + + do i = 1, nDev + stat = cudaSetDevice(devList(i)) + res = ncclReduce(sendBuffPtr(i), recvBuffPtr(i), nEl, dataType, redOp, root, comm(i), stream(i)) + end do + + do i = 1, nDev + stat = cudaSetDevice(devList(i)) + stat = cudaStreamSynchronize(stream(i)) + end do + + stat = cudaSetDevice(devList(root + 1)) + call c_f_pointer(recvBuffPtr(root + 1), recvBuff, [nEl]) + hostBuff(:, nDev + 1) = recvBuff + + print "(a)", "" + print "(a)", "after reduce:" + err = maxval(abs(hostBuff(:, nDev + 1) / hostBuff(:, nDev + 2) - 1.0_real32)) + print "(a, i2.2, a, e11.4e2)", "maximum error in recvbuff from root (rank ", root,") = ", err + + print "(a)", "" + do i = 1, nDev + stat = cudaSetDevice(devList(i)) + call c_f_pointer(sendBuffPtr(i), sendBuff, [nEl]) + hostBuff(:, nDev + 1) = sendBuff + err = maxval(abs(hostBuff(:, nDev + 1) / hostBuff(:, i) - 1.0_real32)) + print "(a, i2.2, a, e11.4e2)", "maximum error in sendbuff of rank ", i - 1," = ", err + end do + + print "(a)", "" + do i = 1, nDev + if (i - 1 /= root) then + stat = cudaSetDevice(devList(i)) + call c_f_pointer(recvBuffPtr(i), recvBuff, [nEl]) + hostBuff(:, nDev + 1) = recvBuff + err = maxval(abs(hostBuff(:, nDev + 1) / hostBuff(:, i) - 1.0_real32)) + print "(a, i2.2, a, e11.4e2)", "maximum error in recvbuff of rank ", i - 1," = ", err + end if + end do + print "(a)", "" + + do i = 1, nDev + stat = cudaSetDevice(devList(i)) + call c_f_pointer(recvBuffPtr(i), recvBuff, [nEl]) + deallocate(recvBuff) + end do + + deallocate(recvBuffPtr) + + do i = 1, nDev + stat = cudaSetDevice(devList(i)) + call c_f_pointer(sendBuffPtr(i), sendBuff, [nEl]) + deallocate(sendBuff) + end do + + deallocate(sendBuffPtr) + + deallocate(hostBuff) + + deallocate(seed) + + do i = 1, nDev + stat = cudaSetDevice(devList(i)) + stat = cudaStreamDestroy(stream(i)) + end do + + deallocate(stream) + + do i = 1, nDev + call ncclCommDestroy(comm(i)) + end do + + deallocate(devList) + deallocate(comm) + +end program test diff --git a/fortran/test/reduce_ptr_out.f90 b/fortran/test/reduce_ptr_out.f90 new file mode 100644 index 0000000..46f3e12 --- /dev/null +++ b/fortran/test/reduce_ptr_out.f90 @@ -0,0 +1,168 @@ +program test +use iso_c_binding +use iso_fortran_env +use cudaFor +use ncclFor +implicit none +integer(int32) :: stat, i +real(real32) :: err +integer(int32) :: nEl, nDev, root +type(ncclDataType) :: dataType +type(ncclRedOp) :: redOp +type(ncclComm), allocatable :: comm(:) +integer(int32), allocatable :: devList(:) +type(ncclResult) :: res +integer(int32) :: cudaDev, rank +integer(cuda_stream_kind), allocatable :: stream(:) +integer(int32) :: time(8) +integer(int32), allocatable :: seed(:) +real(real32), allocatable, target :: hostBuff(:, :) +type(c_ptr), allocatable :: hostBuffPtr(:) +type(c_devptr), allocatable :: sendBuffPtr(:) +type(c_devptr), allocatable :: recvBuffPtr(:) + + nEl = 2621440 + +! nDev = 2 +! root = 0 + stat = cudaGetDeviceCount(nDev) + root = nDev - 1 + + dataType = ncclFloat + redOp = ncclProd + + allocate(comm(nDev)) + allocate(devList(nDev)) + + do i = 1, nDev + devList(i) = i - 1 + end do + + res = ncclCommInitAll(comm, nDev, devList) + + do i = 1, nDev + res = ncclCommCuDevice(comm(i), cudaDev) + res = ncclCommUserRank(comm(i), rank) + end do + + allocate(stream(nDev)) + + do i = 1, nDev + stat = cudaSetDevice(devList(i)) + stat = cudaStreamCreate(stream(i)) + end do + + call date_and_time(values = time) + call random_seed(size = i) + allocate(seed(i)) + call random_seed(get = seed) + seed = 60 * 60 * 1000 * time(5) + 60 * 1000 * time(6) + 1000 * time(7) + time(8) - seed + call random_seed(put = seed) + + allocate(hostBuff(nEl, nDev + 2)) + + call random_number(hostBuff(:, 1:nDev + 1)) + + hostBuff(:, nDev + 2) = hostBuff(:, 1) + do i = 2, nDev + hostBuff(:, nDev + 2) = hostBuff(:, nDev + 2) * hostBuff(:, i) + end do + + print "(a)", "before reduce:" + err = maxval(abs(hostBuff(:, nDev + 1) / hostBuff(:, nDev + 2) - 1.0_real32)) + print "(a, i2.2, a, e11.4e2)", "maximum error in recvbuff from root (rank ", root,") = ", err + + allocate(hostBuffPtr(nDev + 1)) + + do i = 1, nDev + 1 + hostBuffPtr(i) = c_loc(hostBuff(1, i)) + end do + + allocate(sendBuffPtr(nDev)) + + do i = 1, nDev + stat = cudaSetDevice(devList(i)) + stat = cudaMalloc(sendBuffPtr(i), nEl * c_sizeof(hostBuff(1, 1))) + stat = cudaMemcpy(sendBuffPtr(i), hostBuffPtr(i), nEl * c_sizeof(hostBuff(1, 1)), cudaMemcpyHostToDevice) + end do + + allocate(recvBuffPtr(nDev)) + + do i = 1, nDev + stat = cudaSetDevice(devList(i)) + stat = cudaMalloc(recvBuffPtr(i), nEl * c_sizeof(hostBuff(1, 1))) + stat = cudaMemcpy(recvBuffPtr(i), hostBuffPtr(i), nEl * c_sizeof(hostBuff(1, 1)), cudaMemcpyHostToDevice) + end do + + do i = 1, nDev + stat = cudaSetDevice(devList(i)) + res = ncclReduce(sendBuffPtr(i), recvBuffPtr(i), nEl, dataType, redOp, root, comm(i), stream(i)) + end do + + do i = 1, nDev + stat = cudaSetDevice(devList(i)) + stat = cudaStreamSynchronize(stream(i)) + end do + + stat = cudaSetDevice(devList(root + 1)) + stat = cudaMemcpy(hostBuffPtr(nDev + 1), recvBuffPtr(root + 1), nEl * c_sizeof(hostBuff(1, 1)), cudaMemcpyDeviceToHost) + + print "(a)", "" + print "(a)", "after reduce:" + err = maxval(abs(hostBuff(:, nDev + 1) / hostBuff(:, nDev + 2) - 1.0_real32)) + print "(a, i2.2, a, e11.4e2)", "maximum error in recvbuff from root (rank ", root,") = ", err + + print "(a)", "" + do i = 1, nDev + stat = cudaSetDevice(devList(i)) + stat = cudaMemcpy(hostBuffPtr(nDev + 1), sendBuffPtr(i), nEl * c_sizeof(hostBuff(1, 1)), cudaMemcpyDeviceToHost) + err = maxval(abs(hostBuff(:, nDev + 1) / hostBuff(:, i) - 1.0_real32)) + print "(a, i2.2, a, e11.4e2)", "maximum error in sendbuff of rank ", i - 1," = ", err + end do + + print "(a)", "" + do i = 1, nDev + if (i - 1 /= root) then + stat = cudaSetDevice(devList(i)) + stat = cudaMemcpy(hostBuffPtr(nDev + 1), recvBuffPtr(i), nEl * c_sizeof(hostBuff(1, 1)), cudaMemcpyDeviceToHost) + err = maxval(abs(hostBuff(:, nDev + 1) / hostBuff(:, i) - 1.0_real32)) + print "(a, i2.2, a, e11.4e2)", "maximum error in recvbuff of rank ", i - 1," = ", err + end if + end do + print "(a)", "" + + do i = 1, nDev + stat = cudaSetDevice(devList(i)) + stat = cudaFree(recvBuffPtr(i)) + end do + + deallocate(recvBuffPtr) + + do i = 1, nDev + stat = cudaSetDevice(devList(i)) + stat = cudaFree(sendBuffPtr(i)) + end do + + deallocate(sendBuffPtr) + + deallocate(hostBuffPtr) + + deallocate(hostBuff) + + deallocate(seed) + + do i = 1, nDev + stat = cudaSetDevice(devList(i)) + stat = cudaStreamDestroy(stream(i)) + end do + + deallocate(stream) + + do i = 1, nDev + call ncclCommDestroy(comm(i)) + end do + + deallocate(devList) + deallocate(comm) + +end program test diff --git a/fortran/test/reducescatter_arr_out.f90 b/fortran/test/reducescatter_arr_out.f90 new file mode 100644 index 0000000..cd14157 --- /dev/null +++ b/fortran/test/reducescatter_arr_out.f90 @@ -0,0 +1,158 @@ +program test +use iso_c_binding +use iso_fortran_env +use cudaFor +use ncclFor +implicit none +integer(int32) :: stat, i +real(real32) :: err +integer(int32) :: nEl, nDev +type(ncclDataType) :: dataType +type(ncclRedOp) :: redOp +type(ncclComm), allocatable :: comm(:) +integer(int32), allocatable :: devList(:) +type(ncclResult) :: res +integer(int32) :: cudaDev, rank +integer(cuda_stream_kind), allocatable :: stream(:) +integer(int32) :: time(8) +integer(int32), allocatable :: seed(:) +real(real32), allocatable :: hostBuff(:, :) +real(real32), allocatable, device :: sendBuff(:) +type(c_devptr), allocatable :: sendBuffPtr(:) +real(real32), allocatable, device :: recvBuff(:) +type(c_devptr), allocatable :: recvBuffPtr(:) + + nEl = 2621440 + +! nDev = 2 + stat = cudaGetDeviceCount(nDev) + + dataType = ncclFloat + redOp = ncclProd + + allocate(comm(nDev)) + allocate(devList(nDev)) + + do i = 1, nDev + devList(i) = i - 1 + end do + + res = ncclCommInitAll(comm, nDev, devList) + + do i = 1, nDev + res = ncclCommCuDevice(comm(i), cudaDev) + res = ncclCommUserRank(comm(i), rank) + end do + + allocate(stream(nDev)) + + do i = 1, nDev + stat = cudaSetDevice(devList(i)) + stat = cudaStreamCreate(stream(i)) + end do + + call date_and_time(values = time) + call random_seed(size = i) + allocate(seed(i)) + call random_seed(get = seed) + seed = 60 * 60 * 1000 * time(5) + 60 * 1000 * time(6) + 1000 * time(7) + time(8) - seed + call random_seed(put = seed) + + allocate(hostBuff(nEl * nDev, nDev + 2)) + + call random_number(hostBuff(:, 1:nDev + 1)) + + hostBuff(:, nDev + 2) = hostBuff(:, 1) + do i = 2, nDev + hostBuff(:, nDev + 2) = hostBuff(:, nDev + 2) * hostBuff(:, i) + end do + + print "(a)", "before reducescatter:" + do i = 1, nDev + err = maxval(abs(hostBuff((i - 1) * nEl + 1:i * nEl, nDev + 1) / hostBuff((i - 1) * nEl + 1:i * nEl, nDev + 2) - 1.0_real32)) + print "(a, i2.2, a, e11.4e2)", "maximum error in recvbuff from rank ", i - 1," = ", err + end do + + allocate(sendBuffPtr(nDev)) + + do i = 1, nDev + stat = cudaSetDevice(devList(i)) + allocate(sendBuff(nEl * nDev)) + sendBuffPtr(i) = c_devloc(sendBuff) + sendBuff = hostBuff(:, i) + end do + + allocate(recvBuffPtr(nDev)) + + do i = 1, nDev + stat = cudaSetDevice(devList(i)) + allocate(recvBuff(nEl)) + recvBuffPtr(i) = c_devloc(recvBuff) + recvBuff = hostBuff((i - 1) * nEl + 1:i * nEl, nDev + 1) + end do + + do i = 1, nDev + stat = cudaSetDevice(devList(i)) + res = ncclReduceScatter(sendBuffPtr(i), recvBuffPtr(i), nEl, dataType, redOp, comm(i), stream(i)) + end do + + do i = 1, nDev + stat = cudaSetDevice(devList(i)) + stat = cudaStreamSynchronize(stream(i)) + end do + + print "(a)", "" + print "(a)", "after reducescatter:" + do i = 1, nDev + stat = cudaSetDevice(devList(i)) + call c_f_pointer(recvBuffPtr(i), recvBuff, [nEl]) + hostBuff((i - 1) * nEl + 1:i * nEl, nDev + 1) = recvBuff + err = maxval(abs(hostBuff((i - 1) * nEl + 1:i * nEl, nDev + 1) / hostBuff((i - 1) * nEl + 1:i * nEl, nDev + 2) - 1.0_real32)) + print "(a, i2.2, a, e11.4e2)", "maximum error in recvbuff from rank ", i - 1," = ", err + end do + + print "(a)", "" + do i = 1, nDev + stat = cudaSetDevice(devList(i)) + call c_f_pointer(sendBuffPtr(i), sendBuff, [nEl * nDev]) + hostBuff(:, nDev + 1) = sendBuff + err = maxval(abs(hostBuff(:, nDev + 1) / hostBuff(:, i) - 1.0_real32)) + print "(a, i2.2, a, e11.4e2)", "maximum error in sendbuff of rank ", i - 1," = ", err + end do + print "(a)", "" + + do i = 1, nDev + stat = cudaSetDevice(devList(i)) + call c_f_pointer(recvBuffPtr(i), recvBuff, [nEl]) + deallocate(recvBuff) + end do + + deallocate(recvBuffPtr) + + do i = 1, nDev + stat = cudaSetDevice(devList(i)) + call c_f_pointer(sendBuffPtr(i), sendBuff, [nEl * nDev]) + deallocate(sendBuff) + end do + + deallocate(sendBuffPtr) + + deallocate(hostBuff) + + deallocate(seed) + + do i = 1, nDev + stat = cudaSetDevice(devList(i)) + stat = cudaStreamDestroy(stream(i)) + end do + + deallocate(stream) + + do i = 1, nDev + call ncclCommDestroy(comm(i)) + end do + + deallocate(devList) + deallocate(comm) + +end program test diff --git a/fortran/test/reducescatter_ptr_out.f90 b/fortran/test/reducescatter_ptr_out.f90 new file mode 100644 index 0000000..e35e85d --- /dev/null +++ b/fortran/test/reducescatter_ptr_out.f90 @@ -0,0 +1,167 @@ +program test +use iso_c_binding +use iso_fortran_env +use cudaFor +use ncclFor +implicit none +integer(int32) :: stat, i +real(real32) :: err +integer(int32) :: nEl, nDev +type(ncclDataType) :: dataType +type(ncclRedOp) :: redOp +type(ncclComm), allocatable :: comm(:) +integer(int32), allocatable :: devList(:) +type(ncclResult) :: res +integer(int32) :: cudaDev, rank +integer(cuda_stream_kind), allocatable :: stream(:) +integer(int32) :: time(8) +integer(int32), allocatable :: seed(:) +real(real32), allocatable, target :: hostBuff(:, :) +type(c_ptr), allocatable :: hostBuffPtr(:) +type(c_devptr), allocatable :: sendBuffPtr(:) +type(c_devptr), allocatable :: recvBuffPtr(:) + + nEl = 2621440 + +! nDev = 2 + stat = cudaGetDeviceCount(nDev) + + dataType = ncclFloat + redOp = ncclProd + + allocate(comm(nDev)) + allocate(devList(nDev)) + + do i = 1, nDev + devList(i) = i - 1 + end do + + res = ncclCommInitAll(comm, nDev, devList) + + do i = 1, nDev + res = ncclCommCuDevice(comm(i), cudaDev) + res = ncclCommUserRank(comm(i), rank) + end do + + allocate(stream(nDev)) + + do i = 1, nDev + stat = cudaSetDevice(devList(i)) + stat = cudaStreamCreate(stream(i)) + end do + + call date_and_time(values = time) + call random_seed(size = i) + allocate(seed(i)) + call random_seed(get = seed) + seed = 60 * 60 * 1000 * time(5) + 60 * 1000 * time(6) + 1000 * time(7) + time(8) - seed + call random_seed(put = seed) + + allocate(hostBuff(nEl * nDev, nDev + 2)) + + call random_number(hostBuff(:, 1:nDev + 1)) + + hostBuff(:, nDev + 2) = hostBuff(:, 1) + do i = 2, nDev + hostBuff(:, nDev + 2) = hostBuff(:, nDev + 2) * hostBuff(:, i) + end do + + print "(a)", "before reducescatter:" + do i = 1, nDev + err = maxval(abs(hostBuff((i - 1) * nEl + 1:i * nEl, nDev + 1) / hostBuff((i - 1) * nEl + 1:i * nEl, nDev + 2) - 1.0_real32)) + print "(a, i2.2, a, e11.4e2)", "maximum error in recvbuff from rank ", i - 1," = ", err + end do + + allocate(hostBuffPtr(nDev + 1)) + + do i = 1, nDev + 1 + hostBuffPtr(i) = c_loc(hostBuff(1, i)) + end do + + allocate(sendBuffPtr(nDev)) + + do i = 1, nDev + stat = cudaSetDevice(devList(i)) + stat = cudaMalloc(sendBuffPtr(i), nEl * c_sizeof(hostBuff(1, 1)) * nDev) + stat = cudaMemcpy(sendBuffPtr(i), hostBuffPtr(i), nEl * c_sizeof(hostBuff(1, 1)) * nDev, cudaMemcpyHostToDevice) + end do + + do i = 1, nDev + hostBuffPtr(i) = c_loc(hostBuff((i - 1) * nEl + 1, nDev + 1)) + end do + + allocate(recvBuffPtr(nDev)) + + do i = 1, nDev + stat = cudaSetDevice(devList(i)) + stat = cudaMalloc(recvBuffPtr(i), nEl * c_sizeof(hostBuff(1, 1))) + stat = cudaMemcpy(recvBuffPtr(i), hostBuffPtr(i), nEl * c_sizeof(hostBuff(1, 1)), cudaMemcpyHostToDevice) + end do + + do i = 1, nDev + stat = cudaSetDevice(devList(i)) + res = ncclReduceScatter(sendBuffPtr(i), recvBuffPtr(i), nEl, dataType, redOp, comm(i), stream(i)) + end do + + do i = 1, nDev + stat = cudaSetDevice(devList(i)) + stat = cudaStreamSynchronize(stream(i)) + end do + + print "(a)", "" + print "(a)", "after reduceScatter:" + do i = 1, nDev + stat = cudaSetDevice(devList(i)) + stat = cudaMemcpy(hostBuffPtr(i), recvBuffPtr(i), nEl * c_sizeof(hostBuff(1, 1)), cudaMemcpyDeviceToHost) + err = maxval(abs(hostBuff((i - 1) * nEl + 1:i * nEl, nDev + 1) / hostBuff((i - 1) * nEl + 1:i * nEl, nDev + 2) - 1.0_real32)) + print "(a, i2.2, a, e11.4e2)", "maximum error in recvbuff from rank ", i - 1," = ", err + end do + + do i = 1, nDev + 1 + hostBuffPtr(i) = c_loc(hostBuff(1, nDev + 1)) + end do + + print "(a)", "" + do i = 1, nDev + stat = cudaSetDevice(devList(i)) + stat = cudaMemcpy(hostBuffPtr(i), sendBuffPtr(i), nEl * c_sizeof(hostBuff(1, 1)) * nDev, cudaMemcpyDeviceToHost) + err = maxval(abs(hostBuff(:, nDev + 1) / hostBuff(:, i) - 1.0_real32)) + print "(a, i2.2, a, e11.4e2)", "maximum error in sendbuff of rank ", i - 1," = ", err + end do + print "(a)", "" + + do i = 1, nDev + stat = cudaSetDevice(devList(i)) + stat = cudaFree(recvBuffPtr(i)) + end do + + deallocate(recvBuffPtr) + + do i = 1, nDev + stat = cudaSetDevice(devList(i)) + stat = cudaFree(sendBuffPtr(i)) + end do + + deallocate(sendBuffPtr) + + deallocate(hostBuffPtr) + + deallocate(hostBuff) + + deallocate(seed) + + do i = 1, nDev + stat = cudaSetDevice(devList(i)) + stat = cudaStreamDestroy(stream(i)) + end do + + deallocate(stream) + + do i = 1, nDev + call ncclCommDestroy(comm(i)) + end do + + deallocate(devList) + deallocate(comm) + +end program test |