Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/marian-nmt/nccl.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorKyle Fernandes, ne Jacobs <kj333@cam.ac.uk>2016-11-18 02:27:20 +0300
committerSylvain Jeaugey <sjeaugey@nvidia.com>2016-11-18 02:33:34 +0300
commit5f2b32e45b7d31ec942de27369a1308a0afe8fb0 (patch)
tree761f62f22038833b75934afd7302f337a03880c8
parent534b9a169778040dbe7562854893b74bb9387e46 (diff)
Add Fortran bindings
-rw-r--r--Makefile30
-rw-r--r--fortran/Makefile81
-rw-r--r--fortran/src/cudafor.f90164
-rw-r--r--fortran/src/ncclfor.f90305
-rw-r--r--fortran/test/allgather_arr_out.f90155
-rw-r--r--fortran/test/allgather_ptr_out.f90164
-rw-r--r--fortran/test/allreduce_arr_out.f90158
-rw-r--r--fortran/test/allreduce_ptr_out.f90159
-rw-r--r--fortran/test/broadcast_arr.f90130
-rw-r--r--fortran/test/broadcast_ptr.f90135
-rw-r--r--fortran/test/reduce_arr_out.f90168
-rw-r--r--fortran/test/reduce_ptr_out.f90168
-rw-r--r--fortran/test/reducescatter_arr_out.f90158
-rw-r--r--fortran/test/reducescatter_ptr_out.f90167
14 files changed, 2133 insertions, 9 deletions
diff --git a/Makefile b/Makefile
index 2556d07..64243e8 100644
--- a/Makefile
+++ b/Makefile
@@ -11,6 +11,7 @@ KEEP ?= 0
DEBUG ?= 0
PROFAPI ?= 0
BUILDDIR ?= build
+BUILDDIR := $(abspath $(BUILDDIR))
CUDA_LIB ?= $(CUDA_HOME)/lib64
CUDA_INC ?= $(CUDA_HOME)/include
@@ -21,7 +22,7 @@ NVCC_GENCODE ?= -gencode=arch=compute_35,code=sm_35 \
-gencode=arch=compute_52,code=sm_52 \
-gencode=arch=compute_52,code=compute_52
-CXXFLAGS := -I$(CUDA_INC) -fPIC -fvisibility=hidden
+CXXFLAGS := -I$(CUDA_INC) -fPIC -fvisibility=hidden
NVCUFLAGS := -ccbin $(CXX) $(NVCC_GENCODE) -lineinfo -std=c++11 -maxrregcount 96
# Use addprefix so that we can specify more than one path
LDFLAGS := $(addprefix -L,${CUDA_LIB}) -lcudart -lrt
@@ -59,7 +60,7 @@ CUDA_MAJOR = $(shell echo $(CUDA_VERSION) | cut -d "." -f 1)
CUDA_MINOR = $(shell echo $(CUDA_VERSION) | cut -d "." -f 2)
CXXFLAGS += -DCUDA_MAJOR=$(CUDA_MAJOR) -DCUDA_MINOR=$(CUDA_MINOR)
-.PHONY : lib clean test mpitest install deb debian debclean
+.PHONY : lib clean test mpitest install deb debian debclean forlib fortest forclean
.DEFAULT : lib
INCEXPORTS := nccl.h
@@ -82,19 +83,19 @@ lib : $(INCTARGETS) $(LIBDIR)/$(LIBTARGET)
-include $(DEPFILES)
$(LIBDIR)/$(LIBTARGET) : $(LIBOBJ)
- @printf "Linking %-25s\n" $@
+ @printf "Linking %-35s > %s\n" $(LIBTARGET) $@
mkdir -p $(LIBDIR)
$(CXX) $(CXXFLAGS) -shared -Wl,--no-as-needed -Wl,-soname,$(LIBSONAME) -o $@ $(LDFLAGS) $(LIBOBJ)
ln -sf $(LIBSONAME) $(LIBDIR)/$(LIBNAME)
ln -sf $(LIBTARGET) $(LIBDIR)/$(LIBSONAME)
$(INCDIR)/%.h : src/%.h
- @printf "Grabbing %-25s > %-25s\n" $< $@
+ @printf "Grabbing %-35s > %s\n" $< $@
mkdir -p $(INCDIR)
cp -f $< $@
$(OBJDIR)/%.o : src/%.cu
- @printf "Compiling %-25s > %-25s\n" $< $@
+ @printf "Compiling %-35s > %s\n" $< $@
mkdir -p $(OBJDIR)
$(NVCC) -c $(NVCUFLAGS) --compiler-options "$(CXXFLAGS)" $< -o $@
@$(NVCC) -M $(NVCUFLAGS) --compiler-options "$(CXXFLAGS)" $< > $(@:%.o=%.d.tmp)
@@ -147,7 +148,7 @@ MPITESTBINS:= $(patsubst %, $(MPITSTDIR)/%, $(MPITESTS))
test : $(TESTBINS)
$(TSTDIR)/% : test/single/%.cu test/include/*.h $(TSTDEP)
- @printf "Building %-25s > %-24s\n" $< $@
+ @printf "Building %-35s > %s\n" $< $@
mkdir -p $(TSTDIR)
$(NVCC) $(TSTINC) $(NVCUFLAGS) --compiler-options "$(CXXFLAGS)" -o $@ $< $(TSTLIB) -lcuda -lcurand -lnvToolsExt
@$(NVCC) -M $(TSTINC) $(NVCUFLAGS) --compiler-options "$(CXXFLAGS)" $< $(TSTLIB) -lcuda -lcurand -lnvToolsExt > $(@:%=%.d.tmp)
@@ -159,7 +160,7 @@ $(TSTDIR)/% : test/single/%.cu test/include/*.h $(TSTDEP)
mpitest : $(MPITESTBINS)
$(MPITSTDIR)/% : test/mpi/%.cu $(TSTDEP)
- @printf "Building %-25s > %-24s\n" $< $@
+ @printf "Building %-35s > %s\n" $< $@
mkdir -p $(MPITSTDIR)
$(NVCC) $(MPIFLAGS) $(TSTINC) $(NVCUFLAGS) --compiler-options "$(CXXFLAGS)" -o $@ $< $(TSTLIB) -lcurand
@$(NVCC) $(MPIFLAGS) -M $(TSTINC) $(NVCUFLAGS) --compiler-options "$(CXXFLAGS)" $< $(TSTLIB) -lcurand > $(@:%=%.d.tmp)
@@ -193,7 +194,7 @@ debclean :
rm -Rf $(DEBIANDIR)
$(DEBIANDIR)/% : debian/%.in
- @printf "Generating %-25s > %-24s\n" $< $@
+ @printf "Generating %-35s > %s\n" $< $@
sed -e "s/\$${nccl:Major}/$(NCCL_MAJOR)/g" \
-e "s/\$${nccl:Minor}/$(NCCL_MINOR)/g" \
-e "s/\$${nccl:Patch}/$(NCCL_PATCH)/g" \
@@ -205,7 +206,18 @@ $(DEBIANDIR)/% : debian/%.in
$< > $@
$(DEBIANDIR)/% : debian/%
- @printf "Grabbing %-25s > %-25s\n" $< $@
+ @printf "Grabbing %-35s > %s\n" $< $@
mkdir -p $(DEBIANDIR)
cp -f $< $@
+#### FORTRAN BINDINGS ####
+
+export NCCL_MAJOR NCCL_MINOR NCCL_PATCH CUDA_MAJOR CUDA_MINOR LIBLINK CUDA_LIB BUILDDIR
+
+forlib : lib
+ $(MAKE) -C fortran lib
+fortest : forlib
+ $(MAKE) -C fortran test
+forclean :
+ $(MAKE) -C fortran clean
+
diff --git a/fortran/Makefile b/fortran/Makefile
new file mode 100644
index 0000000..b60b016
--- /dev/null
+++ b/fortran/Makefile
@@ -0,0 +1,81 @@
+FC := gfortran
+FCNAME := $(notdir $(FC))
+
+BUILDDIR ?= ../build
+INCDIR := $(BUILDDIR)/include
+LIBDIR := $(BUILDDIR)/lib
+OBJDIR := $(BUILDDIR)/obj
+
+LIBNAME := libncclfor.so
+LIBSONAME := $(patsubst %,%.$(NCCL_MAJOR),$(LIBNAME))
+LIBTARGET := $(patsubst %,%.$(NCCL_MAJOR).$(NCCL_MINOR).$(NCCL_PATCH),$(LIBNAME))
+LIBLINK += $(patsubst lib%.so,-l%,$(LIBNAME))
+
+LIBCUDAFOR := libcudafor.so
+
+ifneq ($(filter pgf%, $(FCNAME)), )
+# PGI compiler (pgfortran, pgf90, pgf95)
+FCMODFLAGS := -module $(INCDIR)
+FCPREFLAGS := -Mpreprocess
+FCCUDAFLAGS := -Mcuda,cuda$(CUDA_MAJOR).$(CUDA_MINOR)
+FCFLAGS := -fast -O3
+else
+# non-PGI compilers do not have CUDA support, compile our own CUDA lib
+CUDAFORDEP := $(LIBDIR)/$(LIBCUDAFOR)
+CUDALINK := -L$(CUDA_LIB) -lcudart
+CUDAFORLINK := -lcudafor
+ifeq ($(FCNAME), gfortran)
+FCMODFLAGS := -J$(INCDIR)
+FCPREFLAGS += -cpp
+FCFLAGS += -ffree-line-length-none
+else ifeq ($(FCNAME), ifort)
+FCMODFLAGS := -module $(INCDIR)
+FCPREFLAGS += -fpp
+endif
+endif
+
+ifeq ($(VERBOSE), 0)
+.SILENT:
+endif
+
+lib: $(CUDAFORDEP)
+ $(MAKE) $(LIBDIR)/$(LIBTARGET)
+
+$(LIBDIR)/$(LIBTARGET): $(OBJDIR)/ncclfor.o
+ @printf "Linking %-35s > %s\n" $(LIBTARGET) $@
+ mkdir -p $(LIBDIR)
+ $(FC) -shared -Wl,--no-as-needed -Wl,-soname,$(LIBSONAME) $< -o $(LIBDIR)/$(LIBTARGET)
+ ln -sf $(LIBSONAME) $(LIBDIR)/$(LIBNAME)
+ ln -sf $(LIBTARGET) $(LIBDIR)/$(LIBSONAME)
+
+$(LIBDIR)/$(LIBCUDAFOR): $(OBJDIR)/cudafor.o
+ @printf "Linking %-35s > %s\n" $(LIBCUDAFOR) $@
+ mkdir -p $(LIBDIR)
+ $(FC) -shared -Wl,--no-as-needed -Wl,-soname,$(LIBCUDAFOR) $< -o $(LIBDIR)/$(LIBCUDAFOR)
+
+$(OBJDIR)/%.o: src/%.f90
+ @printf "Building %-35s > %s\n" $< $@
+ mkdir -p $(OBJDIR)
+ mkdir -p $(INCDIR)
+ $(FC) -c $(FCMODFLAGS) $(FCPREFLAGS) -fPIC $(FCCUDAFLAGS) $(FCFLAGS) $< -o $@
+
+TESTS := reduce_ptr_out allreduce_ptr_out reducescatter_ptr_out broadcast_ptr allgather_ptr_out
+ifneq ($(filter pgf%, $(FCNAME)), )
+TESTS += reduce_arr_out allreduce_arr_out reducescatter_arr_out broadcast_arr allgather_arr_out
+endif
+
+TESTDIR := $(BUILDDIR)/test/fortran
+TESTBINS := $(patsubst %,$(TESTDIR)/%,$(TESTS))
+
+test: lib $(TESTBINS)
+
+$(TESTDIR)/%: test/%.f90 lib
+ @printf "Building %-35s > %s\n" $< $@
+ @mkdir -p $(TESTDIR)
+ $(FC) $(FCCUDAFLAGS) $(FCFLAGS) $< $(CUDALINK) -I$(INCDIR) -L$(LIBDIR) $(CUDAFORLINK) $(LIBLINK) -o $@
+
+clean:
+ rm -f $(LIBDIR)/$(LIBTARGET) $(LIBDIR)/$(LIBSONAME) $(LIBDIR)/$(LIBNAME)
+ rm -f $(LIBDIR)/$(LIBCUDAFOR) $(OBJDIR)/*for.o $(INCDIR)/*.mod
+ rm -rf $(TESTDIR)/
+
diff --git a/fortran/src/cudafor.f90 b/fortran/src/cudafor.f90
new file mode 100644
index 0000000..8c6934d
--- /dev/null
+++ b/fortran/src/cudafor.f90
@@ -0,0 +1,164 @@
+#ifndef _CUDA
+
+!Start cudaFor module
+module cudaFor
+use iso_c_binding
+implicit none
+private
+public :: c_devptr
+public :: cudaMemcpyKind, &
+ cudaMemcpyHostToHost, &
+ cudaMemcpyHostToDevice, &
+ cudaMemcpyDeviceToHost, &
+ cudaMemcpyDeviceToDevice, &
+ cudaMemcpyDefault
+public :: cuda_stream_kind
+public :: cudaGetDeviceCount
+public :: cudaSetDevice
+public :: cudaMalloc
+public :: cudaMemcpy
+public :: cudaFree
+public :: cudaStreamCreate
+public :: cudaStreamSynchronize
+public :: cudaStreamDestroy
+
+!Start types
+
+!Start c_devptr
+type, bind(c) :: c_devptr
+type(c_ptr) :: member
+end type c_devptr
+!End c_devptr
+
+!Start cudaMemcpyKind
+type, bind(c) :: cudaMemcpyKind
+integer(c_int) :: member
+end type cudaMemcpyKind
+
+type(cudaMemcpyKind), parameter :: cudaMemcpyHostToHost = cudaMemcpyKind(0), &
+ cudaMemcpyHostToDevice = cudaMemcpyKind(1), &
+ cudaMemcpyDeviceToHost = cudaMemcpyKind(2), &
+ cudaMemcpyDeviceToDevice = cudaMemcpyKind(3), &
+ cudaMemcpyDefault = cudaMemcpyKind(4)
+!End cudaMemcpyKind
+
+!Start cuda_stream_kind
+integer(c_intptr_t), parameter :: cuda_stream_kind = c_intptr_t
+!End cuda_stream_kind
+
+!End types
+
+!Start interfaces
+
+!Start cudaGetDeviceCount
+interface cudaGetDeviceCount
+integer(c_int) function cudaGetDeviceCount(count) bind(c, name = "cudaGetDeviceCount")
+import :: c_int
+implicit none
+integer(c_int) :: count
+end function cudaGetDeviceCount
+end interface cudaGetDeviceCount
+!End cudaGetDeviceCount
+
+!Start cudaSetDevice
+interface cudaSetDevice
+integer(c_int) function cudaSetDevice(device) bind(c, name = "cudaSetDevice")
+import :: c_int
+implicit none
+integer(c_int), value :: device
+end function cudaSetDevice
+end interface cudaSetDevice
+!End cudaSetDevice
+
+!Start cudaMalloc
+interface cudaMalloc
+integer(c_int) function cudaMalloc(devPtr, size) bind(c, name = "cudaMalloc")
+import :: c_int, c_size_t
+import :: c_devptr
+implicit none
+type(c_devptr) :: devPtr
+integer(c_size_t), value :: size
+end function cudaMalloc
+end interface cudaMalloc
+!End cudaMalloc
+
+!Start cudaMemcpy
+interface cudaMemcpy
+
+!Start cudaMemcpyH2D
+integer(c_int) function cudaMemcpyH2D(dst, src, count, kind) bind(c, name = "cudaMemcpy")
+import :: c_ptr, c_int, c_size_t
+import :: c_devptr, cudaMemcpyKind
+implicit none
+type(c_devptr), value :: dst
+type(c_ptr), value :: src
+integer(c_size_t), value :: count
+type(cudaMemcpyKind), value :: kind
+end function cudaMemcpyH2D
+!End cudaMemcpyH2D
+
+!Start cudaMemcpyD2H
+integer(c_int) function cudaMemcpyD2H(dst, src, count, kind) bind(c, name = "cudaMemcpy")
+import :: c_ptr, c_int, c_size_t
+import :: c_devptr, cudaMemcpyKind
+implicit none
+type(c_ptr), value :: dst
+type(c_devptr), value :: src
+integer(c_size_t), value :: count
+type(cudaMemcpyKind), value :: kind
+end function cudaMemcpyD2H
+!End cudaMemcpyD2H
+
+end interface cudaMemcpy
+!End cudaMemcpy
+
+!Start cudaFree
+interface cudaFree
+integer(c_int) function cudaFree(devPtr) bind(c, name = "cudaFree")
+import :: c_int
+import :: c_devptr
+implicit none
+type(c_devptr), value :: devPtr
+end function cudaFree
+end interface cudaFree
+!End cudaFree
+
+!Start cudaStreamCreate
+interface cudaStreamCreate
+integer(c_int) function cudaStreamCreate(pStream) bind(c, name = "cudaStreamCreate")
+import :: c_int
+import :: cuda_stream_kind
+implicit none
+integer(cuda_stream_kind) :: pStream
+end function cudaStreamCreate
+end interface cudaStreamCreate
+!End cudaStreamCreate
+
+!Start cudaStreamSynchronize
+interface cudaStreamSynchronize
+integer(c_int) function cudaStreamSynchronize(stream) bind(c, name = "cudaStreamSynchronize")
+import :: c_int
+import :: cuda_stream_kind
+implicit none
+integer(cuda_stream_kind), value :: stream
+end function cudaStreamSynchronize
+end interface cudaStreamSynchronize
+!End cudaStreamSynchronize
+
+!Start cudaStreamDestroy
+interface cudaStreamDestroy
+integer(c_int) function cudaStreamDestroy(stream) bind(c, name = "cudaStreamDestroy")
+import :: c_int
+import :: cuda_stream_kind
+implicit none
+integer(cuda_stream_kind), value :: stream
+end function cudaStreamDestroy
+end interface cudaStreamDestroy
+!End cudaStreamDestroy
+
+!End interfaces
+
+end module cudaFor
+!End cudaFor module
+
+#endif
diff --git a/fortran/src/ncclfor.f90 b/fortran/src/ncclfor.f90
new file mode 100644
index 0000000..db98f3d
--- /dev/null
+++ b/fortran/src/ncclfor.f90
@@ -0,0 +1,305 @@
+!Start defines
+#define NCCL_UNIQUE_ID_BYTES 128
+!End defines
+
+!Start ncclFor module
+module ncclFor
+use iso_c_binding
+use cudaFor
+implicit none
+private
+public :: ncclUniqueId
+public :: ncclComm
+public :: ncclResult, &
+ ncclSuccess, &
+ ncclUnhandledCudaError, &
+ ncclSystemError, &
+ ncclInternalError, &
+ ncclInvalidDevicePointer, &
+ ncclInvalidRank, &
+ ncclUnsupportedDeviceCount, &
+ ncclDeviceNotFound, &
+ ncclInvalidDeviceIndex, &
+ ncclLibWrapperNotSet, &
+ ncclCudaMallocFailed, &
+ ncclRankMismatch, &
+ ncclInvalidArgument, &
+ ncclInvalidType, &
+ ncclInvalidOperation, &
+ nccl_NUM_RESULTS
+public :: ncclDataType, &
+ ncclChar, &
+ ncclInt, &
+#ifdef CUDA_HAS_HALF
+ ncclHalf, &
+#endif
+ ncclFloat, &
+ ncclDouble, &
+ ncclInt64, &
+ ncclUInt64, &
+ nccl_NUM_TYPES
+public :: ncclRedOp, &
+ ncclSum, &
+ ncclProd, &
+ ncclMax, &
+ ncclMin, &
+ nccl_NUM_OPS
+public :: ncclGetUniqueId
+public :: ncclCommInitRank
+public :: ncclCommInitAll
+public :: ncclCommCuDevice
+public :: ncclCommUserRank
+public :: ncclCommCount
+public :: ncclCommDestroy
+public :: ncclReduce
+public :: ncclAllReduce
+public :: ncclReduceScatter
+public :: ncclBcast
+public :: ncclAllGather
+
+!Start types
+
+!Start ncclUniqueId
+type, bind(c) :: ncclUniqueId
+character(c_char) :: internal(NCCL_UNIQUE_ID_BYTES)
+end type ncclUniqueId
+!End ncclUniqueId
+
+!Start ncclComm
+type, bind(c) :: ncclComm
+type(c_ptr) :: member
+end type ncclComm
+!End ncclComm
+
+!Start ncclResult
+type, bind(c) :: ncclResult
+integer(c_int) :: member
+end type ncclResult
+
+type(ncclResult), parameter :: ncclSuccess = ncclResult( 0), &
+ ncclUnhandledCudaError = ncclResult( 1), &
+ ncclSystemError = ncclResult( 2), &
+ ncclInternalError = ncclResult( 3), &
+ ncclInvalidDevicePointer = ncclResult( 4), &
+ ncclInvalidRank = ncclResult( 5), &
+ ncclUnsupportedDeviceCount = ncclResult( 6), &
+ ncclDeviceNotFound = ncclResult( 7), &
+ ncclInvalidDeviceIndex = ncclResult( 8), &
+ ncclLibWrapperNotSet = ncclResult( 9), &
+ ncclCudaMallocFailed = ncclResult(10), &
+ ncclRankMismatch = ncclResult(11), &
+ ncclInvalidArgument = ncclResult(12), &
+ ncclInvalidType = ncclResult(13), &
+ ncclInvalidOperation = ncclResult(14), &
+ nccl_NUM_RESULTS = ncclResult(15)
+!End ncclResult
+
+!Start ncclDataType
+type, bind(c) :: ncclDataType
+integer(c_int) :: member
+end type ncclDataType
+
+type(ncclDataType), parameter :: ncclChar = ncclDataType(0), &
+ ncclInt = ncclDataType(1), &
+#ifdef CUDA_HAS_HALF
+ ncclHalf = ncclDataType(2), &
+#endif
+ ncclFloat = ncclDataType(3), &
+ ncclDouble = ncclDataType(4), &
+ ncclInt64 = ncclDataType(5), &
+ ncclUInt64 = ncclDataType(6), &
+ nccl_NUM_TYPES = ncclDataType(7)
+!End ncclDataType
+
+!Start ncclRedOp
+type, bind(c) :: ncclRedOp
+integer(c_int) :: member
+end type ncclRedOp
+
+type(ncclRedOp), parameter :: ncclSum = ncclRedOp(0), &
+ ncclProd = ncclRedOp(1), &
+ ncclMax = ncclRedOp(2), &
+ ncclMin = ncclRedOp(3), &
+ nccl_NUM_OPS = ncclRedOp(4)
+!End ncclRedOp
+
+!End types
+
+!Start interfaces
+
+!Start ncclGetUniqueId
+interface ncclGetUniqueId
+type(ncclResult) function ncclGetUniqueId(uniqueId) bind(c, name = 'ncclGetUniqueId')
+import :: ncclResult, ncclUniqueId
+implicit none
+type(ncclUniqueId) :: uniqueId
+end function ncclGetUniqueId
+end interface ncclGetUniqueId
+!End ncclGetUniqueId
+
+!Start ncclCommInitRank
+interface ncclCommInitRank
+type(ncclResult) function ncclCommInitRank(comm, ndev, commId, rank) bind(c, name = 'ncclCommInitRank')
+import :: c_int
+import :: ncclResult, ncclUniqueId, ncclComm
+implicit none
+type(ncclComm) :: comm(*)
+integer(c_int), value :: ndev
+type(ncclUniqueId), value :: commId
+integer(c_int), value :: rank
+end function ncclCommInitRank
+end interface ncclCommInitRank
+!End ncclCommInitRank
+
+!Start ncclCommInitAll
+interface ncclCommInitAll
+type(ncclResult) function ncclCommInitAll(comm, ndev, devlist) bind(c, name = 'ncclCommInitAll')
+import :: c_int
+import :: ncclResult, ncclComm
+implicit none
+type(ncclComm) :: comm(*)
+integer(c_int), value :: ndev
+integer(c_int) :: devlist(*)
+end function ncclCommInitAll
+end interface ncclCommInitAll
+!End ncclCommInitAll
+
+!Start ncclCommCuDevice
+interface ncclCommCuDevice
+type(ncclResult) function ncclCommCuDevice(comm, devid) bind(c, name = 'ncclCommCuDevice')
+import :: c_int
+import :: ncclResult, ncclComm
+implicit none
+type(ncclComm), value :: comm
+integer(c_int) :: devid
+end function ncclCommCuDevice
+end interface ncclCommCuDevice
+!End ncclCommCuDevice
+
+!Start ncclCommUserRank
+interface ncclCommUserRank
+type(ncclResult) function ncclCommUserRank(comm, rank) bind(c, name = 'ncclCommUserRank')
+import :: c_int
+import :: ncclResult, ncclComm
+implicit none
+type(ncclComm), value :: comm
+integer(c_int) :: rank
+end function ncclCommUserRank
+end interface ncclCommUserRank
+!End ncclCommUserRank
+
+!Start ncclCommCount
+interface ncclCommCount
+type(ncclResult) function ncclCommCount(comm, count) bind(c, name = 'ncclCommCount')
+import :: c_int
+import :: ncclResult, ncclComm
+implicit none
+type(ncclComm), value :: comm
+integer(c_int) :: count
+end function ncclCommCount
+end interface ncclCommCount
+!End ncclCommCount
+
+!Start ncclCommDestroy
+interface ncclCommDestroy
+subroutine ncclCommDestroy(comm) bind(c, name = 'ncclCommDestroy')
+import :: ncclComm
+implicit none
+type(ncclComm), value :: comm
+end subroutine ncclCommDestroy
+end interface ncclCommDestroy
+!End ncclCommDestroy
+
+!Start ncclReduce
+interface ncclReduce
+type(ncclResult) function ncclReduce(sendbuff, recvbuff, count, datatype, op, root, comm, stream) bind(c, name = 'ncclReduce')
+import :: c_int
+import :: c_devptr, cuda_stream_kind
+import :: ncclResult, ncclComm, ncclDataType, ncclRedOp
+implicit none
+type(c_devptr), value :: sendbuff
+type(c_devptr), value :: recvbuff
+integer(c_int), value :: count
+type(ncclDataType), value :: datatype
+type(ncclRedOp), value :: op
+integer(c_int), value :: root
+type(ncclComm), value :: comm
+integer(cuda_stream_kind), value :: stream
+end function ncclReduce
+end interface ncclReduce
+!End ncclReduce
+
+!Start ncclAllReduce
+interface ncclAllReduce
+type(ncclResult) function ncclAllReduce(sendbuff, recvbuff, count, datatype, op, comm, stream) bind(c, name = 'ncclAllReduce')
+import :: c_int
+import :: c_devptr, cuda_stream_kind
+import :: ncclResult, ncclComm, ncclDataType, ncclRedOp
+implicit none
+type(c_devptr), value :: sendbuff
+type(c_devptr), value :: recvbuff
+integer(c_int), value :: count
+type(ncclDataType), value :: datatype
+type(ncclRedOp), value :: op
+type(ncclComm), value :: comm
+integer(cuda_stream_kind), value :: stream
+end function ncclAllReduce
+end interface ncclAllReduce
+!End ncclAllReduce
+
+!Start ncclReduceScatter
+interface ncclReduceScatter
+type(ncclResult) function ncclReduceScatter(sendbuff, recvbuff, recvcount, datatype, op, comm, stream) bind(c, name = 'ncclReduceScatter')
+import :: c_int
+import :: c_devptr, cuda_stream_kind
+import :: ncclResult, ncclComm, ncclDataType, ncclRedOp
+implicit none
+type(c_devptr), value :: sendbuff
+type(c_devptr), value :: recvbuff
+integer(c_int), value :: recvcount
+type(ncclDataType), value :: datatype
+type(ncclRedOp), value :: op
+type(ncclComm), value :: comm
+integer(cuda_stream_kind), value :: stream
+end function ncclReduceScatter
+end interface ncclReduceScatter
+!End ncclReduceScatter
+
+!Start ncclBcast
+interface ncclBcast
+type(ncclResult) function ncclBcast(buff, count, datatype, root, comm, stream) bind(c, name = 'ncclBcast')
+import :: c_int
+import :: c_devptr, cuda_stream_kind
+import :: ncclResult, ncclComm, ncclDataType
+implicit none
+type(c_devptr), value :: buff
+integer(c_int), value :: count
+type(ncclDataType), value :: datatype
+integer(c_int), value :: root
+type(ncclComm), value :: comm
+integer(cuda_stream_kind), value :: stream
+end function ncclBcast
+end interface ncclBcast
+!End ncclBcast
+
+!Start ncclAllGather
+interface ncclAllGather
+type(ncclResult) function ncclAllGather(sendbuff, count, datatype, recvbuff, comm, stream) bind(c, name = 'ncclAllGather')
+import :: c_int
+import :: c_devptr, cuda_stream_kind
+import :: ncclResult, ncclComm, ncclDataType
+implicit none
+type(c_devptr), value :: sendbuff
+integer(c_int), value :: count
+type(ncclDataType), value :: datatype
+type(c_devptr), value :: recvbuff
+type(ncclComm), value :: comm
+integer(cuda_stream_kind), value :: stream
+end function ncclAllGather
+end interface ncclAllGather
+!End ncclAllGather
+
+!End interfaces
+
+end module ncclFor
+!End nccl module
diff --git a/fortran/test/allgather_arr_out.f90 b/fortran/test/allgather_arr_out.f90
new file mode 100644
index 0000000..61fd35b
--- /dev/null
+++ b/fortran/test/allgather_arr_out.f90
@@ -0,0 +1,155 @@
+program test
+use iso_c_binding
+use iso_fortran_env
+use cudaFor
+use ncclFor
+implicit none
+integer(int32) :: stat, i
+real(real32) :: err
+integer(int32) :: nEl, nDev
+type(ncclDataType) :: dataType
+type(ncclComm), allocatable :: comm(:)
+integer(int32), allocatable :: devList(:)
+type(ncclResult) :: res
+integer(int32) :: cudaDev, rank
+integer(cuda_stream_kind), allocatable :: stream(:)
+integer(int32) :: time(8)
+integer(int32), allocatable :: seed(:)
+real(real32), allocatable :: hostBuff(:, :)
+real(real32), allocatable, device :: sendBuff(:)
+type(c_devptr), allocatable :: sendBuffPtr(:)
+real(real32), allocatable, device :: recvBuff(:)
+type(c_devptr), allocatable :: recvBuffPtr(:)
+
+ nEl = 2621440
+
+! nDev = 2
+ stat = cudaGetDeviceCount(nDev)
+
+ dataType = ncclFloat
+
+ allocate(comm(nDev))
+ allocate(devList(nDev))
+
+ do i = 1, nDev
+ devList(i) = i - 1
+ end do
+
+ res = ncclCommInitAll(comm, nDev, devList)
+
+ do i = 1, nDev
+ res = ncclCommCuDevice(comm(i), cudaDev)
+ res = ncclCommUserRank(comm(i), rank)
+ end do
+
+ allocate(stream(nDev))
+
+ do i = 1, nDev
+ stat = cudaSetDevice(devList(i))
+ stat = cudaStreamCreate(stream(i))
+ end do
+
+ call date_and_time(values = time)
+ call random_seed(size = i)
+ allocate(seed(i))
+ call random_seed(get = seed)
+ seed = 60 * 60 * 1000 * time(5) + 60 * 1000 * time(6) + 1000 * time(7) + time(8) - seed
+ call random_seed(put = seed)
+
+ allocate(hostBuff(nEl * nDev, nDev + 1))
+
+ call random_number(hostBuff)
+
+ print "(a)", "before allgather:"
+ do i = 1, nDev
+ err = maxval(abs(hostBuff(:, i) / hostBuff(:, nDev + 1) - 1.0_real32))
+ print "(a, i2.2, a, e11.4e2)", "maximum error of rank ", i - 1, " vs sendbuff = ", err
+ end do
+
+ allocate(sendBuffPtr(nDev))
+
+ do i = 1, nDev
+ stat = cudaSetDevice(devList(i))
+ allocate(sendBuff(nEl))
+ sendBuffPtr(i) = c_devloc(sendBuff)
+ sendBuff = hostBuff((i - 1) * nEl + 1:i * nEl, nDev + 1)
+ end do
+
+ allocate(recvBuffPtr(nDev))
+
+ do i = 1, nDev
+ stat = cudaSetDevice(devList(i))
+ allocate(recvBuff(nEl * nDev))
+ recvBuffPtr(i) = c_devloc(recvBuff)
+ recvBuff = hostBuff(:, i)
+ end do
+
+ do i = 1, nDev
+ stat = cudaSetDevice(devList(i))
+ res = ncclAllGather(sendBuffPtr(i), nEl, dataType, recvBuffPtr(i), comm(i), stream(i))
+ end do
+
+ do i = 1, nDev
+ stat = cudaSetDevice(devList(i))
+ stat = cudaStreamSynchronize(stream(i))
+ end do
+
+ do i = 1, nDev
+ stat = cudaSetDevice(devList(i))
+ call c_f_pointer(recvBuffPtr(i), recvBuff, [nEl * nDev])
+ hostBuff(:, i) = recvBuff
+ end do
+
+ print "(a)", ""
+ print "(a)", "after allgather:"
+ do i = 1, nDev
+ err = maxval(abs(hostBuff(:, i) / hostBuff(:, nDev + 1) - 1.0_real32))
+ print "(a, i2.2, a, e11.4e2)", "maximum error of rank ", i - 1, " vs sendbuff = ", err
+ end do
+
+ do i = 1, nDev
+ stat = cudaSetDevice(devList(i))
+ call c_f_pointer(sendBuffPtr(i), sendBuff, [nEl])
+ hostBuff((i - 1) * nEl + 1:i * nEl, 1) = sendBuff
+ end do
+
+ err = maxval(abs(hostBuff(:, 1) / hostBuff(:, nDev + 1) - 1.0_real32))
+ print "(a)", ""
+ print "(a, e11.4e2)", "maximum error in sendbuff = ", err
+ print "(a)", ""
+
+ do i = 1, nDev
+ stat = cudaSetDevice(devList(i))
+ call c_f_pointer(recvBuffPtr(i), recvBuff, [nEl * nDev])
+ deallocate(recvBuff)
+ end do
+
+ deallocate(recvBuffPtr)
+
+ do i = 1, nDev
+ stat = cudaSetDevice(devList(i))
+ call c_f_pointer(sendBuffPtr(i), sendBuff, [nEl])
+ deallocate(sendBuff)
+ end do
+
+ deallocate(sendBuffPtr)
+
+ deallocate(hostBuff)
+
+ deallocate(seed)
+
+ do i = 1, nDev
+ stat = cudaSetDevice(devList(i))
+ stat = cudaStreamDestroy(stream(i))
+ end do
+
+ deallocate(stream)
+
+ do i = 1, nDev
+ call ncclCommDestroy(comm(i))
+ end do
+
+ deallocate(devList)
+ deallocate(comm)
+
+end program test
diff --git a/fortran/test/allgather_ptr_out.f90 b/fortran/test/allgather_ptr_out.f90
new file mode 100644
index 0000000..b407664
--- /dev/null
+++ b/fortran/test/allgather_ptr_out.f90
@@ -0,0 +1,164 @@
+program test
+use iso_c_binding
+use iso_fortran_env
+use cudaFor
+use ncclFor
+implicit none
+integer(int32) :: stat, i
+real(real32) :: err
+integer(int32) :: nEl, nDev
+type(ncclDataType) :: dataType
+type(ncclComm), allocatable :: comm(:)
+integer(int32), allocatable :: devList(:)
+type(ncclResult) :: res
+integer(int32) :: cudaDev, rank
+integer(cuda_stream_kind), allocatable :: stream(:)
+integer(int32) :: time(8)
+integer(int32), allocatable :: seed(:)
+real(real32), allocatable, target :: hostBuff(:, :)
+type(c_ptr), allocatable :: hostBuffPtr(:)
+type(c_devptr), allocatable :: sendBuffPtr(:)
+type(c_devptr), allocatable :: recvBuffPtr(:)
+
+ nEl = 2621440
+
+! nDev = 2
+ stat = cudaGetDeviceCount(nDev)
+
+ dataType = ncclFloat
+
+ allocate(comm(nDev))
+ allocate(devList(nDev))
+
+ do i = 1, nDev
+ devList(i) = i - 1
+ end do
+
+ res = ncclCommInitAll(comm, nDev, devList)
+
+ do i = 1, nDev
+ res = ncclCommCuDevice(comm(i), cudaDev)
+ res = ncclCommUserRank(comm(i), rank)
+ end do
+
+ allocate(stream(nDev))
+
+ do i = 1, nDev
+ stat = cudaSetDevice(devList(i))
+ stat = cudaStreamCreate(stream(i))
+ end do
+
+ call date_and_time(values = time)
+ call random_seed(size = i)
+ allocate(seed(i))
+ call random_seed(get = seed)
+ seed = 60 * 60 * 1000 * time(5) + 60 * 1000 * time(6) + 1000 * time(7) + time(8) - seed
+ call random_seed(put = seed)
+
+ allocate(hostBuff(nEl * nDev, nDev + 1))
+
+ call random_number(hostBuff)
+
+ print "(a)", "before allgather:"
+ do i = 1, nDev
+ err = maxval(abs(hostBuff(:, i) / hostBuff(:, nDev + 1) - 1.0_real32))
+ print "(a, i2.2, a, e11.4e2)", "maximum error of rank ", i - 1, " vs sendbuff = ", err
+ end do
+
+ allocate(hostBuffPtr(nDev))
+
+ do i = 1, nDev
+ hostBuffPtr(i) = c_loc(hostBuff((i - 1) * nEl + 1, nDev + 1))
+ end do
+
+ allocate(sendBuffPtr(nDev))
+
+ do i = 1, nDev
+ stat = cudaSetDevice(devList(i))
+ stat = cudaMalloc(sendBuffPtr(i), nEl * c_sizeof(hostBuff(1, 1)))
+ stat = cudaMemcpy(sendBuffPtr(i), hostBuffPtr(i), nEl * c_sizeof(hostBuff(1, 1)), cudaMemcpyHostToDevice)
+ end do
+
+ do i = 1, nDev
+ hostBuffPtr(i) = c_loc(hostBuff(1, i))
+ end do
+
+ allocate(recvBuffPtr(nDev))
+
+ do i = 1, nDev
+ stat = cudaSetDevice(devList(i))
+ stat = cudaMalloc(recvBuffPtr(i), nEl * c_sizeof(hostBuff(1, 1)) * nDev)
+ stat = cudaMemcpy(recvBuffPtr(i), hostBuffPtr(i), nEl * c_sizeof(hostBuff(1, 1)) * nDev, cudaMemcpyHostToDevice)
+ end do
+
+ do i = 1, nDev
+ stat = cudaSetDevice(devList(i))
+ res = ncclAllGather(sendBuffPtr(i), nEl, dataType, recvBuffPtr(i), comm(i), stream(i))
+ end do
+
+ do i = 1, nDev
+ stat = cudaSetDevice(devList(i))
+ stat = cudaStreamSynchronize(stream(i))
+ end do
+
+ do i = 1, nDev
+ stat = cudaSetDevice(devList(i))
+ stat = cudaMemcpy(hostBuffPtr(i), recvBuffPtr(i), nEl * c_sizeof(hostBuff(1, 1)) * nDev, cudaMemcpyDeviceToHost)
+ end do
+
+ print "(a)", ""
+ print "(a)", "after allgather:"
+ do i = 1, nDev
+ err = maxval(abs(hostBuff(:, i) / hostBuff(:, nDev + 1) - 1.0_real32))
+ print "(a, i2.2, a, e11.4e2)", "maximum error of rank ", i - 1, " vs sendbuff = ", err
+ end do
+
+ do i = 1, nDev
+ hostBuffPtr(i) = c_loc(hostBuff((i - 1) * nEl + 1, 1))
+ end do
+
+ do i = 1, nDev
+ stat = cudaSetDevice(devList(i))
+ stat = cudaMemcpy(hostBuffPtr(i), sendBuffPtr(i), nEl * c_sizeof(hostBuff(1, 1)), cudaMemcpyDeviceToHost)
+ end do
+
+ err = maxval(abs(hostBuff(:, 1) / hostBuff(:, nDev + 1) - 1.0_real32))
+ print "(a)", ""
+ print "(a, e11.4e2)", "maximum error in sendbuff = ", err
+ print "(a)", ""
+
+ do i = 1, nDev
+ stat = cudaSetDevice(devList(i))
+ stat = cudaFree(recvBuffPtr(i))
+ end do
+
+ deallocate(recvBuffPtr)
+
+ do i = 1, nDev
+ stat = cudaSetDevice(devList(i))
+ stat = cudaFree(sendBuffPtr(i))
+ end do
+
+ deallocate(sendBuffPtr)
+
+ deallocate(hostBuffPtr)
+
+ deallocate(hostBuff)
+
+ deallocate(seed)
+
+ do i = 1, nDev
+ stat = cudaSetDevice(devList(i))
+ stat = cudaStreamDestroy(stream(i))
+ end do
+
+ deallocate(stream)
+
+ do i = 1, nDev
+ call ncclCommDestroy(comm(i))
+ end do
+
+ deallocate(devList)
+ deallocate(comm)
+
+end program test
diff --git a/fortran/test/allreduce_arr_out.f90 b/fortran/test/allreduce_arr_out.f90
new file mode 100644
index 0000000..4804ceb
--- /dev/null
+++ b/fortran/test/allreduce_arr_out.f90
@@ -0,0 +1,158 @@
+program test
+use iso_c_binding
+use iso_fortran_env
+use cudaFor
+use ncclFor
+implicit none
+integer(int32) :: stat, i
+real(real32) :: err
+integer(int32) :: nEl, nDev
+type(ncclDataType) :: dataType
+type(ncclRedOp) :: redOp
+type(ncclComm), allocatable :: comm(:)
+integer(int32), allocatable :: devList(:)
+type(ncclResult) :: res
+integer(int32) :: cudaDev, rank
+integer(cuda_stream_kind), allocatable :: stream(:)
+integer(int32) :: time(8)
+integer(int32), allocatable :: seed(:)
+real(real32), allocatable :: hostBuff(:, :)
+real(real32), allocatable, device :: sendBuff(:)
+type(c_devptr), allocatable :: sendBuffPtr(:)
+real(real32), allocatable, device :: recvBuff(:)
+type(c_devptr), allocatable :: recvBuffPtr(:)
+
+ nEl = 2621440
+
+! nDev = 2
+ stat = cudaGetDeviceCount(nDev)
+
+ dataType = ncclFloat
+ redOp = ncclProd
+
+ allocate(comm(nDev))
+ allocate(devList(nDev))
+
+ do i = 1, nDev
+ devList(i) = i - 1
+ end do
+
+ res = ncclCommInitAll(comm, nDev, devList)
+
+ do i = 1, nDev
+ res = ncclCommCuDevice(comm(i), cudaDev)
+ res = ncclCommUserRank(comm(i), rank)
+ end do
+
+ allocate(stream(nDev))
+
+ do i = 1, nDev
+ stat = cudaSetDevice(devList(i))
+ stat = cudaStreamCreate(stream(i))
+ end do
+
+ call date_and_time(values = time)
+ call random_seed(size = i)
+ allocate(seed(i))
+ call random_seed(get = seed)
+ seed = 60 * 60 * 1000 * time(5) + 60 * 1000 * time(6) + 1000 * time(7) + time(8) - seed
+ call random_seed(put = seed)
+
+ allocate(hostBuff(nEl, nDev + 2))
+
+ call random_number(hostBuff(:, 1:nDev + 1))
+
+ hostBuff(:, nDev + 2) = hostBuff(:, 1)
+ do i = 2, nDev
+ hostBuff(:, nDev + 2) = hostBuff(:, nDev + 2) * hostBuff(:, i)
+ end do
+
+ print "(a)", "before allreduce:"
+ do i = 1, nDev
+ err = maxval(abs(hostBuff(:, i) / hostBuff(:, nDev + 2) - 1.0_real32))
+ print "(a, i2.2, a, e11.4e2)", "maximum error in recvbuff from rank ", i - 1," = ", err
+ end do
+
+ allocate(sendBuffPtr(nDev))
+
+ do i = 1, nDev
+ stat = cudaSetDevice(devList(i))
+ allocate(sendBuff(nEl))
+ sendBuffPtr(i) = c_devloc(sendBuff)
+ sendBuff = hostBuff(:, i)
+ end do
+
+ allocate(recvBuffPtr(nDev))
+
+ do i = 1, nDev
+ stat = cudaSetDevice(devList(i))
+ allocate(recvBuff(nEl))
+ recvBuffPtr(i) = c_devloc(recvBuff)
+ recvBuff = hostBuff(:, i)
+ end do
+
+ do i = 1, nDev
+ stat = cudaSetDevice(devList(i))
+ res = ncclAllReduce(sendBuffPtr(i), recvBuffPtr(i), nEl, dataType, redOp, comm(i), stream(i))
+ end do
+
+ do i = 1, nDev
+ stat = cudaSetDevice(devList(i))
+ stat = cudaStreamSynchronize(stream(i))
+ end do
+
+ print "(a)", ""
+ print "(a)", "after allreduce:"
+ do i = 1, nDev
+ stat = cudaSetDevice(devList(i))
+ call c_f_pointer(recvBuffPtr(i), recvBuff, [nEl])
+ hostBuff(:, nDev + 1) = recvBuff
+ err = maxval(abs(hostBuff(:, nDev + 1) / hostBuff(:, nDev + 2) - 1.0_real32))
+ print "(a, i2.2, a, e11.4e2)", "maximum error in recvbuff from rank ", i - 1," = ", err
+ end do
+
+ print "(a)", ""
+ do i = 1, nDev
+ stat = cudaSetDevice(devList(i))
+ call c_f_pointer(sendBuffPtr(i), sendBuff, [nEl])
+ hostBuff(:, nDev + 1) = sendBuff
+ err = maxval(abs(hostBuff(:, nDev + 1) / hostBuff(:, i) - 1.0_real32))
+ print "(a, i2.2, a, e11.4e2)", "maximum error in sendbuff of rank ", i - 1," = ", err
+ end do
+ print "(a)", ""
+
+ do i = 1, nDev
+ stat = cudaSetDevice(devList(i))
+ call c_f_pointer(recvBuffPtr(i), recvBuff, [nEl])
+ deallocate(recvBuff)
+ end do
+
+ deallocate(recvBuffPtr)
+
+ do i = 1, nDev
+ stat = cudaSetDevice(devList(i))
+ call c_f_pointer(sendBuffPtr(i), sendBuff, [nEl])
+ deallocate(sendBuff)
+ end do
+
+ deallocate(sendBuffPtr)
+
+ deallocate(hostBuff)
+
+ deallocate(seed)
+
+ do i = 1, nDev
+ stat = cudaSetDevice(devList(i))
+ stat = cudaStreamDestroy(stream(i))
+ end do
+
+ deallocate(stream)
+
+ do i = 1, nDev
+ call ncclCommDestroy(comm(i))
+ end do
+
+ deallocate(devList)
+ deallocate(comm)
+
+end program test
diff --git a/fortran/test/allreduce_ptr_out.f90 b/fortran/test/allreduce_ptr_out.f90
new file mode 100644
index 0000000..39b2f2b
--- /dev/null
+++ b/fortran/test/allreduce_ptr_out.f90
@@ -0,0 +1,159 @@
+program test
+use iso_c_binding
+use iso_fortran_env
+use cudaFor
+use ncclFor
+implicit none
+integer(int32) :: stat, i
+real(real32) :: err
+integer(int32) :: nEl, nDev
+type(ncclDataType) :: dataType
+type(ncclRedOp) :: redOp
+type(ncclComm), allocatable :: comm(:)
+integer(int32), allocatable :: devList(:)
+type(ncclResult) :: res
+integer(int32) :: cudaDev, rank
+integer(cuda_stream_kind), allocatable :: stream(:)
+integer(int32) :: time(8)
+integer(int32), allocatable :: seed(:)
+real(real32), allocatable, target :: hostBuff(:, :)
+type(c_ptr), allocatable :: hostBuffPtr(:)
+type(c_devptr), allocatable :: sendBuffPtr(:)
+type(c_devptr), allocatable :: recvBuffPtr(:)
+
+ nEl = 2621440
+
+! nDev = 2
+ stat = cudaGetDeviceCount(nDev)
+
+ dataType = ncclFloat
+ redOp = ncclProd
+
+ allocate(comm(nDev))
+ allocate(devList(nDev))
+
+ do i = 1, nDev
+ devList(i) = i - 1
+ end do
+
+ res = ncclCommInitAll(comm, nDev, devList)
+
+ do i = 1, nDev
+ res = ncclCommCuDevice(comm(i), cudaDev)
+ res = ncclCommUserRank(comm(i), rank)
+ end do
+
+ allocate(stream(nDev))
+
+ do i = 1, nDev
+ stat = cudaSetDevice(devList(i))
+ stat = cudaStreamCreate(stream(i))
+ end do
+
+ call date_and_time(values = time)
+ call random_seed(size = i)
+ allocate(seed(i))
+ call random_seed(get = seed)
+ seed = 60 * 60 * 1000 * time(5) + 60 * 1000 * time(6) + 1000 * time(7) + time(8) - seed
+ call random_seed(put = seed)
+
+ allocate(hostBuff(nEl, nDev + 2))
+
+ call random_number(hostBuff(:, 1:nDev + 1))
+
+ hostBuff(:, nDev + 2) = hostBuff(:, 1)
+ do i = 2, nDev
+ hostBuff(:, nDev + 2) = hostBuff(:, nDev + 2) * hostBuff(:, i)
+ end do
+
+ print "(a)", "before allreduce:"
+ do i = 1, nDev
+ err = maxval(abs(hostBuff(:, i) / hostBuff(:, nDev + 2) - 1.0_real32))
+ print "(a, i2.2, a, e11.4e2)", "maximum error in recvbuff from rank ", i - 1," = ", err
+ end do
+
+ allocate(hostBuffPtr(nDev + 1))
+
+ do i = 1, nDev + 1
+ hostBuffPtr(i) = c_loc(hostBuff(1, i))
+ end do
+
+ allocate(sendBuffPtr(nDev))
+
+ do i = 1, nDev
+ stat = cudaSetDevice(devList(i))
+ stat = cudaMalloc(sendBuffPtr(i), nEl * c_sizeof(hostBuff(1, 1)))
+ stat = cudaMemcpy(sendBuffPtr(i), hostBuffPtr(i), nEl * c_sizeof(hostBuff(1, 1)), cudaMemcpyHostToDevice)
+ end do
+
+ allocate(recvBuffPtr(nDev))
+
+ do i = 1, nDev
+ stat = cudaSetDevice(devList(i))
+ stat = cudaMalloc(recvBuffPtr(i), nEl * c_sizeof(hostBuff(1, 1)))
+ stat = cudaMemcpy(recvBuffPtr(i), hostBuffPtr(i), nEl * c_sizeof(hostBuff(1, 1)), cudaMemcpyHostToDevice)
+ end do
+
+ do i = 1, nDev
+ stat = cudaSetDevice(devList(i))
+ res = ncclAllReduce(sendBuffPtr(i), recvBuffPtr(i), nEl, dataType, redOp, comm(i), stream(i))
+ end do
+
+ do i = 1, nDev
+ stat = cudaSetDevice(devList(i))
+ stat = cudaStreamSynchronize(stream(i))
+ end do
+
+ print "(a)", ""
+ print "(a)", "after allreduce:"
+ do i = 1, nDev
+ stat = cudaSetDevice(devList(i))
+ stat = cudaMemcpy(hostBuffPtr(nDev + 1), recvBuffPtr(i), nEl * c_sizeof(hostBuff(1, 1)), cudaMemcpyDeviceToHost)
+ err = maxval(abs(hostBuff(:, nDev + 1) / hostBuff(:, nDev + 2) - 1.0_real32))
+ print "(a, i2.2, a, e11.4e2)", "maximum error in recvbuff from rank ", i - 1," = ", err
+ end do
+
+ print "(a)", ""
+ do i = 1, nDev
+ stat = cudaSetDevice(devList(i))
+ stat = cudaMemcpy(hostBuffPtr(nDev + 1), sendBuffPtr(i), nEl * c_sizeof(hostBuff(1, 1)), cudaMemcpyDeviceToHost)
+ err = maxval(abs(hostBuff(:, nDev + 1) / hostBuff(:, i) - 1.0_real32))
+ print "(a, i2.2, a, e11.4e2)", "maximum error in sendbuff of rank ", i - 1," = ", err
+ end do
+ print "(a)", ""
+
+ do i = 1, nDev
+ stat = cudaSetDevice(devList(i))
+ stat = cudaFree(recvBuffPtr(i))
+ end do
+
+ deallocate(recvBuffPtr)
+
+ do i = 1, nDev
+ stat = cudaSetDevice(devList(i))
+ stat = cudaFree(sendBuffPtr(i))
+ end do
+
+ deallocate(sendBuffPtr)
+
+ deallocate(hostBuffPtr)
+
+ deallocate(hostBuff)
+
+ deallocate(seed)
+
+ do i = 1, nDev
+ stat = cudaSetDevice(devList(i))
+ stat = cudaStreamDestroy(stream(i))
+ end do
+
+ deallocate(stream)
+
+ do i = 1, nDev
+ call ncclCommDestroy(comm(i))
+ end do
+
+ deallocate(devList)
+ deallocate(comm)
+
+end program test
diff --git a/fortran/test/broadcast_arr.f90 b/fortran/test/broadcast_arr.f90
new file mode 100644
index 0000000..137c679
--- /dev/null
+++ b/fortran/test/broadcast_arr.f90
@@ -0,0 +1,130 @@
+program test
+use iso_c_binding
+use iso_fortran_env
+use cudaFor
+use ncclFor
+implicit none
+integer(int32) :: stat, i
+real(real32) :: err
+integer(int32) :: nEl, nDev, root
+type(ncclDataType) :: dataType
+type(ncclComm), allocatable :: comm(:)
+integer(int32), allocatable :: devList(:)
+type(ncclResult) :: res
+integer(int32) :: cudaDev, rank
+integer(cuda_stream_kind), allocatable :: stream(:)
+integer(int32) :: time(8)
+integer(int32), allocatable :: seed(:)
+real(real32), allocatable :: hostBuff(:, :)
+real(real32), allocatable, device :: devBuff(:)
+type(c_devptr), allocatable :: devBuffPtr(:)
+
+ nEl = 2621440
+
+! nDev = 2
+! root = 0
+ stat = cudaGetDeviceCount(nDev)
+ root = nDev - 1
+
+ dataType = ncclFloat
+
+ allocate(comm(nDev))
+ allocate(devList(nDev))
+
+ do i = 1, nDev
+ devList(i) = i - 1
+ end do
+
+ res = ncclCommInitAll(comm, nDev, devList)
+
+ do i = 1, nDev
+ res = ncclCommCuDevice(comm(i), cudaDev)
+ res = ncclCommUserRank(comm(i), rank)
+ end do
+
+ allocate(stream(nDev))
+
+ do i = 1, nDev
+ stat = cudaSetDevice(devList(i))
+ stat = cudaStreamCreate(stream(i))
+ end do
+
+ call date_and_time(values = time)
+ call random_seed(size = i)
+ allocate(seed(i))
+ call random_seed(get = seed)
+ seed = 60 * 60 * 1000 * time(5) + 60 * 1000 * time(6) + 1000 * time(7) + time(8) - seed
+ call random_seed(put = seed)
+
+ allocate(hostBuff(nEl, nDev + 1))
+
+ call random_number(hostBuff(:, 1:nDev))
+
+ hostBuff(:, nDev + 1) = hostBuff(:, root + 1)
+
+ print "(a)", "before broadcast:"
+ do i = 1, nDev
+ err = maxval(abs(hostBuff(:, i) / hostBuff(:, nDev + 1) - 1.0_real32))
+ print "(a, i2.2, a, i2.2, a, e11.4e2)", "maximum error of rank ", i - 1, " vs root (rank ", root,") = ", err
+ end do
+
+ allocate(devBuffPtr(nDev))
+
+ do i = 1, nDev
+ stat = cudaSetDevice(devList(i))
+ allocate(devBuff(nEl))
+ devBuffPtr(i) = c_devloc(devBuff)
+ devBuff = hostBuff(:, i)
+ end do
+
+ do i = 1, nDev
+ stat = cudaSetDevice(devList(i))
+ res = ncclBcast(devBuffPtr(i), nEl, dataType, root, comm(i), stream(i))
+ end do
+
+ do i = 1, nDev
+ stat = cudaSetDevice(devList(i))
+ stat = cudaStreamSynchronize(stream(i))
+ end do
+
+ do i = 1, nDev
+ stat = cudaSetDevice(devList(i))
+ call c_f_pointer(devBuffPtr(i), devBuff, [nEl])
+ hostBuff(:, i) = devBuff
+ end do
+
+ print "(a)", ""
+ print "(a)", "after broadcast:"
+ do i = 1, nDev
+ err = maxval(abs(hostBuff(:, i) / hostBuff(:, nDev + 1) - 1.0_real32))
+ print "(a, i2.2, a, i2.2, a, e11.4e2)", "maximum error of rank ", i - 1, " vs root (rank ", root,") = ", err
+ end do
+ print "(a)", ""
+
+ do i = 1, nDev
+ stat = cudaSetDevice(devList(i))
+ call c_f_pointer(devBuffPtr(i), devBuff, [nEl])
+ deallocate(devBuff)
+ end do
+
+ deallocate(devBuffPtr)
+
+ deallocate(hostBuff)
+
+ deallocate(seed)
+
+ do i = 1, nDev
+ stat = cudaSetDevice(devList(i))
+ stat = cudaStreamDestroy(stream(i))
+ end do
+
+ deallocate(stream)
+
+ do i = 1, nDev
+ call ncclCommDestroy(comm(i))
+ end do
+
+ deallocate(devList)
+ deallocate(comm)
+
+end program test
diff --git a/fortran/test/broadcast_ptr.f90 b/fortran/test/broadcast_ptr.f90
new file mode 100644
index 0000000..0918519
--- /dev/null
+++ b/fortran/test/broadcast_ptr.f90
@@ -0,0 +1,135 @@
+program test
+use iso_c_binding
+use iso_fortran_env
+use cudaFor
+use ncclFor
+implicit none
+integer(int32) :: stat, i
+real(real32) :: err
+integer(int32) :: nEl, nDev, root
+type(ncclDataType) :: dataType
+type(ncclComm), allocatable :: comm(:)
+integer(int32), allocatable :: devList(:)
+type(ncclResult) :: res
+integer(int32) :: cudaDev, rank
+integer(cuda_stream_kind), allocatable :: stream(:)
+integer(int32) :: time(8)
+integer(int32), allocatable :: seed(:)
+real(real32), allocatable, target :: hostBuff(:, :)
+type(c_ptr), allocatable :: hostBuffPtr(:)
+type(c_devptr), allocatable :: devBuffPtr(:)
+
+ nEl = 2621440
+
+! nDev = 2
+! root = 0
+ stat = cudaGetDeviceCount(nDev)
+ root = nDev - 1
+
+ dataType = ncclFloat
+
+ allocate(comm(nDev))
+ allocate(devList(nDev))
+
+ do i = 1, nDev
+ devList(i) = i - 1
+ end do
+
+ res = ncclCommInitAll(comm, nDev, devList)
+
+ do i = 1, nDev
+ res = ncclCommCuDevice(comm(i), cudaDev)
+ res = ncclCommUserRank(comm(i), rank)
+ end do
+
+ allocate(stream(nDev))
+
+ do i = 1, nDev
+ stat = cudaSetDevice(devList(i))
+ stat = cudaStreamCreate(stream(i))
+ end do
+
+ call date_and_time(values = time)
+ call random_seed(size = i)
+ allocate(seed(i))
+ call random_seed(get = seed)
+ seed = 60 * 60 * 1000 * time(5) + 60 * 1000 * time(6) + 1000 * time(7) + time(8) - seed
+ call random_seed(put = seed)
+
+ allocate(hostBuff(nEl, nDev + 1))
+
+ call random_number(hostBuff(:, 1:nDev))
+
+ hostBuff(:, nDev + 1) = hostBuff(:, root + 1)
+
+ print "(a)", "before broadcast:"
+ do i = 1, nDev
+ err = maxval(abs(hostBuff(:, i) / hostBuff(:, nDev + 1) - 1.0_real32))
+ print "(a, i2.2, a, i2.2, a, e11.4e2)", "maximum error of rank ", i - 1, " vs root (rank ", root,") = ", err
+ end do
+
+ allocate(hostBuffPtr(nDev))
+
+ do i = 1, nDev
+ hostBuffPtr(i) = c_loc(hostBuff(1, i))
+ end do
+
+ allocate(devBuffPtr(nDev))
+
+ do i = 1, nDev
+ stat = cudaSetDevice(devList(i))
+ stat = cudaMalloc(devBuffPtr(i), nEl * c_sizeof(hostBuff(1, 1)))
+ stat = cudaMemcpy(devBuffPtr(i), hostBuffPtr(i), nEl * c_sizeof(hostBuff(1, 1)), cudaMemcpyHostToDevice)
+ end do
+
+ do i = 1, nDev
+ stat = cudaSetDevice(devList(i))
+ res = ncclBcast(devBuffPtr(i), nEl, dataType, root, comm(i), stream(i))
+ end do
+
+ do i = 1, nDev
+ stat = cudaSetDevice(devList(i))
+ stat = cudaStreamSynchronize(stream(i))
+ end do
+
+ do i = 1, nDev
+ stat = cudaSetDevice(devList(i))
+ stat = cudaMemcpy(hostBuffPtr(i), devBuffPtr(i), nEl * c_sizeof(hostBuff(1, 1)), cudaMemcpyDeviceToHost)
+ end do
+
+ print "(a)", ""
+ print "(a)", "after broadcast:"
+ do i = 1, nDev
+ err = maxval(abs(hostBuff(:, i) / hostBuff(:, nDev + 1) - 1.0_real32))
+ print "(a, i2.2, a, i2.2, a, e11.4e2)", "maximum error of rank ", i - 1, " vs root (rank ", root,") = ", err
+ end do
+ print "(a)", ""
+
+ do i = 1, nDev
+ stat = cudaSetDevice(devList(i))
+ stat = cudaFree(devBuffPtr(i))
+ end do
+
+ deallocate(devBuffPtr)
+
+ deallocate(hostBuffPtr)
+
+ deallocate(hostBuff)
+
+ deallocate(seed)
+
+ do i = 1, nDev
+ stat = cudaSetDevice(devList(i))
+ stat = cudaStreamDestroy(stream(i))
+ end do
+
+ deallocate(stream)
+
+ do i = 1, nDev
+ call ncclCommDestroy(comm(i))
+ end do
+
+ deallocate(devList)
+ deallocate(comm)
+
+end program test
diff --git a/fortran/test/reduce_arr_out.f90 b/fortran/test/reduce_arr_out.f90
new file mode 100644
index 0000000..a332cf5
--- /dev/null
+++ b/fortran/test/reduce_arr_out.f90
@@ -0,0 +1,168 @@
+program test
+use iso_c_binding
+use iso_fortran_env
+use cudaFor
+use ncclFor
+implicit none
+integer(int32) :: stat, i
+real(real32) :: err
+integer(int32) :: nEl, nDev, root
+type(ncclDataType) :: dataType
+type(ncclRedOp) :: redOp
+type(ncclComm), allocatable :: comm(:)
+integer(int32), allocatable :: devList(:)
+type(ncclResult) :: res
+integer(int32) :: cudaDev, rank
+integer(cuda_stream_kind), allocatable :: stream(:)
+integer(int32) :: time(8)
+integer(int32), allocatable :: seed(:)
+real(real32), allocatable :: hostBuff(:, :)
+real(real32), allocatable, device :: sendBuff(:)
+type(c_devptr), allocatable :: sendBuffPtr(:)
+real(real32), allocatable, device :: recvBuff(:)
+type(c_devptr), allocatable :: recvBuffPtr(:)
+
+ nEl = 2621440
+
+! nDev = 2
+! root = 0
+ stat = cudaGetDeviceCount(nDev)
+ root = nDev - 1
+
+ dataType = ncclFloat
+ redOp = ncclProd
+
+ allocate(comm(nDev))
+ allocate(devList(nDev))
+
+ do i = 1, nDev
+ devList(i) = i - 1
+ end do
+
+ res = ncclCommInitAll(comm, nDev, devList)
+
+ do i = 1, nDev
+ res = ncclCommCuDevice(comm(i), cudaDev)
+ res = ncclCommUserRank(comm(i), rank)
+ end do
+
+ allocate(stream(nDev))
+
+ do i = 1, nDev
+ stat = cudaSetDevice(devList(i))
+ stat = cudaStreamCreate(stream(i))
+ end do
+
+ call date_and_time(values = time)
+ call random_seed(size = i)
+ allocate(seed(i))
+ call random_seed(get = seed)
+ seed = 60 * 60 * 1000 * time(5) + 60 * 1000 * time(6) + 1000 * time(7) + time(8) - seed
+ call random_seed(put = seed)
+
+ allocate(hostBuff(nEl, nDev + 2))
+
+ call random_number(hostBuff(:, 1:nDev + 1))
+
+ hostBuff(:, nDev + 2) = hostBuff(:, 1)
+ do i = 2, nDev
+ hostBuff(:, nDev + 2) = hostBuff(:, nDev + 2) * hostBuff(:, i)
+ end do
+
+ print "(a)", "before reduce:"
+ err = maxval(abs(hostBuff(:, nDev + 1) / hostBuff(:, nDev + 2) - 1.0_real32))
+ print "(a, i2.2, a, e11.4e2)", "maximum error in recvbuff from root (rank ", root,") = ", err
+
+ allocate(sendBuffPtr(nDev))
+
+ do i = 1, nDev
+ stat = cudaSetDevice(devList(i))
+ allocate(sendBuff(nEl))
+ sendBuffPtr(i) = c_devloc(sendBuff)
+ sendBuff = hostBuff(:, i)
+ end do
+
+ allocate(recvBuffPtr(nDev))
+
+ do i = 1, nDev
+ stat = cudaSetDevice(devList(i))
+ allocate(recvBuff(nEl))
+ recvBuffPtr(i) = c_devloc(recvBuff)
+ recvBuff = hostBuff(:, i)
+ end do
+
+ do i = 1, nDev
+ stat = cudaSetDevice(devList(i))
+ res = ncclReduce(sendBuffPtr(i), recvBuffPtr(i), nEl, dataType, redOp, root, comm(i), stream(i))
+ end do
+
+ do i = 1, nDev
+ stat = cudaSetDevice(devList(i))
+ stat = cudaStreamSynchronize(stream(i))
+ end do
+
+ stat = cudaSetDevice(devList(root + 1))
+ call c_f_pointer(recvBuffPtr(root + 1), recvBuff, [nEl])
+ hostBuff(:, nDev + 1) = recvBuff
+
+ print "(a)", ""
+ print "(a)", "after reduce:"
+ err = maxval(abs(hostBuff(:, nDev + 1) / hostBuff(:, nDev + 2) - 1.0_real32))
+ print "(a, i2.2, a, e11.4e2)", "maximum error in recvbuff from root (rank ", root,") = ", err
+
+ print "(a)", ""
+ do i = 1, nDev
+ stat = cudaSetDevice(devList(i))
+ call c_f_pointer(sendBuffPtr(i), sendBuff, [nEl])
+ hostBuff(:, nDev + 1) = sendBuff
+ err = maxval(abs(hostBuff(:, nDev + 1) / hostBuff(:, i) - 1.0_real32))
+ print "(a, i2.2, a, e11.4e2)", "maximum error in sendbuff of rank ", i - 1," = ", err
+ end do
+
+ print "(a)", ""
+ do i = 1, nDev
+ if (i - 1 /= root) then
+ stat = cudaSetDevice(devList(i))
+ call c_f_pointer(recvBuffPtr(i), recvBuff, [nEl])
+ hostBuff(:, nDev + 1) = recvBuff
+ err = maxval(abs(hostBuff(:, nDev + 1) / hostBuff(:, i) - 1.0_real32))
+ print "(a, i2.2, a, e11.4e2)", "maximum error in recvbuff of rank ", i - 1," = ", err
+ end if
+ end do
+ print "(a)", ""
+
+ do i = 1, nDev
+ stat = cudaSetDevice(devList(i))
+ call c_f_pointer(recvBuffPtr(i), recvBuff, [nEl])
+ deallocate(recvBuff)
+ end do
+
+ deallocate(recvBuffPtr)
+
+ do i = 1, nDev
+ stat = cudaSetDevice(devList(i))
+ call c_f_pointer(sendBuffPtr(i), sendBuff, [nEl])
+ deallocate(sendBuff)
+ end do
+
+ deallocate(sendBuffPtr)
+
+ deallocate(hostBuff)
+
+ deallocate(seed)
+
+ do i = 1, nDev
+ stat = cudaSetDevice(devList(i))
+ stat = cudaStreamDestroy(stream(i))
+ end do
+
+ deallocate(stream)
+
+ do i = 1, nDev
+ call ncclCommDestroy(comm(i))
+ end do
+
+ deallocate(devList)
+ deallocate(comm)
+
+end program test
diff --git a/fortran/test/reduce_ptr_out.f90 b/fortran/test/reduce_ptr_out.f90
new file mode 100644
index 0000000..46f3e12
--- /dev/null
+++ b/fortran/test/reduce_ptr_out.f90
@@ -0,0 +1,168 @@
+program test
+use iso_c_binding
+use iso_fortran_env
+use cudaFor
+use ncclFor
+implicit none
+integer(int32) :: stat, i
+real(real32) :: err
+integer(int32) :: nEl, nDev, root
+type(ncclDataType) :: dataType
+type(ncclRedOp) :: redOp
+type(ncclComm), allocatable :: comm(:)
+integer(int32), allocatable :: devList(:)
+type(ncclResult) :: res
+integer(int32) :: cudaDev, rank
+integer(cuda_stream_kind), allocatable :: stream(:)
+integer(int32) :: time(8)
+integer(int32), allocatable :: seed(:)
+real(real32), allocatable, target :: hostBuff(:, :)
+type(c_ptr), allocatable :: hostBuffPtr(:)
+type(c_devptr), allocatable :: sendBuffPtr(:)
+type(c_devptr), allocatable :: recvBuffPtr(:)
+
+ nEl = 2621440
+
+! nDev = 2
+! root = 0
+ stat = cudaGetDeviceCount(nDev)
+ root = nDev - 1
+
+ dataType = ncclFloat
+ redOp = ncclProd
+
+ allocate(comm(nDev))
+ allocate(devList(nDev))
+
+ do i = 1, nDev
+ devList(i) = i - 1
+ end do
+
+ res = ncclCommInitAll(comm, nDev, devList)
+
+ do i = 1, nDev
+ res = ncclCommCuDevice(comm(i), cudaDev)
+ res = ncclCommUserRank(comm(i), rank)
+ end do
+
+ allocate(stream(nDev))
+
+ do i = 1, nDev
+ stat = cudaSetDevice(devList(i))
+ stat = cudaStreamCreate(stream(i))
+ end do
+
+ call date_and_time(values = time)
+ call random_seed(size = i)
+ allocate(seed(i))
+ call random_seed(get = seed)
+ seed = 60 * 60 * 1000 * time(5) + 60 * 1000 * time(6) + 1000 * time(7) + time(8) - seed
+ call random_seed(put = seed)
+
+ allocate(hostBuff(nEl, nDev + 2))
+
+ call random_number(hostBuff(:, 1:nDev + 1))
+
+ hostBuff(:, nDev + 2) = hostBuff(:, 1)
+ do i = 2, nDev
+ hostBuff(:, nDev + 2) = hostBuff(:, nDev + 2) * hostBuff(:, i)
+ end do
+
+ print "(a)", "before reduce:"
+ err = maxval(abs(hostBuff(:, nDev + 1) / hostBuff(:, nDev + 2) - 1.0_real32))
+ print "(a, i2.2, a, e11.4e2)", "maximum error in recvbuff from root (rank ", root,") = ", err
+
+ allocate(hostBuffPtr(nDev + 1))
+
+ do i = 1, nDev + 1
+ hostBuffPtr(i) = c_loc(hostBuff(1, i))
+ end do
+
+ allocate(sendBuffPtr(nDev))
+
+ do i = 1, nDev
+ stat = cudaSetDevice(devList(i))
+ stat = cudaMalloc(sendBuffPtr(i), nEl * c_sizeof(hostBuff(1, 1)))
+ stat = cudaMemcpy(sendBuffPtr(i), hostBuffPtr(i), nEl * c_sizeof(hostBuff(1, 1)), cudaMemcpyHostToDevice)
+ end do
+
+ allocate(recvBuffPtr(nDev))
+
+ do i = 1, nDev
+ stat = cudaSetDevice(devList(i))
+ stat = cudaMalloc(recvBuffPtr(i), nEl * c_sizeof(hostBuff(1, 1)))
+ stat = cudaMemcpy(recvBuffPtr(i), hostBuffPtr(i), nEl * c_sizeof(hostBuff(1, 1)), cudaMemcpyHostToDevice)
+ end do
+
+ do i = 1, nDev
+ stat = cudaSetDevice(devList(i))
+ res = ncclReduce(sendBuffPtr(i), recvBuffPtr(i), nEl, dataType, redOp, root, comm(i), stream(i))
+ end do
+
+ do i = 1, nDev
+ stat = cudaSetDevice(devList(i))
+ stat = cudaStreamSynchronize(stream(i))
+ end do
+
+ stat = cudaSetDevice(devList(root + 1))
+ stat = cudaMemcpy(hostBuffPtr(nDev + 1), recvBuffPtr(root + 1), nEl * c_sizeof(hostBuff(1, 1)), cudaMemcpyDeviceToHost)
+
+ print "(a)", ""
+ print "(a)", "after reduce:"
+ err = maxval(abs(hostBuff(:, nDev + 1) / hostBuff(:, nDev + 2) - 1.0_real32))
+ print "(a, i2.2, a, e11.4e2)", "maximum error in recvbuff from root (rank ", root,") = ", err
+
+ print "(a)", ""
+ do i = 1, nDev
+ stat = cudaSetDevice(devList(i))
+ stat = cudaMemcpy(hostBuffPtr(nDev + 1), sendBuffPtr(i), nEl * c_sizeof(hostBuff(1, 1)), cudaMemcpyDeviceToHost)
+ err = maxval(abs(hostBuff(:, nDev + 1) / hostBuff(:, i) - 1.0_real32))
+ print "(a, i2.2, a, e11.4e2)", "maximum error in sendbuff of rank ", i - 1," = ", err
+ end do
+
+ print "(a)", ""
+ do i = 1, nDev
+ if (i - 1 /= root) then
+ stat = cudaSetDevice(devList(i))
+ stat = cudaMemcpy(hostBuffPtr(nDev + 1), recvBuffPtr(i), nEl * c_sizeof(hostBuff(1, 1)), cudaMemcpyDeviceToHost)
+ err = maxval(abs(hostBuff(:, nDev + 1) / hostBuff(:, i) - 1.0_real32))
+ print "(a, i2.2, a, e11.4e2)", "maximum error in recvbuff of rank ", i - 1," = ", err
+ end if
+ end do
+ print "(a)", ""
+
+ do i = 1, nDev
+ stat = cudaSetDevice(devList(i))
+ stat = cudaFree(recvBuffPtr(i))
+ end do
+
+ deallocate(recvBuffPtr)
+
+ do i = 1, nDev
+ stat = cudaSetDevice(devList(i))
+ stat = cudaFree(sendBuffPtr(i))
+ end do
+
+ deallocate(sendBuffPtr)
+
+ deallocate(hostBuffPtr)
+
+ deallocate(hostBuff)
+
+ deallocate(seed)
+
+ do i = 1, nDev
+ stat = cudaSetDevice(devList(i))
+ stat = cudaStreamDestroy(stream(i))
+ end do
+
+ deallocate(stream)
+
+ do i = 1, nDev
+ call ncclCommDestroy(comm(i))
+ end do
+
+ deallocate(devList)
+ deallocate(comm)
+
+end program test
diff --git a/fortran/test/reducescatter_arr_out.f90 b/fortran/test/reducescatter_arr_out.f90
new file mode 100644
index 0000000..cd14157
--- /dev/null
+++ b/fortran/test/reducescatter_arr_out.f90
@@ -0,0 +1,158 @@
+program test
+use iso_c_binding
+use iso_fortran_env
+use cudaFor
+use ncclFor
+implicit none
+integer(int32) :: stat, i
+real(real32) :: err
+integer(int32) :: nEl, nDev
+type(ncclDataType) :: dataType
+type(ncclRedOp) :: redOp
+type(ncclComm), allocatable :: comm(:)
+integer(int32), allocatable :: devList(:)
+type(ncclResult) :: res
+integer(int32) :: cudaDev, rank
+integer(cuda_stream_kind), allocatable :: stream(:)
+integer(int32) :: time(8)
+integer(int32), allocatable :: seed(:)
+real(real32), allocatable :: hostBuff(:, :)
+real(real32), allocatable, device :: sendBuff(:)
+type(c_devptr), allocatable :: sendBuffPtr(:)
+real(real32), allocatable, device :: recvBuff(:)
+type(c_devptr), allocatable :: recvBuffPtr(:)
+
+ nEl = 2621440
+
+! nDev = 2
+ stat = cudaGetDeviceCount(nDev)
+
+ dataType = ncclFloat
+ redOp = ncclProd
+
+ allocate(comm(nDev))
+ allocate(devList(nDev))
+
+ do i = 1, nDev
+ devList(i) = i - 1
+ end do
+
+ res = ncclCommInitAll(comm, nDev, devList)
+
+ do i = 1, nDev
+ res = ncclCommCuDevice(comm(i), cudaDev)
+ res = ncclCommUserRank(comm(i), rank)
+ end do
+
+ allocate(stream(nDev))
+
+ do i = 1, nDev
+ stat = cudaSetDevice(devList(i))
+ stat = cudaStreamCreate(stream(i))
+ end do
+
+ call date_and_time(values = time)
+ call random_seed(size = i)
+ allocate(seed(i))
+ call random_seed(get = seed)
+ seed = 60 * 60 * 1000 * time(5) + 60 * 1000 * time(6) + 1000 * time(7) + time(8) - seed
+ call random_seed(put = seed)
+
+ allocate(hostBuff(nEl * nDev, nDev + 2))
+
+ call random_number(hostBuff(:, 1:nDev + 1))
+
+ hostBuff(:, nDev + 2) = hostBuff(:, 1)
+ do i = 2, nDev
+ hostBuff(:, nDev + 2) = hostBuff(:, nDev + 2) * hostBuff(:, i)
+ end do
+
+ print "(a)", "before reducescatter:"
+ do i = 1, nDev
+ err = maxval(abs(hostBuff((i - 1) * nEl + 1:i * nEl, nDev + 1) / hostBuff((i - 1) * nEl + 1:i * nEl, nDev + 2) - 1.0_real32))
+ print "(a, i2.2, a, e11.4e2)", "maximum error in recvbuff from rank ", i - 1," = ", err
+ end do
+
+ allocate(sendBuffPtr(nDev))
+
+ do i = 1, nDev
+ stat = cudaSetDevice(devList(i))
+ allocate(sendBuff(nEl * nDev))
+ sendBuffPtr(i) = c_devloc(sendBuff)
+ sendBuff = hostBuff(:, i)
+ end do
+
+ allocate(recvBuffPtr(nDev))
+
+ do i = 1, nDev
+ stat = cudaSetDevice(devList(i))
+ allocate(recvBuff(nEl))
+ recvBuffPtr(i) = c_devloc(recvBuff)
+ recvBuff = hostBuff((i - 1) * nEl + 1:i * nEl, nDev + 1)
+ end do
+
+ do i = 1, nDev
+ stat = cudaSetDevice(devList(i))
+ res = ncclReduceScatter(sendBuffPtr(i), recvBuffPtr(i), nEl, dataType, redOp, comm(i), stream(i))
+ end do
+
+ do i = 1, nDev
+ stat = cudaSetDevice(devList(i))
+ stat = cudaStreamSynchronize(stream(i))
+ end do
+
+ print "(a)", ""
+ print "(a)", "after reducescatter:"
+ do i = 1, nDev
+ stat = cudaSetDevice(devList(i))
+ call c_f_pointer(recvBuffPtr(i), recvBuff, [nEl])
+ hostBuff((i - 1) * nEl + 1:i * nEl, nDev + 1) = recvBuff
+ err = maxval(abs(hostBuff((i - 1) * nEl + 1:i * nEl, nDev + 1) / hostBuff((i - 1) * nEl + 1:i * nEl, nDev + 2) - 1.0_real32))
+ print "(a, i2.2, a, e11.4e2)", "maximum error in recvbuff from rank ", i - 1," = ", err
+ end do
+
+ print "(a)", ""
+ do i = 1, nDev
+ stat = cudaSetDevice(devList(i))
+ call c_f_pointer(sendBuffPtr(i), sendBuff, [nEl * nDev])
+ hostBuff(:, nDev + 1) = sendBuff
+ err = maxval(abs(hostBuff(:, nDev + 1) / hostBuff(:, i) - 1.0_real32))
+ print "(a, i2.2, a, e11.4e2)", "maximum error in sendbuff of rank ", i - 1," = ", err
+ end do
+ print "(a)", ""
+
+ do i = 1, nDev
+ stat = cudaSetDevice(devList(i))
+ call c_f_pointer(recvBuffPtr(i), recvBuff, [nEl])
+ deallocate(recvBuff)
+ end do
+
+ deallocate(recvBuffPtr)
+
+ do i = 1, nDev
+ stat = cudaSetDevice(devList(i))
+ call c_f_pointer(sendBuffPtr(i), sendBuff, [nEl * nDev])
+ deallocate(sendBuff)
+ end do
+
+ deallocate(sendBuffPtr)
+
+ deallocate(hostBuff)
+
+ deallocate(seed)
+
+ do i = 1, nDev
+ stat = cudaSetDevice(devList(i))
+ stat = cudaStreamDestroy(stream(i))
+ end do
+
+ deallocate(stream)
+
+ do i = 1, nDev
+ call ncclCommDestroy(comm(i))
+ end do
+
+ deallocate(devList)
+ deallocate(comm)
+
+end program test
diff --git a/fortran/test/reducescatter_ptr_out.f90 b/fortran/test/reducescatter_ptr_out.f90
new file mode 100644
index 0000000..e35e85d
--- /dev/null
+++ b/fortran/test/reducescatter_ptr_out.f90
@@ -0,0 +1,167 @@
+program test
+use iso_c_binding
+use iso_fortran_env
+use cudaFor
+use ncclFor
+implicit none
+integer(int32) :: stat, i
+real(real32) :: err
+integer(int32) :: nEl, nDev
+type(ncclDataType) :: dataType
+type(ncclRedOp) :: redOp
+type(ncclComm), allocatable :: comm(:)
+integer(int32), allocatable :: devList(:)
+type(ncclResult) :: res
+integer(int32) :: cudaDev, rank
+integer(cuda_stream_kind), allocatable :: stream(:)
+integer(int32) :: time(8)
+integer(int32), allocatable :: seed(:)
+real(real32), allocatable, target :: hostBuff(:, :)
+type(c_ptr), allocatable :: hostBuffPtr(:)
+type(c_devptr), allocatable :: sendBuffPtr(:)
+type(c_devptr), allocatable :: recvBuffPtr(:)
+
+ nEl = 2621440
+
+! nDev = 2
+ stat = cudaGetDeviceCount(nDev)
+
+ dataType = ncclFloat
+ redOp = ncclProd
+
+ allocate(comm(nDev))
+ allocate(devList(nDev))
+
+ do i = 1, nDev
+ devList(i) = i - 1
+ end do
+
+ res = ncclCommInitAll(comm, nDev, devList)
+
+ do i = 1, nDev
+ res = ncclCommCuDevice(comm(i), cudaDev)
+ res = ncclCommUserRank(comm(i), rank)
+ end do
+
+ allocate(stream(nDev))
+
+ do i = 1, nDev
+ stat = cudaSetDevice(devList(i))
+ stat = cudaStreamCreate(stream(i))
+ end do
+
+ call date_and_time(values = time)
+ call random_seed(size = i)
+ allocate(seed(i))
+ call random_seed(get = seed)
+ seed = 60 * 60 * 1000 * time(5) + 60 * 1000 * time(6) + 1000 * time(7) + time(8) - seed
+ call random_seed(put = seed)
+
+ allocate(hostBuff(nEl * nDev, nDev + 2))
+
+ call random_number(hostBuff(:, 1:nDev + 1))
+
+ hostBuff(:, nDev + 2) = hostBuff(:, 1)
+ do i = 2, nDev
+ hostBuff(:, nDev + 2) = hostBuff(:, nDev + 2) * hostBuff(:, i)
+ end do
+
+ print "(a)", "before reducescatter:"
+ do i = 1, nDev
+ err = maxval(abs(hostBuff((i - 1) * nEl + 1:i * nEl, nDev + 1) / hostBuff((i - 1) * nEl + 1:i * nEl, nDev + 2) - 1.0_real32))
+ print "(a, i2.2, a, e11.4e2)", "maximum error in recvbuff from rank ", i - 1," = ", err
+ end do
+
+ allocate(hostBuffPtr(nDev + 1))
+
+ do i = 1, nDev + 1
+ hostBuffPtr(i) = c_loc(hostBuff(1, i))
+ end do
+
+ allocate(sendBuffPtr(nDev))
+
+ do i = 1, nDev
+ stat = cudaSetDevice(devList(i))
+ stat = cudaMalloc(sendBuffPtr(i), nEl * c_sizeof(hostBuff(1, 1)) * nDev)
+ stat = cudaMemcpy(sendBuffPtr(i), hostBuffPtr(i), nEl * c_sizeof(hostBuff(1, 1)) * nDev, cudaMemcpyHostToDevice)
+ end do
+
+ do i = 1, nDev
+ hostBuffPtr(i) = c_loc(hostBuff((i - 1) * nEl + 1, nDev + 1))
+ end do
+
+ allocate(recvBuffPtr(nDev))
+
+ do i = 1, nDev
+ stat = cudaSetDevice(devList(i))
+ stat = cudaMalloc(recvBuffPtr(i), nEl * c_sizeof(hostBuff(1, 1)))
+ stat = cudaMemcpy(recvBuffPtr(i), hostBuffPtr(i), nEl * c_sizeof(hostBuff(1, 1)), cudaMemcpyHostToDevice)
+ end do
+
+ do i = 1, nDev
+ stat = cudaSetDevice(devList(i))
+ res = ncclReduceScatter(sendBuffPtr(i), recvBuffPtr(i), nEl, dataType, redOp, comm(i), stream(i))
+ end do
+
+ do i = 1, nDev
+ stat = cudaSetDevice(devList(i))
+ stat = cudaStreamSynchronize(stream(i))
+ end do
+
+ print "(a)", ""
+ print "(a)", "after reduceScatter:"
+ do i = 1, nDev
+ stat = cudaSetDevice(devList(i))
+ stat = cudaMemcpy(hostBuffPtr(i), recvBuffPtr(i), nEl * c_sizeof(hostBuff(1, 1)), cudaMemcpyDeviceToHost)
+ err = maxval(abs(hostBuff((i - 1) * nEl + 1:i * nEl, nDev + 1) / hostBuff((i - 1) * nEl + 1:i * nEl, nDev + 2) - 1.0_real32))
+ print "(a, i2.2, a, e11.4e2)", "maximum error in recvbuff from rank ", i - 1," = ", err
+ end do
+
+ do i = 1, nDev + 1
+ hostBuffPtr(i) = c_loc(hostBuff(1, nDev + 1))
+ end do
+
+ print "(a)", ""
+ do i = 1, nDev
+ stat = cudaSetDevice(devList(i))
+ stat = cudaMemcpy(hostBuffPtr(i), sendBuffPtr(i), nEl * c_sizeof(hostBuff(1, 1)) * nDev, cudaMemcpyDeviceToHost)
+ err = maxval(abs(hostBuff(:, nDev + 1) / hostBuff(:, i) - 1.0_real32))
+ print "(a, i2.2, a, e11.4e2)", "maximum error in sendbuff of rank ", i - 1," = ", err
+ end do
+ print "(a)", ""
+
+ do i = 1, nDev
+ stat = cudaSetDevice(devList(i))
+ stat = cudaFree(recvBuffPtr(i))
+ end do
+
+ deallocate(recvBuffPtr)
+
+ do i = 1, nDev
+ stat = cudaSetDevice(devList(i))
+ stat = cudaFree(sendBuffPtr(i))
+ end do
+
+ deallocate(sendBuffPtr)
+
+ deallocate(hostBuffPtr)
+
+ deallocate(hostBuff)
+
+ deallocate(seed)
+
+ do i = 1, nDev
+ stat = cudaSetDevice(devList(i))
+ stat = cudaStreamDestroy(stream(i))
+ end do
+
+ deallocate(stream)
+
+ do i = 1, nDev
+ call ncclCommDestroy(comm(i))
+ end do
+
+ deallocate(devList)
+ deallocate(comm)
+
+end program test