Add Fortran bindings

author: Kyle Fernandes, ne Jacobs <kj333@cam.ac.uk> 2016-11-18 02:27:20 +0300
committer: Sylvain Jeaugey <sjeaugey@nvidia.com> 2016-11-18 02:33:34 +0300
commit: 5f2b32e45b7d31ec942de27369a1308a0afe8fb0 (patch)
tree: 761f62f22038833b75934afd7302f337a03880c8
parent: 534b9a169778040dbe7562854893b74bb9387e46 (diff)
14 files changed, 2133 insertions, 9 deletions
diff --git a/Makefile b/Makefile
index 2556d07..64243e8 100644
--- a/Makefile
+++ b/Makefile
@@ -11,6 +11,7 @@ KEEP ?= 0
 DEBUG ?= 0
 PROFAPI ?= 0
 BUILDDIR ?= build
+BUILDDIR := $(abspath $(BUILDDIR))
 
 CUDA_LIB ?= $(CUDA_HOME)/lib64
 CUDA_INC ?= $(CUDA_HOME)/include
@@ -21,7 +22,7 @@ NVCC_GENCODE ?= -gencode=arch=compute_35,code=sm_35 \
                 -gencode=arch=compute_52,code=sm_52 \
                 -gencode=arch=compute_52,code=compute_52
 
-CXXFLAGS   := -I$(CUDA_INC) -fPIC -fvisibility=hidden 
+CXXFLAGS   := -I$(CUDA_INC) -fPIC -fvisibility=hidden
 NVCUFLAGS  := -ccbin $(CXX) $(NVCC_GENCODE) -lineinfo -std=c++11 -maxrregcount 96
 # Use addprefix so that we can specify more than one path
 LDFLAGS    := $(addprefix -L,${CUDA_LIB}) -lcudart -lrt
@@ -59,7 +60,7 @@ CUDA_MAJOR = $(shell echo $(CUDA_VERSION) | cut -d "." -f 1)
 CUDA_MINOR = $(shell echo $(CUDA_VERSION) | cut -d "." -f 2)
 CXXFLAGS  += -DCUDA_MAJOR=$(CUDA_MAJOR) -DCUDA_MINOR=$(CUDA_MINOR)
 
-.PHONY : lib clean test mpitest install deb debian debclean
+.PHONY : lib clean test mpitest install deb debian debclean forlib fortest forclean
 .DEFAULT : lib
 
 INCEXPORTS  := nccl.h
@@ -82,19 +83,19 @@ lib : $(INCTARGETS) $(LIBDIR)/$(LIBTARGET)
 -include $(DEPFILES)
 
 $(LIBDIR)/$(LIBTARGET) : $(LIBOBJ)
-	@printf "Linking   %-25s\n" $@
+	@printf "Linking   %-35s > %s\n" $(LIBTARGET) $@
 	mkdir -p $(LIBDIR)
 	$(CXX) $(CXXFLAGS) -shared -Wl,--no-as-needed -Wl,-soname,$(LIBSONAME) -o $@ $(LDFLAGS) $(LIBOBJ)
 	ln -sf $(LIBSONAME) $(LIBDIR)/$(LIBNAME)
 	ln -sf $(LIBTARGET) $(LIBDIR)/$(LIBSONAME)
 
 $(INCDIR)/%.h : src/%.h
-	@printf "Grabbing  %-25s > %-25s\n" $< $@
+	@printf "Grabbing  %-35s > %s\n" $< $@
 	mkdir -p $(INCDIR)
 	cp -f $< $@
 
 $(OBJDIR)/%.o : src/%.cu
-	@printf "Compiling %-25s > %-25s\n" $< $@
+	@printf "Compiling %-35s > %s\n" $< $@
 	mkdir -p $(OBJDIR)
 	$(NVCC) -c $(NVCUFLAGS) --compiler-options "$(CXXFLAGS)" $< -o $@
 	@$(NVCC) -M $(NVCUFLAGS) --compiler-options "$(CXXFLAGS)" $< > $(@:%.o=%.d.tmp)
@@ -147,7 +148,7 @@ MPITESTBINS:= $(patsubst %, $(MPITSTDIR)/%, $(MPITESTS))
 test : $(TESTBINS)
 
 $(TSTDIR)/% : test/single/%.cu test/include/*.h $(TSTDEP)
-	@printf "Building  %-25s > %-24s\n" $< $@
+	@printf "Building  %-35s > %s\n" $< $@
 	mkdir -p $(TSTDIR)
 	$(NVCC) $(TSTINC) $(NVCUFLAGS) --compiler-options "$(CXXFLAGS)" -o $@ $< $(TSTLIB) -lcuda -lcurand -lnvToolsExt
 	@$(NVCC) -M $(TSTINC) $(NVCUFLAGS) --compiler-options "$(CXXFLAGS)" $< $(TSTLIB) -lcuda -lcurand -lnvToolsExt > $(@:%=%.d.tmp)
@@ -159,7 +160,7 @@ $(TSTDIR)/% : test/single/%.cu test/include/*.h $(TSTDEP)
 mpitest : $(MPITESTBINS)
 
 $(MPITSTDIR)/% : test/mpi/%.cu $(TSTDEP)
-	@printf "Building  %-25s > %-24s\n" $< $@
+	@printf "Building  %-35s > %s\n" $< $@
 	mkdir -p $(MPITSTDIR)
 	$(NVCC) $(MPIFLAGS) $(TSTINC) $(NVCUFLAGS) --compiler-options "$(CXXFLAGS)" -o $@ $< $(TSTLIB) -lcurand
 	@$(NVCC) $(MPIFLAGS) -M $(TSTINC) $(NVCUFLAGS) --compiler-options "$(CXXFLAGS)" $< $(TSTLIB) -lcurand > $(@:%=%.d.tmp)
@@ -193,7 +194,7 @@ debclean :
 	rm -Rf $(DEBIANDIR)
 
 $(DEBIANDIR)/% : debian/%.in
-	@printf "Generating %-25s > %-24s\n" $< $@
+	@printf "Generating %-35s > %s\n" $< $@
 	sed -e "s/\$${nccl:Major}/$(NCCL_MAJOR)/g" \
 	    -e "s/\$${nccl:Minor}/$(NCCL_MINOR)/g" \
 	    -e "s/\$${nccl:Patch}/$(NCCL_PATCH)/g" \
@@ -205,7 +206,18 @@ $(DEBIANDIR)/% : debian/%.in
 	    $< > $@
 
 $(DEBIANDIR)/% : debian/%
-	@printf "Grabbing  %-25s > %-25s\n" $< $@
+	@printf "Grabbing  %-35s > %s\n" $< $@
 	mkdir -p $(DEBIANDIR)
 	cp -f $< $@
 
+#### FORTRAN BINDINGS ####
+
+export NCCL_MAJOR NCCL_MINOR NCCL_PATCH CUDA_MAJOR CUDA_MINOR LIBLINK CUDA_LIB BUILDDIR
+
+forlib : lib
+	$(MAKE) -C fortran lib
+fortest : forlib
+	$(MAKE) -C fortran test
+forclean :
+	$(MAKE) -C fortran clean
+
diff --git a/fortran/Makefile b/fortran/Makefile
new file mode 100644
index 0000000..b60b016
--- /dev/null
+++ b/fortran/Makefile
@@ -0,0 +1,81 @@
+FC := gfortran
+FCNAME := $(notdir $(FC))
+
+BUILDDIR ?= ../build
+INCDIR := $(BUILDDIR)/include
+LIBDIR := $(BUILDDIR)/lib
+OBJDIR := $(BUILDDIR)/obj
+
+LIBNAME    := libncclfor.so
+LIBSONAME  := $(patsubst %,%.$(NCCL_MAJOR),$(LIBNAME))
+LIBTARGET  := $(patsubst %,%.$(NCCL_MAJOR).$(NCCL_MINOR).$(NCCL_PATCH),$(LIBNAME))
+LIBLINK    += $(patsubst lib%.so,-l%,$(LIBNAME))
+
+LIBCUDAFOR := libcudafor.so
+
+ifneq ($(filter pgf%, $(FCNAME)), )
+# PGI compiler (pgfortran, pgf90, pgf95)
+FCMODFLAGS  := -module $(INCDIR)
+FCPREFLAGS  := -Mpreprocess
+FCCUDAFLAGS := -Mcuda,cuda$(CUDA_MAJOR).$(CUDA_MINOR)
+FCFLAGS     := -fast -O3
+else
+# non-PGI compilers do not have CUDA support, compile our own CUDA lib
+CUDAFORDEP  := $(LIBDIR)/$(LIBCUDAFOR)
+CUDALINK    := -L$(CUDA_LIB) -lcudart
+CUDAFORLINK := -lcudafor
+ifeq ($(FCNAME), gfortran)
+FCMODFLAGS  := -J$(INCDIR)
+FCPREFLAGS  += -cpp
+FCFLAGS     += -ffree-line-length-none
+else ifeq ($(FCNAME), ifort)
+FCMODFLAGS  := -module $(INCDIR)
+FCPREFLAGS  += -fpp
+endif
+endif
+
+ifeq ($(VERBOSE), 0)
+.SILENT:
+endif
+
+lib: $(CUDAFORDEP)
+	$(MAKE) $(LIBDIR)/$(LIBTARGET)
+
+$(LIBDIR)/$(LIBTARGET): $(OBJDIR)/ncclfor.o
+	@printf "Linking   %-35s > %s\n" $(LIBTARGET) $@
+	mkdir -p $(LIBDIR)
+	$(FC) -shared -Wl,--no-as-needed -Wl,-soname,$(LIBSONAME) $< -o $(LIBDIR)/$(LIBTARGET)
+	ln -sf $(LIBSONAME) $(LIBDIR)/$(LIBNAME)
+	ln -sf $(LIBTARGET) $(LIBDIR)/$(LIBSONAME)
+
+$(LIBDIR)/$(LIBCUDAFOR): $(OBJDIR)/cudafor.o
+	@printf "Linking   %-35s > %s\n" $(LIBCUDAFOR) $@
+	mkdir -p $(LIBDIR)
+	$(FC) -shared -Wl,--no-as-needed -Wl,-soname,$(LIBCUDAFOR) $< -o $(LIBDIR)/$(LIBCUDAFOR)
+
+$(OBJDIR)/%.o: src/%.f90
+	@printf "Building  %-35s > %s\n" $< $@
+	mkdir -p $(OBJDIR)
+	mkdir -p $(INCDIR)
+	$(FC) -c $(FCMODFLAGS) $(FCPREFLAGS) -fPIC $(FCCUDAFLAGS) $(FCFLAGS) $< -o $@
+
+TESTS := reduce_ptr_out allreduce_ptr_out reducescatter_ptr_out broadcast_ptr allgather_ptr_out
+ifneq ($(filter pgf%, $(FCNAME)), )
+TESTS += reduce_arr_out allreduce_arr_out reducescatter_arr_out broadcast_arr allgather_arr_out
+endif
+
+TESTDIR  := $(BUILDDIR)/test/fortran
+TESTBINS := $(patsubst %,$(TESTDIR)/%,$(TESTS))
+
+test: lib $(TESTBINS)
+
+$(TESTDIR)/%: test/%.f90 lib
+	@printf "Building  %-35s > %s\n" $< $@
+	@mkdir -p $(TESTDIR)
+	$(FC) $(FCCUDAFLAGS) $(FCFLAGS) $< $(CUDALINK) -I$(INCDIR) -L$(LIBDIR) $(CUDAFORLINK) $(LIBLINK) -o $@
+
+clean:
+	rm -f $(LIBDIR)/$(LIBTARGET) $(LIBDIR)/$(LIBSONAME) $(LIBDIR)/$(LIBNAME)
+	rm -f $(LIBDIR)/$(LIBCUDAFOR) $(OBJDIR)/*for.o $(INCDIR)/*.mod
+	rm -rf $(TESTDIR)/
+
diff --git a/fortran/src/cudafor.f90 b/fortran/src/cudafor.f90
new file mode 100644
index 0000000..8c6934d
--- /dev/null
+++ b/fortran/src/cudafor.f90
@@ -0,0 +1,164 @@
+#ifndef _CUDA
+
+!Start cudaFor module
+module cudaFor
+use iso_c_binding
+implicit none
+private
+public :: c_devptr
+public :: cudaMemcpyKind,           &
+          cudaMemcpyHostToHost,     &
+          cudaMemcpyHostToDevice,   &
+          cudaMemcpyDeviceToHost,   &
+          cudaMemcpyDeviceToDevice, &
+          cudaMemcpyDefault
+public :: cuda_stream_kind
+public :: cudaGetDeviceCount
+public :: cudaSetDevice
+public :: cudaMalloc
+public :: cudaMemcpy
+public :: cudaFree
+public :: cudaStreamCreate
+public :: cudaStreamSynchronize
+public :: cudaStreamDestroy
+
+!Start types
+
+!Start c_devptr
+type, bind(c) :: c_devptr
+type(c_ptr) :: member
+end type c_devptr
+!End c_devptr
+
+!Start cudaMemcpyKind
+type, bind(c) :: cudaMemcpyKind
+integer(c_int) :: member
+end type cudaMemcpyKind
+
+type(cudaMemcpyKind), parameter :: cudaMemcpyHostToHost     = cudaMemcpyKind(0), &
+                                   cudaMemcpyHostToDevice   = cudaMemcpyKind(1), &
+                                   cudaMemcpyDeviceToHost   = cudaMemcpyKind(2), &
+                                   cudaMemcpyDeviceToDevice = cudaMemcpyKind(3), &
+                                   cudaMemcpyDefault        = cudaMemcpyKind(4)
+!End cudaMemcpyKind
+
+!Start cuda_stream_kind
+integer(c_intptr_t), parameter :: cuda_stream_kind = c_intptr_t
+!End cuda_stream_kind
+
+!End types
+
+!Start interfaces
+
+!Start cudaGetDeviceCount
+interface cudaGetDeviceCount
+integer(c_int) function cudaGetDeviceCount(count) bind(c, name = "cudaGetDeviceCount")
+import :: c_int
+implicit none
+integer(c_int) :: count
+end function cudaGetDeviceCount
+end interface cudaGetDeviceCount
+!End cudaGetDeviceCount
+
+!Start cudaSetDevice
+interface cudaSetDevice
+integer(c_int) function cudaSetDevice(device) bind(c, name = "cudaSetDevice")
+import :: c_int
+implicit none
+integer(c_int), value :: device
+end function cudaSetDevice
+end interface cudaSetDevice
+!End cudaSetDevice
+
+!Start cudaMalloc
+interface cudaMalloc
+integer(c_int) function cudaMalloc(devPtr, size) bind(c, name = "cudaMalloc")
+import :: c_int, c_size_t
+import :: c_devptr
+implicit none
+type(c_devptr) :: devPtr
+integer(c_size_t), value :: size
+end function cudaMalloc
+end interface cudaMalloc
+!End cudaMalloc
+
+!Start cudaMemcpy
+interface cudaMemcpy
+
+!Start cudaMemcpyH2D
+integer(c_int) function cudaMemcpyH2D(dst, src, count, kind) bind(c, name = "cudaMemcpy")
+import :: c_ptr, c_int, c_size_t
+import :: c_devptr, cudaMemcpyKind
+implicit none
+type(c_devptr), value :: dst
+type(c_ptr), value :: src
+integer(c_size_t), value :: count
+type(cudaMemcpyKind), value :: kind
+end function cudaMemcpyH2D
+!End cudaMemcpyH2D
+
+!Start cudaMemcpyD2H
+integer(c_int) function cudaMemcpyD2H(dst, src, count, kind) bind(c, name = "cudaMemcpy")
+import :: c_ptr, c_int, c_size_t
+import :: c_devptr, cudaMemcpyKind
+implicit none
+type(c_ptr), value :: dst
+type(c_devptr), value :: src
+integer(c_size_t), value :: count
+type(cudaMemcpyKind), value :: kind
+end function cudaMemcpyD2H
+!End cudaMemcpyD2H
+
+end interface cudaMemcpy
+!End cudaMemcpy
+
+!Start cudaFree
+interface cudaFree
+integer(c_int) function cudaFree(devPtr) bind(c, name = "cudaFree")
+import :: c_int
+import :: c_devptr
+implicit none
+type(c_devptr), value :: devPtr
+end function cudaFree
+end interface cudaFree
+!End cudaFree
+
+!Start cudaStreamCreate
+interface cudaStreamCreate
+integer(c_int) function cudaStreamCreate(pStream) bind(c, name = "cudaStreamCreate")
+import :: c_int
+import :: cuda_stream_kind
+implicit none
+integer(cuda_stream_kind) :: pStream
+end function cudaStreamCreate
+end interface cudaStreamCreate
+!End cudaStreamCreate
+
+!Start cudaStreamSynchronize
+interface cudaStreamSynchronize
+integer(c_int) function cudaStreamSynchronize(stream) bind(c, name = "cudaStreamSynchronize")
+import :: c_int
+import :: cuda_stream_kind
+implicit none
+integer(cuda_stream_kind), value :: stream
+end function cudaStreamSynchronize
+end interface cudaStreamSynchronize
+!End cudaStreamSynchronize
+
+!Start cudaStreamDestroy
+interface cudaStreamDestroy
+integer(c_int) function cudaStreamDestroy(stream) bind(c, name = "cudaStreamDestroy")
+import :: c_int
+import :: cuda_stream_kind
+implicit none
+integer(cuda_stream_kind), value :: stream
+end function cudaStreamDestroy
+end interface cudaStreamDestroy
+!End cudaStreamDestroy
+
+!End interfaces
+
+end module cudaFor
+!End cudaFor module
+
+#endif
diff --git a/fortran/src/ncclfor.f90 b/fortran/src/ncclfor.f90
new file mode 100644
index 0000000..db98f3d
--- /dev/null
+++ b/fortran/src/ncclfor.f90
@@ -0,0 +1,305 @@
+!Start defines
+#define NCCL_UNIQUE_ID_BYTES 128
+!End defines
+
+!Start ncclFor module
+module ncclFor
+use iso_c_binding
+use cudaFor
+implicit none
+private
+public :: ncclUniqueId
+public :: ncclComm
+public :: ncclResult,                 &
+          ncclSuccess,                &
+          ncclUnhandledCudaError,     &
+          ncclSystemError,            &
+          ncclInternalError,          &
+          ncclInvalidDevicePointer,   &
+          ncclInvalidRank,            &
+          ncclUnsupportedDeviceCount, &
+          ncclDeviceNotFound,         &
+          ncclInvalidDeviceIndex,     &
+          ncclLibWrapperNotSet,       &
+          ncclCudaMallocFailed,       &
+          ncclRankMismatch,           &
+          ncclInvalidArgument,        &
+          ncclInvalidType,            &
+          ncclInvalidOperation,       &
+          nccl_NUM_RESULTS
+public :: ncclDataType, &
+          ncclChar,     &
+          ncclInt,      &
+#ifdef CUDA_HAS_HALF
+          ncclHalf,     &
+#endif
+          ncclFloat,    &
+          ncclDouble,   &
+          ncclInt64,    &
+          ncclUInt64,   &
+          nccl_NUM_TYPES
+public :: ncclRedOp, &
+          ncclSum,   &
+          ncclProd,  &
+          ncclMax,   &
+          ncclMin,   &
+          nccl_NUM_OPS
+public :: ncclGetUniqueId
+public :: ncclCommInitRank
+public :: ncclCommInitAll
+public :: ncclCommCuDevice
+public :: ncclCommUserRank
+public :: ncclCommCount
+public :: ncclCommDestroy
+public :: ncclReduce
+public :: ncclAllReduce
+public :: ncclReduceScatter
+public :: ncclBcast
+public :: ncclAllGather
+
+!Start types
+
+!Start ncclUniqueId
+type, bind(c) :: ncclUniqueId
+character(c_char) :: internal(NCCL_UNIQUE_ID_BYTES)
+end type ncclUniqueId
+!End ncclUniqueId
+
+!Start ncclComm
+type, bind(c) :: ncclComm
+type(c_ptr) :: member
+end type ncclComm
+!End ncclComm
+
+!Start ncclResult
+type, bind(c) :: ncclResult
+integer(c_int) :: member
+end type ncclResult
+
+type(ncclResult), parameter :: ncclSuccess                = ncclResult( 0), &
+                               ncclUnhandledCudaError     = ncclResult( 1), &
+                               ncclSystemError            = ncclResult( 2), &
+                               ncclInternalError          = ncclResult( 3), &
+                               ncclInvalidDevicePointer   = ncclResult( 4), &
+                               ncclInvalidRank            = ncclResult( 5), &
+                               ncclUnsupportedDeviceCount = ncclResult( 6), &
+                               ncclDeviceNotFound         = ncclResult( 7), &
+                               ncclInvalidDeviceIndex     = ncclResult( 8), &
+                               ncclLibWrapperNotSet       = ncclResult( 9), &
+                               ncclCudaMallocFailed       = ncclResult(10), &
+                               ncclRankMismatch           = ncclResult(11), &
+                               ncclInvalidArgument        = ncclResult(12), &
+                               ncclInvalidType            = ncclResult(13), &
+                               ncclInvalidOperation       = ncclResult(14), &
+                               nccl_NUM_RESULTS           = ncclResult(15)
+!End ncclResult
+
+!Start ncclDataType
+type, bind(c) :: ncclDataType
+integer(c_int) :: member
+end type ncclDataType
+
+type(ncclDataType), parameter :: ncclChar       = ncclDataType(0), &
+                                 ncclInt        = ncclDataType(1), &
+#ifdef CUDA_HAS_HALF
+                                 ncclHalf       = ncclDataType(2), &
+#endif
+                                 ncclFloat      = ncclDataType(3), &
+                                 ncclDouble     = ncclDataType(4), &
+                                 ncclInt64      = ncclDataType(5), &
+                                 ncclUInt64     = ncclDataType(6), &
+                                 nccl_NUM_TYPES = ncclDataType(7)
+!End ncclDataType
+
+!Start ncclRedOp
+type, bind(c) :: ncclRedOp
+integer(c_int) :: member
+end type ncclRedOp
+
+type(ncclRedOp), parameter :: ncclSum      = ncclRedOp(0), &
+                              ncclProd     = ncclRedOp(1), &
+                              ncclMax      = ncclRedOp(2), &
+                              ncclMin      = ncclRedOp(3), &
+                              nccl_NUM_OPS = ncclRedOp(4)
+!End ncclRedOp
+
+!End types
+
+!Start interfaces
+
+!Start ncclGetUniqueId
+interface ncclGetUniqueId
+type(ncclResult) function ncclGetUniqueId(uniqueId) bind(c, name = 'ncclGetUniqueId')
+import :: ncclResult, ncclUniqueId
+implicit none
+type(ncclUniqueId) :: uniqueId
+end function ncclGetUniqueId
+end interface ncclGetUniqueId
+!End ncclGetUniqueId
+
+!Start ncclCommInitRank
+interface ncclCommInitRank
+type(ncclResult) function ncclCommInitRank(comm, ndev, commId, rank) bind(c, name = 'ncclCommInitRank')
+import :: c_int
+import :: ncclResult, ncclUniqueId, ncclComm
+implicit none
+type(ncclComm) :: comm(*)
+integer(c_int), value :: ndev
+type(ncclUniqueId), value :: commId
+integer(c_int), value :: rank
+end function ncclCommInitRank
+end interface ncclCommInitRank
+!End ncclCommInitRank
+
+!Start ncclCommInitAll
+interface ncclCommInitAll
+type(ncclResult) function ncclCommInitAll(comm, ndev, devlist) bind(c, name = 'ncclCommInitAll')
+import :: c_int
+import :: ncclResult, ncclComm
+implicit none
+type(ncclComm) :: comm(*)
+integer(c_int), value :: ndev
+integer(c_int) :: devlist(*)
+end function ncclCommInitAll
+end interface ncclCommInitAll
+!End ncclCommInitAll
+
+!Start ncclCommCuDevice
+interface ncclCommCuDevice
+type(ncclResult) function ncclCommCuDevice(comm, devid) bind(c, name = 'ncclCommCuDevice')
+import :: c_int
+import :: ncclResult, ncclComm
+implicit none
+type(ncclComm), value :: comm
+integer(c_int) :: devid
+end function ncclCommCuDevice
+end interface ncclCommCuDevice
+!End ncclCommCuDevice
+
+!Start ncclCommUserRank
+interface ncclCommUserRank
+type(ncclResult) function ncclCommUserRank(comm, rank) bind(c, name = 'ncclCommUserRank')
+import :: c_int
+import :: ncclResult, ncclComm
+implicit none
+type(ncclComm), value :: comm
+integer(c_int) :: rank
+end function ncclCommUserRank
+end interface ncclCommUserRank
+!End ncclCommUserRank
+
+!Start ncclCommCount
+interface ncclCommCount
+type(ncclResult) function ncclCommCount(comm, count) bind(c, name = 'ncclCommCount')
+import :: c_int
+import :: ncclResult, ncclComm
+implicit none
+type(ncclComm), value :: comm
+integer(c_int) :: count
+end function ncclCommCount
+end interface ncclCommCount
+!End ncclCommCount
+
+!Start ncclCommDestroy
+interface ncclCommDestroy
+subroutine ncclCommDestroy(comm) bind(c, name = 'ncclCommDestroy')
+import :: ncclComm
+implicit none
+type(ncclComm), value :: comm
+end subroutine ncclCommDestroy
+end interface ncclCommDestroy
+!End ncclCommDestroy
+
+!Start ncclReduce
+interface ncclReduce
+type(ncclResult) function ncclReduce(sendbuff, recvbuff, count, datatype, op, root, comm, stream) bind(c, name = 'ncclReduce')
+import :: c_int
+import :: c_devptr, cuda_stream_kind
+import :: ncclResult, ncclComm, ncclDataType, ncclRedOp
+implicit none
+type(c_devptr), value :: sendbuff
+type(c_devptr), value :: recvbuff
+integer(c_int), value :: count
+type(ncclDataType), value :: datatype
+type(ncclRedOp), value :: op
+integer(c_int), value :: root
+type(ncclComm), value :: comm
+integer(cuda_stream_kind), value :: stream
+end function ncclReduce
+end interface ncclReduce
+!End ncclReduce
+
+!Start ncclAllReduce
+interface ncclAllReduce
+type(ncclResult) function ncclAllReduce(sendbuff, recvbuff, count, datatype, op, comm, stream) bind(c, name = 'ncclAllReduce')
+import :: c_int
+import :: c_devptr, cuda_stream_kind
+import :: ncclResult, ncclComm, ncclDataType, ncclRedOp
+implicit none
+type(c_devptr), value :: sendbuff
+type(c_devptr), value :: recvbuff
+integer(c_int), value :: count
+type(ncclDataType), value :: datatype
+type(ncclRedOp), value :: op
+type(ncclComm), value :: comm
+integer(cuda_stream_kind), value :: stream
+end function ncclAllReduce
+end interface ncclAllReduce
+!End ncclAllReduce
+
+!Start ncclReduceScatter
+interface ncclReduceScatter
+type(ncclResult) function ncclReduceScatter(sendbuff, recvbuff, recvcount, datatype, op, comm, stream) bind(c, name = 'ncclReduceScatter')
+import :: c_int
+import :: c_devptr, cuda_stream_kind
+import :: ncclResult, ncclComm, ncclDataType, ncclRedOp
+implicit none
+type(c_devptr), value :: sendbuff
+type(c_devptr), value :: recvbuff
+integer(c_int), value :: recvcount
+type(ncclDataType), value :: datatype
+type(ncclRedOp), value :: op
+type(ncclComm), value :: comm
+integer(cuda_stream_kind), value :: stream
+end function ncclReduceScatter
+end interface ncclReduceScatter
+!End ncclReduceScatter
+
+!Start ncclBcast
+interface ncclBcast
+type(ncclResult) function ncclBcast(buff, count, datatype, root, comm, stream) bind(c, name = 'ncclBcast')
+import :: c_int
+import :: c_devptr, cuda_stream_kind
+import :: ncclResult, ncclComm, ncclDataType
+implicit none
+type(c_devptr), value :: buff
+integer(c_int), value :: count
+type(ncclDataType), value :: datatype
+integer(c_int), value :: root
+type(ncclComm), value :: comm
+integer(cuda_stream_kind), value :: stream
+end function ncclBcast
+end interface ncclBcast
+!End ncclBcast
+
+!Start ncclAllGather
+interface ncclAllGather
+type(ncclResult) function ncclAllGather(sendbuff, count, datatype, recvbuff, comm, stream) bind(c, name = 'ncclAllGather')
+import :: c_int
+import :: c_devptr, cuda_stream_kind
+import :: ncclResult, ncclComm, ncclDataType
+implicit none
+type(c_devptr), value :: sendbuff
+integer(c_int), value :: count
+type(ncclDataType), value :: datatype
+type(c_devptr), value :: recvbuff
+type(ncclComm), value :: comm
+integer(cuda_stream_kind), value :: stream
+end function ncclAllGather
+end interface ncclAllGather
+!End ncclAllGather
+
+!End interfaces
+
+end module ncclFor
+!End nccl module
diff --git a/fortran/test/allgather_arr_out.f90 b/fortran/test/allgather_arr_out.f90
new file mode 100644
index 0000000..61fd35b
--- /dev/null
+++ b/fortran/test/allgather_arr_out.f90
@@ -0,0 +1,155 @@
+program test
+use iso_c_binding
+use iso_fortran_env
+use cudaFor
+use ncclFor
+implicit none
+integer(int32) :: stat, i
+real(real32) :: err
+integer(int32) :: nEl, nDev
+type(ncclDataType) :: dataType
+type(ncclComm), allocatable :: comm(:)
+integer(int32), allocatable :: devList(:)
+type(ncclResult) :: res
+integer(int32) :: cudaDev, rank
+integer(cuda_stream_kind), allocatable :: stream(:)
+integer(int32) :: time(8)
+integer(int32), allocatable :: seed(:)
+real(real32), allocatable :: hostBuff(:, :)
+real(real32), allocatable, device :: sendBuff(:)
+type(c_devptr), allocatable :: sendBuffPtr(:)
+real(real32), allocatable, device :: recvBuff(:)
+type(c_devptr), allocatable :: recvBuffPtr(:)
+
+  nEl = 2621440
+
+!  nDev = 2
+  stat = cudaGetDeviceCount(nDev)
+
+  dataType = ncclFloat
+
+  allocate(comm(nDev))
+  allocate(devList(nDev))
+
+  do i = 1, nDev
+    devList(i) = i - 1
+  end do
+
+  res = ncclCommInitAll(comm, nDev, devList)
+
+  do i = 1, nDev
+    res = ncclCommCuDevice(comm(i), cudaDev)
+    res = ncclCommUserRank(comm(i), rank)
+  end do
+
+  allocate(stream(nDev))
+
+  do i = 1, nDev
+    stat = cudaSetDevice(devList(i))
+    stat = cudaStreamCreate(stream(i))
+  end do
+
+  call date_and_time(values = time)
+  call random_seed(size = i)
+  allocate(seed(i))
+  call random_seed(get = seed)
+  seed = 60 * 60 * 1000 * time(5) + 60 * 1000 * time(6) + 1000 * time(7) + time(8) - seed
+  call random_seed(put = seed)
+
+  allocate(hostBuff(nEl * nDev, nDev + 1))
+
+  call random_number(hostBuff)
+
+  print "(a)", "before allgather:"
+  do i = 1, nDev
+    err = maxval(abs(hostBuff(:, i) / hostBuff(:, nDev + 1) - 1.0_real32))
+    print "(a, i2.2, a, e11.4e2)", "maximum error of rank ", i - 1, " vs sendbuff = ", err
+  end do
+
+  allocate(sendBuffPtr(nDev))
+
+  do i = 1, nDev
+    stat = cudaSetDevice(devList(i))
+    allocate(sendBuff(nEl))
+    sendBuffPtr(i) = c_devloc(sendBuff)
+    sendBuff = hostBuff((i - 1) * nEl + 1:i * nEl, nDev + 1)
+  end do
+
+  allocate(recvBuffPtr(nDev))
+
+  do i = 1, nDev
+    stat = cudaSetDevice(devList(i))
+    allocate(recvBuff(nEl * nDev))
+    recvBuffPtr(i) = c_devloc(recvBuff)
+    recvBuff = hostBuff(:, i)
+  end do
+
+  do i = 1, nDev
+    stat = cudaSetDevice(devList(i))
+    res = ncclAllGather(sendBuffPtr(i), nEl, dataType, recvBuffPtr(i), comm(i), stream(i))
+  end do
+
+  do i = 1, nDev
+    stat = cudaSetDevice(devList(i))
+    stat = cudaStreamSynchronize(stream(i))
+  end do
+
+  do i = 1, nDev
+    stat = cudaSetDevice(devList(i))
+    call c_f_pointer(recvBuffPtr(i), recvBuff, [nEl * nDev])
+    hostBuff(:, i) = recvBuff
+  end do
+
+  print "(a)", ""
+  print "(a)", "after allgather:"
+  do i = 1, nDev
+    err = maxval(abs(hostBuff(:, i) / hostBuff(:, nDev + 1) - 1.0_real32))
+    print "(a, i2.2, a, e11.4e2)", "maximum error of rank ", i - 1, " vs sendbuff = ", err
+  end do
+
+  do i = 1, nDev
+    stat = cudaSetDevice(devList(i))
+    call c_f_pointer(sendBuffPtr(i), sendBuff, [nEl])
+    hostBuff((i - 1) * nEl + 1:i * nEl, 1) = sendBuff
+  end do
+
+  err = maxval(abs(hostBuff(:, 1) / hostBuff(:, nDev + 1) - 1.0_real32))
+  print "(a)", ""
+  print "(a, e11.4e2)", "maximum error in sendbuff = ", err
+  print "(a)", ""
+
+  do i = 1, nDev
+    stat = cudaSetDevice(devList(i))
+    call c_f_pointer(recvBuffPtr(i), recvBuff, [nEl * nDev])
+    deallocate(recvBuff)
+  end do
+
+  deallocate(recvBuffPtr)
+
+  do i = 1, nDev
+    stat = cudaSetDevice(devList(i))
+    call c_f_pointer(sendBuffPtr(i), sendBuff, [nEl])
+    deallocate(sendBuff)
+  end do
+
+  deallocate(sendBuffPtr)
+
+  deallocate(hostBuff)
+
+  deallocate(seed)
+
+  do i = 1, nDev
+    stat = cudaSetDevice(devList(i))
+    stat = cudaStreamDestroy(stream(i))
+  end do
+
+  deallocate(stream)
+
+  do i = 1, nDev
+    call ncclCommDestroy(comm(i))
+  end do
+
+  deallocate(devList)
+  deallocate(comm)
+
+end program test
diff --git a/fortran/test/allgather_ptr_out.f90 b/fortran/test/allgather_ptr_out.f90
new file mode 100644
index 0000000..b407664
--- /dev/null
+++ b/fortran/test/allgather_ptr_out.f90
@@ -0,0 +1,164 @@
+program test
+use iso_c_binding
+use iso_fortran_env
+use cudaFor
+use ncclFor
+implicit none
+integer(int32) :: stat, i
+real(real32) :: err
+integer(int32) :: nEl, nDev
+type(ncclDataType) :: dataType
+type(ncclComm), allocatable :: comm(:)
+integer(int32), allocatable :: devList(:)
+type(ncclResult) :: res
+integer(int32) :: cudaDev, rank
+integer(cuda_stream_kind), allocatable :: stream(:)
+integer(int32) :: time(8)
+integer(int32), allocatable :: seed(:)
+real(real32), allocatable, target :: hostBuff(:, :)
+type(c_ptr), allocatable :: hostBuffPtr(:)
+type(c_devptr), allocatable :: sendBuffPtr(:)
+type(c_devptr), allocatable :: recvBuffPtr(:)
+
+  nEl = 2621440
+
+!  nDev = 2
+  stat = cudaGetDeviceCount(nDev)
+
+  dataType = ncclFloat
+
+  allocate(comm(nDev))
+  allocate(devList(nDev))
+
+  do i = 1, nDev
+    devList(i) = i - 1
+  end do
+
+  res = ncclCommInitAll(comm, nDev, devList)
+
+  do i = 1, nDev
+    res = ncclCommCuDevice(comm(i), cudaDev)
+    res = ncclCommUserRank(comm(i), rank)
+  end do
+
+  allocate(stream(nDev))
+
+  do i = 1, nDev
+    stat = cudaSetDevice(devList(i))
+    stat = cudaStreamCreate(stream(i))
+  end do
+
+  call date_and_time(values = time)
+  call random_seed(size = i)
+  allocate(seed(i))
+  call random_seed(get = seed)
+  seed = 60 * 60 * 1000 * time(5) + 60 * 1000 * time(6) + 1000 * time(7) + time(8) - seed
+  call random_seed(put = seed)
+
+  allocate(hostBuff(nEl * nDev, nDev + 1))
+
+  call random_number(hostBuff)
+
+  print "(a)", "before allgather:"
+  do i = 1, nDev
+    err = maxval(abs(hostBuff(:, i) / hostBuff(:, nDev + 1) - 1.0_real32))
+    print "(a, i2.2, a, e11.4e2)", "maximum error of rank ", i - 1, " vs sendbuff = ", err
+  end do
+
+  allocate(hostBuffPtr(nDev))
+
+  do i = 1, nDev
+    hostBuffPtr(i) = c_loc(hostBuff((i - 1) * nEl + 1, nDev + 1))
+  end do
+
+  allocate(sendBuffPtr(nDev))
+
+  do i = 1, nDev
+    stat = cudaSetDevice(devList(i))
+    stat = cudaMalloc(sendBuffPtr(i), nEl * c_sizeof(hostBuff(1, 1)))
+    stat = cudaMemcpy(sendBuffPtr(i), hostBuffPtr(i), nEl * c_sizeof(hostBuff(1, 1)), cudaMemcpyHostToDevice)
+  end do
+
+  do i = 1, nDev
+    hostBuffPtr(i) = c_loc(hostBuff(1, i))
+  end do
+
+  allocate(recvBuffPtr(nDev))
+
+  do i = 1, nDev
+    stat = cudaSetDevice(devList(i))
+    stat = cudaMalloc(recvBuffPtr(i), nEl * c_sizeof(hostBuff(1, 1)) * nDev)
+    stat = cudaMemcpy(recvBuffPtr(i), hostBuffPtr(i), nEl * c_sizeof(hostBuff(1, 1)) * nDev, cudaMemcpyHostToDevice)
+  end do
+
+  do i = 1, nDev
+    stat = cudaSetDevice(devList(i))
+    res = ncclAllGather(sendBuffPtr(i), nEl, dataType, recvBuffPtr(i), comm(i), stream(i))
+  end do
+
+  do i = 1, nDev
+    stat = cudaSetDevice(devList(i))
+    stat = cudaStreamSynchronize(stream(i))
+  end do
+
+  do i = 1, nDev
+    stat = cudaSetDevice(devList(i))
+    stat = cudaMemcpy(hostBuffPtr(i), recvBuffPtr(i), nEl * c_sizeof(hostBuff(1, 1)) * nDev, cudaMemcpyDeviceToHost)
+  end do
+
+  print "(a)", ""
+  print "(a)", "after allgather:"
+  do i = 1, nDev
+    err = maxval(abs(hostBuff(:, i) / hostBuff(:, nDev + 1) - 1.0_real32))
+    print "(a, i2.2, a, e11.4e2)", "maximum error of rank ", i - 1, " vs sendbuff = ", err
+  end do
+
+  do i = 1, nDev
+    hostBuffPtr(i) = c_loc(hostBuff((i - 1) * nEl + 1, 1))
+  end do
+
+  do i = 1, nDev
+    stat = cudaSetDevice(devList(i))
+    stat = cudaMemcpy(hostBuffPtr(i), sendBuffPtr(i), nEl * c_sizeof(hostBuff(1, 1)), cudaMemcpyDeviceToHost)
+  end do
+
+  err = maxval(abs(hostBuff(:, 1) / hostBuff(:, nDev + 1) - 1.0_real32))
+  print "(a)", ""
+  print "(a, e11.4e2)", "maximum error in sendbuff = ", err
+  print "(a)", ""
+
+  do i = 1, nDev
+    stat = cudaSetDevice(devList(i))
+    stat = cudaFree(recvBuffPtr(i))
+  end do
+
+  deallocate(recvBuffPtr)
+
+  do i = 1, nDev
+    stat = cudaSetDevice(devList(i))
+    stat = cudaFree(sendBuffPtr(i))
+  end do
+
+  deallocate(sendBuffPtr)
+
+  deallocate(hostBuffPtr)
+
+  deallocate(hostBuff)
+
+  deallocate(seed)
+
+  do i = 1, nDev
+    stat = cudaSetDevice(devList(i))
+    stat = cudaStreamDestroy(stream(i))
+  end do
+
+  deallocate(stream)
+
+  do i = 1, nDev
+    call ncclCommDestroy(comm(i))
+  end do
+
+  deallocate(devList)
+  deallocate(comm)
+
+end program test
diff --git a/fortran/test/allreduce_arr_out.f90 b/fortran/test/allreduce_arr_out.f90
new file mode 100644
index 0000000..4804ceb
--- /dev/null
+++ b/fortran/test/allreduce_arr_out.f90
@@ -0,0 +1,158 @@
+program test
+use iso_c_binding
+use iso_fortran_env
+use cudaFor
+use ncclFor
+implicit none
+integer(int32) :: stat, i
+real(real32) :: err
+integer(int32) :: nEl, nDev
+type(ncclDataType) :: dataType
+type(ncclRedOp) :: redOp
+type(ncclComm), allocatable :: comm(:)
+integer(int32), allocatable :: devList(:)
+type(ncclResult) :: res
+integer(int32) :: cudaDev, rank
+integer(cuda_stream_kind), allocatable :: stream(:)
+integer(int32) :: time(8)
+integer(int32), allocatable :: seed(:)
+real(real32), allocatable :: hostBuff(:, :)
+real(real32), allocatable, device :: sendBuff(:)
+type(c_devptr), allocatable :: sendBuffPtr(:)
+real(real32), allocatable, device :: recvBuff(:)
+type(c_devptr), allocatable :: recvBuffPtr(:)
+
+  nEl = 2621440
+
+!  nDev = 2
+  stat = cudaGetDeviceCount(nDev)
+
+  dataType = ncclFloat
+  redOp = ncclProd
+
+  allocate(comm(nDev))
+  allocate(devList(nDev))
+
+  do i = 1, nDev
+    devList(i) = i - 1
+  end do
+
+  res = ncclCommInitAll(comm, nDev, devList)
+
+  do i = 1, nDev
+    res = ncclCommCuDevice(comm(i), cudaDev)
+    res = ncclCommUserRank(comm(i), rank)
+  end do
+
+  allocate(stream(nDev))
+
+  do i = 1, nDev
+    stat = cudaSetDevice(devList(i))
+    stat = cudaStreamCreate(stream(i))
+  end do
+
+  call date_and_time(values = time)
+  call random_seed(size = i)
+  allocate(seed(i))
+  call random_seed(get = seed)
+  seed = 60 * 60 * 1000 * time(5) + 60 * 1000 * time(6) + 1000 * time(7) + time(8) - seed
+  call random_seed(put = seed)
+
+  allocate(hostBuff(nEl, nDev + 2))
+
+  call random_number(hostBuff(:, 1:nDev + 1))
+
+  hostBuff(:, nDev + 2) = hostBuff(:, 1)
+  do i = 2, nDev
+    hostBuff(:, nDev + 2) = hostBuff(:, nDev + 2) * hostBuff(:, i)
+  end do
+
+  print "(a)", "before allreduce:"
+  do i = 1, nDev
+    err = maxval(abs(hostBuff(:, i) / hostBuff(:, nDev + 2) - 1.0_real32))
+    print "(a, i2.2, a, e11.4e2)", "maximum error in recvbuff from rank ", i - 1," = ", err
+  end do
+
+  allocate(sendBuffPtr(nDev))
+
+  do i = 1, nDev
+    stat = cudaSetDevice(devList(i))
+    allocate(sendBuff(nEl))
+    sendBuffPtr(i) = c_devloc(sendBuff)
+    sendBuff = hostBuff(:, i)
+  end do
+
+  allocate(recvBuffPtr(nDev))
+
+  do i = 1, nDev
+    stat = cudaSetDevice(devList(i))
+    allocate(recvBuff(nEl))
+    recvBuffPtr(i) = c_devloc(recvBuff)
+    recvBuff = hostBuff(:, i)
+  end do
+
+  do i = 1, nDev
+    stat = cudaSetDevice(devList(i))
+    res = ncclAllReduce(sendBuffPtr(i), recvBuffPtr(i), nEl, dataType, redOp, comm(i), stream(i))
+  end do
+
+  do i = 1, nDev
+    stat = cudaSetDevice(devList(i))
+    stat = cudaStreamSynchronize(stream(i))
+  end do
+
+  print "(a)", ""
+  print "(a)", "after allreduce:"
+  do i = 1, nDev
+    stat = cudaSetDevice(devList(i))
+    call c_f_pointer(recvBuffPtr(i), recvBuff, [nEl])
+    hostBuff(:, nDev + 1) = recvBuff
+    err = maxval(abs(hostBuff(:, nDev + 1) / hostBuff(:, nDev + 2) - 1.0_real32))
+    print "(a, i2.2, a, e11.4e2)", "maximum error in recvbuff from rank ", i - 1," = ", err
+  end do
+
+  print "(a)", ""
+  do i = 1, nDev
+    stat = cudaSetDevice(devList(i))
+    call c_f_pointer(sendBuffPtr(i), sendBuff, [nEl])
+    hostBuff(:, nDev + 1) = sendBuff
+    err = maxval(abs(hostBuff(:, nDev + 1) / hostBuff(:, i) - 1.0_real32))
+    print "(a, i2.2, a, e11.4e2)", "maximum error in sendbuff of rank ", i - 1," = ", err
+  end do
+  print "(a)", ""
+
+  do i = 1, nDev
+    stat = cudaSetDevice(devList(i))
+    call c_f_pointer(recvBuffPtr(i), recvBuff, [nEl])
+    deallocate(recvBuff)
+  end do
+
+  deallocate(recvBuffPtr)
+
+  do i = 1, nDev
+    stat = cudaSetDevice(devList(i))
+    call c_f_pointer(sendBuffPtr(i), sendBuff, [nEl])
+    deallocate(sendBuff)
+  end do
+
+  deallocate(sendBuffPtr)
+
+  deallocate(hostBuff)
+
+  deallocate(seed)
+
+  do i = 1, nDev
+    stat = cudaSetDevice(devList(i))
+    stat = cudaStreamDestroy(stream(i))
+  end do
+
+  deallocate(stream)
+
+  do i = 1, nDev
+    call ncclCommDestroy(comm(i))
+  end do
+
+  deallocate(devList)
+  deallocate(comm)
+
+end program test
diff --git a/fortran/test/allreduce_ptr_out.f90 b/fortran/test/allreduce_ptr_out.f90
new file mode 100644
index 0000000..39b2f2b
--- /dev/null
+++ b/fortran/test/allreduce_ptr_out.f90
@@ -0,0 +1,159 @@
+program test
+use iso_c_binding
+use iso_fortran_env
+use cudaFor
+use ncclFor
+implicit none
+integer(int32) :: stat, i
+real(real32) :: err
+integer(int32) :: nEl, nDev
+type(ncclDataType) :: dataType
+type(ncclRedOp) :: redOp
+type(ncclComm), allocatable :: comm(:)
+integer(int32), allocatable :: devList(:)
+type(ncclResult) :: res
+integer(int32) :: cudaDev, rank
+integer(cuda_stream_kind), allocatable :: stream(:)
+integer(int32) :: time(8)
+integer(int32), allocatable :: seed(:)
+real(real32), allocatable, target :: hostBuff(:, :)
+type(c_ptr), allocatable :: hostBuffPtr(:)
+type(c_devptr), allocatable :: sendBuffPtr(:)
+type(c_devptr), allocatable :: recvBuffPtr(:)
+
+  nEl = 2621440
+
+!  nDev = 2
+  stat = cudaGetDeviceCount(nDev)
+
+  dataType = ncclFloat
+  redOp = ncclProd
+
+  allocate(comm(nDev))
+  allocate(devList(nDev))
+
+  do i = 1, nDev
+    devList(i) = i - 1
+  end do
+
+  res = ncclCommInitAll(comm, nDev, devList)
+
+  do i = 1, nDev
+    res = ncclCommCuDevice(comm(i), cudaDev)
+    res = ncclCommUserRank(comm(i), rank)
+  end do
+
+  allocate(stream(nDev))
+
+  do i = 1, nDev
+    stat = cudaSetDevice(devList(i))
+    stat = cudaStreamCreate(stream(i))
+  end do
+
+  call date_and_time(values = time)
+  call random_seed(size = i)
+  allocate(seed(i))
+  call random_seed(get = seed)
+  seed = 60 * 60 * 1000 * time(5) + 60 * 1000 * time(6) + 1000 * time(7) + time(8) - seed
+  call random_seed(put = seed)
+
+  allocate(hostBuff(nEl, nDev + 2))
+
+  call random_number(hostBuff(:, 1:nDev + 1))
+
+  hostBuff(:, nDev + 2) = hostBuff(:, 1)
+  do i = 2, nDev
+    hostBuff(:, nDev + 2) = hostBuff(:, nDev + 2) * hostBuff(:, i)
+  end do
+
+  print "(a)", "before allreduce:"
+  do i = 1, nDev
+    err = maxval(abs(hostBuff(:, i) / hostBuff(:, nDev + 2) - 1.0_real32))
+    print "(a, i2.2, a, e11.4e2)", "maximum error in recvbuff from rank ", i - 1," = ", err
+  end do
+
+  allocate(hostBuffPtr(nDev + 1))
+
+  do i = 1, nDev + 1
+    hostBuffPtr(i) = c_loc(hostBuff(1, i))
+  end do
+
+  allocate(sendBuffPtr(nDev))
+
+  do i = 1, nDev
+    stat = cudaSetDevice(devList(i))
+    stat = cudaMalloc(sendBuffPtr(i), nEl * c_sizeof(hostBuff(1, 1)))
+    stat = cudaMemcpy(sendBuffPtr(i), hostBuffPtr(i), nEl * c_sizeof(hostBuff(1, 1)), cudaMemcpyHostToDevice)
+  end do
+
+  allocate(recvBuffPtr(nDev))
+
+  do i = 1, nDev
+    stat = cudaSetDevice(devList(i))
+    stat = cudaMalloc(recvBuffPtr(i), nEl * c_sizeof(hostBuff(1, 1)))
+    stat = cudaMemcpy(recvBuffPtr(i), hostBuffPtr(i), nEl * c_sizeof(hostBuff(1, 1)), cudaMemcpyHostToDevice)
+  end do
+
+  do i = 1, nDev
+    stat = cudaSetDevice(devList(i))
+    res = ncclAllReduce(sendBuffPtr(i), recvBuffPtr(i), nEl, dataType, redOp, comm(i), stream(i))
+  end do
+
+  do i = 1, nDev
+    stat = cudaSetDevice(devList(i))
+    stat = cudaStreamSynchronize(stream(i))
+  end do
+
+  print "(a)", ""
+  print "(a)", "after allreduce:"
+  do i = 1, nDev
+    stat = cudaSetDevice(devList(i))
+    stat = cudaMemcpy(hostBuffPtr(nDev + 1), recvBuffPtr(i), nEl * c_sizeof(hostBuff(1, 1)), cudaMemcpyDeviceToHost)
+    err = maxval(abs(hostBuff(:, nDev + 1) / hostBuff(:, nDev + 2) - 1.0_real32))
+    print "(a, i2.2, a, e11.4e2)", "maximum error in recvbuff from rank ", i - 1," = ", err
+  end do
+
+  print "(a)", ""
+  do i = 1, nDev
+    stat = cudaSetDevice(devList(i))
+    stat = cudaMemcpy(hostBuffPtr(nDev + 1), sendBuffPtr(i), nEl * c_sizeof(hostBuff(1, 1)), cudaMemcpyDeviceToHost)
+    err = maxval(abs(hostBuff(:, nDev + 1) / hostBuff(:, i) - 1.0_real32))
+    print "(a, i2.2, a, e11.4e2)", "maximum error in sendbuff of rank ", i - 1," = ", err
+  end do
+  print "(a)", ""
+
+  do i = 1, nDev
+    stat = cudaSetDevice(devList(i))
+    stat = cudaFree(recvBuffPtr(i))
+  end do
+
+  deallocate(recvBuffPtr)
+
+  do i = 1, nDev
+    stat = cudaSetDevice(devList(i))
+    stat = cudaFree(sendBuffPtr(i))
+  end do
+
+  deallocate(sendBuffPtr)
+
+  deallocate(hostBuffPtr)
+
+  deallocate(hostBuff)
+
+  deallocate(seed)
+
+  do i = 1, nDev
+    stat = cudaSetDevice(devList(i))
+    stat = cudaStreamDestroy(stream(i))
+  end do
+
+  deallocate(stream)
+
+  do i = 1, nDev
+    call ncclCommDestroy(comm(i))
+  end do
+
+  deallocate(devList)
+  deallocate(comm)
+
+end program test
diff --git a/fortran/test/broadcast_arr.f90 b/fortran/test/broadcast_arr.f90
new file mode 100644
index 0000000..137c679
--- /dev/null
+++ b/fortran/test/broadcast_arr.f90
@@ -0,0 +1,130 @@
+program test
+use iso_c_binding
+use iso_fortran_env
+use cudaFor
+use ncclFor
+implicit none
+integer(int32) :: stat, i
+real(real32) :: err
+integer(int32) :: nEl, nDev, root
+type(ncclDataType) :: dataType
+type(ncclComm), allocatable :: comm(:)
+integer(int32), allocatable :: devList(:)
+type(ncclResult) :: res
+integer(int32) :: cudaDev, rank
+integer(cuda_stream_kind), allocatable :: stream(:)
+integer(int32) :: time(8)
+integer(int32), allocatable :: seed(:)
+real(real32), allocatable :: hostBuff(:, :)
+real(real32), allocatable, device :: devBuff(:)
+type(c_devptr), allocatable :: devBuffPtr(:)
+
+  nEl = 2621440
+
+!  nDev = 2
+!  root = 0
+  stat = cudaGetDeviceCount(nDev)
+  root = nDev - 1
+
+  dataType = ncclFloat
+
+  allocate(comm(nDev))
+  allocate(devList(nDev))
+
+  do i = 1, nDev
+    devList(i) = i - 1
+  end do
+
+  res = ncclCommInitAll(comm, nDev, devList)
+
+  do i = 1, nDev
+    res = ncclCommCuDevice(comm(i), cudaDev)
+    res = ncclCommUserRank(comm(i), rank)
+  end do
+
+  allocate(stream(nDev))
+
+  do i = 1, nDev
+    stat = cudaSetDevice(devList(i))
+    stat = cudaStreamCreate(stream(i))
+  end do
+
+  call date_and_time(values = time)
+  call random_seed(size = i)
+  allocate(seed(i))
+  call random_seed(get = seed)
+  seed = 60 * 60 * 1000 * time(5) + 60 * 1000 * time(6) + 1000 * time(7) + time(8) - seed
+  call random_seed(put = seed)
+
+  allocate(hostBuff(nEl, nDev + 1))
+
+  call random_number(hostBuff(:, 1:nDev))
+
+  hostBuff(:, nDev + 1) = hostBuff(:, root + 1)
+
+  print "(a)", "before broadcast:"
+  do i = 1, nDev
+    err = maxval(abs(hostBuff(:, i) / hostBuff(:, nDev + 1) - 1.0_real32))
+    print "(a, i2.2, a, i2.2, a, e11.4e2)", "maximum error of rank ", i - 1, " vs root (rank ", root,") = ", err
+  end do
+
+  allocate(devBuffPtr(nDev))
+
+  do i = 1, nDev
+    stat = cudaSetDevice(devList(i))
+    allocate(devBuff(nEl))
+    devBuffPtr(i) = c_devloc(devBuff)
+    devBuff = hostBuff(:, i)
+  end do
+
+  do i = 1, nDev
+    stat = cudaSetDevice(devList(i))
+    res = ncclBcast(devBuffPtr(i), nEl, dataType, root, comm(i), stream(i))
+  end do
+
+  do i = 1, nDev
+    stat = cudaSetDevice(devList(i))
+    stat = cudaStreamSynchronize(stream(i))
+  end do
+
+  do i = 1, nDev
+    stat = cudaSetDevice(devList(i))
+    call c_f_pointer(devBuffPtr(i), devBuff, [nEl])
+    hostBuff(:, i) = devBuff
+  end do
+
+  print "(a)", ""
+  print "(a)", "after broadcast:"
+  do i = 1, nDev
+    err = maxval(abs(hostBuff(:, i) / hostBuff(:, nDev + 1) - 1.0_real32))
+    print "(a, i2.2, a, i2.2, a, e11.4e2)", "maximum error of rank ", i - 1, " vs root (rank ", root,") = ", err
+  end do
+  print "(a)", ""
+
+  do i = 1, nDev
+    stat = cudaSetDevice(devList(i))
+    call c_f_pointer(devBuffPtr(i), devBuff, [nEl])
+    deallocate(devBuff)
+  end do
+
+  deallocate(devBuffPtr)
+
+  deallocate(hostBuff)
+
+  deallocate(seed)
+
+  do i = 1, nDev
+    stat = cudaSetDevice(devList(i))
+    stat = cudaStreamDestroy(stream(i))
+  end do
+
+  deallocate(stream)
+
+  do i = 1, nDev
+    call ncclCommDestroy(comm(i))
+  end do
+
+  deallocate(devList)
+  deallocate(comm)
+
+end program test
diff --git a/fortran/test/broadcast_ptr.f90 b/fortran/test/broadcast_ptr.f90
new file mode 100644
index 0000000..0918519
--- /dev/null
+++ b/fortran/test/broadcast_ptr.f90
@@ -0,0 +1,135 @@
+program test
+use iso_c_binding
+use iso_fortran_env
+use cudaFor
+use ncclFor
+implicit none
+integer(int32) :: stat, i
+real(real32) :: err
+integer(int32) :: nEl, nDev, root
+type(ncclDataType) :: dataType
+type(ncclComm), allocatable :: comm(:)
+integer(int32), allocatable :: devList(:)
+type(ncclResult) :: res
+integer(int32) :: cudaDev, rank
+integer(cuda_stream_kind), allocatable :: stream(:)
+integer(int32) :: time(8)
+integer(int32), allocatable :: seed(:)
+real(real32), allocatable, target :: hostBuff(:, :)
+type(c_ptr), allocatable :: hostBuffPtr(:)
+type(c_devptr), allocatable :: devBuffPtr(:)
+
+  nEl = 2621440
+
+!  nDev = 2
+!  root = 0
+  stat = cudaGetDeviceCount(nDev)
+  root = nDev - 1
+
+  dataType = ncclFloat
+
+  allocate(comm(nDev))
+  allocate(devList(nDev))
+
+  do i = 1, nDev
+    devList(i) = i - 1
+  end do
+
+  res = ncclCommInitAll(comm, nDev, devList)
+
+  do i = 1, nDev
+    res = ncclCommCuDevice(comm(i), cudaDev)
+    res = ncclCommUserRank(comm(i), rank)
+  end do
+
+  allocate(stream(nDev))
+
+  do i = 1, nDev
+    stat = cudaSetDevice(devList(i))
+    stat = cudaStreamCreate(stream(i))
+  end do
+
+  call date_and_time(values = time)
+  call random_seed(size = i)
+  allocate(seed(i))
+  call random_seed(get = seed)
+  seed = 60 * 60 * 1000 * time(5) + 60 * 1000 * time(6) + 1000 * time(7) + time(8) - seed
+  call random_seed(put = seed)
+
+  allocate(hostBuff(nEl, nDev + 1))
+
+  call random_number(hostBuff(:, 1:nDev))
+
+  hostBuff(:, nDev + 1) = hostBuff(:, root + 1)
+
+  print "(a)", "before broadcast:"
+  do i = 1, nDev
+    err = maxval(abs(hostBuff(:, i) / hostBuff(:, nDev + 1) - 1.0_real32))
+    print "(a, i2.2, a, i2.2, a, e11.4e2)", "maximum error of rank ", i - 1, " vs root (rank ", root,") = ", err
+  end do
+
+  allocate(hostBuffPtr(nDev))
+
+  do i = 1, nDev
+    hostBuffPtr(i) = c_loc(hostBuff(1, i))
+  end do
+
+  allocate(devBuffPtr(nDev))
+
+  do i = 1, nDev
+    stat = cudaSetDevice(devList(i))
+    stat = cudaMalloc(devBuffPtr(i), nEl * c_sizeof(hostBuff(1, 1)))
+    stat = cudaMemcpy(devBuffPtr(i), hostBuffPtr(i), nEl * c_sizeof(hostBuff(1, 1)), cudaMemcpyHostToDevice)
+  end do
+
+  do i = 1, nDev
+    stat = cudaSetDevice(devList(i))
+    res = ncclBcast(devBuffPtr(i), nEl, dataType, root, comm(i), stream(i))
+  end do
+
+  do i = 1, nDev
+    stat = cudaSetDevice(devList(i))
+    stat = cudaStreamSynchronize(stream(i))
+  end do
+
+  do i = 1, nDev
+    stat = cudaSetDevice(devList(i))
+    stat = cudaMemcpy(hostBuffPtr(i), devBuffPtr(i), nEl * c_sizeof(hostBuff(1, 1)), cudaMemcpyDeviceToHost)
+  end do
+
+  print "(a)", ""
+  print "(a)", "after broadcast:"
+  do i = 1, nDev
+    err = maxval(abs(hostBuff(:, i) / hostBuff(:, nDev + 1) - 1.0_real32))
+    print "(a, i2.2, a, i2.2, a, e11.4e2)", "maximum error of rank ", i - 1, " vs root (rank ", root,") = ", err
+  end do
+  print "(a)", ""
+
+  do i = 1, nDev
+    stat = cudaSetDevice(devList(i))
+    stat = cudaFree(devBuffPtr(i))
+  end do
+
+  deallocate(devBuffPtr)
+
+  deallocate(hostBuffPtr)
+
+  deallocate(hostBuff)
+
+  deallocate(seed)
+
+  do i = 1, nDev
+    stat = cudaSetDevice(devList(i))
+    stat = cudaStreamDestroy(stream(i))
+  end do
+
+  deallocate(stream)
+
+  do i = 1, nDev
+    call ncclCommDestroy(comm(i))
+  end do
+
+  deallocate(devList)
+  deallocate(comm)
+
+end program test
diff --git a/fortran/test/reduce_arr_out.f90 b/fortran/test/reduce_arr_out.f90
new file mode 100644
index 0000000..a332cf5
--- /dev/null
+++ b/fortran/test/reduce_arr_out.f90
@@ -0,0 +1,168 @@
+program test
+use iso_c_binding
+use iso_fortran_env
+use cudaFor
+use ncclFor
+implicit none
+integer(int32) :: stat, i
+real(real32) :: err
+integer(int32) :: nEl, nDev, root
+type(ncclDataType) :: dataType
+type(ncclRedOp) :: redOp
+type(ncclComm), allocatable :: comm(:)
+integer(int32), allocatable :: devList(:)
+type(ncclResult) :: res
+integer(int32) :: cudaDev, rank
+integer(cuda_stream_kind), allocatable :: stream(:)
+integer(int32) :: time(8)
+integer(int32), allocatable :: seed(:)
+real(real32), allocatable :: hostBuff(:, :)
+real(real32), allocatable, device :: sendBuff(:)
+type(c_devptr), allocatable :: sendBuffPtr(:)
+real(real32), allocatable, device :: recvBuff(:)
+type(c_devptr), allocatable :: recvBuffPtr(:)
+
+  nEl = 2621440
+
+!  nDev = 2
+!  root = 0
+  stat = cudaGetDeviceCount(nDev)
+  root = nDev - 1
+
+  dataType = ncclFloat
+  redOp = ncclProd
+
+  allocate(comm(nDev))
+  allocate(devList(nDev))
+
+  do i = 1, nDev
+    devList(i) = i - 1
+  end do
+
+  res = ncclCommInitAll(comm, nDev, devList)
+
+  do i = 1, nDev
+    res = ncclCommCuDevice(comm(i), cudaDev)
+    res = ncclCommUserRank(comm(i), rank)
+  end do
+
+  allocate(stream(nDev))
+
+  do i = 1, nDev
+    stat = cudaSetDevice(devList(i))
+    stat = cudaStreamCreate(stream(i))
+  end do
+
+  call date_and_time(values = time)
+  call random_seed(size = i)
+  allocate(seed(i))
+  call random_seed(get = seed)
+  seed = 60 * 60 * 1000 * time(5) + 60 * 1000 * time(6) + 1000 * time(7) + time(8) - seed
+  call random_seed(put = seed)
+
+  allocate(hostBuff(nEl, nDev + 2))
+
+  call random_number(hostBuff(:, 1:nDev + 1))
+
+  hostBuff(:, nDev + 2) = hostBuff(:, 1)
+  do i = 2, nDev
+    hostBuff(:, nDev + 2) = hostBuff(:, nDev + 2) * hostBuff(:, i)
+  end do
+
+  print "(a)", "before reduce:"
+  err = maxval(abs(hostBuff(:, nDev + 1) / hostBuff(:, nDev + 2) - 1.0_real32))
+  print "(a, i2.2, a, e11.4e2)", "maximum error in recvbuff from root (rank ", root,") = ", err
+
+  allocate(sendBuffPtr(nDev))
+
+  do i = 1, nDev
+    stat = cudaSetDevice(devList(i))
+    allocate(sendBuff(nEl))
+    sendBuffPtr(i) = c_devloc(sendBuff)
+    sendBuff = hostBuff(:, i)
+  end do
+
+  allocate(recvBuffPtr(nDev))
+
+  do i = 1, nDev
+    stat = cudaSetDevice(devList(i))
+    allocate(recvBuff(nEl))
+    recvBuffPtr(i) = c_devloc(recvBuff)
+    recvBuff = hostBuff(:, i)
+  end do
+
+  do i = 1, nDev
+    stat = cudaSetDevice(devList(i))
+    res = ncclReduce(sendBuffPtr(i), recvBuffPtr(i), nEl, dataType, redOp, root, comm(i), stream(i))
+  end do
+
+  do i = 1, nDev
+    stat = cudaSetDevice(devList(i))
+    stat = cudaStreamSynchronize(stream(i))
+  end do
+
+  stat = cudaSetDevice(devList(root + 1))
+  call c_f_pointer(recvBuffPtr(root + 1), recvBuff, [nEl])
+  hostBuff(:, nDev + 1) = recvBuff
+
+  print "(a)", ""
+  print "(a)", "after reduce:"
+  err = maxval(abs(hostBuff(:, nDev + 1) / hostBuff(:, nDev + 2) - 1.0_real32))
+  print "(a, i2.2, a, e11.4e2)", "maximum error in recvbuff from root (rank ", root,") = ", err
+
+  print "(a)", ""
+  do i = 1, nDev
+    stat = cudaSetDevice(devList(i))
+    call c_f_pointer(sendBuffPtr(i), sendBuff, [nEl])
+    hostBuff(:, nDev + 1) = sendBuff
+    err = maxval(abs(hostBuff(:, nDev + 1) / hostBuff(:, i) - 1.0_real32))
+    print "(a, i2.2, a, e11.4e2)", "maximum error in sendbuff of rank ", i - 1," = ", err
+  end do
+
+  print "(a)", ""
+  do i = 1, nDev
+    if (i - 1 /= root) then
+      stat = cudaSetDevice(devList(i))
+      call c_f_pointer(recvBuffPtr(i), recvBuff, [nEl])
+      hostBuff(:, nDev + 1) = recvBuff
+      err = maxval(abs(hostBuff(:, nDev + 1) / hostBuff(:, i) - 1.0_real32))
+      print "(a, i2.2, a, e11.4e2)", "maximum error in recvbuff of rank ", i - 1," = ", err
+    end if
+  end do
+  print "(a)", ""
+
+  do i = 1, nDev
+    stat = cudaSetDevice(devList(i))
+    call c_f_pointer(recvBuffPtr(i), recvBuff, [nEl])
+    deallocate(recvBuff)
+  end do
+
+  deallocate(recvBuffPtr)
+
+  do i = 1, nDev
+    stat = cudaSetDevice(devList(i))
+    call c_f_pointer(sendBuffPtr(i), sendBuff, [nEl])
+    deallocate(sendBuff)
+  end do
+
+  deallocate(sendBuffPtr)
+
+  deallocate(hostBuff)
+
+  deallocate(seed)
+
+  do i = 1, nDev
+    stat = cudaSetDevice(devList(i))
+    stat = cudaStreamDestroy(stream(i))
+  end do
+
+  deallocate(stream)
+
+  do i = 1, nDev
+    call ncclCommDestroy(comm(i))
+  end do
+
+  deallocate(devList)
+  deallocate(comm)
+
+end program test
diff --git a/fortran/test/reduce_ptr_out.f90 b/fortran/test/reduce_ptr_out.f90
new file mode 100644
index 0000000..46f3e12
--- /dev/null
+++ b/fortran/test/reduce_ptr_out.f90
@@ -0,0 +1,168 @@
+program test
+use iso_c_binding
+use iso_fortran_env
+use cudaFor
+use ncclFor
+implicit none
+integer(int32) :: stat, i
+real(real32) :: err
+integer(int32) :: nEl, nDev, root
+type(ncclDataType) :: dataType
+type(ncclRedOp) :: redOp
+type(ncclComm), allocatable :: comm(:)
+integer(int32), allocatable :: devList(:)
+type(ncclResult) :: res
+integer(int32) :: cudaDev, rank
+integer(cuda_stream_kind), allocatable :: stream(:)
+integer(int32) :: time(8)
+integer(int32), allocatable :: seed(:)
+real(real32), allocatable, target :: hostBuff(:, :)
+type(c_ptr), allocatable :: hostBuffPtr(:)
+type(c_devptr), allocatable :: sendBuffPtr(:)
+type(c_devptr), allocatable :: recvBuffPtr(:)
+
+  nEl = 2621440
+
+!  nDev = 2
+!  root = 0
+  stat = cudaGetDeviceCount(nDev)
+  root = nDev - 1
+
+  dataType = ncclFloat
+  redOp = ncclProd
+
+  allocate(comm(nDev))
+  allocate(devList(nDev))
+
+  do i = 1, nDev
+    devList(i) = i - 1
+  end do
+
+  res = ncclCommInitAll(comm, nDev, devList)
+
+  do i = 1, nDev
+    res = ncclCommCuDevice(comm(i), cudaDev)
+    res = ncclCommUserRank(comm(i), rank)
+  end do
+
+  allocate(stream(nDev))
+
+  do i = 1, nDev
+    stat = cudaSetDevice(devList(i))
+    stat = cudaStreamCreate(stream(i))
+  end do
+
+  call date_and_time(values = time)
+  call random_seed(size = i)
+  allocate(seed(i))
+  call random_seed(get = seed)
+  seed = 60 * 60 * 1000 * time(5) + 60 * 1000 * time(6) + 1000 * time(7) + time(8) - seed
+  call random_seed(put = seed)
+
+  allocate(hostBuff(nEl, nDev + 2))
+
+  call random_number(hostBuff(:, 1:nDev + 1))
+
+  hostBuff(:, nDev + 2) = hostBuff(:, 1)
+  do i = 2, nDev
+    hostBuff(:, nDev + 2) = hostBuff(:, nDev + 2) * hostBuff(:, i)
+  end do
+
+  print "(a)", "before reduce:"
+  err = maxval(abs(hostBuff(:, nDev + 1) / hostBuff(:, nDev + 2) - 1.0_real32))
+  print "(a, i2.2, a, e11.4e2)", "maximum error in recvbuff from root (rank ", root,") = ", err
+
+  allocate(hostBuffPtr(nDev + 1))
+
+  do i = 1, nDev + 1
+    hostBuffPtr(i) = c_loc(hostBuff(1, i))
+  end do
+
+  allocate(sendBuffPtr(nDev))
+
+  do i = 1, nDev
+    stat = cudaSetDevice(devList(i))
+    stat = cudaMalloc(sendBuffPtr(i), nEl * c_sizeof(hostBuff(1, 1)))
+    stat = cudaMemcpy(sendBuffPtr(i), hostBuffPtr(i), nEl * c_sizeof(hostBuff(1, 1)), cudaMemcpyHostToDevice)
+  end do
+
+  allocate(recvBuffPtr(nDev))
+
+  do i = 1, nDev
+    stat = cudaSetDevice(devList(i))
+    stat = cudaMalloc(recvBuffPtr(i), nEl * c_sizeof(hostBuff(1, 1)))
+    stat = cudaMemcpy(recvBuffPtr(i), hostBuffPtr(i), nEl * c_sizeof(hostBuff(1, 1)), cudaMemcpyHostToDevice)
+  end do
+
+  do i = 1, nDev
+    stat = cudaSetDevice(devList(i))
+    res = ncclReduce(sendBuffPtr(i), recvBuffPtr(i), nEl, dataType, redOp, root, comm(i), stream(i))
+  end do
+
+  do i = 1, nDev
+    stat = cudaSetDevice(devList(i))
+    stat = cudaStreamSynchronize(stream(i))
+  end do
+
+  stat = cudaSetDevice(devList(root + 1))
+  stat = cudaMemcpy(hostBuffPtr(nDev + 1), recvBuffPtr(root + 1), nEl * c_sizeof(hostBuff(1, 1)), cudaMemcpyDeviceToHost)
+
+  print "(a)", ""
+  print "(a)", "after reduce:"
+  err = maxval(abs(hostBuff(:, nDev + 1) / hostBuff(:, nDev + 2) - 1.0_real32))
+  print "(a, i2.2, a, e11.4e2)", "maximum error in recvbuff from root (rank ", root,") = ", err
+
+  print "(a)", ""
+  do i = 1, nDev
+    stat = cudaSetDevice(devList(i))
+    stat = cudaMemcpy(hostBuffPtr(nDev + 1), sendBuffPtr(i), nEl * c_sizeof(hostBuff(1, 1)), cudaMemcpyDeviceToHost)
+    err = maxval(abs(hostBuff(:, nDev + 1) / hostBuff(:, i) - 1.0_real32))
+    print "(a, i2.2, a, e11.4e2)", "maximum error in sendbuff of rank ", i - 1," = ", err
+  end do
+
+  print "(a)", ""
+  do i = 1, nDev
+    if (i - 1 /= root) then
+      stat = cudaSetDevice(devList(i))
+      stat = cudaMemcpy(hostBuffPtr(nDev + 1), recvBuffPtr(i), nEl * c_sizeof(hostBuff(1, 1)), cudaMemcpyDeviceToHost)
+      err = maxval(abs(hostBuff(:, nDev + 1) / hostBuff(:, i) - 1.0_real32))
+      print "(a, i2.2, a, e11.4e2)", "maximum error in recvbuff of rank ", i - 1," = ", err
+    end if
+  end do
+  print "(a)", ""
+
+  do i = 1, nDev
+    stat = cudaSetDevice(devList(i))
+    stat = cudaFree(recvBuffPtr(i))
+  end do
+
+  deallocate(recvBuffPtr)
+
+  do i = 1, nDev
+    stat = cudaSetDevice(devList(i))
+    stat = cudaFree(sendBuffPtr(i))
+  end do
+
+  deallocate(sendBuffPtr)
+
+  deallocate(hostBuffPtr)
+
+  deallocate(hostBuff)
+
+  deallocate(seed)
+
+  do i = 1, nDev
+    stat = cudaSetDevice(devList(i))
+    stat = cudaStreamDestroy(stream(i))
+  end do
+
+  deallocate(stream)
+
+  do i = 1, nDev
+    call ncclCommDestroy(comm(i))
+  end do
+
+  deallocate(devList)
+  deallocate(comm)
+
+end program test
diff --git a/fortran/test/reducescatter_arr_out.f90 b/fortran/test/reducescatter_arr_out.f90
new file mode 100644
index 0000000..cd14157
--- /dev/null
+++ b/fortran/test/reducescatter_arr_out.f90
@@ -0,0 +1,158 @@
+program test
+use iso_c_binding
+use iso_fortran_env
+use cudaFor
+use ncclFor
+implicit none
+integer(int32) :: stat, i
+real(real32) :: err
+integer(int32) :: nEl, nDev
+type(ncclDataType) :: dataType
+type(ncclRedOp) :: redOp
+type(ncclComm), allocatable :: comm(:)
+integer(int32), allocatable :: devList(:)
+type(ncclResult) :: res
+integer(int32) :: cudaDev, rank
+integer(cuda_stream_kind), allocatable :: stream(:)
+integer(int32) :: time(8)
+integer(int32), allocatable :: seed(:)
+real(real32), allocatable :: hostBuff(:, :)
+real(real32), allocatable, device :: sendBuff(:)
+type(c_devptr), allocatable :: sendBuffPtr(:)
+real(real32), allocatable, device :: recvBuff(:)
+type(c_devptr), allocatable :: recvBuffPtr(:)
+
+  nEl = 2621440
+
+!  nDev = 2
+  stat = cudaGetDeviceCount(nDev)
+
+  dataType = ncclFloat
+  redOp = ncclProd
+
+  allocate(comm(nDev))
+  allocate(devList(nDev))
+
+  do i = 1, nDev
+    devList(i) = i - 1
+  end do
+
+  res = ncclCommInitAll(comm, nDev, devList)
+
+  do i = 1, nDev
+    res = ncclCommCuDevice(comm(i), cudaDev)
+    res = ncclCommUserRank(comm(i), rank)
+  end do
+
+  allocate(stream(nDev))
+
+  do i = 1, nDev
+    stat = cudaSetDevice(devList(i))
+    stat = cudaStreamCreate(stream(i))
+  end do
+
+  call date_and_time(values = time)
+  call random_seed(size = i)
+  allocate(seed(i))
+  call random_seed(get = seed)
+  seed = 60 * 60 * 1000 * time(5) + 60 * 1000 * time(6) + 1000 * time(7) + time(8) - seed
+  call random_seed(put = seed)
+
+  allocate(hostBuff(nEl * nDev, nDev + 2))
+
+  call random_number(hostBuff(:, 1:nDev + 1))
+
+  hostBuff(:, nDev + 2) = hostBuff(:, 1)
+  do i = 2, nDev
+    hostBuff(:, nDev + 2) = hostBuff(:, nDev + 2) * hostBuff(:, i)
+  end do
+
+  print "(a)", "before reducescatter:"
+  do i = 1, nDev
+    err = maxval(abs(hostBuff((i - 1) * nEl + 1:i * nEl, nDev + 1) / hostBuff((i - 1) * nEl + 1:i * nEl, nDev + 2) - 1.0_real32))
+    print "(a, i2.2, a, e11.4e2)", "maximum error in recvbuff from rank ", i - 1," = ", err
+  end do
+
+  allocate(sendBuffPtr(nDev))
+
+  do i = 1, nDev
+    stat = cudaSetDevice(devList(i))
+    allocate(sendBuff(nEl * nDev))
+    sendBuffPtr(i) = c_devloc(sendBuff)
+    sendBuff = hostBuff(:, i)
+  end do
+
+  allocate(recvBuffPtr(nDev))
+
+  do i = 1, nDev
+    stat = cudaSetDevice(devList(i))
+    allocate(recvBuff(nEl))
+    recvBuffPtr(i) = c_devloc(recvBuff)
+    recvBuff = hostBuff((i - 1) * nEl + 1:i * nEl, nDev + 1)
+  end do
+
+  do i = 1, nDev
+    stat = cudaSetDevice(devList(i))
+    res = ncclReduceScatter(sendBuffPtr(i), recvBuffPtr(i), nEl, dataType, redOp, comm(i), stream(i))
+  end do
+
+  do i = 1, nDev
+    stat = cudaSetDevice(devList(i))
+    stat = cudaStreamSynchronize(stream(i))
+  end do
+
+  print "(a)", ""
+  print "(a)", "after reducescatter:"
+  do i = 1, nDev
+    stat = cudaSetDevice(devList(i))
+    call c_f_pointer(recvBuffPtr(i), recvBuff, [nEl])
+    hostBuff((i - 1) * nEl + 1:i * nEl, nDev + 1) = recvBuff
+    err = maxval(abs(hostBuff((i - 1) * nEl + 1:i * nEl, nDev + 1) / hostBuff((i - 1) * nEl + 1:i * nEl, nDev + 2) - 1.0_real32))
+    print "(a, i2.2, a, e11.4e2)", "maximum error in recvbuff from rank ", i - 1," = ", err
+  end do
+
+  print "(a)", ""
+  do i = 1, nDev
+    stat = cudaSetDevice(devList(i))
+    call c_f_pointer(sendBuffPtr(i), sendBuff, [nEl * nDev])
+    hostBuff(:, nDev + 1) = sendBuff
+    err = maxval(abs(hostBuff(:, nDev + 1) / hostBuff(:, i) - 1.0_real32))
+    print "(a, i2.2, a, e11.4e2)", "maximum error in sendbuff of rank ", i - 1," = ", err
+  end do
+  print "(a)", ""
+
+  do i = 1, nDev
+    stat = cudaSetDevice(devList(i))
+    call c_f_pointer(recvBuffPtr(i), recvBuff, [nEl])
+    deallocate(recvBuff)
+  end do
+
+  deallocate(recvBuffPtr)
+
+  do i = 1, nDev
+    stat = cudaSetDevice(devList(i))
+    call c_f_pointer(sendBuffPtr(i), sendBuff, [nEl * nDev])
+    deallocate(sendBuff)
+  end do
+
+  deallocate(sendBuffPtr)
+
+  deallocate(hostBuff)
+
+  deallocate(seed)
+
+  do i = 1, nDev
+    stat = cudaSetDevice(devList(i))
+    stat = cudaStreamDestroy(stream(i))
+  end do
+
+  deallocate(stream)
+
+  do i = 1, nDev
+    call ncclCommDestroy(comm(i))
+  end do
+
+  deallocate(devList)
+  deallocate(comm)
+
+end program test
diff --git a/fortran/test/reducescatter_ptr_out.f90 b/fortran/test/reducescatter_ptr_out.f90
new file mode 100644
index 0000000..e35e85d
--- /dev/null
+++ b/fortran/test/reducescatter_ptr_out.f90
@@ -0,0 +1,167 @@
+program test
+use iso_c_binding
+use iso_fortran_env
+use cudaFor
+use ncclFor
+implicit none
+integer(int32) :: stat, i
+real(real32) :: err
+integer(int32) :: nEl, nDev
+type(ncclDataType) :: dataType
+type(ncclRedOp) :: redOp
+type(ncclComm), allocatable :: comm(:)
+integer(int32), allocatable :: devList(:)
+type(ncclResult) :: res
+integer(int32) :: cudaDev, rank
+integer(cuda_stream_kind), allocatable :: stream(:)
+integer(int32) :: time(8)
+integer(int32), allocatable :: seed(:)
+real(real32), allocatable, target :: hostBuff(:, :)
+type(c_ptr), allocatable :: hostBuffPtr(:)
+type(c_devptr), allocatable :: sendBuffPtr(:)
+type(c_devptr), allocatable :: recvBuffPtr(:)
+
+  nEl = 2621440
+
+!  nDev = 2
+  stat = cudaGetDeviceCount(nDev)
+
+  dataType = ncclFloat
+  redOp = ncclProd
+
+  allocate(comm(nDev))
+  allocate(devList(nDev))
+
+  do i = 1, nDev
+    devList(i) = i - 1
+  end do
+
+  res = ncclCommInitAll(comm, nDev, devList)
+
+  do i = 1, nDev
+    res = ncclCommCuDevice(comm(i), cudaDev)
+    res = ncclCommUserRank(comm(i), rank)
+  end do
+
+  allocate(stream(nDev))
+
+  do i = 1, nDev
+    stat = cudaSetDevice(devList(i))
+    stat = cudaStreamCreate(stream(i))
+  end do
+
+  call date_and_time(values = time)
+  call random_seed(size = i)
+  allocate(seed(i))
+  call random_seed(get = seed)
+  seed = 60 * 60 * 1000 * time(5) + 60 * 1000 * time(6) + 1000 * time(7) + time(8) - seed
+  call random_seed(put = seed)
+
+  allocate(hostBuff(nEl * nDev, nDev + 2))
+
+  call random_number(hostBuff(:, 1:nDev + 1))
+
+  hostBuff(:, nDev + 2) = hostBuff(:, 1)
+  do i = 2, nDev
+    hostBuff(:, nDev + 2) = hostBuff(:, nDev + 2) * hostBuff(:, i)
+  end do
+
+  print "(a)", "before reducescatter:"
+  do i = 1, nDev
+    err = maxval(abs(hostBuff((i - 1) * nEl + 1:i * nEl, nDev + 1) / hostBuff((i - 1) * nEl + 1:i * nEl, nDev + 2) - 1.0_real32))
+    print "(a, i2.2, a, e11.4e2)", "maximum error in recvbuff from rank ", i - 1," = ", err
+  end do
+
+  allocate(hostBuffPtr(nDev + 1))
+
+  do i = 1, nDev + 1
+    hostBuffPtr(i) = c_loc(hostBuff(1, i))
+  end do
+
+  allocate(sendBuffPtr(nDev))
+
+  do i = 1, nDev
+    stat = cudaSetDevice(devList(i))
+    stat = cudaMalloc(sendBuffPtr(i), nEl * c_sizeof(hostBuff(1, 1)) * nDev)
+    stat = cudaMemcpy(sendBuffPtr(i), hostBuffPtr(i), nEl * c_sizeof(hostBuff(1, 1)) * nDev, cudaMemcpyHostToDevice)
+  end do
+
+  do i = 1, nDev
+    hostBuffPtr(i) = c_loc(hostBuff((i - 1) * nEl + 1, nDev + 1))
+  end do
+
+  allocate(recvBuffPtr(nDev))
+
+  do i = 1, nDev
+    stat = cudaSetDevice(devList(i))
+    stat = cudaMalloc(recvBuffPtr(i), nEl * c_sizeof(hostBuff(1, 1)))
+    stat = cudaMemcpy(recvBuffPtr(i), hostBuffPtr(i), nEl * c_sizeof(hostBuff(1, 1)), cudaMemcpyHostToDevice)
+  end do
+
+  do i = 1, nDev
+    stat = cudaSetDevice(devList(i))
+    res = ncclReduceScatter(sendBuffPtr(i), recvBuffPtr(i), nEl, dataType, redOp, comm(i), stream(i))
+  end do
+
+  do i = 1, nDev
+    stat = cudaSetDevice(devList(i))
+    stat = cudaStreamSynchronize(stream(i))
+  end do
+
+  print "(a)", ""
+  print "(a)", "after reduceScatter:"
+  do i = 1, nDev
+    stat = cudaSetDevice(devList(i))
+    stat = cudaMemcpy(hostBuffPtr(i), recvBuffPtr(i), nEl * c_sizeof(hostBuff(1, 1)), cudaMemcpyDeviceToHost)
+    err = maxval(abs(hostBuff((i - 1) * nEl + 1:i * nEl, nDev + 1) / hostBuff((i - 1) * nEl + 1:i * nEl, nDev + 2) - 1.0_real32))
+    print "(a, i2.2, a, e11.4e2)", "maximum error in recvbuff from rank ", i - 1," = ", err
+  end do
+
+  do i = 1, nDev + 1
+    hostBuffPtr(i) = c_loc(hostBuff(1, nDev + 1))
+  end do
+
+  print "(a)", ""
+  do i = 1, nDev
+    stat = cudaSetDevice(devList(i))
+    stat = cudaMemcpy(hostBuffPtr(i), sendBuffPtr(i), nEl * c_sizeof(hostBuff(1, 1)) * nDev, cudaMemcpyDeviceToHost)
+    err = maxval(abs(hostBuff(:, nDev + 1) / hostBuff(:, i) - 1.0_real32))
+    print "(a, i2.2, a, e11.4e2)", "maximum error in sendbuff of rank ", i - 1," = ", err
+  end do
+  print "(a)", ""
+
+  do i = 1, nDev
+    stat = cudaSetDevice(devList(i))
+    stat = cudaFree(recvBuffPtr(i))
+  end do
+
+  deallocate(recvBuffPtr)
+
+  do i = 1, nDev
+    stat = cudaSetDevice(devList(i))
+    stat = cudaFree(sendBuffPtr(i))
+  end do
+
+  deallocate(sendBuffPtr)
+
+  deallocate(hostBuffPtr)
+
+  deallocate(hostBuff)
+
+  deallocate(seed)
+
+  do i = 1, nDev
+    stat = cudaSetDevice(devList(i))
+    stat = cudaStreamDestroy(stream(i))
+  end do
+
+  deallocate(stream)
+
+  do i = 1, nDev
+    call ncclCommDestroy(comm(i))
+  end do
+
+  deallocate(devList)
+  deallocate(comm)
+
+end program test
author	Kyle Fernandes, ne Jacobs <kj333@cam.ac.uk>	2016-11-18 02:27:20 +0300
committer	Sylvain Jeaugey <sjeaugey@nvidia.com>	2016-11-18 02:33:34 +0300
commit	5f2b32e45b7d31ec942de27369a1308a0afe8fb0 (patch)
tree	761f62f22038833b75934afd7302f337a03880c8
parent	534b9a169778040dbe7562854893b74bb9387e46 (diff)