Updated package version, added manpage

author: Boris Fomitchev <bfomitchev@nvidia.com> 2016-02-29 23:10:34 +0300
committer: Boris Fomitchev <bfomitchev@nvidia.com> 2016-02-29 23:10:34 +0300
commit: 941d9da08cab04b5ffd7bb050dc1107b293f5d1d (patch)
tree: 4a5e88b7cce29008cd3174a282f6f3a99b039c85
parent: 5554a4c9f0d81a39f1a737c92ade1fe88de6760e (diff)
8 files changed, 162 insertions, 7 deletions
diff --git a/Makefile b/Makefile
index 9afde64..c019d43 100644
--- a/Makefile
+++ b/Makefile
@@ -37,6 +37,7 @@ CUDACODE := -gencode=arch=compute_35,code=sm_35 \
 BUILDDIR := build
 
 NVCC       := $(CUDA_HOME)/bin/nvcc
+
 GPP        := g++
 CPPFLAGS   := -I$(CUDA_HOME)/include
 CXXFLAGS   := -O3 -fPIC -fvisibility=hidden
@@ -58,8 +59,8 @@ INCEXPORTS  := nccl.h
 LIBSRCFILES := libwrap.cu core.cu all_gather.cu all_reduce.cu broadcast.cu reduce.cu reduce_scatter.cu
 LIBNAME     := libnccl.so
 VER_MAJOR   := 1
-VER_MINOR   := 0
-VER_PATCH   := 2
+VER_MINOR   := 1
+VER_PATCH   := 0
 TESTS       := all_gather_test all_reduce_test broadcast_test reduce_test reduce_scatter_test
 MPITESTS    := mpi_test
 
@@ -136,4 +137,3 @@ install : lib
 	@mkdir -p $(PREFIX)/include
 	@cp -P -v build/lib/* $(PREFIX)/lib/
 	@cp -v build/include/* $(PREFIX)/include/
-
diff --git a/debian/changelog b/debian/changelog
index 50dcaa4..8061ddd 100644
--- a/debian/changelog
+++ b/debian/changelog
@@ -1,3 +1,10 @@
+nccl (1.1.0) trusty; urgency=medium
+
+  * Picking up the latest
+  * Added rule for manpage creation.
+
+ -- Boris Fomitchev <bfomitchev@nvidia.com>  Tue, 16 Feb 2016 18:07:01 -0800
+
 nccl (1.0.2-2) trusty; urgency=medium
 
   * Packages are now libnccl1 and libnccl-dev
diff --git a/debian/control b/debian/control
index 340c213..eeb436f 100644
--- a/debian/control
+++ b/debian/control
@@ -2,7 +2,7 @@ Source: nccl
 Section: universe/libs
 Maintainer: Boris Fomitchev <bfomitchev@nvidia.com>
 Priority: optional
-Build-depends: debhelper(>=9), cuda-toolkit-7-0,
+Build-depends: debhelper(>=9), go-md2man, cuda-toolkit-7-0,
                cuda-curand-dev-7-0, cuda-cublas-dev-7-0, cuda-cudart-dev-7-0, cuda-ld-conf-7-0
 Standards-Version: 3.9.5
 
@@ -28,4 +28,3 @@ Description: NVIDIA Communication Collectives Library (NCCL) Development Files
  been optimized to achieve high bandwidth over PCIe. NCCL supports up to eight
  GPUs and can be used in either single- or multi-process (e.g., MPI)
  applications.
-
diff --git a/debian/libnccl-dev.manpages b/debian/libnccl-dev.manpages
new file mode 100644
index 0000000..4bfc2cb
--- /dev/null
+++ b/debian/libnccl-dev.manpages
@@ -0,0 +1 @@
+debian/nccl.7
diff --git a/debian/libnccl1.install b/debian/libnccl1.install
index be7243e..13f19a3 100644
--- a/debian/libnccl1.install
+++ b/debian/libnccl1.install
@@ -1,2 +1,2 @@
 lib/libnccl.so.1 usr/lib
-lib/libnccl.so.1.0.2 usr/lib
+lib/libnccl.so.1.1.0 usr/lib
diff --git a/debian/nccl.7 b/debian/nccl.7
new file mode 100644
index 0000000..da4d4a4
--- /dev/null
+++ b/debian/nccl.7
@@ -0,0 +1,140 @@
+.TH NCCL
+.PP
+Optimized primitives for collective multi\-GPU communication.
+
+.SH Introduction
+.PP
+NCCL (pronounced "Nickel") is a stand\-alone library of standard collective communication routines, such as all\-gather, reduce, broadcast, etc., that have been optimized to achieve high bandwidth over PCIe. NCCL supports an arbitrary number of GPUs installed in a single node and can be used in either single\- or multi\-process (e.g., MPI) applications.
+
+.SH What's inside
+.PP
+At present, the library implements the following collectives:
+\- all\-reduce
+\- all\-gather
+\- reduce\-scatter
+\- reduce
+\- broadcast
+
+.PP
+These collectives are implemented using ring algorithms and have been optimized primarily for throughput. For best performance, small collectives should be batched into larger operations whenever possible. Small test binaries demonstrating how to use each of the above collectives are also provided.
+
+.SH Requirements
+.PP
+NCCL requires at least CUDA 7.0 and Kepler or newer GPUs. Best performance is achieved when all GPUs are located on a common PCIe root complex, but multi\-socket configurations are also supported.
+
+.PP
+Note: NCCL may also work with CUDA 6.5, but this is an untested configuration.
+
+.SH Build \& run
+.PP
+To build the library and tests.
+
+.PP
+.RS
+
+.nf
+$ cd nccl
+$ make CUDA\_HOME=<cuda install path> test
+
+.fi
+.RE
+
+.PP
+Test binaries are located in the subdirectories nccl/build/test and nccl/build/mpitest.
+
+.PP
+.RS
+
+.nf
+$ export LD\_LIBRARY\_PATH=$LD\_LIBRARY\_PATH:./build/lib
+$ ./build/test/all\_reduce\_test
+Error: must specify at least data size in bytes!
+
+Tests nccl AllReduce with user supplied arguments.
+    Usage: all\_reduce\_test <data size in bytes> [number of GPUs] [GPU 0] [GPU 1] ...
+
+$ ./build/test/all\_reduce\_test 10000000
+# Using devices
+#   Device  0 \->  0 [0x0a] GeForce GTX TITAN X
+#   Device  1 \->  1 [0x09] GeForce GTX TITAN X
+#   Device  2 \->  2 [0x06] GeForce GTX TITAN X
+#   Device  3 \->  3 [0x05] GeForce GTX TITAN X
+
+#                                                 out\-of\-place                    in\-place
+#      bytes             N    type      op     time  algbw  busbw      res     time  algbw  busbw      res
+    10000000      10000000    char     sum    1.628   6.14   9.21    0e+00    1.932   5.18   7.77    0e+00
+    10000000      10000000    char    prod    1.629   6.14   9.21    0e+00    1.643   6.09   9.13    0e+00
+    10000000      10000000    char     max    1.621   6.17   9.25    0e+00    1.634   6.12   9.18    0e+00
+    10000000      10000000    char     min    1.633   6.12   9.19    0e+00    1.637   6.11   9.17    0e+00
+    10000000       2500000     int     sum    1.611   6.21   9.31    0e+00    1.626   6.15   9.23    0e+00
+    10000000       2500000     int    prod    1.613   6.20   9.30    0e+00    1.629   6.14   9.21    0e+00
+    10000000       2500000     int     max    1.619   6.18   9.26    0e+00    1.627   6.15   9.22    0e+00
+    10000000       2500000     int     min    1.619   6.18   9.27    0e+00    1.624   6.16   9.24    0e+00
+    10000000       5000000    half     sum    1.617   6.18   9.28    4e\-03    1.636   6.11   9.17    4e\-03
+    10000000       5000000    half    prod    1.618   6.18   9.27    1e\-03    1.657   6.03   9.05    1e\-03
+    10000000       5000000    half     max    1.608   6.22   9.33    0e+00    1.621   6.17   9.25    0e+00
+    10000000       5000000    half     min    1.610   6.21   9.32    0e+00    1.627   6.15   9.22    0e+00
+    10000000       2500000   float     sum    1.618   6.18   9.27    5e\-07    1.622   6.17   9.25    5e\-07
+    10000000       2500000   float    prod    1.614   6.20   9.29    1e\-07    1.628   6.14   9.21    1e\-07
+    10000000       2500000   float     max    1.616   6.19   9.28    0e+00    1.633   6.12   9.19    0e+00
+    10000000       2500000   float     min    1.613   6.20   9.30    0e+00    1.628   6.14   9.21    0e+00
+    10000000       1250000  double     sum    1.629   6.14   9.21    0e+00    1.628   6.14   9.21    0e+00
+    10000000       1250000  double    prod    1.619   6.18   9.26    2e\-16    1.628   6.14   9.21    2e\-16
+    10000000       1250000  double     max    1.613   6.20   9.30    0e+00    1.630   6.13   9.20    0e+00
+    10000000       1250000  double     min    1.622   6.16   9.25    0e+00    1.623   6.16   9.24    0e+00
+
+.fi
+.RE
+
+.PP
+To install, run \fB\fCmake PREFIX=<install dir> install\fR and add \fB\fC<instal dir>/lib\fR to your \fB\fCLD\_LIBRARY\_PATH\fR.
+
+.SH Usage
+.PP
+NCCL follows the MPI collectives API fairly closely. Before any collectives can be called, a communicator object must be initialized on each GPU. On a single\-process machine, all GPUs can be conveniently initialized using \fB\fCncclCommInitAll\fR. For multi\-process applications (e.g., with MPI), \fB\fCncclCommInitRank\fR must be called for each GPU. Internally \fB\fCncclCommInitRank\fR invokes a synchronization among all GPUs, so these calls must be invoked in different host threads (or processes) for each GPU. A brief single\-process example follows, for an MPI example see src/mpi\_test.cu. For details about the API see nccl.h.
+
+.PP
+.RS
+
+.nf
+#include <nccl.h>
+
+typedef struct \{
+  double* sendBuff;
+  double* recvBuff;
+  int size;
+  cudaStream\_t stream;
+\} PerThreadData;
+
+int main(int argc, char* argv[])
+\{
+  int nGPUs;
+  cudaGetDeviceCount(\&nGPUs);
+  ncclComm\_t* comms = (ncclComm\_t*)malloc(sizeof(ncclComm\_t)*nGPUs);
+  ncclCommInitAll(comms, nGPUs); // initialize communicator
+                                // One communicator per process
+
+  PerThreadData* data;
+
+  ... // Allocate data and issue work to each GPU's
+      // perDevStream to populate the sendBuffs.
+
+  for(int i=0; i<nGPUs; ++i) \{
+    cudaSetDevice(i); // Correct device must be set
+                      // prior to each collective call.
+    ncclAllReduce(data[i].sendBuff, data[i].recvBuff, size,
+        ncclDouble, ncclSum, comms[i], data[i].stream);
+  \}
+
+  ... // Issue work into data[*].stream to consume buffers, etc.
+\}
+
+.fi
+.RE
+
+.SH Copyright and License
+.PP
+NCCL is provided under the 
+\[la]LICENSE.txt\[ra]. All source code and
+accompanying documentation is copyright (c) 2015\-2016, NVIDIA CORPORATION. All
+rights reserved.
diff --git a/debian/rules b/debian/rules
index 6467459..cd40502 100755
--- a/debian/rules
+++ b/debian/rules
@@ -5,4 +5,3 @@
 
 override_dh_auto_install:
 	PREFIX=debian/tmp dh_auto_install
-
diff --git a/scripts/build-package.sh b/scripts/build-package.sh
new file mode 100755
index 0000000..4d1996a
--- /dev/null
+++ b/scripts/build-package.sh
@@ -0,0 +1,9 @@
+#!/bin/bash
+
+# The envirinment set by create-ubuntu-environmens sscript is trusty, so we should match it.
+# Look at debian/gbp.conf for default optiuons sent to gbp.
+rm -r debian/tmp debian/*.log
+git commit -a -m 'Preparing for pbuild' && git push
+git clean -f
+
+gbp buildpackage --git-dist=trusty $1 $2 $3 $4 $5
author	Boris Fomitchev <bfomitchev@nvidia.com>	2016-02-29 23:10:34 +0300
committer	Boris Fomitchev <bfomitchev@nvidia.com>	2016-02-29 23:10:34 +0300
commit	941d9da08cab04b5ffd7bb050dc1107b293f5d1d (patch)
tree	4a5e88b7cce29008cd3174a282f6f3a99b039c85
parent	5554a4c9f0d81a39f1a737c92ade1fe88de6760e (diff)