Address review comments

author: Marcin Junczys-Dowmunt <marcinjd@microsoft.com> 2018-12-08 00:37:05 +0300
committer: Marcin Junczys-Dowmunt <marcinjd@microsoft.com> 2018-12-08 00:37:05 +0300
commit: 8b4b3dae1e4b1d1322c2ec677e051c85b4d788d7 (patch)
tree: 60f702ae578d4138cfb06b1a8512a85ddb67a677
parent: ed25aa02fca90a1a748ac806064a5630c653183e (diff)
4 files changed, 31 insertions, 7 deletions
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 498247e9..943e3e36 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -33,6 +33,9 @@ message(STATUS "Project version: ${PROJECT_VERSION_STRING_FULL}")
 execute_process(COMMAND git submodule update --init --recursive --no-fetch
                 WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR})
 
+# Detect support CPU instrinsics for the current platform. This will 
+# only by used with BUILD_ARCH=native. For overridden BUILD_ARCH we 
+# minimally use -msse4.1. This seems to work with MKL.
 set(INTRINSICS "")
 if(BUILD_ARCH STREQUAL "native")
   message(STATUS "Checking support for CPU intrinsics")
@@ -75,8 +78,7 @@ else()
 
   # These are used in src/CMakeLists.txt on a per-target basis
   list(APPEND ALL_WARNINGS -Wall; -Werror; -Wno-unused-result; -Wno-deprecated; -Wno-pragmas; -Wno-unused-parameter; -Wextra; -Wno-unused-function;
-                           -Wno-unused-value; -Wno-unknown-pragmas; -Wno-sign-compare; -Wno-missing-field-initializers;
-                           -Wno-unused-but-set-variable)
+                           -Wno-unused-value; -Wno-unknown-pragmas; -Wno-sign-compare; -Wno-missing-field-initializers;)
 
   # This warning does not exist prior to gcc 5.0
   if(CMAKE_COMPILER_IS_GNUCC AND CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 5.0)
@@ -144,13 +146,16 @@ endif(USE_STATIC_LIBS)
     list(APPEND CUDA_NVCC_FLAGS -DBOOST_PP_VARIADICS=0; )
   endif()
 
+  # We compile NCCL ourselves, using the NVidia Makefile rather than CMake, this requires to pass a couple of parameters from
+  # Cmake. This is also fairly untested, let's hope it does not explode. 
+  # @TODO: Make sure it does not use pre-installed NCCL headers
   if(USE_NCCL)
     # define and set the include dir for the generated nccl.h header
     set(NCCL_HEADER_LOCATION "${CMAKE_CURRENT_BINARY_DIR}/nccl/include")
     include_directories(${NCCL_HEADER_LOCATION})
 
     # set the path for the generated static lib
-    set(NCCL_STATIC "${CMAKE_CURRENT_BINARY_DIR}/nccl/lib/libnccl_static.a")
+    set(NCCL_LIB_STATIC "${CMAKE_CURRENT_BINARY_DIR}/nccl/lib/libnccl_static.a")
     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DUSE_NCCL")
 
     LIST(APPEND CUDA_NVCC_FLAGS -DUSE_NCCL; )
@@ -158,7 +163,9 @@ endif(USE_STATIC_LIBS)
     # disables compilation for sm_30 to avoid ptxas warning... that's general Kepler support. But K80s are supported for instance by sm_35
     set(GENCODE "-gencode=arch=compute_35,code=sm_35 -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61")
 
-    # sets output to build, passes cuda location from FindCUDA, sets c++ compiler to the same one CMake uses.
+    # We build using NVidia's custom makefile, for that we pass a number of variables from CMake.
+    # Sets output to the chosen build folder, i.e. where the binaries and objects are generated.
+    # Also passes CUDA location from FindCUDA, sets c++ compiler to the same one CMake uses.
     add_custom_command(OUTPUT ${NCCL_STATIC}
                        COMMAND make src.build
                        BUILDDIR=${CMAKE_CURRENT_BINARY_DIR}/nccl
@@ -166,9 +173,9 @@ endif(USE_STATIC_LIBS)
                        CUDA8_GENCODE=${GENCODE}
                        CXX=${CMAKE_CXX_COMPILER}
                        WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/src/3rd_party/nccl)
-    add_custom_target(nccl_target DEPENDS ${NCCL_STATIC})
+    add_custom_target(nccl_target DEPENDS ${NCCL_LIB_STATIC})
     add_library(nccl STATIC IMPORTED)
-    set_target_properties(nccl PROPERTIES IMPORTED_LOCATION ${NCCL_STATIC})
+    set_target_properties(nccl PROPERTIES IMPORTED_LOCATION ${NCCL_LIB_STATIC})
     add_dependencies(nccl nccl_target)
     set(EXT_LIBS ${EXT_LIBS} nccl)
 
diff --git a/src/3rd_party/cnpy/cnpy.cpp b/src/3rd_party/cnpy/cnpy.cpp
index be5e2bfd..9ad3c2fa 100755
--- a/src/3rd_party/cnpy/cnpy.cpp
+++ b/src/3rd_party/cnpy/cnpy.cpp
@@ -103,6 +103,9 @@ void cnpy::parse_npy_header(FILE* fp, unsigned int& word_size, unsigned int*& sh
     word_size = atoi(str_ws.substr(0,loc2).c_str());
 }
 
+// make compiler happy, otherwise warns with "variable set but not used"
+#define _unused(x) ((void)(x))
+
 void cnpy::parse_zip_footer(FILE* fp, unsigned short& nrecs, unsigned int& global_header_size, unsigned int& global_header_offset)
 {
     std::vector<char> footer(22);
@@ -124,6 +127,13 @@ void cnpy::parse_zip_footer(FILE* fp, unsigned short& nrecs, unsigned int& globa
     assert(disk_start == 0);
     assert(nrecs_on_disk == nrecs);
     assert(comment_len == 0);
+
+    // make compiler happy, otherwise warns with "variable set but not used"
+    // on the other hand it seems having the asserts in here is useful. 
+    _unused(disk_no);
+    _unused(disk_start);
+    _unused(nrecs_on_disk);
+    _unused(comment_len);
 }
 
 cnpy::NpyArrayPtr load_the_npy_file(FILE* fp) {
diff --git a/src/tensors/cpu/device.cpp b/src/tensors/cpu/device.cpp
index c4710f5c..b0fdae92 100644
--- a/src/tensors/cpu/device.cpp
+++ b/src/tensors/cpu/device.cpp
@@ -15,6 +15,12 @@ Device::~Device() {
   size_ = 0;
 }
 
+// allocate function for tensor reserve() below. 
+// Needed for AVX512, while not available on all compilers. It seems clang
+// does not have aligned_alloc for all cstlib versions. If AVX512 is not used
+// a simple malloc is probably fine. 
+// Should generate a runtime error otherwise as we have a check in the AVX512 
+// functions which tests for alignment. 
 #ifdef _WIN32
 #define MALLOC(size) _aligned_alloc(size, alignment_)
 #elif __GNUC__
diff --git a/src/training/communicator_nccl.h b/src/training/communicator_nccl.h
index 16c49bdb..bf1f07df 100755
--- a/src/training/communicator_nccl.h
+++ b/src/training/communicator_nccl.h
@@ -4,7 +4,8 @@
 #include "3rd_party/threadpool.h"
 #include "tensors/gpu/cuda_helpers.h"
 
-// Generated by NCCL make files in build/nccl/include
+// Generated by NCCL make files in build/nccl/include;
+// include dir has been set in CMake files. NCCL add version number etc.
 #include "nccl.h"
 
 #include <cuda_runtime.h>
author	Marcin Junczys-Dowmunt <marcinjd@microsoft.com>	2018-12-08 00:37:05 +0300
committer	Marcin Junczys-Dowmunt <marcinjd@microsoft.com>	2018-12-08 00:37:05 +0300
commit	8b4b3dae1e4b1d1322c2ec677e051c85b4d788d7 (patch)
tree	60f702ae578d4138cfb06b1a8512a85ddb67a677
parent	ed25aa02fca90a1a748ac806064a5630c653183e (diff)