add -DDETERMINISTIC=ON/OFF flag (#912)

* Add -DDETERMINISTIC=ON/OFF flag to CMake * Use -DDETERMINISTIC=on in GitHub/Azure workflows Co-authored-by: Roman Grundkiewicz <rgrundkiewicz@gmail.com>
author: Marcin Junczys-Dowmunt <marcinjd@microsoft.com> 2022-02-08 13:57:20 +0300
committer: GitHub <noreply@github.com> 2022-02-08 13:57:20 +0300
commit: 05ba9e4c319db2317319227f5706f634340e0db4 (patch)
tree: 9266afe6d10ec1e75ab52d30e496a624d454237a
parent: a365bb5ce99135eab29ffe378e0c6c9fb9bf0c1b (diff)
6 files changed, 28 insertions, 3 deletions
diff --git a/.github/workflows/ubuntu.yml b/.github/workflows/ubuntu.yml
index a7f233ca..4a0fa674 100644
--- a/.github/workflows/ubuntu.yml
+++ b/.github/workflows/ubuntu.yml
@@ -98,6 +98,7 @@ jobs:
           -DCOMPILE_SERVER=on \
           -DCOMPILE_TESTS=${{ matrix.unit_tests }} \
           -DCUDA_TOOLKIT_ROOT_DIR=/usr/local/cuda-${{ matrix.cuda }} \
+          -DDETERMINISTIC=on \
           -DUSE_FBGEMM=${{ matrix.cpu }} \
           -DUSE_SENTENCEPIECE=on \
           -DUSE_STATIC_LIBS=on \
diff --git a/.github/workflows/windows.yml b/.github/workflows/windows.yml
index dd10c733..ee85f303 100644
--- a/.github/workflows/windows.yml
+++ b/.github/workflows/windows.yml
@@ -81,6 +81,7 @@ jobs:
           -DCOMPILE_CUDA="${{ matrix.gpu }}"
           -DCOMPILE_SERVER="FALSE"
           -DCOMPILE_TESTS="TRUE"
+          -DDETERMINISTIC="TRUE"
           -DUSE_FBGEMM="TRUE"
           -DUSE_MPI="FALSE"
           -DUSE_NCCL="FALSE"
@@ -110,6 +111,7 @@ jobs:
           -DCOMPILE_CUDA="${{ matrix.gpu }}"
           -DCOMPILE_SERVER="FALSE"
           -DCOMPILE_TESTS="TRUE"
+          -DDETERMINISTIC="TRUE"
           -DUSE_FBGEMM="TRUE"
           -DUSE_MPI="FALSE"
           -DUSE_NCCL="FALSE"
diff --git a/CMakeLists.txt b/CMakeLists.txt
index eb6ca97b..7c41b365 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -31,6 +31,7 @@ option(USE_NCCL "Use NCCL library" ON)
 option(USE_SENTENCEPIECE "Download and compile SentencePiece" ON)
 option(USE_STATIC_LIBS "Link statically against non-system libs" OFF)
 option(GENERATE_MARIAN_INSTALL_TARGETS "Generate Marian install targets (requires CMake 3.12+)" OFF)
+option(DETERMINISTIC "Try to make training results as deterministic as possible (e.g. for testing)" OFF)
 
 # fbgemm and sentencepiece are both defined with "non-local" installation targets (the source projects don't define them,
 # so we define them in src\3rd_party\CMakeLists.txt), but that isn't supported until CMake 3.12. Prior to CMake 3.12,
@@ -571,6 +572,15 @@ if(USE_STATIC_LIBS)
   set(CMAKE_FIND_LIBRARY_SUFFIXES ${_ORIG_CMAKE_FIND_LIBRARY_SUFFIXES})
 endif()
 
+if(DETERMINISTIC)
+  message(WARNING "Option DETERMINISTIC=ON: Trying to make training as deterministic as possible, may result in slow-down")
+  add_definitions(-DDETERMINISTIC=1)
+  list(APPEND CUDA_NVCC_FLAGS -DDETERMINISTIC=1; )
+else()
+  add_definitions(-DDETERMINISTIC=0)
+  list(APPEND CUDA_NVCC_FLAGS -DDETERMINISTIC=0; )
+endif()
+
 # Find MPI
 if(USE_MPI)
   # 2.0 refers to MPI2 standard. OpenMPI is an implementation of that standard regardless of the specific OpenMPI version
@@ -580,7 +590,7 @@ if(USE_MPI)
     include_directories(${MPI_INCLUDE_PATH})
     set(EXT_LIBS ${EXT_LIBS} ${MPI_LIBRARIES})
     if(USE_STATIC_LIBS) # alternatively this could install OpenMPI like NCCL and link against that statically with greater control
-      message(WARNING "MPI implementations are notoriously difficult to link statically, linking ${MPI_LIBRARIES} dynamically despite -DUSE_STATIC_LIBS=on")
+    message(WARNING "MPI implementations are notoriously difficult to link statically, linking ${MPI_LIBRARIES} dynamically despite -DUSE_STATIC_LIBS=on")
     endif(USE_STATIC_LIBS)
     add_definitions(-DMPI_FOUND=1)
   endif(MPI_FOUND)
diff --git a/azure-pipelines.yml b/azure-pipelines.yml
index bc76f85c..0348ebb4 100644
--- a/azure-pipelines.yml
+++ b/azure-pipelines.yml
@@ -470,7 +470,7 @@ stages:
 
 
 # Marian is built in the same job where the regression tests are run to make sure that executables
-# is compiled and run on a machine with the same CPU architecture, which is required for
+# are compiled and run on a machine with the same CPU architecture, which is required for
 # compilations with FBGEMM.
 - stage: Tests
   jobs:
@@ -530,6 +530,7 @@ stages:
           -DCMAKE_MAKE_PROGRAM="ninja.exe" ^
           -DCMAKE_TOOLCHAIN_FILE="$(VCPKG_DIR)\scripts\buildsystems\vcpkg.cmake" ^
           -DVCPKG_TARGET_TRIPLET="x64-windows-static" ^
+          -DDETERMINISTIC="TRUE" ^
           ^
           -DCOMPILE_CPU="TRUE" ^
           -DCOMPILE_CUDA="FALSE" ^
@@ -634,6 +635,7 @@ stages:
           -DCMAKE_BUILD_TYPE=slim \
           -DCOMPILE_CPU=on \
           -DCOMPILE_CUDA=off \
+          -DDETERMINISTIC=on \
           -DUSE_FBGEMM=on \
           -DUSE_SENTENCEPIECE=on \
           -DUSE_STATIC_LIBS=on
diff --git a/src/common/config_parser.cpp b/src/common/config_parser.cpp
index ebbe4a89..837bee53 100644
--- a/src/common/config_parser.cpp
+++ b/src/common/config_parser.cpp
@@ -897,8 +897,13 @@ void ConfigParser::addSuboptionsBatching(cli::CLIWrapper& cli) {
     cli.add<bool>("--shuffle-in-ram",
         "Keep shuffled corpus in RAM, do not write to temp file");
 
+#if DETERMINISTIC
     cli.add<size_t>("--data-threads",
         "Number of concurrent threads to use during data reading and processing", 1);
+#else
+    cli.add<size_t>("--data-threads",
+        "Number of concurrent threads to use during data reading and processing", 8);
+#endif
 
     // @TODO: Consider making the next two options options of the vocab instead, to make it more local in scope.
     cli.add<size_t>("--all-caps-every",
@@ -919,8 +924,13 @@ void ConfigParser::addSuboptionsBatching(cli::CLIWrapper& cli) {
         "Round up batch size to next power of 2 for more efficient training, but this can make batch size less stable. Disable with --mini-batch-round-up=false",
         true);
   } else {
+#if DETERMINISTIC
     cli.add<size_t>("--data-threads",
         "Number of concurrent threads to use during data reading and processing", 1);
+#else
+    cli.add<size_t>("--data-threads",
+        "Number of concurrent threads to use during data reading and processing", 8);
+#endif
   }
   // clang-format on
 }
diff --git a/src/tensors/gpu/tensor_operators.cu b/src/tensors/gpu/tensor_operators.cu
index 2103ca9d..9011f284 100644
--- a/src/tensors/gpu/tensor_operators.cu
+++ b/src/tensors/gpu/tensor_operators.cu
@@ -1163,7 +1163,7 @@ void PasteRows(Tensor out,
   size_t rowsToCopy = indices->size();
 
   int threads = std::min(MAX_THREADS, (int)cols);
-#if 0   // @TODO: make this configurable with a 'deterministic' flag
+#if DETERMINISTIC
   // If we only use one block, then each core operates on a different column,
   // hence the summation becomes deterministic.
   // However, we only use e.g. 512 cores out of possibly 3000+, so this will be
author	Marcin Junczys-Dowmunt <marcinjd@microsoft.com>	2022-02-08 13:57:20 +0300
committer	GitHub <noreply@github.com>	2022-02-08 13:57:20 +0300
commit	05ba9e4c319db2317319227f5706f634340e0db4 (patch)
tree	9266afe6d10ec1e75ab52d30e496a624d454237a
parent	a365bb5ce99135eab29ffe378e0c6c9fb9bf0c1b (diff)