diff options
author | Marcin Junczys-Dowmunt <marcinjd@microsoft.com> | 2022-02-08 13:57:20 +0300 |
---|---|---|
committer | GitHub <noreply@github.com> | 2022-02-08 13:57:20 +0300 |
commit | 05ba9e4c319db2317319227f5706f634340e0db4 (patch) | |
tree | 9266afe6d10ec1e75ab52d30e496a624d454237a | |
parent | a365bb5ce99135eab29ffe378e0c6c9fb9bf0c1b (diff) |
add -DDETERMINISTIC=ON/OFF flag (#912)
* Add -DDETERMINISTIC=ON/OFF flag to CMake
* Use -DDETERMINISTIC=on in GitHub/Azure workflows
Co-authored-by: Roman Grundkiewicz <rgrundkiewicz@gmail.com>
-rw-r--r-- | .github/workflows/ubuntu.yml | 1 | ||||
-rw-r--r-- | .github/workflows/windows.yml | 2 | ||||
-rw-r--r-- | CMakeLists.txt | 12 | ||||
-rw-r--r-- | azure-pipelines.yml | 4 | ||||
-rw-r--r-- | src/common/config_parser.cpp | 10 | ||||
-rw-r--r-- | src/tensors/gpu/tensor_operators.cu | 2 |
6 files changed, 28 insertions, 3 deletions
diff --git a/.github/workflows/ubuntu.yml b/.github/workflows/ubuntu.yml index a7f233ca..4a0fa674 100644 --- a/.github/workflows/ubuntu.yml +++ b/.github/workflows/ubuntu.yml @@ -98,6 +98,7 @@ jobs: -DCOMPILE_SERVER=on \ -DCOMPILE_TESTS=${{ matrix.unit_tests }} \ -DCUDA_TOOLKIT_ROOT_DIR=/usr/local/cuda-${{ matrix.cuda }} \ + -DDETERMINISTIC=on \ -DUSE_FBGEMM=${{ matrix.cpu }} \ -DUSE_SENTENCEPIECE=on \ -DUSE_STATIC_LIBS=on \ diff --git a/.github/workflows/windows.yml b/.github/workflows/windows.yml index dd10c733..ee85f303 100644 --- a/.github/workflows/windows.yml +++ b/.github/workflows/windows.yml @@ -81,6 +81,7 @@ jobs: -DCOMPILE_CUDA="${{ matrix.gpu }}" -DCOMPILE_SERVER="FALSE" -DCOMPILE_TESTS="TRUE" + -DDETERMINISTIC="TRUE" -DUSE_FBGEMM="TRUE" -DUSE_MPI="FALSE" -DUSE_NCCL="FALSE" @@ -110,6 +111,7 @@ jobs: -DCOMPILE_CUDA="${{ matrix.gpu }}" -DCOMPILE_SERVER="FALSE" -DCOMPILE_TESTS="TRUE" + -DDETERMINISTIC="TRUE" -DUSE_FBGEMM="TRUE" -DUSE_MPI="FALSE" -DUSE_NCCL="FALSE" diff --git a/CMakeLists.txt b/CMakeLists.txt index eb6ca97b..7c41b365 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -31,6 +31,7 @@ option(USE_NCCL "Use NCCL library" ON) option(USE_SENTENCEPIECE "Download and compile SentencePiece" ON) option(USE_STATIC_LIBS "Link statically against non-system libs" OFF) option(GENERATE_MARIAN_INSTALL_TARGETS "Generate Marian install targets (requires CMake 3.12+)" OFF) +option(DETERMINISTIC "Try to make training results as deterministic as possible (e.g. for testing)" OFF) # fbgemm and sentencepiece are both defined with "non-local" installation targets (the source projects don't define them, # so we define them in src\3rd_party\CMakeLists.txt), but that isn't supported until CMake 3.12. Prior to CMake 3.12, @@ -571,6 +572,15 @@ if(USE_STATIC_LIBS) set(CMAKE_FIND_LIBRARY_SUFFIXES ${_ORIG_CMAKE_FIND_LIBRARY_SUFFIXES}) endif() +if(DETERMINISTIC) + message(WARNING "Option DETERMINISTIC=ON: Trying to make training as deterministic as possible, may result in slow-down") + add_definitions(-DDETERMINISTIC=1) + list(APPEND CUDA_NVCC_FLAGS -DDETERMINISTIC=1; ) +else() + add_definitions(-DDETERMINISTIC=0) + list(APPEND CUDA_NVCC_FLAGS -DDETERMINISTIC=0; ) +endif() + # Find MPI if(USE_MPI) # 2.0 refers to MPI2 standard. OpenMPI is an implementation of that standard regardless of the specific OpenMPI version @@ -580,7 +590,7 @@ if(USE_MPI) include_directories(${MPI_INCLUDE_PATH}) set(EXT_LIBS ${EXT_LIBS} ${MPI_LIBRARIES}) if(USE_STATIC_LIBS) # alternatively this could install OpenMPI like NCCL and link against that statically with greater control - message(WARNING "MPI implementations are notoriously difficult to link statically, linking ${MPI_LIBRARIES} dynamically despite -DUSE_STATIC_LIBS=on") + message(WARNING "MPI implementations are notoriously difficult to link statically, linking ${MPI_LIBRARIES} dynamically despite -DUSE_STATIC_LIBS=on") endif(USE_STATIC_LIBS) add_definitions(-DMPI_FOUND=1) endif(MPI_FOUND) diff --git a/azure-pipelines.yml b/azure-pipelines.yml index bc76f85c..0348ebb4 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -470,7 +470,7 @@ stages: # Marian is built in the same job where the regression tests are run to make sure that executables -# is compiled and run on a machine with the same CPU architecture, which is required for +# are compiled and run on a machine with the same CPU architecture, which is required for # compilations with FBGEMM. - stage: Tests jobs: @@ -530,6 +530,7 @@ stages: -DCMAKE_MAKE_PROGRAM="ninja.exe" ^ -DCMAKE_TOOLCHAIN_FILE="$(VCPKG_DIR)\scripts\buildsystems\vcpkg.cmake" ^ -DVCPKG_TARGET_TRIPLET="x64-windows-static" ^ + -DDETERMINISTIC="TRUE" ^ ^ -DCOMPILE_CPU="TRUE" ^ -DCOMPILE_CUDA="FALSE" ^ @@ -634,6 +635,7 @@ stages: -DCMAKE_BUILD_TYPE=slim \ -DCOMPILE_CPU=on \ -DCOMPILE_CUDA=off \ + -DDETERMINISTIC=on \ -DUSE_FBGEMM=on \ -DUSE_SENTENCEPIECE=on \ -DUSE_STATIC_LIBS=on diff --git a/src/common/config_parser.cpp b/src/common/config_parser.cpp index ebbe4a89..837bee53 100644 --- a/src/common/config_parser.cpp +++ b/src/common/config_parser.cpp @@ -897,8 +897,13 @@ void ConfigParser::addSuboptionsBatching(cli::CLIWrapper& cli) { cli.add<bool>("--shuffle-in-ram", "Keep shuffled corpus in RAM, do not write to temp file"); +#if DETERMINISTIC cli.add<size_t>("--data-threads", "Number of concurrent threads to use during data reading and processing", 1); +#else + cli.add<size_t>("--data-threads", + "Number of concurrent threads to use during data reading and processing", 8); +#endif // @TODO: Consider making the next two options options of the vocab instead, to make it more local in scope. cli.add<size_t>("--all-caps-every", @@ -919,8 +924,13 @@ void ConfigParser::addSuboptionsBatching(cli::CLIWrapper& cli) { "Round up batch size to next power of 2 for more efficient training, but this can make batch size less stable. Disable with --mini-batch-round-up=false", true); } else { +#if DETERMINISTIC cli.add<size_t>("--data-threads", "Number of concurrent threads to use during data reading and processing", 1); +#else + cli.add<size_t>("--data-threads", + "Number of concurrent threads to use during data reading and processing", 8); +#endif } // clang-format on } diff --git a/src/tensors/gpu/tensor_operators.cu b/src/tensors/gpu/tensor_operators.cu index 2103ca9d..9011f284 100644 --- a/src/tensors/gpu/tensor_operators.cu +++ b/src/tensors/gpu/tensor_operators.cu @@ -1163,7 +1163,7 @@ void PasteRows(Tensor out, size_t rowsToCopy = indices->size(); int threads = std::min(MAX_THREADS, (int)cols); -#if 0 // @TODO: make this configurable with a 'deterministic' flag +#if DETERMINISTIC // If we only use one block, then each core operates on a different column, // hence the summation becomes deterministic. // However, we only use e.g. 512 cores out of possibly 3000+, so this will be |