diff options
author | Martin Junczys-Dowmunt <Marcin.JunczysDowmunt@microsoft.com> | 2020-11-11 03:38:37 +0300 |
---|---|---|
committer | Martin Junczys-Dowmunt <Marcin.JunczysDowmunt@microsoft.com> | 2020-11-11 03:38:37 +0300 |
commit | 9dad84ae9b261c621aed60d19d520c7f1aed381c (patch) | |
tree | b807b02fa2df44557f80b64e8bfb4fc94d5c6df0 | |
parent | b90229d8ee298532badbc92e4d0b80c3ded95357 (diff) |
Merged PR 16337: Update sentencepiece to new version
This updates the SentencePiece version in Marian to a much more recent revision. Due to that there is no dependency on Protobuf anymore.
-rw-r--r-- | CHANGELOG.md | 4 | ||||
-rw-r--r-- | CMakeLists.txt | 3 | ||||
m--------- | regression-tests | 0 | ||||
-rw-r--r-- | src/3rd_party/CMakeLists.txt | 4 | ||||
-rwxr-xr-x | src/common/config_parser.cpp | 2 | ||||
-rw-r--r-- | src/optimizers/quantizer.cpp | 5 |
6 files changed, 11 insertions, 7 deletions
diff --git a/CHANGELOG.md b/CHANGELOG.md index eb41b1c6..4182d72b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -32,6 +32,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0. - Quantized training (fixed point or log-based quantization) with --quantize-bits N command ### Fixed +- Segfault of spm_train when compiled with -DUSE_STATIC_LIBS=ON seems to have gone away with update to newer SentencePiece version. - Fix bug causing certain reductions into scalars to be 0 on the GPU backend. Removed unnecessary warp shuffle instructions. - Do not apply dropout in embeddings layers during inference with dropout-src/trg - Print "server is listening on port" message after it is accepting connections @@ -53,6 +54,9 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0. - Improved handling for receiving SIGTERM during training. By default, SIGTERM triggers 'save (now) and exit'. Prior to this fix, batch pre-fetching did not check for this sigal, potentially delaying exit considerably. It now pays attention to that. Also, the default behaviour of save-and-exit can now be disabled on the command line with --sigterm exit-immediately. ### Changed +- Updated SentencePiece repository to version 8336bbd0c1cfba02a879afe625bf1ddaf7cd93c5 from https://github.com/google/sentencepiece. +- Enabled compilation of SentencePiece by default since no dependency on protobuf anymore. +- Changed default value of --sentencepiece-max-lines from 10000000 to 2000000 since apparently the new version doesn't sample automatically anymore (Not quite clear how that affects quality of the vocabulary). - Change mini-batch-fit search stopping criterion to stop at ideal binary search threshold. - --metric bleu now always detokenizes SacreBLEU-style if a vocabulary knows how to, use bleu-segmented to compute BLEU on word ids. bleu-detok is now a synonym for bleu. - Move label-smoothing computation into Cross-entropy node diff --git a/CMakeLists.txt b/CMakeLists.txt index 1f8cd106..3c015058 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -23,7 +23,7 @@ option(USE_FBGEMM "Use FBGEMM" OFF) option(USE_MKL "Compile with MKL support" ON) option(USE_MPI "Use MPI library" OFF) option(USE_NCCL "Use NCCL library" ON) -option(USE_SENTENCEPIECE "Download and compile SentencePiece" OFF) +option(USE_SENTENCEPIECE "Download and compile SentencePiece" ON) option(USE_STATIC_LIBS "Link statically against non-system libs" OFF) option(GENERATE_MARIAN_INSTALL_TARGETS "Generate Marian install targets (requires CMake 3.12+)" OFF) @@ -236,6 +236,7 @@ endif() if(USE_ONNX) message(STATUS "Enabling experimental ONNX support") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DUSE_ONNX") + # TODO: likely required to find protobuf by itself, we should check/fix this. Before it would take advantage of sentencepiece doing that. set(EXT_LIBS ${EXT_LIBS} protobuf) include_directories(${Protobuf_INCLUDE_DIRS}) endif() diff --git a/regression-tests b/regression-tests -Subproject 910d489b7b71f306ab3867d696f86ab25f7a1b4 +Subproject 16914ae94c80f338c678f0461c4e45965149f6a diff --git a/src/3rd_party/CMakeLists.txt b/src/3rd_party/CMakeLists.txt index a18ea7b1..67ba43cb 100644 --- a/src/3rd_party/CMakeLists.txt +++ b/src/3rd_party/CMakeLists.txt @@ -66,10 +66,6 @@ if(USE_SENTENCEPIECE) set(SPM_ENABLE_TCMALLOC ON CACHE BOOL "Enable TCMalloc if available.") if(USE_STATIC_LIBS) - message(WARNING "You are compiling SentencePiece binaries with -DUSE_STATIC_LIBS=on. \ - This will cause spm_train to segfault. No need to worry if you do not intend to use that binary. \ - Marian support for SentencePiece will work fine.") - set(SPM_ENABLE_SHARED OFF CACHE BOOL "Builds shared libaries in addition to static libraries." FORCE) set(SPM_TCMALLOC_STATIC ON CACHE BOOL "Link static library of TCMALLOC." FORCE) else(USE_STATIC_LIBS) diff --git a/src/common/config_parser.cpp b/src/common/config_parser.cpp index f7f60eab..f72475f6 100755 --- a/src/common/config_parser.cpp +++ b/src/common/config_parser.cpp @@ -372,7 +372,7 @@ void ConfigParser::addOptionsTraining(cli::CLIWrapper& cli) { cli.add<size_t>("--sentencepiece-max-lines", "Maximum lines to train SentencePiece vocabulary, selected with sampling from all data. " "When set to 0 all lines are going to be used.", - 10000000); + 2000000); #endif // scheduling options diff --git a/src/optimizers/quantizer.cpp b/src/optimizers/quantizer.cpp index fc175672..76032d7e 100644 --- a/src/optimizers/quantizer.cpp +++ b/src/optimizers/quantizer.cpp @@ -90,6 +90,8 @@ void ModelQuantizer::quantize(Ptr<ExpressionGraph> graph) { allocator->reserveExact(graph->params()->vals()->memory()->size()); allocator->allocate(errorResidual_, {1, numElements}); + errorResidual_->set(0); + allocators_.push_back(allocator); isFirstError_ = true; } @@ -140,7 +142,6 @@ void ModelQuantizer::quantizeImpl(Tensor t) { allocators_.push_back(allocator); } - Tensor q = delta_->subtensor(0, t->size()); // to store the quantized t Tensor tflat = t->subtensor(0, t->size()); // flatten t for reduce float S = 0.0f; // scaling factor S @@ -153,6 +154,8 @@ void ModelQuantizer::quantizeImpl(Tensor t) { // optimize the scaling factor S for(int i = 0; i < optSteps_; i++) { + Tensor q = delta_->subtensor(0, t->size()); // to store the quantized t + // let t be the original tensor, and q be the quantized tensor, and q = S*a where S is the // scaling factor. we want to optimize S to minimize MSE(S*a - t) therefore, S = // sum(a*t)/sum(a*a) see https://www.aclweb.org/anthology/2020.ngt-1.4.pdf for more details. |