Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/marian-nmt/marian.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMarcin Junczys-Dowmunt <Marcin.JunczysDowmunt@microsoft.com>2021-09-28 20:17:12 +0300
committerMarcin Junczys-Dowmunt <Marcin.JunczysDowmunt@microsoft.com>2021-09-28 20:17:12 +0300
commitd796a3c3b7779993660e672f2a47f5cdd685a174 (patch)
tree7d6bf423fbc8f32a6c6e2f65823af1ccf5de5d60
parentaa58ba8e239d228d539734e6be8266fbb3181044 (diff)
Merged PR 20839: Do not ignore ignoreEOS for spm decoding
With final space this eliminates trailing whitespace caused by appending EOS
-rw-r--r--src/data/sentencepiece_vocab.cpp8
1 files changed, 5 insertions, 3 deletions
diff --git a/src/data/sentencepiece_vocab.cpp b/src/data/sentencepiece_vocab.cpp
index 090d478b..8f774c2b 100644
--- a/src/data/sentencepiece_vocab.cpp
+++ b/src/data/sentencepiece_vocab.cpp
@@ -236,18 +236,20 @@ public:
return words;
}
- std::string decode(const Words& sentence, bool /*ignoreEOS*/) const override {
+ std::string decode(const Words& sentence, bool ignoreEOS) const override {
std::string line;
if(keepEncoded_) { // i.e. keep the sentence segmented into subword units
for(const Word& id : sentence)
- line += (*this)[id] + " ";
+ if(!ignoreEOS || id != getEosId())
+ line += (*this)[id] + " ";
line.pop_back(); // trim the trailing whitespace
} else {
// convert vector of Word to vector of int
std::vector<int> spmSentence;
spmSentence.reserve(sentence.size());
for(auto&& word : sentence)
- spmSentence.push_back(word.toWordIndex());
+ if(!ignoreEOS || word != getEosId())
+ spmSentence.push_back(word.toWordIndex());
spm_->Decode(spmSentence, &line);
}
return line;