diff options
author | Marcin Junczys-Dowmunt <Marcin.JunczysDowmunt@microsoft.com> | 2021-09-28 20:17:12 +0300 |
---|---|---|
committer | Marcin Junczys-Dowmunt <Marcin.JunczysDowmunt@microsoft.com> | 2021-09-28 20:17:12 +0300 |
commit | d796a3c3b7779993660e672f2a47f5cdd685a174 (patch) | |
tree | 7d6bf423fbc8f32a6c6e2f65823af1ccf5de5d60 | |
parent | aa58ba8e239d228d539734e6be8266fbb3181044 (diff) |
Merged PR 20839: Do not ignore ignoreEOS for spm decoding
With final space this eliminates trailing whitespace caused by appending EOS
-rw-r--r-- | src/data/sentencepiece_vocab.cpp | 8 |
1 files changed, 5 insertions, 3 deletions
diff --git a/src/data/sentencepiece_vocab.cpp b/src/data/sentencepiece_vocab.cpp index 090d478b..8f774c2b 100644 --- a/src/data/sentencepiece_vocab.cpp +++ b/src/data/sentencepiece_vocab.cpp @@ -236,18 +236,20 @@ public: return words; } - std::string decode(const Words& sentence, bool /*ignoreEOS*/) const override { + std::string decode(const Words& sentence, bool ignoreEOS) const override { std::string line; if(keepEncoded_) { // i.e. keep the sentence segmented into subword units for(const Word& id : sentence) - line += (*this)[id] + " "; + if(!ignoreEOS || id != getEosId()) + line += (*this)[id] + " "; line.pop_back(); // trim the trailing whitespace } else { // convert vector of Word to vector of int std::vector<int> spmSentence; spmSentence.reserve(sentence.size()); for(auto&& word : sentence) - spmSentence.push_back(word.toWordIndex()); + if(!ignoreEOS || word != getEosId()) + spmSentence.push_back(word.toWordIndex()); spm_->Decode(spmSentence, &line); } return line; |