diff options
author | Rohit Jain <rjai@microsoft.com> | 2021-06-19 19:10:48 +0300 |
---|---|---|
committer | Rohit Jain <rjai@microsoft.com> | 2021-06-19 19:10:48 +0300 |
commit | 9f4de81ef4a5f53803b980715e51714900d24ce3 (patch) | |
tree | fb2ae57a6ba4124cde28b1f1962eb58a8a422db3 | |
parent | 1b08cd2f6aee4f844d795045ff91464ccb4df52a (diff) |
wip bytefallback issuerjai/casing
-rw-r--r-- | src/sentencepiece_processor.cc | 37 |
1 files changed, 22 insertions, 15 deletions
diff --git a/src/sentencepiece_processor.cc b/src/sentencepiece_processor.cc index a6d7d32..2a76015 100644 --- a/src/sentencepiece_processor.cc +++ b/src/sentencepiece_processor.cc @@ -566,6 +566,7 @@ util::Status SentencePieceProcessor::Decode( sp->set_end(text->size() + surface.size()); *text += surface; }; + auto ProcessBytePieces = [&](int begin, int end) -> util::Status { if (begin < end) { // Constructs byte sequence. @@ -602,6 +603,7 @@ util::Status SentencePieceProcessor::Decode( } return util::OkStatus(); }; + int byte_start = 0; for (int i = 0; i < spt->pieces_size(); ++i) { const auto &sp = spt->pieces(i); @@ -629,22 +631,27 @@ util::Status SentencePieceProcessor::Decode( // Text is de-normalized, but pieces still need de-normalization. for(int i = 0; i < spt->pieces_size(); i++) { auto *spiece = spt->mutable_pieces(i); - auto curr_surface = spiece->surface(); - - // De-normalize curr_surface using o2n. Missing chars are deleted (ambiguous) - std::string new_surface; - for(int j = text_piece_surface_index; j < text_piece_surface_index + curr_surface.size(); - j++) { - auto norm_index = orig_to_norm.find(j + 1); - if(norm_index != orig_to_norm.end()) - new_surface.push_back(normalized[norm_index->second - 1]); - } - text_piece_surface_index += curr_surface.size(); + if(!IsByte(spiece->id())) { + auto curr_surface = spiece->surface(); + + // De-normalize curr_surface using o2n. Missing chars are deleted (ambiguous) + std::string new_surface; + for(int j = text_piece_surface_index; j < text_piece_surface_index + curr_surface.size(); + j++) { + auto norm_index = orig_to_norm.find(j + 1); + if(norm_index != orig_to_norm.end()) + new_surface.push_back(normalized[norm_index->second - 1]); + } + text_piece_surface_index += curr_surface.size(); - spiece->set_surface(new_surface); - spiece->set_begin(normalized_piece_surface_index); - normalized_piece_surface_index += new_surface.size(); - spiece->set_end(normalized_piece_surface_index); + spiece->set_surface(new_surface); + spiece->set_begin(normalized_piece_surface_index); + normalized_piece_surface_index += new_surface.size(); + spiece->set_end(normalized_piece_surface_index); + } else { + normalized_piece_surface_index += spiece->surface().size(); + text_piece_surface_index += spiece->surface().size(); + } } } |