Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/marian-nmt/sentencepiece.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorRohit Jain <rjai@microsoft.com>2021-06-19 19:10:48 +0300
committerRohit Jain <rjai@microsoft.com>2021-06-19 19:10:48 +0300
commit9f4de81ef4a5f53803b980715e51714900d24ce3 (patch)
treefb2ae57a6ba4124cde28b1f1962eb58a8a422db3
parent1b08cd2f6aee4f844d795045ff91464ccb4df52a (diff)
wip bytefallback issuerjai/casing
-rw-r--r--src/sentencepiece_processor.cc37
1 files changed, 22 insertions, 15 deletions
diff --git a/src/sentencepiece_processor.cc b/src/sentencepiece_processor.cc
index a6d7d32..2a76015 100644
--- a/src/sentencepiece_processor.cc
+++ b/src/sentencepiece_processor.cc
@@ -566,6 +566,7 @@ util::Status SentencePieceProcessor::Decode(
sp->set_end(text->size() + surface.size());
*text += surface;
};
+
auto ProcessBytePieces = [&](int begin, int end) -> util::Status {
if (begin < end) {
// Constructs byte sequence.
@@ -602,6 +603,7 @@ util::Status SentencePieceProcessor::Decode(
}
return util::OkStatus();
};
+
int byte_start = 0;
for (int i = 0; i < spt->pieces_size(); ++i) {
const auto &sp = spt->pieces(i);
@@ -629,22 +631,27 @@ util::Status SentencePieceProcessor::Decode(
// Text is de-normalized, but pieces still need de-normalization.
for(int i = 0; i < spt->pieces_size(); i++) {
auto *spiece = spt->mutable_pieces(i);
- auto curr_surface = spiece->surface();
-
- // De-normalize curr_surface using o2n. Missing chars are deleted (ambiguous)
- std::string new_surface;
- for(int j = text_piece_surface_index; j < text_piece_surface_index + curr_surface.size();
- j++) {
- auto norm_index = orig_to_norm.find(j + 1);
- if(norm_index != orig_to_norm.end())
- new_surface.push_back(normalized[norm_index->second - 1]);
- }
- text_piece_surface_index += curr_surface.size();
+ if(!IsByte(spiece->id())) {
+ auto curr_surface = spiece->surface();
+
+ // De-normalize curr_surface using o2n. Missing chars are deleted (ambiguous)
+ std::string new_surface;
+ for(int j = text_piece_surface_index; j < text_piece_surface_index + curr_surface.size();
+ j++) {
+ auto norm_index = orig_to_norm.find(j + 1);
+ if(norm_index != orig_to_norm.end())
+ new_surface.push_back(normalized[norm_index->second - 1]);
+ }
+ text_piece_surface_index += curr_surface.size();
- spiece->set_surface(new_surface);
- spiece->set_begin(normalized_piece_surface_index);
- normalized_piece_surface_index += new_surface.size();
- spiece->set_end(normalized_piece_surface_index);
+ spiece->set_surface(new_surface);
+ spiece->set_begin(normalized_piece_surface_index);
+ normalized_piece_surface_index += new_surface.size();
+ spiece->set_end(normalized_piece_surface_index);
+ } else {
+ normalized_piece_surface_index += spiece->surface().size();
+ text_piece_surface_index += spiece->surface().size();
+ }
}
}