Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/marian-nmt/sentencepiece.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
Diffstat (limited to 'src/sentencepiece_processor.h')
-rw-r--r--src/sentencepiece_processor.h24
1 files changed, 24 insertions, 0 deletions
diff --git a/src/sentencepiece_processor.h b/src/sentencepiece_processor.h
index 61da691..ee5cd17 100644
--- a/src/sentencepiece_processor.h
+++ b/src/sentencepiece_processor.h
@@ -158,6 +158,11 @@ class min_string_view {
const char *ptr_ = nullptr;
size_t length_ = 0;
};
+
+// Redefine std::string for serialized_proto interface as Python's string is
+// a Unicode string. We can enforce the return value to be raw byte sequence
+// with SWIG's typemap.
+using bytes = std::string;
} // namespace util
class SentencePieceProcessor {
@@ -357,6 +362,25 @@ class SentencePieceProcessor {
#undef DEFINE_SPP_DIRECT_FUNC_IMPL
+ // They are used in Python interface. Returns serialized proto.
+ // In python module, we can get access to the full Proto after
+ // deserialzing the returned byte sequence.
+ virtual util::bytes EncodeAsSerializedProto(
+ util::min_string_view input) const;
+
+ virtual util::bytes SampleEncodeAsSerializedProto(util::min_string_view input,
+ int nbest_size,
+ float alpha) const;
+
+ virtual util::bytes NBestEncodeAsSerializedProto(util::min_string_view input,
+ int nbest_size) const;
+
+ virtual util::bytes DecodePiecesAsSerializedProto(
+ const std::vector<std::string> &pieces) const;
+
+ virtual util::bytes DecodeIdsAsSerializedProto(
+ const std::vector<int> &ids) const;
+
//////////////////////////////////////////////////////////////
// Vocabulary management methods.
//