Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/marian-nmt/sentencepiece.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorTaku Kudo <taku@google.com>2018-09-07 11:17:43 +0300
committerTaku Kudo <taku@google.com>2018-09-07 11:17:43 +0300
commitb40cca7d0c6266cb106848ecf7b56bda0a7904ad (patch)
tree8216c57c225e582c869351469a9a10f8ced2d60d /src/sentencepiece_model.proto
parent03dad83922588a92c04cca8cb770187034cf969b (diff)
Added --use_all_vocab=true flag for WORD/CHAR model
Diffstat (limited to 'src/sentencepiece_model.proto')
-rw-r--r--src/sentencepiece_model.proto22
1 files changed, 13 insertions, 9 deletions
diff --git a/src/sentencepiece_model.proto b/src/sentencepiece_model.proto
index 7bbc320..6af7ce9 100644
--- a/src/sentencepiece_model.proto
+++ b/src/sentencepiece_model.proto
@@ -27,7 +27,7 @@ message TrainerSpec {
// B) Bilingual: TSV, source sentence <tab> target sentence
// When bilingual data is passed, shared vocabulary model is built.
// Note that the input file must be raw corpus, not a preprocessed corpus.
- // Trainer only loads the first |input_sentence_size| sentences specified
+ // Trainer only loads the first `input_sentence_size` sentences specified
// with this parameter.
repeated string input = 1;
@@ -62,13 +62,13 @@ message TrainerSpec {
///////////////////////////////////////////////////////////////////
// Training parameters.
//
- // Uses characters which cover the corpus with the ratio of |chars_coverage|.
+ // Uses characters which cover the corpus with the ratio of `chars_coverage`.
// This parameter determines the set of basic Alphabet of sentence piece.
- // 1.0 - |chars_coverage| characters are treated as UNK.
+ // 1.0 - `chars_coverage` characters are treated as UNK.
optional float character_coverage = 10 [ default = 0.9995 ];
- // Maximum size of sentences the trainer loads from |input| parameter.
- // Trainer simply loads the |input| files in sequence.
+ // Maximum size of sentences the trainer loads from `input` parameter.
+ // Trainer simply loads the `input` files in sequence.
// It is better to shuffle the input corpus randomly.
optional int32 input_sentence_size = 11 [ default = 10000000 ];
@@ -82,11 +82,11 @@ message TrainerSpec {
optional int32 training_sentence_size = 13 [ default = 10000000 ];
// The size of seed sentencepieces.
- // |seed_sentencepiece_size| must be larger than |vocab_size|.
+ // `seed_sentencepiece_size` must be larger than `vocab_size`.
optional int32 seed_sentencepiece_size = 14 [ default = 1000000 ];
// In every EM sub-iterations, keeps top
- // |shrinking_factor| * |current sentencepieces size| with respect to
+ // `shrinking_factor` * `current sentencepieces size` with respect to
// the loss of the sentence piece. This value should be smaller than 1.0.
optional float shrinking_factor = 15 [ default = 0.75 ];
@@ -103,7 +103,7 @@ message TrainerSpec {
optional int32 max_sentencepiece_length = 20 [ default = 16 ];
// Uses Unicode script to split sentence pieces.
- // When |split_by_unicode_script| is true, we do not allow sentence piece to
+ // When `split_by_unicode_script` is true, we do not allow sentence piece to
// include multiple Unicode scripts, e.g. "F1" is not a valid piece.
// Exception: CJ characters (Hiragana/Katakana/Han) are all handled
// as one script type, since Japanese word can consist of multiple scripts.
@@ -112,7 +112,7 @@ message TrainerSpec {
optional bool split_by_unicode_script = 21 [ default = true ];
// Use a white space to split sentence pieces.
- // When |split_by_whitespace| is false, we may have the piece containing
+ // When `split_by_whitespace` is false, we may have the piece containing
// a white space in the middle. e.g., "in_the".
optional bool split_by_whitespace = 22 [ default = true ];
@@ -142,6 +142,10 @@ message TrainerSpec {
// always assumes hard_vocab_limit = false.
optional bool hard_vocab_limit = 33 [ default = true ];
+ // use all symbols for vocab extraction. This flag is valid
+ // if model type is either CHAR or WORD
+ optional bool use_all_vocab = 34 [ default = false ];
+
///////////////////////////////////////////////////////////////////
// Reserved special meta tokens.
// * -1 is not used.