Added --use_all_vocab=true flag for WORD/CHAR model

author: Taku Kudo <taku@google.com> 2018-09-07 11:17:43 +0300
committer: Taku Kudo <taku@google.com> 2018-09-07 11:17:43 +0300
commit: b40cca7d0c6266cb106848ecf7b56bda0a7904ad (patch)
tree: 8216c57c225e582c869351469a9a10f8ced2d60d /src/sentencepiece_model.proto
parent: 03dad83922588a92c04cca8cb770187034cf969b (diff)
1 files changed, 13 insertions, 9 deletions
diff --git a/src/sentencepiece_model.proto b/src/sentencepiece_model.proto
index 7bbc320..6af7ce9 100644
--- a/src/sentencepiece_model.proto
+++ b/src/sentencepiece_model.proto
@@ -27,7 +27,7 @@ message TrainerSpec {
   //  B) Bilingual:   TSV, source sentence <tab> target sentence
   //  When bilingual data is passed, shared vocabulary model is built.
   //  Note that the input file must be raw corpus, not a preprocessed corpus.
-  //  Trainer only loads the first |input_sentence_size| sentences specified
+  //  Trainer only loads the first `input_sentence_size` sentences specified
   //  with this parameter.
   repeated string input  = 1;
 
@@ -62,13 +62,13 @@ message TrainerSpec {
   ///////////////////////////////////////////////////////////////////
   // Training parameters.
   //
-  // Uses characters which cover the corpus with the ratio of |chars_coverage|.
+  // Uses characters which cover the corpus with the ratio of `chars_coverage`.
   // This parameter determines the set of basic Alphabet of sentence piece.
-  // 1.0 - |chars_coverage| characters are treated as UNK.
+  // 1.0 - `chars_coverage` characters are treated as UNK.
   optional float character_coverage = 10 [ default = 0.9995 ];
 
-  // Maximum size of sentences the trainer loads from |input| parameter.
-  // Trainer simply loads the |input| files in sequence.
+  // Maximum size of sentences the trainer loads from `input` parameter.
+  // Trainer simply loads the `input` files in sequence.
   // It is better to shuffle the input corpus randomly.
   optional int32 input_sentence_size = 11 [ default = 10000000 ];
 
@@ -82,11 +82,11 @@ message TrainerSpec {
   optional int32  training_sentence_size = 13 [ default = 10000000 ];
 
   // The size of seed sentencepieces.
-  // |seed_sentencepiece_size| must be larger than |vocab_size|.
+  // `seed_sentencepiece_size` must be larger than `vocab_size`.
   optional int32  seed_sentencepiece_size = 14 [ default = 1000000 ];
 
   // In every EM sub-iterations, keeps top
-  // |shrinking_factor| * |current sentencepieces size| with respect to
+  // `shrinking_factor` * `current sentencepieces size` with respect to
   // the loss of the sentence piece. This value should be smaller than 1.0.
   optional float  shrinking_factor = 15 [ default = 0.75 ];
 
@@ -103,7 +103,7 @@ message TrainerSpec {
   optional int32  max_sentencepiece_length = 20 [ default = 16 ];
 
   // Uses Unicode script to split sentence pieces.
-  // When |split_by_unicode_script| is true, we do not allow sentence piece to
+  // When `split_by_unicode_script` is true, we do not allow sentence piece to
   // include multiple Unicode scripts, e.g. "F1" is not a valid piece.
   // Exception: CJ characters (Hiragana/Katakana/Han) are all handled
   // as one script type, since Japanese word can consist of multiple scripts.
@@ -112,7 +112,7 @@ message TrainerSpec {
   optional bool split_by_unicode_script = 21 [ default = true ];
 
   // Use a white space to split sentence pieces.
-  // When |split_by_whitespace| is false, we may have the piece containing
+  // When `split_by_whitespace` is false, we may have the piece containing
   // a white space in the middle. e.g., "in_the".
   optional bool split_by_whitespace = 22 [ default = true ];
 
@@ -142,6 +142,10 @@ message TrainerSpec {
   // always assumes hard_vocab_limit = false.
   optional bool hard_vocab_limit = 33 [ default = true ];
 
+  // use all symbols for vocab extraction. This flag is valid
+  // if model type is either CHAR or WORD
+  optional bool use_all_vocab = 34 [ default = false ];
+
   ///////////////////////////////////////////////////////////////////
   // Reserved special meta tokens.
   // * -1 is not used.
author	Taku Kudo <taku@google.com>	2018-09-07 11:17:43 +0300
committer	Taku Kudo <taku@google.com>	2018-09-07 11:17:43 +0300
commit	b40cca7d0c6266cb106848ecf7b56bda0a7904ad (patch)
tree	8216c57c225e582c869351469a9a10f8ced2d60d /src/sentencepiece_model.proto
parent	03dad83922588a92c04cca8cb770187034cf969b (diff)