diff options
author | Taku Kudo <taku@google.com> | 2018-09-07 11:17:43 +0300 |
---|---|---|
committer | Taku Kudo <taku@google.com> | 2018-09-07 11:17:43 +0300 |
commit | b40cca7d0c6266cb106848ecf7b56bda0a7904ad (patch) | |
tree | 8216c57c225e582c869351469a9a10f8ced2d60d /src/sentencepiece_model.proto | |
parent | 03dad83922588a92c04cca8cb770187034cf969b (diff) |
Added --use_all_vocab=true flag for WORD/CHAR model
Diffstat (limited to 'src/sentencepiece_model.proto')
-rw-r--r-- | src/sentencepiece_model.proto | 22 |
1 files changed, 13 insertions, 9 deletions
diff --git a/src/sentencepiece_model.proto b/src/sentencepiece_model.proto index 7bbc320..6af7ce9 100644 --- a/src/sentencepiece_model.proto +++ b/src/sentencepiece_model.proto @@ -27,7 +27,7 @@ message TrainerSpec { // B) Bilingual: TSV, source sentence <tab> target sentence // When bilingual data is passed, shared vocabulary model is built. // Note that the input file must be raw corpus, not a preprocessed corpus. - // Trainer only loads the first |input_sentence_size| sentences specified + // Trainer only loads the first `input_sentence_size` sentences specified // with this parameter. repeated string input = 1; @@ -62,13 +62,13 @@ message TrainerSpec { /////////////////////////////////////////////////////////////////// // Training parameters. // - // Uses characters which cover the corpus with the ratio of |chars_coverage|. + // Uses characters which cover the corpus with the ratio of `chars_coverage`. // This parameter determines the set of basic Alphabet of sentence piece. - // 1.0 - |chars_coverage| characters are treated as UNK. + // 1.0 - `chars_coverage` characters are treated as UNK. optional float character_coverage = 10 [ default = 0.9995 ]; - // Maximum size of sentences the trainer loads from |input| parameter. - // Trainer simply loads the |input| files in sequence. + // Maximum size of sentences the trainer loads from `input` parameter. + // Trainer simply loads the `input` files in sequence. // It is better to shuffle the input corpus randomly. optional int32 input_sentence_size = 11 [ default = 10000000 ]; @@ -82,11 +82,11 @@ message TrainerSpec { optional int32 training_sentence_size = 13 [ default = 10000000 ]; // The size of seed sentencepieces. - // |seed_sentencepiece_size| must be larger than |vocab_size|. + // `seed_sentencepiece_size` must be larger than `vocab_size`. optional int32 seed_sentencepiece_size = 14 [ default = 1000000 ]; // In every EM sub-iterations, keeps top - // |shrinking_factor| * |current sentencepieces size| with respect to + // `shrinking_factor` * `current sentencepieces size` with respect to // the loss of the sentence piece. This value should be smaller than 1.0. optional float shrinking_factor = 15 [ default = 0.75 ]; @@ -103,7 +103,7 @@ message TrainerSpec { optional int32 max_sentencepiece_length = 20 [ default = 16 ]; // Uses Unicode script to split sentence pieces. - // When |split_by_unicode_script| is true, we do not allow sentence piece to + // When `split_by_unicode_script` is true, we do not allow sentence piece to // include multiple Unicode scripts, e.g. "F1" is not a valid piece. // Exception: CJ characters (Hiragana/Katakana/Han) are all handled // as one script type, since Japanese word can consist of multiple scripts. @@ -112,7 +112,7 @@ message TrainerSpec { optional bool split_by_unicode_script = 21 [ default = true ]; // Use a white space to split sentence pieces. - // When |split_by_whitespace| is false, we may have the piece containing + // When `split_by_whitespace` is false, we may have the piece containing // a white space in the middle. e.g., "in_the". optional bool split_by_whitespace = 22 [ default = true ]; @@ -142,6 +142,10 @@ message TrainerSpec { // always assumes hard_vocab_limit = false. optional bool hard_vocab_limit = 33 [ default = true ]; + // use all symbols for vocab extraction. This flag is valid + // if model type is either CHAR or WORD + optional bool use_all_vocab = 34 [ default = false ]; + /////////////////////////////////////////////////////////////////// // Reserved special meta tokens. // * -1 is not used. |