seed: 42 share_vocab: true save_data: data/wikitext-103-raw/run/example ## Where the vocab(s) will be written src_vocab: data/wikitext-103-raw/run/example.vocab.src src_vocab_size: 60000 tgt_vocab_size: 60000 src_subword_type: bpe src_subword_model: data/wikitext-103-raw/subwords.bpe src_onmttok_kwargs: '{"mode": "aggressive", "joiner_annotate": True, "preserve_placeholders": True, "case_markup": True, "soft_case_regions": True, "preserve_segmented_tokens": True}' transforms: [onmt_tokenize, filtertoolong] src_seq_length: 512 tgt_seq_length: 512 # Prevent overwriting existing files in the folder overwrite: True # Corpus opts: data: corpus_1: path_src: data/wikitext-103-raw/wiki.train.raw valid: path_src: data/wikitext-103-raw/wiki.valid.raw # Vocabulary files that were just created src_vocab: data/wikitext-103-raw/run/example.vocab.src # Train on a single GPU world_size: 1 gpu_ranks: [0] # Where to save the checkpoints save_model: data/wikitext-103-raw/run/model-lm save_checkpoint_steps: 50000 train_steps: 1000000 valid_steps: 500 report_every: 100 tensorboard: true tensorboard_log_dir: data/wikitext-103-raw/run/tensorboard # Model model_task: lm encoder_type: transformer_lm decoder_type: transformer_lm position_encoding: true dec_layers: 6 heads: 8 hidden_size: 512 word_vec_size: 512 transformer_ff: 2048 dropout_steps: [0] dropout: [0.1] attention_dropout: [0.1] batch_size: 2048 batch_type: tokens model_dtype: "fp32" optim: "adam" learning_rate: 2 warmup_steps: 8000 decay_method: "noam" adam_beta2: 0.998 max_grad_norm: 0 label_smoothing: 0.1 param_init: 0 param_init_glorot: true normalization: "tokens"