From fe613fd82c33277962ccebd6429a33a4cce3602e Mon Sep 17 00:00:00 2001 From: Graeme Nail Date: Tue, 22 Feb 2022 18:44:10 +0000 Subject: Add transformer basic example --- transformer-basic/.gitignore | 15 ++ transformer-basic/README.md | 389 +++++++++++++++++++++++++++ transformer-basic/data/README.md | 22 ++ transformer-basic/requirements.txt | 2 + transformer-basic/run-me.sh | 76 ++++++ transformer-basic/scripts/comet-score.sh | 34 +++ transformer-basic/scripts/download-files.sh | 39 +++ transformer-basic/scripts/preprocess-data.sh | 36 +++ transformer-basic/transformer-model.yml | 46 ++++ 9 files changed, 659 insertions(+) create mode 100644 transformer-basic/.gitignore create mode 100644 transformer-basic/README.md create mode 100644 transformer-basic/data/README.md create mode 100644 transformer-basic/requirements.txt create mode 100755 transformer-basic/run-me.sh create mode 100755 transformer-basic/scripts/comet-score.sh create mode 100755 transformer-basic/scripts/download-files.sh create mode 100755 transformer-basic/scripts/preprocess-data.sh create mode 100644 transformer-basic/transformer-model.yml diff --git a/transformer-basic/.gitignore b/transformer-basic/.gitignore new file mode 100644 index 0000000..2229207 --- /dev/null +++ b/transformer-basic/.gitignore @@ -0,0 +1,15 @@ +.venv +*.log + +# Data +data/*.de +data/*.en +data/*.gz +data/*.tgz +data/*tsv +data/commoncrawl* +data/corpus.* + +# Model +model*/ +evaluation diff --git a/transformer-basic/README.md b/transformer-basic/README.md new file mode 100644 index 0000000..56c398d --- /dev/null +++ b/transformer-basic/README.md @@ -0,0 +1,389 @@ +# Basic Transformer + +In this example we will use Marian to create a English-German translation +system. We'll follow a very simple pipeline with data acquisition, some basic +corpus cleaning, generation of vocabulary with [SentencePiece], training of a +transformer model, and evaluation with [sacreBLEU], and (optionally) [Comet]. + +We'll be using a subset of data from the WMT21 [news task] to train our model. +For the validation and test sets, we'll use the test sets from WMT19 and WMT20, +respectively. + +Lets get started by installing our dependencies! + + +## Install requirements +If you haven't installed the common tools for `marian-examples`, you can do +by doing to the `tools/` folder in the root of the repository and running `make`. +```shell +cd ../tools +make all +cd - +``` +In this example, we'll be using some +[scripts](https://github.com/marian-nmt/moses-scripts) from [Moses]. + +We'll also use [sacreBLEU] and [Comet] from Python pip. To install these in a +virtual environment, execute: +```shell +python -m venv .venv +source .venv/bin/activate +pip install -r requirements.txt +``` +You can skip the first two of these commands if you don't want to use a virtual environment. + +Next we'll install Marian! + + +## Getting Marian +The development version of Marian can be obtained with +```shell +git clone https://github.com/marian-nmt/marian-dev +cd marian-dev +``` + +### Compile +To compile Marian we need to ensure we have the required packages. The list of requirements can be found in the [documentation][install_marian]. Since we're using SentencePiece, we also need to make sure we have satisfy its[requirements][install_sentencepiece] too. + +Then we can compile with +```shell +mkdir build +cd build +cmake .. -DCMAKE_BUILD_TYPE=Release -DUSE_SENTENCEPIECE=ON +cmake --build . +``` + +To speed up compilation we can use ```cmake --build . -j 8``` to run 8 tasks simultaneously. + +If it succeeded running +```shell +./marian --version +``` +will return the version you've compiled. To verify that Sentence support was enabled, running +```shell +./marian --help |& grep sentencepiece +``` +will display the SentencePiece specific options: +``` +--sentencepiece-alphas VECTOR ... Sampling factors for SentencePiece vocabulary; i-th factor corresponds to i-th vocabulary +--sentencepiece-options TEXT Pass-through command-line options to SentencePiece trainer +--sentencepiece-max-lines UINT=2000000 + Maximum lines to train SentencePiece vocabulary, selected with sampling from all data. When set to 0 all lines are going to be used. +``` + +## Running the Example +The entire example can be run end-to-end by executing +```shell +./run-me.sh +``` +This will acquire the data then apply cleaning. It uses the resulting corpus to +train a transformer model, which is evaluated via sacreBLEU. + +By default, `run-me.sh` will run on a single GPU (`device 0`). To use a +different set of GPUs, pass their IDs as an argument, e.g. training using the 4 +GPUs +```shell +./run-me.sh 0 1 2 3 +``` + +You can run the commands from `run-me.sh` manually yourself. We'll walk through +the different commands in the sections below. These commands assume that Marian +is compiled, and accessible at `../../build/marian`. The `data/`, `scripts/` and +`model/` directories will be contains at the same level as this README file. + +## Acquire data +We'll acquire a subset of the data from the WMT21 [news task]. + +In particular we'll make use of the following English-German parallel corpora: + +| Dataset | Sentences | +|---------------------|--------------:| +| Europarl v10 | 1,828,521 | +| News Commentary v16 | 398,981 | +| Common Crawl corpus | 2,399,123 | +| **Total** | **4,626,625** | + +### Download +We'll store our data inside the `data/` directory. First lets change directory +to that location: +```shell +cd data +``` + +To download the datasets above, we can use the command: +```shell +# Get en-de for training WMT21 +wget -nc https://www.statmt.org/europarl/v10/training/europarl-v10.de-en.tsv.gz 2> /dev/null +wget -nc https://data.statmt.org/news-commentary/v16/training/news-commentary-v16.de-en.tsv.gz 2> /dev/null +wget -nc https://www.statmt.org/wmt13/training-parallel-commoncrawl.tgz 2> /dev/null +``` +This may take a little time to download the data from the server. + +The dev set and test set can be obtained directly from sacrebleu via the command line. We echo the source and reference texts to file. +``` +# Dev Sets +sacrebleu -t wmt19 -l en-de --echo src > valid.en +sacrebleu -t wmt19 -l en-de --echo ref > valid.de + +# Test Sets +sacrebleu -t wmt20 -l en-de --echo src > test.en +sacrebleu -t wmt20 -l en-de --echo ref > test.de +``` +This is relatively fast as these are typically only 1000-2000 lines. + + +### Combine +Now we want to combine our data sources in to a single corpus. First we start by +decompressing each of the EuroParl and news-commentary TSV files. +```shell +for compressed in europarl-v10.de-en.tsv news-commentary-v16.de-en.tsv; do + if [ ! -e $compressed ]; then + gzip --keep -q -d $compressed.gz + fi +done +``` +This leaves two TSV files: + - `europarl-v10.de-en.tsv` + - `news-commentary-v16.de-en.tsv` + +where the first field contains German text, and the second field contains +English text. + +We can untar the common crawl archive. +```shell +tar xf training-parallel-commoncrawl.tgz +``` +This contains a collection of parallel text files across multiple languages, but +we're only interested in those covering `en-de`: + - `commoncrawl.de-en.de` + - `commoncrawl.de-en.de` + +From these we can construct a parallel corpus. We concatenate the two TSV files, +and extract the first field to populate the German combined corpus, and then the +second field to populate the English combined corpus. To this, we then +concatenate the commoncrawl data to the relevant file. +```shell +# Corpus +if [ ! -e corpus.de ] || [ ! -e corpus.en ]; then + # TSVs + cat europarl-v10.de-en.tsv news-commentary-v16.de-en.tsv | cut -f 1 > corpus.de + cat europarl-v10.de-en.tsv news-commentary-v16.de-en.tsv | cut -f 2 > corpus.en + + # Plain text + cat commoncrawl.de-en.de >> corpus.de + cat commoncrawl.de-en.en >> corpus.en +fi +``` + +## Prepare data +With our combined corpus we now apply some basic pre-processing. + +Firstly, we remove any non-printing characters using a script from [Moses]. +```shell +for lang in en de; do + # Remove non-printing characters + cat corpus.$lang \ + | perl $MOSES_SCRIPTS/tokenizer/remove-non-printing-char.perl \ + > .corpus.norm.$lang +done +``` +This modifies the content separately for each language, but **does not** adjust +the ordering. The parallel sentences pairs are associated by line, so it is +crucial that any pre-processing preserves that. + +Then we constrain the sentences to be between 1 and 100 words with +```shell +# Contrain length between 1 100 +perl $MOSES_SCRIPTS/training/clean-corpus-n.perl .corpus.norm en de .corpus.trim 1 100 +``` +This removes sentence pairs where either one does not meet the length +requirements. + +To remove any duplicates we build a TSV file, sort it and retain only unique +lines. +```shell +# Deduplicate +paste <(cat .corpus.trim.en) <(cat .corpus.trim.de) \ + | LC_ALL=C sort -S 50% | uniq \ + > .corpus.uniq.ende.tsv +``` + +Then clean corpus is obtained by separating our TSV file back to parallel text +files. +```shell +cat .corpus.uniq.ende.tsv | cut -f 1 > corpus.clean.en +cat .corpus.uniq.ende.tsv | cut -f 2 > corpus.clean.de +``` + +The cleaned corpus has 4,552,319 parallel sentences, having discarded around +1.6% the total sentences. + +## Training +To train a transformer model, we make use of Marian's presets. The `--task +transformer-base` preset gives a good baseline of hyperparameters for a +transformer model. + +We'll put our configuration inside a YAML file `transformer-model.yml`. We can +output the configuration for this preset using the `--dump-config expand` +options: +```shell +$MARIAN/marian --task transformer-base --dump-config expand > transformer-model.yml +``` +We have shortened `../../build/marian` to `$MARIAN/marian` for brevity. + +You can inspect this file to see exactly which options have been set. + +We'll modify this file by adding options that training a little more verbose. +``` +disp-freq: 1000 +disp-first: 10 +save-freq: 2ku +``` + +We also add line that will halt training after 10 updates without an improvement +for on the validation set. +``` +early-stopping: 10 +``` + +We will also validate with additional metrics, keep the best model per metric +and validate more often. This is achieved via: +``` +keep-best: true +valid-freq: 2ku +valid-metrics: + - ce-mean-words + - bleu + - perplexity +``` +Note that early-stopping criteria applies to `ce-mean-words`. + +### SentencePiece (Optional) +To generate a SentencePiece vocabulary model you can run the `spm_train` command +built alongside Marian. An example invocation would look something like: +```shell + $MARIAN/spm_train \ + --accept_language en,de \ + --input data/corpus.clean.en,data/corpus.clean.de \ + --model_prefix model/vocab.ende \ + --vocab_size 32000 +mv model/vocab.ende.{model,spm} +``` +Where as a last step, we rename `.model` to `.spm` (SentencePiece Model) so that +Marian recognises it as from SentencePiece. This step is listed as optional as +in the absence of a vocabulary file, Marian will build one. + +This produces a combined vocabulary of 32000 tokens. + +### Training Command +To begin training, we call the `marian` command with the following arguments: +```shell +$MARIAN/marian -c transformer-model.yml \ + -d 0 1 2 3 --workspace 9000 \ + --seed 1111 \ + --after 10e \ + --model model/model.npz \ + --train-sets data/corpus.clean.{en,de} \ + --vocabs model/vocab.ende.spm model/vocab.ende.spm \ + --dim-vocabs 32000 32000 \ + --valid-sets data/valid.{en,de} \ + --log model/train.log --valid-log model/valid.log +``` +The flag `-d` sets the devices to be ran on, which you'll have to update for +your setup. Additionally `-w`, the workspace, depends on how much memory your +GPUs have. The example was tested on a pair of NVIDIA RTX 2080 with 11GB using a +workspace of 9000 MiB. You should reduce this if you have less available memory. +For reproducibility, the seed is set to `1111`. As a reference, this took around +8 hours. + +The models will be stored at `model/model.npz`. The training and validation sets +are specified, as well as the vocabular files and their dimension. Logs for the +training and validation output are also retained. Finally, for this example we +only train for a maximum of 10 epochs. + +The `save-freq` we specified of 2000, will result in the model state being saved +at regular intervals of 2000 updates: + - `model/model.iter2000.npz` + - `model/model.iter4000.npz` + - ... + +The current model is always `model/model.npz`. Additionally, the `keep-best` +option produces an additional model file for every validator: + - `model/model.npz.best-bleu.npz` + - `model/model.npz.best-ce-mean-words.npz` + - `model/model.npz.best-perplexity.npz` + +The training progress is tracked in `model/model.npz.progress.yml` with the full +model configuration at `model/model.npz.yml`. In addition, Marian automatically +generates a decoding config for each of these models: + - `model/model.npz.decoder.yml` + - `model/model.npz.best-*.npz.decoder.yml` + +These conveniently refer to the model and vocabulary files. They also include a +default setting for beam-search and normalization, which can be overwritten by +the command-line interface. + +## Translation +To translate we use the `marian-decoder` command: +```shell +cat data/test.en \ + | $MARIAN/marian-decoder \ + -c model/model.npz.best-bleu.npz.decoder.yml \ + -d 0 1 2 3 \ + | tee evaluation/testset_output.txt \ + | sacrebleu data/test.de --metrics bleu chrf -b -w 3 -f text +``` +where we're using the model that produced the best BLEU score on the validation +set. This snippet passes the source text to Marian over a pipe to `stdin`, and +is output over `stdout`. We're capturing this output to file with `tee`, and +passing the output into sacreBLEU for evaluation. We provide sacreBLEU our +reference text, and ask it to compute both BLEU and chrF. The remaining +sacreBLEU options return us only the score with 3 decimal places of precision in +text format. + +You can experiment changing the `--beam-size` and `--normalization` to see how +it changes the scores + + +Additionally, if you want to compute the Comet score, there's a helper script: +``` +./scripts/comet-score.sh hyp.txt src.txt ref.txt +``` +This returns the Comet score for `hyp.txt`, the translation output, based on +`src.txt` the source input, and `ref.txt` the reference translation. + +### Results +Here we tabulate the scores for BLEU, chrF2 and Comet for our model. For each of +the metrics, a larger score is better. You should achieve similar results with +your own run! + +These are the results from decoding with best-BLEU model: + +| Test | BLEU | chrF2 | Comet | +|--------|--------|--------|--------| +| WMT20 | 24.573 | 52.368 | 0.1795 | +| WMT19^ | 37.185 | 62.628 | 0.3312 | +| WMT18 | 40.140 | 65.281 | 0.5363 | +| WMT17 | 26.832 | 56.096 | 0.4061 | +| WMT16 | 33.245 | 60.534 | 0.4552 | + +**^** Note that WMT19 was used as the validation set! + +## Going Further +If you want to improve on these results, you can continue training for longer, +or incorporating other datasets from the WMT21 task. Take a look at the other +examples and think about implementing some data augmentation through +back-translation. + +Good luck! + + +[sacrebleu]: https://github.com/mjpost/sacrebleu +[comet]: https://github.com/Unbabel/COMET +[moses]: https://github.com/moses-smt/mosesdecoder + +[news task]: https://www.statmt.org/wmt21/translation-task.html + +[sentencepiece]: https://github.com/google/sentencepiece +[install_marian]: https://marian-nmt.github.io/docs/#installation +[install_sentencepiece]: https://marian-nmt.github.io/docs/#sentencepiece diff --git a/transformer-basic/data/README.md b/transformer-basic/data/README.md new file mode 100644 index 0000000..5a6c375 --- /dev/null +++ b/transformer-basic/data/README.md @@ -0,0 +1,22 @@ +# en-de data + +## Training +The training data is a subset of data from the [WMT21] news task. +| Dataset | Sentences | +|---------------------|--------------:| +| Europarl v10 | 1,828,521 | +| News Commentary v16 | 398,981 | +| Common Crawl corpus | 2,399,123 | +| **Total** | **4,626,625** | + +## Validation +The validation set uses the [WMT19] news task test set via [sacrebleu]. + +## Testing +Evaluation of the model uses the [WMT20] news task test set via [sacrebleu]. + + +[wmt19]: https://www.statmt.org/wmt19/translation-task.html +[wmt20]: https://www.statmt.org/wmt20/translation-task.html +[wmt21]: https://www.statmt.org/wmt21/translation-task.html +[sacrebleu]: https://github.com/mjpost/sacrebleu diff --git a/transformer-basic/requirements.txt b/transformer-basic/requirements.txt new file mode 100644 index 0000000..738927a --- /dev/null +++ b/transformer-basic/requirements.txt @@ -0,0 +1,2 @@ +sacrebleu>=2.0.0 +unbabel-comet>=1.0.1 diff --git a/transformer-basic/run-me.sh b/transformer-basic/run-me.sh new file mode 100755 index 0000000..56390b6 --- /dev/null +++ b/transformer-basic/run-me.sh @@ -0,0 +1,76 @@ +#!/usr/bin/env bash +set -euo pipefail + +MARIAN=../../build +if [ ! -e $MARIAN/marian ]; then + echo "Marian is not found at '$MARIAN'. Please compile it first!" + exit 1; +fi + +SRC="en" +TRG="de" + +# Set which GPUs to use for compute +compute="-d 0" +if [ $# -ne 0 ]; then + compute="-d $@" +fi + +# Setup +mkdir -p data model evaluation + +# Get Data +./scripts/download-files.sh + +# Preprocessing +./scripts/preprocess-data.sh + + +# Prepare vocab (optional) +# $MARIAN/spm_train \ +# --accept_language $SRC,$TRG \ +# --input data/corpus.clean.$SRC,data/corpus.clean.$TRG \ +# --model_prefix model/vocab.$SRC$TRG \ +# --vocab_size 32000 +# mv model/vocab.$SRC$TRG.{model,spm} + +# Train +$MARIAN/marian -c transformer-model.yml \ + ${compute} --workspace 9000 \ + --seed 1111 \ + --after 10e \ + --model model/model.npz \ + --train-sets data/corpus.clean.{$SRC,$TRG} \ + --vocabs model/vocab.$SRC$TRG.spm model/vocab.$SRC$TRG.spm \ + --dim-vocabs 32000 32000 \ + --valid-sets data/valid.{$SRC,$TRG} \ + --log model/train.log --valid-log model/valid.log + +# Decoding +SB_OPTS="--metrics bleu chrf -b -w 3 -f text" # options for sacrebleu +mkdir -p evaluation +echo "Evaluating test set" +cat data/test.$SRC \ + | $MARIAN/marian-decoder \ + -c model/model.npz.best-bleu.npz.decoder.yml \ + ${compute} \ + --log evaluation/testset_decoding.log \ + | tee evaluation/testset_output.txt \ + | sacrebleu data/test.$TRG ${SB_OPTS} + + # Run comet-score + ./scripts/comet-score.sh evaluation/testset_output.txt data/test.$SRC data/test.$TRG + +# Run comparison of WMT tests +for test in wmt{16,17,18,19,20}; do + echo "Evaluating ${test} test set" + sacrebleu -t $test -l $SRC-$TRG --echo src \ + | $MARIAN/marian-decoder \ + -c model/model.npz.best-bleu.npz.decoder.yml \ + ${compute} \ + --log evaluation/${test}_decoding.log \ + --quiet --quiet-translation \ + | tee evaluation/${test}_output.txt \ + | sacrebleu -t $test -l $SRC-$TRG ${SB_OPTS} + ./scripts/comet-score.sh evaluation/${test}_output.txt <(sacrebleu -t $test -l $SRC-$TRG --echo src) <(sacrebleu -t $test -l $SRC-$TRG --echo ref) +done diff --git a/transformer-basic/scripts/comet-score.sh b/transformer-basic/scripts/comet-score.sh new file mode 100755 index 0000000..64f5149 --- /dev/null +++ b/transformer-basic/scripts/comet-score.sh @@ -0,0 +1,34 @@ +#!/usr/bin/env bash +set -euo pipefail + +# Compute Comet score +# Perform on CPU to avoid competing for GPU memory + +# Usage: +# 1) Score against default validation set +# ./comet-score hypothesis.txt +# 2) Score against a different source/reference +# ./comet-score hypothesis.txt source.txt reference.txt + +if [[ "$#" -eq 1 ]]; then + src="data/valid.en" + ref="data/valid.de" +elif [[ "$#" -eq 3 ]]; then + src=$2 + ref=$3 +else + echo "Usage: $0 hypothesis.txt [source.txt reference.txt]" + exit 1 +fi + +trg=$1 + +comet-score \ + --gpus 0 \ + -s ${src} \ + -t ${trg} \ + -r ${ref} \ + --model wmt20-comet-da \ + 2> ./scripts/.comet.stderr.log \ + | tail -1 \ + | grep -oP "([+-]?\d+.\d+)" diff --git a/transformer-basic/scripts/download-files.sh b/transformer-basic/scripts/download-files.sh new file mode 100755 index 0000000..366fc02 --- /dev/null +++ b/transformer-basic/scripts/download-files.sh @@ -0,0 +1,39 @@ +#!/usr/bin/env bash +set -euo pipefail + +cd data +echo "Downloading data" +# Get en-de for training WMT21 +wget -nc https://www.statmt.org/europarl/v10/training/europarl-v10.de-en.tsv.gz 2> /dev/null +wget -nc https://data.statmt.org/news-commentary/v16/training/news-commentary-v16.de-en.tsv.gz 2> /dev/null +wget -nc https://www.statmt.org/wmt13/training-parallel-commoncrawl.tgz 2> /dev/null + +# Dev Sets +sacrebleu -t wmt19 -l en-de --echo src > valid.en +sacrebleu -t wmt19 -l en-de --echo ref > valid.de + +# Test Sets +sacrebleu -t wmt20 -l en-de --echo src > test.en +sacrebleu -t wmt20 -l en-de --echo ref > test.de + +# Uncompress +for compressed in europarl-v10.de-en.tsv news-commentary-v16.de-en.tsv; do + if [ ! -e $compressed ]; then + gzip --keep -q -d $compressed.gz + fi +done + +tar xf training-parallel-commoncrawl.tgz + +# Corpus +if [ ! -e corpus.de ] || [ ! -e corpus.en ]; then + # TSVs + cat europarl-v10.de-en.tsv news-commentary-v16.de-en.tsv | cut -f 1 > corpus.de + cat europarl-v10.de-en.tsv news-commentary-v16.de-en.tsv | cut -f 2 > corpus.en + + # Plain text + cat commoncrawl.de-en.de >> corpus.de + cat commoncrawl.de-en.en >> corpus.en +fi + +echo "Corpus prepared" diff --git a/transformer-basic/scripts/preprocess-data.sh b/transformer-basic/scripts/preprocess-data.sh new file mode 100755 index 0000000..e6e8c6a --- /dev/null +++ b/transformer-basic/scripts/preprocess-data.sh @@ -0,0 +1,36 @@ +#!/usr/bin/env bash +set -euo pipefail + +MOSES_SCRIPTS="$PWD/../tools/moses-scripts/scripts" + +SRC="en" +TRG="de" + +cd data +if [ -e corpus.clean.$SRC ] && [ -e corpus.clean.$TRG ]; then + echo "No action needed" + exit 0 +fi + + +for lang in $SRC $TRG; do + # Remove non-printing characters + cat corpus.$lang \ + | perl $MOSES_SCRIPTS/tokenizer/remove-non-printing-char.perl \ + > .corpus.norm.$lang + # | perl $MOSES_SCRIPTS/tokenizer/normalize-punctuation.perl -l $lang \ # could optionally norm quotes +done + +# Contrain length between 1 100 +perl $MOSES_SCRIPTS/training/clean-corpus-n.perl .corpus.norm $SRC $TRG .corpus.trim 1 100 + +# Deduplicate +paste <(cat .corpus.trim.$SRC) <(cat .corpus.trim.$TRG) \ + | LC_ALL=C sort -S 50% | uniq \ + > .corpus.uniq.$SRC$TRG.tsv + +cat .corpus.uniq.$SRC$TRG.tsv | cut -f 1 > corpus.clean.$SRC +cat .corpus.uniq.$SRC$TRG.tsv | cut -f 2 > corpus.clean.$TRG + +# Clean up +rm .corpus.* diff --git a/transformer-basic/transformer-model.yml b/transformer-basic/transformer-model.yml new file mode 100644 index 0000000..fb45979 --- /dev/null +++ b/transformer-basic/transformer-model.yml @@ -0,0 +1,46 @@ +# Model options +type: transformer +dim-emb: 512 +enc-depth: 6 +dec-depth: 6 +tied-embeddings-all: true +transformer-heads: 8 +transformer-dim-ffn: 2048 +transformer-ffn-activation: relu +transformer-preprocess: "" +transformer-postprocess: dan +transformer-dropout: 0.1 + +# Training options +cost-type: ce-mean-words +max-length: 100 +mini-batch: 1000 +mini-batch-fit: true +maxi-batch: 1000 +optimizer-params: + - 0.9 + - 0.98 + - 1e-09 +sync-sgd: true +learn-rate: 0.0003 +lr-decay-inv-sqrt: + - 16000 +lr-warmup: 16000 +label-smoothing: 0.1 +clip-norm: 0 +exponential-smoothing: 0.0001 +disp-freq: 1000 +disp-first: 10 +save-freq: 2ku +early-stopping: 10 + +# Validation set options +keep-best: true +beam-size: 8 +normalize: 1 +valid-freq: 2ku +valid-metrics: + - ce-mean-words + - bleu + - perplexity +valid-mini-batch: 16 -- cgit v1.2.3