diff options
author | Marcin Junczys-Dowmunt <junczys@amu.edu.pl> | 2018-03-10 03:08:12 +0300 |
---|---|---|
committer | Marcin Junczys-Dowmunt <junczys@amu.edu.pl> | 2018-03-10 03:08:12 +0300 |
commit | 5aee613746bb86b38ea342816e9afe7ab45caa6f (patch) | |
tree | 18753e928bac1e449f86e034488e4552e44fc5b5 | |
parent | 9477cc56b709324b5fa7d1cd66086eb472e269cc (diff) |
change path
-rw-r--r-- | wmt2017-transformer/.gitignore | 3 | ||||
-rw-r--r-- | wmt2017-transformer/README.md | 67 | ||||
-rwxr-xr-x | wmt2017-transformer/run-me.sh | 203 | ||||
-rwxr-xr-x | wmt2017-transformer/scripts/download-files-mono.sh | 14 | ||||
-rwxr-xr-x | wmt2017-transformer/scripts/download-files.sh | 23 | ||||
-rwxr-xr-x | wmt2017-transformer/scripts/preprocess-data-mono.sh | 23 | ||||
-rwxr-xr-x | wmt2017-transformer/scripts/preprocess-data.sh | 58 | ||||
-rw-r--r-- | wmt2017-transformer/scripts/rescore.py | 25 | ||||
-rwxr-xr-x | wmt2017-transformer/scripts/validate.en.sh | 8 | ||||
-rwxr-xr-x | wmt2017-transformer/scripts/validate.sh | 10 | ||||
-rw-r--r-- | wmt2017-uedin/.gitignore | 4 | ||||
-rw-r--r-- | wmt2017-uedin/README.md | 67 | ||||
-rwxr-xr-x | wmt2017-uedin/run-me.sh | 205 | ||||
-rwxr-xr-x | wmt2017-uedin/scripts/bla.s | 59 | ||||
-rwxr-xr-x | wmt2017-uedin/scripts/download-files-mono.sh | 14 | ||||
-rwxr-xr-x | wmt2017-uedin/scripts/download-files.sh | 23 | ||||
-rwxr-xr-x | wmt2017-uedin/scripts/preprocess-data-mono.sh | 23 | ||||
-rwxr-xr-x | wmt2017-uedin/scripts/preprocess-data.sh | 64 | ||||
-rwxr-xr-x | wmt2017-uedin/scripts/validate.en.sh | 8 | ||||
-rwxr-xr-x | wmt2017-uedin/scripts/validate.sh | 8 |
20 files changed, 909 insertions, 0 deletions
diff --git a/wmt2017-transformer/.gitignore b/wmt2017-transformer/.gitignore new file mode 100644 index 0000000..6053e6f --- /dev/null +++ b/wmt2017-transformer/.gitignore @@ -0,0 +1,3 @@ +data +model +model.back diff --git a/wmt2017-transformer/README.md b/wmt2017-transformer/README.md new file mode 100644 index 0000000..1b793ce --- /dev/null +++ b/wmt2017-transformer/README.md @@ -0,0 +1,67 @@ +# Example: Training a transformer model + +Files and scripts in this folder show how to train a Google-style transformer +model ([Vaswani et al, 2017](https://arxiv.org/abs/1706.03762)) on WMT-17 (?) +English-German data. +The problem-set has been adapted from the original +[tensor2tensor](https://github.com/tensorflow/tensor2tensor) repository by +Google. We reuse their 36,000 common BPE subword units for both languages. +No back-translationed data was added. + +Assuming four GPUs are available (here 0 1 2 3), execute the command below +to run the complete example: + +``` +./run-me.sh 0 1 2 3 +``` + +This starts a training run with `marian` using the following command: + +``` +../build/marian \ + --model model/model.npz --type transformer \ + --train-sets data/corpus.bpe.en data/corpus.bpe.de \ + --max-length 100 \ + --vocabs model/vocab.ende.yml model/vocab.ende.yml \ + --mini-batch-fit -w 7000 --maxi-batch 1000 \ + --early-stopping 10 \ + --valid-freq 5000 --save-freq 5000 --disp-freq 500 \ + --valid-metrics cross-entropy perplexity translation \ + --valid-sets data/valid.bpe.en data/valid.bpe.de \ + --valid-script-path ./scripts/validate.sh \ + --valid-translation-output data/valid.bpe.en.output --quiet-translation \ + --valid-mini-batch 64 \ + --beam-size 6 --normalize 0.6 \ + --log model/train.log --valid-log model/valid.log \ + --enc-depth 6 --dec-depth 6 \ + --transformer-heads 8 \ + --transformer-postprocess-emb d \ + --transformer-postprocess dan \ + --transformer-dropout 0.1 --label-smoothing 0.1 \ + --learn-rate 0.0003 --lr-warmup 16000 --lr-decay-inv-sqrt 16000 --lr-report \ + --optimizer-params 0.9 0.98 1e-09 --clip-norm 5 \ + --tied-embeddings-all \ + --devices $GPUS --sync-sgd --seed 1111 \ + --exponential-smoothing +``` + +This reproduces a system roughly equivalent to the basic 6-layer transformer +described in the original paper. + +The training setting includes: +* Fitting mini-batch sizes to 7GB of GPU memory with synchronous SGD (ADAM), +which results in large mini-batches. +* Validation on external data set using cross-entropy, perplexity and BLEU +* 6-layer (or rather block) encoder and 6-layer decoder +* Tied embeddings for source, target and output layer +* Label smoothing +* Learning rate warm-up and cool-down +* Multi-GPU training + +The evaluation is performed on WMT test sets from 2014, 2015 and 2016 using +[sacreBLEU](https://github.com/mjpost/sacreBLEU), which provides hassle-free +computation of shareable, comparable, and reproducible BLEU scores. The +WMT-2013 test set is used as a validation set. + +See the basic training example (`marian/examples/training-basics/`) for more +details. diff --git a/wmt2017-transformer/run-me.sh b/wmt2017-transformer/run-me.sh new file mode 100755 index 0000000..6ba4f5b --- /dev/null +++ b/wmt2017-transformer/run-me.sh @@ -0,0 +1,203 @@ +#!/bin/bash -v + +MARIAN=../.. +# set chosen gpus +GPUS=0 +if [ $# -ne 0 ] +then + GPUS=$@ +fi +echo Using GPUs: $GPUS + +if [ ! $WORKSPACE ] +then + WORKSPACE=9500 +fi + +N=4 +B=12 + +if [ ! -e $MARIAN/build/marian ] +then + echo "marian is not installed in ../../build, you need to compile the toolkit first" + exit 1 +fi + +if [ ! -e ../tools/moses-scripts ] || [ ! -e ../tools/subword-nmt ] || [ ! -e ../tools/sacreBLEU ] +then + echo "missing tools in ../tools, you need to download them first" + exit 1 +fi + +if [ ! -e "data/corpus.en" ] +then + ./scripts/download-files.sh +fi + +mkdir -p model + +# preprocess data +if [ ! -e "data/corpus.bpe.en" ] +then + LC_ALL=C.UTF-8 ../tools/sacreBLEU/sacrebleu.py -t wmt16 -l en-de --echo src > data/valid.en + LC_ALL=C.UTF-8 ../tools/sacreBLEU/sacrebleu.py -t wmt16 -l en-de --echo ref > data/valid.de + + LC_ALL=C.UTF-8 ../tools/sacreBLEU/sacrebleu.py -t wmt14 -l en-de --echo src > data/test2014.en + LC_ALL=C.UTF-8 ../tools/sacreBLEU/sacrebleu.py -t wmt15 -l en-de --echo src > data/test2015.en + LC_ALL=C.UTF-8 ../tools/sacreBLEU/sacrebleu.py -t wmt16 -l en-de --echo src > data/test2016.en + LC_ALL=C.UTF-8 ../tools/sacreBLEU/sacrebleu.py -t wmt17 -l en-de --echo src > data/test2017.en + + ./scripts/preprocess-data.sh +fi + +if [ ! -e "data/news.2016.de" ] +then + ./scripts/download-files-mono.sh +fi + +if [ ! -e "data/news.2016.bpe.de" ] +then + ./scripts/preprocess-data-mono.sh +fi + +# create common vocabulary +if [ ! -e "model/vocab.ende.yml" ] +then + cat data/corpus.bpe.en data/corpus.bpe.de | $MARIAN/build/marian-vocab --max-size 36000 > model/vocab.ende.yml +fi + +# train model +mkdir -p model.back +if [ ! -e "model.back/model.npz.best-translation.npz" ] +then + $MARIAN/build/marian \ + --model model.back/model.npz --type s2s \ + --train-sets data/corpus.bpe.de data/corpus.bpe.en \ + --max-length 100 \ + --vocabs model/vocab.ende.yml model/vocab.ende.yml \ + --mini-batch-fit -w 3500 --maxi-batch 1000 \ + --valid-freq 10000 --save-freq 10000 --disp-freq 100 \ + --valid-metrics cross-entropy translation \ + --valid-script-path ./scripts/validate.en.sh \ + --valid-translation-output data/valid.bpe.de.output --quiet-translation \ + --valid-sets data/valid.bpe.de data/valid.bpe.en \ + --valid-mini-batch 64 --beam-size 12 --normalize=1 \ + --overwrite --keep-best \ + --early-stopping 5 --after-epochs 10 \ + --log model.back/train.log --valid-log model.back/valid.log \ + --tied-embeddings-all --layer-normalization \ + --devices $GPUS --seed 1111 \ + --exponential-smoothing +fi + +if [ ! -e "data/news.2016.bpe.en" ] +then + $MARIAN/build/marian-decoder \ + -c model.back/model.npz.best-translation.npz.decoder.yml \ + -i data/news.2016.bpe.de \ + -b 6 --normalize=1 -w 2500 -d $GPUS \ + --mini-batch 64 --maxi-batch 100 --maxi-batch-sort src \ + --max-length 200 --max-length-crop \ + > data/news.2016.bpe.en +fi + +if [ ! -e "data/all.bpe.en" ] +then + cat data/corpus.bpe.en data/corpus.bpe.en data/news.2016.bpe.en > data/all.bpe.en + cat data/corpus.bpe.de data/corpus.bpe.de data/news.2016.bpe.de > data/all.bpe.de +fi + +for i in `seq 1 $N` +do + mkdir -p model/ens$i + # train model + if [ ! -e "model/ens$i/model.npz.best-translation.npz" ] + then + $MARIAN/build/marian \ + --model model/ens$i/model.npz --type transformer \ + --train-sets data/all.bpe.en data/all.bpe.de \ + --max-length 100 \ + --vocabs model/vocab.ende.yml model/vocab.ende.yml \ + --mini-batch-fit -w $WORKSPACE --mini-batch 1000 --maxi-batch 1000 \ + --valid-freq 5000 --save-freq 5000 --disp-freq 500 \ + --valid-metrics cross-entropy translation \ + --valid-sets data/valid.bpe.en data/valid.bpe.de \ + --valid-script-path ./scripts/validate.sh \ + --valid-translation-output data/valid.bpe.en.output --quiet-translation \ + --beam-size 12 --normalize=1 \ + --valid-mini-batch 64 \ + --overwrite --keep-best \ + --early-stopping 5 --after-epochs 7 --cost-type=ce-mean-words \ + --log model/ens$i/train.log --valid-log model/ens$i/valid.log \ + --enc-depth 6 --dec-depth 6 \ + --tied-embeddings-all \ + --transformer-dropout 0.1 --label-smoothing 0.1 \ + --learn-rate 0.0003 --lr-warmup 16000 --lr-decay-inv-sqrt 16000 --lr-report \ + --optimizer-params 0.9 0.98 1e-09 --clip-norm 5 \ + --devices $GPUS --sync-sgd --seed $i$i$i$i \ + --exponential-smoothing + fi +done + +for i in `seq 1 $N` +do + mkdir -p model/ens-rtl$i + # train model + if [ ! -e "model/ens-rtl$i/model.npz.best-translation.npz" ] + then + $MARIAN/build/marian \ + --model model/ens-rtl$i/model.npz --type transformer \ + --train-sets data/all.bpe.en data/all.bpe.de \ + --max-length 100 \ + --vocabs model/vocab.ende.yml model/vocab.ende.yml \ + --mini-batch-fit -w $WORKSPACE --mini-batch 1000 --maxi-batch 1000 \ + --valid-freq 5000 --save-freq 5000 --disp-freq 500 \ + --valid-metrics cross-entropy translation \ + --valid-sets data/valid.bpe.en data/valid.bpe.de \ + --valid-script-path ./scripts/validate.sh \ + --valid-translation-output data/valid.bpe.en.output --quiet-translation \ + --beam-size 12 --normalize=1 \ + --valid-mini-batch 64 \ + --overwrite --keep-best \ + --early-stopping 5 --after-epochs 7 --cost-type=ce-mean-words \ + --log model/ens-rtl$i/train.log --valid-log model/ens-rtl$i/valid.log \ + --enc-depth 6 --dec-depth 6 \ + --tied-embeddings-all \ + --transformer-dropout 0.1 --label-smoothing 0.1 \ + --learn-rate 0.0003 --lr-warmup 16000 --lr-decay-inv-sqrt 16000 --lr-report \ + --optimizer-params 0.9 0.98 1e-09 --clip-norm 5 \ + --devices $GPUS --sync-sgd --seed $i$i$i$i$i \ + --exponential-smoothing --right-left + fi +done + + +# translate test sets +for prefix in valid test2014 test2015 test2017 +do + cat data/$prefix.bpe.en \ + | $MARIAN/build/marian-decoder -c model/ens1/model.npz.best-translation.npz.decoder.yml \ + -m model/ens?/model.npz.best-translation.npz -d $GPUS \ + --mini-batch 16 --maxi-batch 100 --maxi-batch-sort src -w 5000 --n-best --beam-size $B \ + > data/$prefix.bpe.en.output.nbest.0 + + for i in $(seq 1 $N) + do + $MARIAN/build/marian-scorer -m model/ens-rtl$i/model.npz.best-cross-entropy.npz \ + -v model/vocab.ende.yml model/vocab.ende.yml -d $GPUS \ + --mini-batch 16 --maxi-batch 100 --maxi-batch-sort trg --n-best --n-best-feature R2L$(expr $i - 1) \ + -t data/$prefix.bpe.en data/$prefix.bpe.en.output.nbest.$(expr $i - 1) > data/$prefix.bpe.en.output.nbest.$i + done + + cat data/$prefix.bpe.en.output.nbest.$N \ + | python scripts/rescore.py \ + | perl -pe 's/@@ //g' \ + | ../tools/moses-scripts/scripts/recaser/detruecase.perl \ + | ../tools/moses-scripts/scripts/tokenizer/detokenizer.perl > data/$prefix.en.output +done + +# calculate bleu scores on test sets +LC_ALL=C.UTF-8 ../tools/sacreBLEU/sacrebleu.py -t wmt16 -l en-de < data/valid.en.output +LC_ALL=C.UTF-8 ../tools/sacreBLEU/sacrebleu.py -t wmt14 -l en-de < data/test2014.en.output +LC_ALL=C.UTF-8 ../tools/sacreBLEU/sacrebleu.py -t wmt15 -l en-de < data/test2015.en.output +LC_ALL=C.UTF-8 ../tools/sacreBLEU/sacrebleu.py -t wmt17 -l en-de < data/test2017.en.output diff --git a/wmt2017-transformer/scripts/download-files-mono.sh b/wmt2017-transformer/scripts/download-files-mono.sh new file mode 100755 index 0000000..ba2c39a --- /dev/null +++ b/wmt2017-transformer/scripts/download-files-mono.sh @@ -0,0 +1,14 @@ +#!/bin/bash -v + +mkdir -p data +cd data + +# get En-De training data for WMT17 +wget -nc http://data.statmt.org/wmt17/translation-task/news.2016.de.shuffled.gz + +zcat news.2016.de.shuffled.gz | shuf -n 11000000 | perl -ne 'print if(split(/\s/, $_) < 100)' | head -n 10000000 > news.2016.de + +# clean +rm -r news.2016.de.shuffled.gz + +cd .. diff --git a/wmt2017-transformer/scripts/download-files.sh b/wmt2017-transformer/scripts/download-files.sh new file mode 100755 index 0000000..99f880a --- /dev/null +++ b/wmt2017-transformer/scripts/download-files.sh @@ -0,0 +1,23 @@ +#!/bin/bash -v + +mkdir -p data +cd data + +# get En-De training data for WMT17 +wget -nc http://www.statmt.org/europarl/v7/de-en.tgz +wget -nc http://www.statmt.org/wmt13/training-parallel-commoncrawl.tgz +wget -nc http://data.statmt.org/wmt17/translation-task/training-parallel-nc-v12.tgz + +# extract data +tar -xf de-en.tgz +tar -xf training-parallel-commoncrawl.tgz +tar -xf training-parallel-nc-v12.tgz + +# create corpus files +cat europarl-v7.de-en.de commoncrawl.de-en.de training/news-commentary-v12.de-en.de > corpus.de +cat europarl-v7.de-en.en commoncrawl.de-en.en training/news-commentary-v12.de-en.en > corpus.en + +# clean +rm -r europarl-* commoncrawl.* training/ *.tgz + +cd .. diff --git a/wmt2017-transformer/scripts/preprocess-data-mono.sh b/wmt2017-transformer/scripts/preprocess-data-mono.sh new file mode 100755 index 0000000..b327e9f --- /dev/null +++ b/wmt2017-transformer/scripts/preprocess-data-mono.sh @@ -0,0 +1,23 @@ +#!/bin/bash -v + +# suffix of target language files +SRC=en +TRG=de + +# path to moses decoder: https://github.com/moses-smt/mosesdecoder +mosesdecoder=../tools/moses-scripts + +# path to subword segmentation scripts: https://github.com/rsennrich/subword-nmt +subword_nmt=../tools/subword-nmt + +# tokenize + +prefix=news.2016 + +cat data/$prefix.$TRG \ + | $mosesdecoder/scripts/tokenizer/normalize-punctuation.perl -l $TRG \ + | $mosesdecoder/scripts/tokenizer/tokenizer.perl -a -l $TRG > data/$prefix.tok.$TRG + +$mosesdecoder/scripts/recaser/truecase.perl -model model/tc.$TRG < data/$prefix.tok.$TRG > data/$prefix.tc.$TRG + +$subword_nmt/apply_bpe.py -c model/$SRC$TRG.bpe < data/$prefix.tc.$TRG > data/$prefix.bpe.$TRG diff --git a/wmt2017-transformer/scripts/preprocess-data.sh b/wmt2017-transformer/scripts/preprocess-data.sh new file mode 100755 index 0000000..3a968a5 --- /dev/null +++ b/wmt2017-transformer/scripts/preprocess-data.sh @@ -0,0 +1,58 @@ +#!/bin/bash -v + +# suffix of source language files +SRC=en + +# suffix of target language files +TRG=de + +# number of merge operations +bpe_operations=32000 + +# path to moses decoder: https://github.com/moses-smt/mosesdecoder +mosesdecoder=../tools/moses-scripts + +# path to subword segmentation scripts: https://github.com/rsennrich/subword-nmt +subword_nmt=../tools/subword-nmt + +# tokenize +for prefix in corpus valid test2014 test2015 test2016 +do + cat data/$prefix.$SRC \ + | $mosesdecoder/scripts/tokenizer/normalize-punctuation.perl -l $SRC \ + | $mosesdecoder/scripts/tokenizer/tokenizer.perl -a -l $SRC > data/$prefix.tok.$SRC + + test -f data/$prefix.$TRG || continue + + cat data/$prefix.$TRG \ + | $mosesdecoder/scripts/tokenizer/normalize-punctuation.perl -l $TRG \ + | $mosesdecoder/scripts/tokenizer/tokenizer.perl -a -l $TRG > data/$prefix.tok.$TRG +done + +# clean empty and long sentences, and sentences with high source-target ratio (training corpus only) +mv data/corpus.tok.$SRC data/corpus.tok.uncleaned.$SRC +mv data/corpus.tok.$TRG data/corpus.tok.uncleaned.$TRG +$mosesdecoder/scripts/training/clean-corpus-n.perl data/corpus.tok.uncleaned $SRC $TRG data/corpus.tok 1 100 + +# train truecaser +$mosesdecoder/scripts/recaser/train-truecaser.perl -corpus data/corpus.tok.$SRC -model model/tc.$SRC +$mosesdecoder/scripts/recaser/train-truecaser.perl -corpus data/corpus.tok.$TRG -model model/tc.$TRG + +# apply truecaser (cleaned training corpus) +for prefix in corpus valid test2014 test2015 test2016 +do + $mosesdecoder/scripts/recaser/truecase.perl -model model/tc.$SRC < data/$prefix.tok.$SRC > data/$prefix.tc.$SRC + test -f data/$prefix.tok.$TRG || continue + $mosesdecoder/scripts/recaser/truecase.perl -model model/tc.$TRG < data/$prefix.tok.$TRG > data/$prefix.tc.$TRG +done + +# train BPE +cat data/corpus.tc.$SRC data/corpus.tc.$TRG | $subword_nmt/learn_bpe.py -s $bpe_operations > model/$SRC$TRG.bpe + +# apply BPE +for prefix in corpus valid test2014 test2015 test2016 +do + $subword_nmt/apply_bpe.py -c model/$SRC$TRG.bpe < data/$prefix.tc.$SRC > data/$prefix.bpe.$SRC + test -f data/$prefix.tc.$TRG || continue + $subword_nmt/apply_bpe.py -c model/$SRC$TRG.bpe < data/$prefix.tc.$TRG > data/$prefix.bpe.$TRG +done diff --git a/wmt2017-transformer/scripts/rescore.py b/wmt2017-transformer/scripts/rescore.py new file mode 100644 index 0000000..f5ecfa9 --- /dev/null +++ b/wmt2017-transformer/scripts/rescore.py @@ -0,0 +1,25 @@ +import sys + +lastNum = 0 +bestScore = -9999 + +for line in sys.stdin: + line = line.rstrip("\n") + fields = line.split(" ||| ") + score = sum(float(score) for score in fields[2].split(" ") if score[-1] != "=") + length = float(len(fields[1].split(" ")) + 1) + + score = score / length + + num = int(fields[0]) + if num > lastNum: + print bestLine + bestScore = -99999 + bestLine = fields[1] + lastNum = num + + if score > bestScore: + bestScore = score + bestLine = fields[1] + +print bestLine diff --git a/wmt2017-transformer/scripts/validate.en.sh b/wmt2017-transformer/scripts/validate.en.sh new file mode 100755 index 0000000..4400d63 --- /dev/null +++ b/wmt2017-transformer/scripts/validate.en.sh @@ -0,0 +1,8 @@ +#!/bin/bash + +cat $1 \ + | sed 's/\@\@ //g' \ + | ../tools/moses-scripts/scripts/recaser/detruecase.perl 2>/dev/null \ + | ../tools/moses-scripts/scripts/tokenizer/detokenizer.perl -l en 2>/dev/null \ + | ../tools/moses-scripts/scripts/generic/multi-bleu-detok.perl data/valid.en \ + | sed -r 's/BLEU = ([0-9.]+),.*/\1/' diff --git a/wmt2017-transformer/scripts/validate.sh b/wmt2017-transformer/scripts/validate.sh new file mode 100755 index 0000000..a2b4945 --- /dev/null +++ b/wmt2017-transformer/scripts/validate.sh @@ -0,0 +1,10 @@ +#!/bin/bash + +export LC_ALL=C.UTF-8 + +cat $1 \ + | sed 's/\@\@ //g' \ + | ../tools/moses-scripts/scripts/recaser/detruecase.perl 2>/dev/null \ + | ../tools/moses-scripts/scripts/tokenizer/detokenizer.perl -l de 2>/dev/null \ + | ../tools/moses-scripts/scripts/generic/multi-bleu-detok.perl data/valid.de \ + | sed -r 's/BLEU = ([0-9.]+),.*/\1/' diff --git a/wmt2017-uedin/.gitignore b/wmt2017-uedin/.gitignore new file mode 100644 index 0000000..c201e72 --- /dev/null +++ b/wmt2017-uedin/.gitignore @@ -0,0 +1,4 @@ +data +model +model.back + diff --git a/wmt2017-uedin/README.md b/wmt2017-uedin/README.md new file mode 100644 index 0000000..1b793ce --- /dev/null +++ b/wmt2017-uedin/README.md @@ -0,0 +1,67 @@ +# Example: Training a transformer model + +Files and scripts in this folder show how to train a Google-style transformer +model ([Vaswani et al, 2017](https://arxiv.org/abs/1706.03762)) on WMT-17 (?) +English-German data. +The problem-set has been adapted from the original +[tensor2tensor](https://github.com/tensorflow/tensor2tensor) repository by +Google. We reuse their 36,000 common BPE subword units for both languages. +No back-translationed data was added. + +Assuming four GPUs are available (here 0 1 2 3), execute the command below +to run the complete example: + +``` +./run-me.sh 0 1 2 3 +``` + +This starts a training run with `marian` using the following command: + +``` +../build/marian \ + --model model/model.npz --type transformer \ + --train-sets data/corpus.bpe.en data/corpus.bpe.de \ + --max-length 100 \ + --vocabs model/vocab.ende.yml model/vocab.ende.yml \ + --mini-batch-fit -w 7000 --maxi-batch 1000 \ + --early-stopping 10 \ + --valid-freq 5000 --save-freq 5000 --disp-freq 500 \ + --valid-metrics cross-entropy perplexity translation \ + --valid-sets data/valid.bpe.en data/valid.bpe.de \ + --valid-script-path ./scripts/validate.sh \ + --valid-translation-output data/valid.bpe.en.output --quiet-translation \ + --valid-mini-batch 64 \ + --beam-size 6 --normalize 0.6 \ + --log model/train.log --valid-log model/valid.log \ + --enc-depth 6 --dec-depth 6 \ + --transformer-heads 8 \ + --transformer-postprocess-emb d \ + --transformer-postprocess dan \ + --transformer-dropout 0.1 --label-smoothing 0.1 \ + --learn-rate 0.0003 --lr-warmup 16000 --lr-decay-inv-sqrt 16000 --lr-report \ + --optimizer-params 0.9 0.98 1e-09 --clip-norm 5 \ + --tied-embeddings-all \ + --devices $GPUS --sync-sgd --seed 1111 \ + --exponential-smoothing +``` + +This reproduces a system roughly equivalent to the basic 6-layer transformer +described in the original paper. + +The training setting includes: +* Fitting mini-batch sizes to 7GB of GPU memory with synchronous SGD (ADAM), +which results in large mini-batches. +* Validation on external data set using cross-entropy, perplexity and BLEU +* 6-layer (or rather block) encoder and 6-layer decoder +* Tied embeddings for source, target and output layer +* Label smoothing +* Learning rate warm-up and cool-down +* Multi-GPU training + +The evaluation is performed on WMT test sets from 2014, 2015 and 2016 using +[sacreBLEU](https://github.com/mjpost/sacreBLEU), which provides hassle-free +computation of shareable, comparable, and reproducible BLEU scores. The +WMT-2013 test set is used as a validation set. + +See the basic training example (`marian/examples/training-basics/`) for more +details. diff --git a/wmt2017-uedin/run-me.sh b/wmt2017-uedin/run-me.sh new file mode 100755 index 0000000..ee6ef34 --- /dev/null +++ b/wmt2017-uedin/run-me.sh @@ -0,0 +1,205 @@ +#!/bin/bash -v + +MARIAN=../.. +# set chosen gpus +GPUS=0 +if [ $# -ne 0 ] +then + GPUS=$@ +fi +echo Using GPUs: $GPUS + +if [ ! $WORKSPACE ] +then + WORKSPACE=9500 +fi + +N=4 +B=12 + +if [ ! -e $MARIAN/build/marian ] +then + echo "marian is not installed in ../../build, you need to compile the toolkit first" + exit 1 +fi + +if [ ! -e ../tools/moses-scripts ] || [ ! -e ../tools/subword-nmt ] || [ ! -e ../tools/sacreBLEU ] +then + echo "missing tools in ../tools, you need to download them first" + exit 1 +fi + +if [ ! -e "data/corpus.en" ] +then + ./scripts/download-files.sh +fi + +mkdir -p model + +# preprocess data +if [ ! -e "data/corpus.bpe.en" ] +then + LC_ALL=C.UTF-8 ../tools/sacreBLEU/sacrebleu.py -t wmt16 -l en-de --echo src > data/valid.en + LC_ALL=C.UTF-8 ../tools/sacreBLEU/sacrebleu.py -t wmt16 -l en-de --echo ref > data/valid.de + + LC_ALL=C.UTF-8 ../tools/sacreBLEU/sacrebleu.py -t wmt14 -l en-de --echo src > data/test2014.en + LC_ALL=C.UTF-8 ../tools/sacreBLEU/sacrebleu.py -t wmt15 -l en-de --echo src > data/test2015.en + LC_ALL=C.UTF-8 ../tools/sacreBLEU/sacrebleu.py -t wmt16 -l en-de --echo src > data/test2016.en + LC_ALL=C.UTF-8 ../tools/sacreBLEU/sacrebleu.py -t wmt17 -l en-de --echo src > data/test2017.en + + ./scripts/preprocess-data.sh +fi + +if [ ! -e "data/news.2016.de" ] +then + ./scripts/download-files-mono.sh +fi + +if [ ! -e "data/news.2016.bpe.de" ] +then + ./scripts/preprocess-data-mono.sh +fi + +# create common vocabulary +if [ ! -e "model/vocab.ende.yml" ] +then + cat data/corpus.bpe.en data/corpus.bpe.de | $MARIAN/build/marian-vocab --max-size 36000 > model/vocab.ende.yml +fi + +# train model +mkdir -p model.back +if [ ! -e "model.back/model.npz.best-translation.npz" ] +then + $MARIAN/build/marian \ + --model model.back/model.npz --type s2s \ + --train-sets data/corpus.bpe.de data/corpus.bpe.en \ + --max-length 100 \ + --vocabs model/vocab.ende.yml model/vocab.ende.yml \ + --mini-batch-fit -w 3500 --maxi-batch 1000 \ + --valid-freq 10000 --save-freq 10000 --disp-freq 100 \ + --valid-metrics cross-entropy translation \ + --valid-script-path ./scripts/validate.en.sh \ + --valid-translation-output data/valid.bpe.de.output --quiet-translation \ + --valid-sets data/valid.bpe.de data/valid.bpe.en \ + --valid-mini-batch 64 --beam-size 12 --normalize=1 \ + --overwrite --keep-best \ + --early-stopping 5 --after-epochs 10 \ + --log model.back/train.log --valid-log model.back/valid.log \ + --tied-embeddings-all --layer-normalization \ + --devices $GPUS --seed 1111 \ + --exponential-smoothing +fi + +if [ ! -e "data/news.2016.bpe.en" ] +then + $MARIAN/build/marian-decoder \ + -c model.back/model.npz.best-translation.npz.decoder.yml \ + -i data/news.2016.bpe.de \ + -b 6 --normalize=1 -w 2500 -d $GPUS \ + --mini-batch 64 --maxi-batch 100 --maxi-batch-sort src \ + --max-length 200 --max-length-crop \ + > data/news.2016.bpe.en +fi + +if [ ! -e "data/all.bpe.en" ] +then + cat data/corpus.bpe.en data/corpus.bpe.en data/news.2016.bpe.en > data/all.bpe.en + cat data/corpus.bpe.de data/corpus.bpe.de data/news.2016.bpe.de > data/all.bpe.de +fi + +for i in `seq 1 $N` +do + mkdir -p model/ens$i + # train model + if [ ! -e "model/ens$i/model.npz.best-translation.npz" ] + then + $MARIAN/build/marian \ + --model model/ens$i/model.npz --type s2s \ + --train-sets data/all.bpe.en data/all.bpe.de \ + --max-length 100 \ + --vocabs model/vocab.ende.yml model/vocab.ende.yml \ + --mini-batch-fit -w $WORKSPACE --mini-batch 1000 --maxi-batch 1000 \ + --valid-freq 5000 --save-freq 5000 --disp-freq 500 \ + --valid-metrics cross-entropy translation \ + --valid-sets data/valid.bpe.en data/valid.bpe.de \ + --valid-script-path ./scripts/validate.sh \ + --valid-translation-output data/valid.bpe.en.output --quiet-translation \ + --beam-size 12 --normalize=1 \ + --valid-mini-batch 64 \ + --overwrite --keep-best \ + --early-stopping 5 --after-epochs 7 --cost-type=ce-mean-words \ + --log model/ens$i/train.log --valid-log model/ens$i/valid.log \ + --enc-type bidirectional --enc-depth 1 --enc-cell-depth 4 \ + --dec-depth 1 --dec-cell-base-depth 8 --dec-cell-high-depth 1 \ + --tied-embeddings-all --layer-normalization \ + --dropout-rnn 0.1 --label-smoothing 0.1 \ + --learn-rate 0.0003 --lr-warmup 16000 --lr-decay-inv-sqrt 16000 --lr-report \ + --optimizer-params 0.9 0.98 1e-09 --clip-norm 5 \ + --devices $GPUS --sync-sgd --seed $i$i$i$i \ + --exponential-smoothing + fi +done + +for i in `seq 1 $N` +do + mkdir -p model/ens-rtl$i + # train model + if [ ! -e "model/ens-rtl$i/model.npz.best-translation.npz" ] + then + $MARIAN/build/marian \ + --model model/ens-rtl$i/model.npz --type s2s \ + --train-sets data/all.bpe.en data/all.bpe.de \ + --max-length 100 \ + --vocabs model/vocab.ende.yml model/vocab.ende.yml \ + --mini-batch-fit -w $WORKSPACE --mini-batch 1000 --maxi-batch 1000 \ + --valid-freq 5000 --save-freq 5000 --disp-freq 500 \ + --valid-metrics cross-entropy translation \ + --valid-sets data/valid.bpe.en data/valid.bpe.de \ + --valid-script-path ./scripts/validate.sh \ + --valid-translation-output data/valid.bpe.en.output --quiet-translation \ + --beam-size 12 --normalize=1 \ + --valid-mini-batch 64 \ + --overwrite --keep-best \ + --early-stopping 5 --after-epochs 7 --cost-type=ce-mean-words \ + --log model/ens-rtl$i/train.log --valid-log model/ens-rtl$i/valid.log \ + --enc-type bidirectional --enc-depth 1 --enc-cell-depth 4 \ + --dec-depth 1 --dec-cell-base-depth 8 --dec-cell-high-depth 1 \ + --tied-embeddings-all --layer-normalization \ + --transformer-dropout 0.1 --label-smoothing 0.1 \ + --learn-rate 0.0003 --lr-warmup 16000 --lr-decay-inv-sqrt 16000 --lr-report \ + --optimizer-params 0.9 0.98 1e-09 --clip-norm 5 \ + --devices $GPUS --sync-sgd --seed $i$i$i$i$i \ + --exponential-smoothing --right-left + fi +done + + +# translate test sets +for prefix in valid test2014 test2015 test2017 +do + cat data/$prefix.bpe.en \ + | $MARIAN/build/marian-decoder -c model/ens1/model.npz.best-translation.npz.decoder.yml \ + -m model/ens?/model.npz.best-translation.npz -d $GPUS \ + --mini-batch 16 --maxi-batch 100 --maxi-batch-sort src -w 5000 --n-best --beam-size $B \ + > data/$prefix.bpe.en.output.nbest.0 + + for i in $(seq 1 $N) + do + $MARIAN/build/marian-scorer -m model/ens-rtl$i/model.npz.best-cross-entropy.npz \ + -v model/vocab.ende.yml model/vocab.ende.yml -d $GPUS \ + --mini-batch 16 --maxi-batch 100 --maxi-batch-sort trg --n-best --n-best-feature R2L$(expr $i - 1) \ + -t data/$prefix.bpe.en data/$prefix.bpe.en.output.nbest.$(expr $i - 1) > data/$prefix.bpe.en.output.nbest.$i + done + + cat data/$prefix.bpe.en.output.nbest.$N \ + | python scripts/rescore.py \ + | perl -pe 's/@@ //g' \ + | ../tools/moses-scripts/scripts/recaser/detruecase.perl \ + | ../tools/moses-scripts/scripts/tokenizer/detokenizer.perl > data/$prefix.en.output +done + +# calculate bleu scores on test sets +LC_ALL=C.UTF-8 ../tools/sacreBLEU/sacrebleu.py -t wmt16 -l en-de < data/valid.en.output +LC_ALL=C.UTF-8 ../tools/sacreBLEU/sacrebleu.py -t wmt14 -l en-de < data/test2014.en.output +LC_ALL=C.UTF-8 ../tools/sacreBLEU/sacrebleu.py -t wmt15 -l en-de < data/test2015.en.output +LC_ALL=C.UTF-8 ../tools/sacreBLEU/sacrebleu.py -t wmt17 -l en-de < data/test2017.en.output diff --git a/wmt2017-uedin/scripts/bla.s b/wmt2017-uedin/scripts/bla.s new file mode 100755 index 0000000..ae07a92 --- /dev/null +++ b/wmt2017-uedin/scripts/bla.s @@ -0,0 +1,59 @@ +#!/bin/bash -v + +# suffix of source language files +SRC=en + +# suffix of target language files +TRG=de + +# number of merge operations +bpe_operations=32000 + +# path to moses decoder: https://github.com/moses-smt/mosesdecoder +mosesdecoder=../tools/moses-scripts + +# path to subword segmentation scripts: https://github.com/rsennrich/subword-nmt +subword_nmt=../tools/subword-nmt + +# tokenize +for prefix in valid test2014 test2015 test2016 test2017 +do + cat data/$prefix.$SRC \ + | $mosesdecoder/scripts/tokenizer/normalize-punctuation.perl -l $SRC \ + | $mosesdecoder/scripts/tokenizer/tokenizer.perl -a -l $SRC > data/$prefix.tok.$SRC + + test -f data/$prefix.$TRG || continue + + cat data/$prefix.$TRG \ + | $mosesdecoder/scripts/tokenizer/normalize-punctuation.perl -l $TRG \ + | $mosesdecoder/scripts/tokenizer/tokenizer.perl -a -l $TRG > data/$prefix.tok.$TRG +done + +if [ ! -e "model/tc.$TRG" ] +then + # train truecaser + $mosesdecoder/scripts/recaser/train-truecaser.perl -corpus data/corpus.tok.$SRC -model model/tc.$SRC + $mosesdecoder/scripts/recaser/train-truecaser.perl -corpus data/corpus.tok.$TRG -model model/tc.$TRG +fi + +# apply truecaser (cleaned training corpus) +for prefix in valid test2014 test2015 test2016 test2017 +do + $mosesdecoder/scripts/recaser/truecase.perl -model model/tc.$SRC < data/$prefix.tok.$SRC > data/$prefix.tc.$SRC + test -f data/$prefix.tok.$TRG || continue + $mosesdecoder/scripts/recaser/truecase.perl -model model/tc.$TRG < data/$prefix.tok.$TRG > data/$prefix.tc.$TRG +done + +if [ ! -e "model/$SRC$TRG.bpe" ] +then + # train BPE + cat data/corpus.tc.$SRC data/corpus.tc.$TRG | $subword_nmt/learn_bpe.py -s $bpe_operations > model/$SRC$TRG.bpe +fi + +# apply BPE +for prefix in valid test2014 test2015 test2016 test2017 +do + $subword_nmt/apply_bpe.py -c model/$SRC$TRG.bpe < data/$prefix.tc.$SRC > data/$prefix.bpe.$SRC + test -f data/$prefix.tc.$TRG || continue + $subword_nmt/apply_bpe.py -c model/$SRC$TRG.bpe < data/$prefix.tc.$TRG > data/$prefix.bpe.$TRG +done diff --git a/wmt2017-uedin/scripts/download-files-mono.sh b/wmt2017-uedin/scripts/download-files-mono.sh new file mode 100755 index 0000000..ba2c39a --- /dev/null +++ b/wmt2017-uedin/scripts/download-files-mono.sh @@ -0,0 +1,14 @@ +#!/bin/bash -v + +mkdir -p data +cd data + +# get En-De training data for WMT17 +wget -nc http://data.statmt.org/wmt17/translation-task/news.2016.de.shuffled.gz + +zcat news.2016.de.shuffled.gz | shuf -n 11000000 | perl -ne 'print if(split(/\s/, $_) < 100)' | head -n 10000000 > news.2016.de + +# clean +rm -r news.2016.de.shuffled.gz + +cd .. diff --git a/wmt2017-uedin/scripts/download-files.sh b/wmt2017-uedin/scripts/download-files.sh new file mode 100755 index 0000000..99f880a --- /dev/null +++ b/wmt2017-uedin/scripts/download-files.sh @@ -0,0 +1,23 @@ +#!/bin/bash -v + +mkdir -p data +cd data + +# get En-De training data for WMT17 +wget -nc http://www.statmt.org/europarl/v7/de-en.tgz +wget -nc http://www.statmt.org/wmt13/training-parallel-commoncrawl.tgz +wget -nc http://data.statmt.org/wmt17/translation-task/training-parallel-nc-v12.tgz + +# extract data +tar -xf de-en.tgz +tar -xf training-parallel-commoncrawl.tgz +tar -xf training-parallel-nc-v12.tgz + +# create corpus files +cat europarl-v7.de-en.de commoncrawl.de-en.de training/news-commentary-v12.de-en.de > corpus.de +cat europarl-v7.de-en.en commoncrawl.de-en.en training/news-commentary-v12.de-en.en > corpus.en + +# clean +rm -r europarl-* commoncrawl.* training/ *.tgz + +cd .. diff --git a/wmt2017-uedin/scripts/preprocess-data-mono.sh b/wmt2017-uedin/scripts/preprocess-data-mono.sh new file mode 100755 index 0000000..b327e9f --- /dev/null +++ b/wmt2017-uedin/scripts/preprocess-data-mono.sh @@ -0,0 +1,23 @@ +#!/bin/bash -v + +# suffix of target language files +SRC=en +TRG=de + +# path to moses decoder: https://github.com/moses-smt/mosesdecoder +mosesdecoder=../tools/moses-scripts + +# path to subword segmentation scripts: https://github.com/rsennrich/subword-nmt +subword_nmt=../tools/subword-nmt + +# tokenize + +prefix=news.2016 + +cat data/$prefix.$TRG \ + | $mosesdecoder/scripts/tokenizer/normalize-punctuation.perl -l $TRG \ + | $mosesdecoder/scripts/tokenizer/tokenizer.perl -a -l $TRG > data/$prefix.tok.$TRG + +$mosesdecoder/scripts/recaser/truecase.perl -model model/tc.$TRG < data/$prefix.tok.$TRG > data/$prefix.tc.$TRG + +$subword_nmt/apply_bpe.py -c model/$SRC$TRG.bpe < data/$prefix.tc.$TRG > data/$prefix.bpe.$TRG diff --git a/wmt2017-uedin/scripts/preprocess-data.sh b/wmt2017-uedin/scripts/preprocess-data.sh new file mode 100755 index 0000000..f02f597 --- /dev/null +++ b/wmt2017-uedin/scripts/preprocess-data.sh @@ -0,0 +1,64 @@ +#!/bin/bash -v + +# suffix of source language files +SRC=en + +# suffix of target language files +TRG=de + +# number of merge operations +bpe_operations=32000 + +# path to moses decoder: https://github.com/moses-smt/mosesdecoder +mosesdecoder=../tools/moses-scripts + +# path to subword segmentation scripts: https://github.com/rsennrich/subword-nmt +subword_nmt=../tools/subword-nmt + +# tokenize +for prefix in corpus valid test2014 test2015 test2016 test2017 +do + cat data/$prefix.$SRC \ + | $mosesdecoder/scripts/tokenizer/normalize-punctuation.perl -l $SRC \ + | $mosesdecoder/scripts/tokenizer/tokenizer.perl -a -l $SRC > data/$prefix.tok.$SRC + + test -f data/$prefix.$TRG || continue + + cat data/$prefix.$TRG \ + | $mosesdecoder/scripts/tokenizer/normalize-punctuation.perl -l $TRG \ + | $mosesdecoder/scripts/tokenizer/tokenizer.perl -a -l $TRG > data/$prefix.tok.$TRG +done + +# clean empty and long sentences, and sentences with high source-target ratio (training corpus only) +mv data/corpus.tok.$SRC data/corpus.tok.uncleaned.$SRC +mv data/corpus.tok.$TRG data/corpus.tok.uncleaned.$TRG +$mosesdecoder/scripts/training/clean-corpus-n.perl data/corpus.tok.uncleaned $SRC $TRG data/corpus.tok 1 100 + +if [ ! -e "model/tc.$TRG" ] +then + # train truecaser + $mosesdecoder/scripts/recaser/train-truecaser.perl -corpus data/corpus.tok.$SRC -model model/tc.$SRC + $mosesdecoder/scripts/recaser/train-truecaser.perl -corpus data/corpus.tok.$TRG -model model/tc.$TRG +fi + +# apply truecaser (cleaned training corpus) +for prefix in corpus valid test2014 test2015 test2016 test2017 +do + $mosesdecoder/scripts/recaser/truecase.perl -model model/tc.$SRC < data/$prefix.tok.$SRC > data/$prefix.tc.$SRC + test -f data/$prefix.tok.$TRG || continue + $mosesdecoder/scripts/recaser/truecase.perl -model model/tc.$TRG < data/$prefix.tok.$TRG > data/$prefix.tc.$TRG +done + +if [ ! -e "model/$SRC$TRG.bpe" ] +then + # train BPE + cat data/corpus.tc.$SRC data/corpus.tc.$TRG | $subword_nmt/learn_bpe.py -s $bpe_operations > model/$SRC$TRG.bpe +fi + +# apply BPE +for prefix in corpus valid test2014 test2015 test2016 test2017 +do + $subword_nmt/apply_bpe.py -c model/$SRC$TRG.bpe < data/$prefix.tc.$SRC > data/$prefix.bpe.$SRC + test -f data/$prefix.tc.$TRG || continue + $subword_nmt/apply_bpe.py -c model/$SRC$TRG.bpe < data/$prefix.tc.$TRG > data/$prefix.bpe.$TRG +done diff --git a/wmt2017-uedin/scripts/validate.en.sh b/wmt2017-uedin/scripts/validate.en.sh new file mode 100755 index 0000000..4400d63 --- /dev/null +++ b/wmt2017-uedin/scripts/validate.en.sh @@ -0,0 +1,8 @@ +#!/bin/bash + +cat $1 \ + | sed 's/\@\@ //g' \ + | ../tools/moses-scripts/scripts/recaser/detruecase.perl 2>/dev/null \ + | ../tools/moses-scripts/scripts/tokenizer/detokenizer.perl -l en 2>/dev/null \ + | ../tools/moses-scripts/scripts/generic/multi-bleu-detok.perl data/valid.en \ + | sed -r 's/BLEU = ([0-9.]+),.*/\1/' diff --git a/wmt2017-uedin/scripts/validate.sh b/wmt2017-uedin/scripts/validate.sh new file mode 100755 index 0000000..ee93bc6 --- /dev/null +++ b/wmt2017-uedin/scripts/validate.sh @@ -0,0 +1,8 @@ +#!/bin/bash + +cat $1 \ + | sed 's/\@\@ //g' \ + | ../tools/moses-scripts/scripts/recaser/detruecase.perl 2>/dev/null \ + | ../tools/moses-scripts/scripts/tokenizer/detokenizer.perl -l de 2>/dev/null \ + | ../tools/moses-scripts/scripts/generic/multi-bleu-detok.perl data/valid.de \ + | sed -r 's/BLEU = ([0-9.]+),.*/\1/' |