change path

author: Marcin Junczys-Dowmunt <junczys@amu.edu.pl> 2018-03-10 03:08:12 +0300
committer: Marcin Junczys-Dowmunt <junczys@amu.edu.pl> 2018-03-10 03:08:12 +0300
commit: 5aee613746bb86b38ea342816e9afe7ab45caa6f (patch)
tree: 18753e928bac1e449f86e034488e4552e44fc5b5
parent: 9477cc56b709324b5fa7d1cd66086eb472e269cc (diff)
20 files changed, 909 insertions, 0 deletions
diff --git a/wmt2017-transformer/.gitignore b/wmt2017-transformer/.gitignore
new file mode 100644
index 0000000..6053e6f
--- /dev/null
+++ b/wmt2017-transformer/.gitignore
@@ -0,0 +1,3 @@
+data
+model
+model.back
diff --git a/wmt2017-transformer/README.md b/wmt2017-transformer/README.md
new file mode 100644
index 0000000..1b793ce
--- /dev/null
+++ b/wmt2017-transformer/README.md
@@ -0,0 +1,67 @@
+# Example: Training a transformer model
+
+Files and scripts in this folder show how to train a Google-style transformer 
+model ([Vaswani et al, 2017](https://arxiv.org/abs/1706.03762)) on WMT-17 (?)
+English-German data.
+The problem-set has been adapted from the original
+[tensor2tensor](https://github.com/tensorflow/tensor2tensor) repository by
+Google. We reuse their 36,000 common BPE subword units for both languages.
+No back-translationed data was added.
+
+Assuming four GPUs are available (here 0 1 2 3), execute the command below
+to run the complete example:
+
+```
+./run-me.sh 0 1 2 3
+```
+
+This starts a training run with `marian` using the following command:
+
+```
+../build/marian \
+    --model model/model.npz --type transformer \
+    --train-sets data/corpus.bpe.en data/corpus.bpe.de \
+    --max-length 100 \
+    --vocabs model/vocab.ende.yml model/vocab.ende.yml \
+    --mini-batch-fit -w 7000 --maxi-batch 1000 \
+    --early-stopping 10 \
+    --valid-freq 5000 --save-freq 5000 --disp-freq 500 \
+    --valid-metrics cross-entropy perplexity translation \
+    --valid-sets data/valid.bpe.en data/valid.bpe.de \
+    --valid-script-path ./scripts/validate.sh \
+    --valid-translation-output data/valid.bpe.en.output --quiet-translation \
+    --valid-mini-batch 64 \
+    --beam-size 6 --normalize 0.6 \
+    --log model/train.log --valid-log model/valid.log \
+    --enc-depth 6 --dec-depth 6 \
+    --transformer-heads 8 \
+    --transformer-postprocess-emb d \
+    --transformer-postprocess dan \
+    --transformer-dropout 0.1 --label-smoothing 0.1 \
+    --learn-rate 0.0003 --lr-warmup 16000 --lr-decay-inv-sqrt 16000 --lr-report \
+    --optimizer-params 0.9 0.98 1e-09 --clip-norm 5 \
+    --tied-embeddings-all \
+    --devices $GPUS --sync-sgd --seed 1111 \
+    --exponential-smoothing
+```
+
+This reproduces a system roughly equivalent to the basic 6-layer transformer
+described in the original paper.
+
+The training setting includes:
+* Fitting mini-batch sizes to 7GB of GPU memory with synchronous SGD (ADAM), 
+which results in large mini-batches.
+* Validation on external data set using cross-entropy, perplexity and BLEU
+* 6-layer (or rather block) encoder and 6-layer decoder
+* Tied embeddings for source, target and output layer
+* Label smoothing
+* Learning rate warm-up and cool-down
+* Multi-GPU training
+
+The evaluation is performed on WMT test sets from 2014, 2015 and 2016 using
+[sacreBLEU](https://github.com/mjpost/sacreBLEU), which provides hassle-free
+computation of shareable, comparable, and reproducible BLEU scores.  The
+WMT-2013 test set is used as a validation set.
+
+See the basic training example (`marian/examples/training-basics/`) for more
+details.
diff --git a/wmt2017-transformer/run-me.sh b/wmt2017-transformer/run-me.sh
new file mode 100755
index 0000000..6ba4f5b
--- /dev/null
+++ b/wmt2017-transformer/run-me.sh
@@ -0,0 +1,203 @@
+#!/bin/bash -v
+
+MARIAN=../..
+# set chosen gpus
+GPUS=0
+if [ $# -ne 0 ]
+then
+    GPUS=$@
+fi
+echo Using GPUs: $GPUS
+
+if [ ! $WORKSPACE ]
+then
+  WORKSPACE=9500
+fi
+
+N=4
+B=12
+
+if [ ! -e $MARIAN/build/marian ]
+then
+    echo "marian is not installed in ../../build, you need to compile the toolkit first"
+    exit 1
+fi
+
+if [ ! -e ../tools/moses-scripts ] || [ ! -e ../tools/subword-nmt ] || [ ! -e ../tools/sacreBLEU ]
+then
+    echo "missing tools in ../tools, you need to download them first"
+    exit 1
+fi
+
+if [ ! -e "data/corpus.en" ]
+then
+    ./scripts/download-files.sh
+fi
+
+mkdir -p model
+
+# preprocess data
+if [ ! -e "data/corpus.bpe.en" ]
+then
+    LC_ALL=C.UTF-8 ../tools/sacreBLEU/sacrebleu.py -t wmt16 -l en-de --echo src > data/valid.en
+    LC_ALL=C.UTF-8 ../tools/sacreBLEU/sacrebleu.py -t wmt16 -l en-de --echo ref > data/valid.de
+
+    LC_ALL=C.UTF-8 ../tools/sacreBLEU/sacrebleu.py -t wmt14 -l en-de --echo src > data/test2014.en
+    LC_ALL=C.UTF-8 ../tools/sacreBLEU/sacrebleu.py -t wmt15 -l en-de --echo src > data/test2015.en
+    LC_ALL=C.UTF-8 ../tools/sacreBLEU/sacrebleu.py -t wmt16 -l en-de --echo src > data/test2016.en
+    LC_ALL=C.UTF-8 ../tools/sacreBLEU/sacrebleu.py -t wmt17 -l en-de --echo src > data/test2017.en
+
+    ./scripts/preprocess-data.sh
+fi
+
+if [ ! -e "data/news.2016.de" ]
+then
+    ./scripts/download-files-mono.sh
+fi
+
+if [ ! -e "data/news.2016.bpe.de" ]
+then
+    ./scripts/preprocess-data-mono.sh
+fi
+
+# create common vocabulary
+if [ ! -e "model/vocab.ende.yml" ]
+then
+    cat data/corpus.bpe.en data/corpus.bpe.de | $MARIAN/build/marian-vocab --max-size 36000 > model/vocab.ende.yml
+fi
+
+# train model
+mkdir -p model.back
+if [ ! -e "model.back/model.npz.best-translation.npz" ]
+then
+    $MARIAN/build/marian \
+        --model model.back/model.npz --type s2s \
+        --train-sets data/corpus.bpe.de data/corpus.bpe.en \
+        --max-length 100 \
+        --vocabs model/vocab.ende.yml model/vocab.ende.yml \
+        --mini-batch-fit -w 3500 --maxi-batch 1000 \
+        --valid-freq 10000 --save-freq 10000 --disp-freq 100 \
+        --valid-metrics cross-entropy translation \
+        --valid-script-path ./scripts/validate.en.sh \
+        --valid-translation-output data/valid.bpe.de.output --quiet-translation \
+        --valid-sets data/valid.bpe.de data/valid.bpe.en \
+        --valid-mini-batch 64 --beam-size 12 --normalize=1 \
+        --overwrite --keep-best \
+        --early-stopping 5 --after-epochs 10 \
+        --log model.back/train.log --valid-log model.back/valid.log \
+        --tied-embeddings-all --layer-normalization \
+        --devices $GPUS --seed 1111 \
+        --exponential-smoothing
+fi
+
+if [ ! -e "data/news.2016.bpe.en" ]
+then
+    $MARIAN/build/marian-decoder \
+      -c model.back/model.npz.best-translation.npz.decoder.yml \
+      -i data/news.2016.bpe.de \
+      -b 6 --normalize=1 -w 2500 -d $GPUS \
+      --mini-batch 64 --maxi-batch 100 --maxi-batch-sort src \
+      --max-length 200 --max-length-crop \
+      > data/news.2016.bpe.en
+fi
+
+if [ ! -e "data/all.bpe.en" ]
+then
+    cat data/corpus.bpe.en data/corpus.bpe.en data/news.2016.bpe.en > data/all.bpe.en
+    cat data/corpus.bpe.de data/corpus.bpe.de data/news.2016.bpe.de > data/all.bpe.de
+fi
+
+for i in `seq 1 $N`
+do
+  mkdir -p model/ens$i
+  # train model
+  if [ ! -e "model/ens$i/model.npz.best-translation.npz" ]
+  then
+    $MARIAN/build/marian \
+        --model model/ens$i/model.npz --type transformer \
+        --train-sets data/all.bpe.en data/all.bpe.de \
+        --max-length 100 \
+        --vocabs model/vocab.ende.yml model/vocab.ende.yml \
+        --mini-batch-fit -w $WORKSPACE --mini-batch 1000 --maxi-batch 1000 \
+        --valid-freq 5000 --save-freq 5000 --disp-freq 500 \
+        --valid-metrics cross-entropy translation \
+        --valid-sets data/valid.bpe.en data/valid.bpe.de \
+        --valid-script-path ./scripts/validate.sh \
+        --valid-translation-output data/valid.bpe.en.output --quiet-translation \
+        --beam-size 12 --normalize=1 \
+        --valid-mini-batch 64 \
+        --overwrite --keep-best \
+        --early-stopping 5 --after-epochs 7 --cost-type=ce-mean-words \
+        --log model/ens$i/train.log --valid-log model/ens$i/valid.log \
+        --enc-depth 6 --dec-depth 6 \
+        --tied-embeddings-all \
+        --transformer-dropout 0.1 --label-smoothing 0.1 \
+        --learn-rate 0.0003 --lr-warmup 16000 --lr-decay-inv-sqrt 16000 --lr-report \
+        --optimizer-params 0.9 0.98 1e-09 --clip-norm 5 \
+        --devices $GPUS --sync-sgd --seed $i$i$i$i  \
+        --exponential-smoothing
+  fi
+done
+
+for i in `seq 1 $N`
+do
+  mkdir -p model/ens-rtl$i
+  # train model
+  if [ ! -e "model/ens-rtl$i/model.npz.best-translation.npz" ]
+  then
+    $MARIAN/build/marian \
+        --model model/ens-rtl$i/model.npz --type transformer \
+        --train-sets data/all.bpe.en data/all.bpe.de \
+        --max-length 100 \
+        --vocabs model/vocab.ende.yml model/vocab.ende.yml \
+        --mini-batch-fit -w $WORKSPACE --mini-batch 1000 --maxi-batch 1000 \
+        --valid-freq 5000 --save-freq 5000 --disp-freq 500 \
+        --valid-metrics cross-entropy translation \
+        --valid-sets data/valid.bpe.en data/valid.bpe.de \
+        --valid-script-path ./scripts/validate.sh \
+        --valid-translation-output data/valid.bpe.en.output --quiet-translation \
+        --beam-size 12 --normalize=1 \
+        --valid-mini-batch 64 \
+        --overwrite --keep-best \
+        --early-stopping 5 --after-epochs 7 --cost-type=ce-mean-words \
+        --log model/ens-rtl$i/train.log --valid-log model/ens-rtl$i/valid.log \
+        --enc-depth 6 --dec-depth 6 \
+        --tied-embeddings-all \
+        --transformer-dropout 0.1 --label-smoothing 0.1 \
+        --learn-rate 0.0003 --lr-warmup 16000 --lr-decay-inv-sqrt 16000 --lr-report \
+        --optimizer-params 0.9 0.98 1e-09 --clip-norm 5 \
+        --devices $GPUS --sync-sgd --seed $i$i$i$i$i \
+        --exponential-smoothing --right-left
+  fi
+done
+
+
+# translate test sets
+for prefix in valid test2014 test2015 test2017
+do
+    cat data/$prefix.bpe.en \
+        | $MARIAN/build/marian-decoder -c model/ens1/model.npz.best-translation.npz.decoder.yml \
+          -m model/ens?/model.npz.best-translation.npz -d $GPUS \
+          --mini-batch 16 --maxi-batch 100 --maxi-batch-sort src -w 5000 --n-best --beam-size $B \
+        > data/$prefix.bpe.en.output.nbest.0
+
+    for i in $(seq 1 $N)
+    do
+      $MARIAN/build/marian-scorer -m model/ens-rtl$i/model.npz.best-cross-entropy.npz \
+        -v model/vocab.ende.yml model/vocab.ende.yml -d $GPUS \
+        --mini-batch 16 --maxi-batch 100 --maxi-batch-sort trg --n-best --n-best-feature R2L$(expr $i - 1) \
+        -t data/$prefix.bpe.en data/$prefix.bpe.en.output.nbest.$(expr $i - 1) > data/$prefix.bpe.en.output.nbest.$i
+    done
+
+    cat data/$prefix.bpe.en.output.nbest.$N \
+      | python scripts/rescore.py \
+      | perl -pe 's/@@ //g' \
+      | ../tools/moses-scripts/scripts/recaser/detruecase.perl \
+      | ../tools/moses-scripts/scripts/tokenizer/detokenizer.perl > data/$prefix.en.output
+done
+
+# calculate bleu scores on test sets
+LC_ALL=C.UTF-8 ../tools/sacreBLEU/sacrebleu.py -t wmt16 -l en-de < data/valid.en.output
+LC_ALL=C.UTF-8 ../tools/sacreBLEU/sacrebleu.py -t wmt14 -l en-de < data/test2014.en.output
+LC_ALL=C.UTF-8 ../tools/sacreBLEU/sacrebleu.py -t wmt15 -l en-de < data/test2015.en.output
+LC_ALL=C.UTF-8 ../tools/sacreBLEU/sacrebleu.py -t wmt17 -l en-de < data/test2017.en.output
diff --git a/wmt2017-transformer/scripts/download-files-mono.sh b/wmt2017-transformer/scripts/download-files-mono.sh
new file mode 100755
index 0000000..ba2c39a
--- /dev/null
+++ b/wmt2017-transformer/scripts/download-files-mono.sh
@@ -0,0 +1,14 @@
+#!/bin/bash -v
+
+mkdir -p data
+cd data
+
+# get En-De training data for WMT17
+wget -nc http://data.statmt.org/wmt17/translation-task/news.2016.de.shuffled.gz
+
+zcat news.2016.de.shuffled.gz | shuf -n 11000000 | perl -ne 'print if(split(/\s/, $_) < 100)' | head -n 10000000 > news.2016.de
+
+# clean
+rm -r news.2016.de.shuffled.gz
+
+cd ..
diff --git a/wmt2017-transformer/scripts/download-files.sh b/wmt2017-transformer/scripts/download-files.sh
new file mode 100755
index 0000000..99f880a
--- /dev/null
+++ b/wmt2017-transformer/scripts/download-files.sh
@@ -0,0 +1,23 @@
+#!/bin/bash -v
+
+mkdir -p data
+cd data
+
+# get En-De training data for WMT17
+wget -nc http://www.statmt.org/europarl/v7/de-en.tgz
+wget -nc http://www.statmt.org/wmt13/training-parallel-commoncrawl.tgz
+wget -nc http://data.statmt.org/wmt17/translation-task/training-parallel-nc-v12.tgz
+
+# extract data
+tar -xf de-en.tgz
+tar -xf training-parallel-commoncrawl.tgz
+tar -xf training-parallel-nc-v12.tgz
+
+# create corpus files
+cat europarl-v7.de-en.de commoncrawl.de-en.de training/news-commentary-v12.de-en.de > corpus.de
+cat europarl-v7.de-en.en commoncrawl.de-en.en training/news-commentary-v12.de-en.en > corpus.en
+
+# clean
+rm -r europarl-* commoncrawl.* training/ *.tgz
+
+cd ..
diff --git a/wmt2017-transformer/scripts/preprocess-data-mono.sh b/wmt2017-transformer/scripts/preprocess-data-mono.sh
new file mode 100755
index 0000000..b327e9f
--- /dev/null
+++ b/wmt2017-transformer/scripts/preprocess-data-mono.sh
@@ -0,0 +1,23 @@
+#!/bin/bash -v
+
+# suffix of target language files
+SRC=en
+TRG=de
+
+# path to moses decoder: https://github.com/moses-smt/mosesdecoder
+mosesdecoder=../tools/moses-scripts
+
+# path to subword segmentation scripts: https://github.com/rsennrich/subword-nmt
+subword_nmt=../tools/subword-nmt
+
+# tokenize
+
+prefix=news.2016
+
+cat data/$prefix.$TRG \
+    | $mosesdecoder/scripts/tokenizer/normalize-punctuation.perl -l $TRG \
+    | $mosesdecoder/scripts/tokenizer/tokenizer.perl -a -l $TRG > data/$prefix.tok.$TRG
+
+$mosesdecoder/scripts/recaser/truecase.perl -model model/tc.$TRG < data/$prefix.tok.$TRG > data/$prefix.tc.$TRG
+
+$subword_nmt/apply_bpe.py -c model/$SRC$TRG.bpe < data/$prefix.tc.$TRG > data/$prefix.bpe.$TRG
diff --git a/wmt2017-transformer/scripts/preprocess-data.sh b/wmt2017-transformer/scripts/preprocess-data.sh
new file mode 100755
index 0000000..3a968a5
--- /dev/null
+++ b/wmt2017-transformer/scripts/preprocess-data.sh
@@ -0,0 +1,58 @@
+#!/bin/bash -v
+
+# suffix of source language files
+SRC=en
+
+# suffix of target language files
+TRG=de
+
+# number of merge operations
+bpe_operations=32000
+
+# path to moses decoder: https://github.com/moses-smt/mosesdecoder
+mosesdecoder=../tools/moses-scripts
+
+# path to subword segmentation scripts: https://github.com/rsennrich/subword-nmt
+subword_nmt=../tools/subword-nmt
+
+# tokenize
+for prefix in corpus valid test2014 test2015 test2016
+do
+    cat data/$prefix.$SRC \
+        | $mosesdecoder/scripts/tokenizer/normalize-punctuation.perl -l $SRC \
+        | $mosesdecoder/scripts/tokenizer/tokenizer.perl -a -l $SRC > data/$prefix.tok.$SRC
+
+    test -f data/$prefix.$TRG || continue
+
+    cat data/$prefix.$TRG \
+        | $mosesdecoder/scripts/tokenizer/normalize-punctuation.perl -l $TRG \
+        | $mosesdecoder/scripts/tokenizer/tokenizer.perl -a -l $TRG > data/$prefix.tok.$TRG
+done
+
+# clean empty and long sentences, and sentences with high source-target ratio (training corpus only)
+mv data/corpus.tok.$SRC data/corpus.tok.uncleaned.$SRC
+mv data/corpus.tok.$TRG data/corpus.tok.uncleaned.$TRG
+$mosesdecoder/scripts/training/clean-corpus-n.perl data/corpus.tok.uncleaned $SRC $TRG data/corpus.tok 1 100
+
+# train truecaser
+$mosesdecoder/scripts/recaser/train-truecaser.perl -corpus data/corpus.tok.$SRC -model model/tc.$SRC
+$mosesdecoder/scripts/recaser/train-truecaser.perl -corpus data/corpus.tok.$TRG -model model/tc.$TRG
+
+# apply truecaser (cleaned training corpus)
+for prefix in corpus valid test2014 test2015 test2016
+do
+    $mosesdecoder/scripts/recaser/truecase.perl -model model/tc.$SRC < data/$prefix.tok.$SRC > data/$prefix.tc.$SRC
+    test -f data/$prefix.tok.$TRG || continue
+    $mosesdecoder/scripts/recaser/truecase.perl -model model/tc.$TRG < data/$prefix.tok.$TRG > data/$prefix.tc.$TRG
+done
+
+# train BPE
+cat data/corpus.tc.$SRC data/corpus.tc.$TRG | $subword_nmt/learn_bpe.py -s $bpe_operations > model/$SRC$TRG.bpe
+
+# apply BPE
+for prefix in corpus valid test2014 test2015 test2016
+do
+    $subword_nmt/apply_bpe.py -c model/$SRC$TRG.bpe < data/$prefix.tc.$SRC > data/$prefix.bpe.$SRC
+    test -f data/$prefix.tc.$TRG || continue
+    $subword_nmt/apply_bpe.py -c model/$SRC$TRG.bpe < data/$prefix.tc.$TRG > data/$prefix.bpe.$TRG
+done
diff --git a/wmt2017-transformer/scripts/rescore.py b/wmt2017-transformer/scripts/rescore.py
new file mode 100644
index 0000000..f5ecfa9
--- /dev/null
+++ b/wmt2017-transformer/scripts/rescore.py
@@ -0,0 +1,25 @@
+import sys
+
+lastNum = 0
+bestScore = -9999
+
+for line in sys.stdin:
+    line = line.rstrip("\n")
+    fields = line.split(" ||| ")
+    score = sum(float(score) for score in fields[2].split(" ") if score[-1] != "=")
+    length = float(len(fields[1].split(" ")) + 1)
+
+    score = score / length
+
+    num = int(fields[0])
+    if num > lastNum:
+      print bestLine
+      bestScore = -99999
+      bestLine = fields[1]
+    lastNum = num
+
+    if score > bestScore:
+      bestScore = score
+      bestLine = fields[1]
+
+print bestLine
diff --git a/wmt2017-transformer/scripts/validate.en.sh b/wmt2017-transformer/scripts/validate.en.sh
new file mode 100755
index 0000000..4400d63
--- /dev/null
+++ b/wmt2017-transformer/scripts/validate.en.sh
@@ -0,0 +1,8 @@
+#!/bin/bash
+
+cat $1 \
+    | sed 's/\@\@ //g' \
+    | ../tools/moses-scripts/scripts/recaser/detruecase.perl 2>/dev/null \
+    | ../tools/moses-scripts/scripts/tokenizer/detokenizer.perl -l en 2>/dev/null \
+    | ../tools/moses-scripts/scripts/generic/multi-bleu-detok.perl data/valid.en \
+    | sed -r 's/BLEU = ([0-9.]+),.*/\1/'
diff --git a/wmt2017-transformer/scripts/validate.sh b/wmt2017-transformer/scripts/validate.sh
new file mode 100755
index 0000000..a2b4945
--- /dev/null
+++ b/wmt2017-transformer/scripts/validate.sh
@@ -0,0 +1,10 @@
+#!/bin/bash
+
+export LC_ALL=C.UTF-8
+
+cat $1 \
+    | sed 's/\@\@ //g' \
+    | ../tools/moses-scripts/scripts/recaser/detruecase.perl 2>/dev/null \
+    | ../tools/moses-scripts/scripts/tokenizer/detokenizer.perl -l de 2>/dev/null \
+    | ../tools/moses-scripts/scripts/generic/multi-bleu-detok.perl data/valid.de \
+    | sed -r 's/BLEU = ([0-9.]+),.*/\1/'
diff --git a/wmt2017-uedin/.gitignore b/wmt2017-uedin/.gitignore
new file mode 100644
index 0000000..c201e72
--- /dev/null
+++ b/wmt2017-uedin/.gitignore
@@ -0,0 +1,4 @@
+data
+model
+model.back
+
diff --git a/wmt2017-uedin/README.md b/wmt2017-uedin/README.md
new file mode 100644
index 0000000..1b793ce
--- /dev/null
+++ b/wmt2017-uedin/README.md
@@ -0,0 +1,67 @@
+# Example: Training a transformer model
+
+Files and scripts in this folder show how to train a Google-style transformer 
+model ([Vaswani et al, 2017](https://arxiv.org/abs/1706.03762)) on WMT-17 (?)
+English-German data.
+The problem-set has been adapted from the original
+[tensor2tensor](https://github.com/tensorflow/tensor2tensor) repository by
+Google. We reuse their 36,000 common BPE subword units for both languages.
+No back-translationed data was added.
+
+Assuming four GPUs are available (here 0 1 2 3), execute the command below
+to run the complete example:
+
+```
+./run-me.sh 0 1 2 3
+```
+
+This starts a training run with `marian` using the following command:
+
+```
+../build/marian \
+    --model model/model.npz --type transformer \
+    --train-sets data/corpus.bpe.en data/corpus.bpe.de \
+    --max-length 100 \
+    --vocabs model/vocab.ende.yml model/vocab.ende.yml \
+    --mini-batch-fit -w 7000 --maxi-batch 1000 \
+    --early-stopping 10 \
+    --valid-freq 5000 --save-freq 5000 --disp-freq 500 \
+    --valid-metrics cross-entropy perplexity translation \
+    --valid-sets data/valid.bpe.en data/valid.bpe.de \
+    --valid-script-path ./scripts/validate.sh \
+    --valid-translation-output data/valid.bpe.en.output --quiet-translation \
+    --valid-mini-batch 64 \
+    --beam-size 6 --normalize 0.6 \
+    --log model/train.log --valid-log model/valid.log \
+    --enc-depth 6 --dec-depth 6 \
+    --transformer-heads 8 \
+    --transformer-postprocess-emb d \
+    --transformer-postprocess dan \
+    --transformer-dropout 0.1 --label-smoothing 0.1 \
+    --learn-rate 0.0003 --lr-warmup 16000 --lr-decay-inv-sqrt 16000 --lr-report \
+    --optimizer-params 0.9 0.98 1e-09 --clip-norm 5 \
+    --tied-embeddings-all \
+    --devices $GPUS --sync-sgd --seed 1111 \
+    --exponential-smoothing
+```
+
+This reproduces a system roughly equivalent to the basic 6-layer transformer
+described in the original paper.
+
+The training setting includes:
+* Fitting mini-batch sizes to 7GB of GPU memory with synchronous SGD (ADAM), 
+which results in large mini-batches.
+* Validation on external data set using cross-entropy, perplexity and BLEU
+* 6-layer (or rather block) encoder and 6-layer decoder
+* Tied embeddings for source, target and output layer
+* Label smoothing
+* Learning rate warm-up and cool-down
+* Multi-GPU training
+
+The evaluation is performed on WMT test sets from 2014, 2015 and 2016 using
+[sacreBLEU](https://github.com/mjpost/sacreBLEU), which provides hassle-free
+computation of shareable, comparable, and reproducible BLEU scores.  The
+WMT-2013 test set is used as a validation set.
+
+See the basic training example (`marian/examples/training-basics/`) for more
+details.
diff --git a/wmt2017-uedin/run-me.sh b/wmt2017-uedin/run-me.sh
new file mode 100755
index 0000000..ee6ef34
--- /dev/null
+++ b/wmt2017-uedin/run-me.sh
@@ -0,0 +1,205 @@
+#!/bin/bash -v
+
+MARIAN=../..
+# set chosen gpus
+GPUS=0
+if [ $# -ne 0 ]
+then
+    GPUS=$@
+fi
+echo Using GPUs: $GPUS
+
+if [ ! $WORKSPACE ]
+then
+  WORKSPACE=9500
+fi
+
+N=4
+B=12
+
+if [ ! -e $MARIAN/build/marian ]
+then
+    echo "marian is not installed in ../../build, you need to compile the toolkit first"
+    exit 1
+fi
+
+if [ ! -e ../tools/moses-scripts ] || [ ! -e ../tools/subword-nmt ] || [ ! -e ../tools/sacreBLEU ]
+then
+    echo "missing tools in ../tools, you need to download them first"
+    exit 1
+fi
+
+if [ ! -e "data/corpus.en" ]
+then
+    ./scripts/download-files.sh
+fi
+
+mkdir -p model
+
+# preprocess data
+if [ ! -e "data/corpus.bpe.en" ]
+then
+    LC_ALL=C.UTF-8 ../tools/sacreBLEU/sacrebleu.py -t wmt16 -l en-de --echo src > data/valid.en
+    LC_ALL=C.UTF-8 ../tools/sacreBLEU/sacrebleu.py -t wmt16 -l en-de --echo ref > data/valid.de
+
+    LC_ALL=C.UTF-8 ../tools/sacreBLEU/sacrebleu.py -t wmt14 -l en-de --echo src > data/test2014.en
+    LC_ALL=C.UTF-8 ../tools/sacreBLEU/sacrebleu.py -t wmt15 -l en-de --echo src > data/test2015.en
+    LC_ALL=C.UTF-8 ../tools/sacreBLEU/sacrebleu.py -t wmt16 -l en-de --echo src > data/test2016.en
+    LC_ALL=C.UTF-8 ../tools/sacreBLEU/sacrebleu.py -t wmt17 -l en-de --echo src > data/test2017.en
+
+    ./scripts/preprocess-data.sh
+fi
+
+if [ ! -e "data/news.2016.de" ]
+then
+    ./scripts/download-files-mono.sh
+fi
+
+if [ ! -e "data/news.2016.bpe.de" ]
+then
+    ./scripts/preprocess-data-mono.sh
+fi
+
+# create common vocabulary
+if [ ! -e "model/vocab.ende.yml" ]
+then
+    cat data/corpus.bpe.en data/corpus.bpe.de | $MARIAN/build/marian-vocab --max-size 36000 > model/vocab.ende.yml
+fi
+
+# train model
+mkdir -p model.back
+if [ ! -e "model.back/model.npz.best-translation.npz" ]
+then
+    $MARIAN/build/marian \
+        --model model.back/model.npz --type s2s \
+        --train-sets data/corpus.bpe.de data/corpus.bpe.en \
+        --max-length 100 \
+        --vocabs model/vocab.ende.yml model/vocab.ende.yml \
+        --mini-batch-fit -w 3500 --maxi-batch 1000 \
+        --valid-freq 10000 --save-freq 10000 --disp-freq 100 \
+        --valid-metrics cross-entropy translation \
+        --valid-script-path ./scripts/validate.en.sh \
+        --valid-translation-output data/valid.bpe.de.output --quiet-translation \
+        --valid-sets data/valid.bpe.de data/valid.bpe.en \
+        --valid-mini-batch 64 --beam-size 12 --normalize=1 \
+        --overwrite --keep-best \
+        --early-stopping 5 --after-epochs 10 \
+        --log model.back/train.log --valid-log model.back/valid.log \
+        --tied-embeddings-all --layer-normalization \
+        --devices $GPUS --seed 1111 \
+        --exponential-smoothing
+fi
+
+if [ ! -e "data/news.2016.bpe.en" ]
+then
+    $MARIAN/build/marian-decoder \
+      -c model.back/model.npz.best-translation.npz.decoder.yml \
+      -i data/news.2016.bpe.de \
+      -b 6 --normalize=1 -w 2500 -d $GPUS \
+      --mini-batch 64 --maxi-batch 100 --maxi-batch-sort src \
+      --max-length 200 --max-length-crop \
+      > data/news.2016.bpe.en
+fi
+
+if [ ! -e "data/all.bpe.en" ]
+then
+    cat data/corpus.bpe.en data/corpus.bpe.en data/news.2016.bpe.en > data/all.bpe.en
+    cat data/corpus.bpe.de data/corpus.bpe.de data/news.2016.bpe.de > data/all.bpe.de
+fi
+
+for i in `seq 1 $N`
+do
+  mkdir -p model/ens$i
+  # train model
+  if [ ! -e "model/ens$i/model.npz.best-translation.npz" ]
+  then
+    $MARIAN/build/marian \
+        --model model/ens$i/model.npz --type s2s \
+        --train-sets data/all.bpe.en data/all.bpe.de \
+        --max-length 100 \
+        --vocabs model/vocab.ende.yml model/vocab.ende.yml \
+        --mini-batch-fit -w $WORKSPACE --mini-batch 1000 --maxi-batch 1000 \
+        --valid-freq 5000 --save-freq 5000 --disp-freq 500 \
+        --valid-metrics cross-entropy translation \
+        --valid-sets data/valid.bpe.en data/valid.bpe.de \
+        --valid-script-path ./scripts/validate.sh \
+        --valid-translation-output data/valid.bpe.en.output --quiet-translation \
+        --beam-size 12 --normalize=1 \
+        --valid-mini-batch 64 \
+        --overwrite --keep-best \
+        --early-stopping 5 --after-epochs 7 --cost-type=ce-mean-words \
+        --log model/ens$i/train.log --valid-log model/ens$i/valid.log \
+        --enc-type bidirectional --enc-depth 1 --enc-cell-depth 4 \
+        --dec-depth 1 --dec-cell-base-depth 8 --dec-cell-high-depth 1 \
+        --tied-embeddings-all --layer-normalization \
+        --dropout-rnn 0.1 --label-smoothing 0.1 \
+        --learn-rate 0.0003 --lr-warmup 16000 --lr-decay-inv-sqrt 16000 --lr-report \
+        --optimizer-params 0.9 0.98 1e-09 --clip-norm 5 \
+        --devices $GPUS --sync-sgd --seed $i$i$i$i  \
+        --exponential-smoothing
+  fi
+done
+
+for i in `seq 1 $N`
+do
+  mkdir -p model/ens-rtl$i
+  # train model
+  if [ ! -e "model/ens-rtl$i/model.npz.best-translation.npz" ]
+  then
+    $MARIAN/build/marian \
+        --model model/ens-rtl$i/model.npz --type s2s \
+        --train-sets data/all.bpe.en data/all.bpe.de \
+        --max-length 100 \
+        --vocabs model/vocab.ende.yml model/vocab.ende.yml \
+        --mini-batch-fit -w $WORKSPACE --mini-batch 1000 --maxi-batch 1000 \
+        --valid-freq 5000 --save-freq 5000 --disp-freq 500 \
+        --valid-metrics cross-entropy translation \
+        --valid-sets data/valid.bpe.en data/valid.bpe.de \
+        --valid-script-path ./scripts/validate.sh \
+        --valid-translation-output data/valid.bpe.en.output --quiet-translation \
+        --beam-size 12 --normalize=1 \
+        --valid-mini-batch 64 \
+        --overwrite --keep-best \
+        --early-stopping 5 --after-epochs 7 --cost-type=ce-mean-words \
+        --log model/ens-rtl$i/train.log --valid-log model/ens-rtl$i/valid.log \
+        --enc-type bidirectional --enc-depth 1 --enc-cell-depth 4 \
+        --dec-depth 1 --dec-cell-base-depth 8 --dec-cell-high-depth 1 \
+        --tied-embeddings-all --layer-normalization \
+        --transformer-dropout 0.1 --label-smoothing 0.1 \
+        --learn-rate 0.0003 --lr-warmup 16000 --lr-decay-inv-sqrt 16000 --lr-report \
+        --optimizer-params 0.9 0.98 1e-09 --clip-norm 5 \
+        --devices $GPUS --sync-sgd --seed $i$i$i$i$i \
+        --exponential-smoothing --right-left
+  fi
+done
+
+
+# translate test sets
+for prefix in valid test2014 test2015 test2017
+do
+    cat data/$prefix.bpe.en \
+        | $MARIAN/build/marian-decoder -c model/ens1/model.npz.best-translation.npz.decoder.yml \
+          -m model/ens?/model.npz.best-translation.npz -d $GPUS \
+          --mini-batch 16 --maxi-batch 100 --maxi-batch-sort src -w 5000 --n-best --beam-size $B \
+        > data/$prefix.bpe.en.output.nbest.0
+
+    for i in $(seq 1 $N)
+    do
+      $MARIAN/build/marian-scorer -m model/ens-rtl$i/model.npz.best-cross-entropy.npz \
+        -v model/vocab.ende.yml model/vocab.ende.yml -d $GPUS \
+        --mini-batch 16 --maxi-batch 100 --maxi-batch-sort trg --n-best --n-best-feature R2L$(expr $i - 1) \
+        -t data/$prefix.bpe.en data/$prefix.bpe.en.output.nbest.$(expr $i - 1) > data/$prefix.bpe.en.output.nbest.$i
+    done
+
+    cat data/$prefix.bpe.en.output.nbest.$N \
+      | python scripts/rescore.py \
+      | perl -pe 's/@@ //g' \
+      | ../tools/moses-scripts/scripts/recaser/detruecase.perl \
+      | ../tools/moses-scripts/scripts/tokenizer/detokenizer.perl > data/$prefix.en.output
+done
+
+# calculate bleu scores on test sets
+LC_ALL=C.UTF-8 ../tools/sacreBLEU/sacrebleu.py -t wmt16 -l en-de < data/valid.en.output
+LC_ALL=C.UTF-8 ../tools/sacreBLEU/sacrebleu.py -t wmt14 -l en-de < data/test2014.en.output
+LC_ALL=C.UTF-8 ../tools/sacreBLEU/sacrebleu.py -t wmt15 -l en-de < data/test2015.en.output
+LC_ALL=C.UTF-8 ../tools/sacreBLEU/sacrebleu.py -t wmt17 -l en-de < data/test2017.en.output
diff --git a/wmt2017-uedin/scripts/bla.s b/wmt2017-uedin/scripts/bla.s
new file mode 100755
index 0000000..ae07a92
--- /dev/null
+++ b/wmt2017-uedin/scripts/bla.s
@@ -0,0 +1,59 @@
+#!/bin/bash -v
+
+# suffix of source language files
+SRC=en
+
+# suffix of target language files
+TRG=de
+
+# number of merge operations
+bpe_operations=32000
+
+# path to moses decoder: https://github.com/moses-smt/mosesdecoder
+mosesdecoder=../tools/moses-scripts
+
+# path to subword segmentation scripts: https://github.com/rsennrich/subword-nmt
+subword_nmt=../tools/subword-nmt
+
+# tokenize
+for prefix in valid test2014 test2015 test2016 test2017
+do
+    cat data/$prefix.$SRC \
+        | $mosesdecoder/scripts/tokenizer/normalize-punctuation.perl -l $SRC \
+        | $mosesdecoder/scripts/tokenizer/tokenizer.perl -a -l $SRC > data/$prefix.tok.$SRC
+
+    test -f data/$prefix.$TRG || continue
+
+    cat data/$prefix.$TRG \
+        | $mosesdecoder/scripts/tokenizer/normalize-punctuation.perl -l $TRG \
+        | $mosesdecoder/scripts/tokenizer/tokenizer.perl -a -l $TRG > data/$prefix.tok.$TRG
+done
+
+if [ ! -e "model/tc.$TRG" ]
+then
+    # train truecaser
+    $mosesdecoder/scripts/recaser/train-truecaser.perl -corpus data/corpus.tok.$SRC -model model/tc.$SRC
+    $mosesdecoder/scripts/recaser/train-truecaser.perl -corpus data/corpus.tok.$TRG -model model/tc.$TRG
+fi
+
+# apply truecaser (cleaned training corpus)
+for prefix in valid test2014 test2015 test2016 test2017
+do
+    $mosesdecoder/scripts/recaser/truecase.perl -model model/tc.$SRC < data/$prefix.tok.$SRC > data/$prefix.tc.$SRC
+    test -f data/$prefix.tok.$TRG || continue
+    $mosesdecoder/scripts/recaser/truecase.perl -model model/tc.$TRG < data/$prefix.tok.$TRG > data/$prefix.tc.$TRG
+done
+
+if [ ! -e "model/$SRC$TRG.bpe" ]
+then
+    # train BPE
+    cat data/corpus.tc.$SRC data/corpus.tc.$TRG | $subword_nmt/learn_bpe.py -s $bpe_operations > model/$SRC$TRG.bpe
+fi
+
+# apply BPE
+for prefix in valid test2014 test2015 test2016 test2017
+do
+    $subword_nmt/apply_bpe.py -c model/$SRC$TRG.bpe < data/$prefix.tc.$SRC > data/$prefix.bpe.$SRC
+    test -f data/$prefix.tc.$TRG || continue
+    $subword_nmt/apply_bpe.py -c model/$SRC$TRG.bpe < data/$prefix.tc.$TRG > data/$prefix.bpe.$TRG
+done
diff --git a/wmt2017-uedin/scripts/download-files-mono.sh b/wmt2017-uedin/scripts/download-files-mono.sh
new file mode 100755
index 0000000..ba2c39a
--- /dev/null
+++ b/wmt2017-uedin/scripts/download-files-mono.sh
@@ -0,0 +1,14 @@
+#!/bin/bash -v
+
+mkdir -p data
+cd data
+
+# get En-De training data for WMT17
+wget -nc http://data.statmt.org/wmt17/translation-task/news.2016.de.shuffled.gz
+
+zcat news.2016.de.shuffled.gz | shuf -n 11000000 | perl -ne 'print if(split(/\s/, $_) < 100)' | head -n 10000000 > news.2016.de
+
+# clean
+rm -r news.2016.de.shuffled.gz
+
+cd ..
diff --git a/wmt2017-uedin/scripts/download-files.sh b/wmt2017-uedin/scripts/download-files.sh
new file mode 100755
index 0000000..99f880a
--- /dev/null
+++ b/wmt2017-uedin/scripts/download-files.sh
@@ -0,0 +1,23 @@
+#!/bin/bash -v
+
+mkdir -p data
+cd data
+
+# get En-De training data for WMT17
+wget -nc http://www.statmt.org/europarl/v7/de-en.tgz
+wget -nc http://www.statmt.org/wmt13/training-parallel-commoncrawl.tgz
+wget -nc http://data.statmt.org/wmt17/translation-task/training-parallel-nc-v12.tgz
+
+# extract data
+tar -xf de-en.tgz
+tar -xf training-parallel-commoncrawl.tgz
+tar -xf training-parallel-nc-v12.tgz
+
+# create corpus files
+cat europarl-v7.de-en.de commoncrawl.de-en.de training/news-commentary-v12.de-en.de > corpus.de
+cat europarl-v7.de-en.en commoncrawl.de-en.en training/news-commentary-v12.de-en.en > corpus.en
+
+# clean
+rm -r europarl-* commoncrawl.* training/ *.tgz
+
+cd ..
diff --git a/wmt2017-uedin/scripts/preprocess-data-mono.sh b/wmt2017-uedin/scripts/preprocess-data-mono.sh
new file mode 100755
index 0000000..b327e9f
--- /dev/null
+++ b/wmt2017-uedin/scripts/preprocess-data-mono.sh
@@ -0,0 +1,23 @@
+#!/bin/bash -v
+
+# suffix of target language files
+SRC=en
+TRG=de
+
+# path to moses decoder: https://github.com/moses-smt/mosesdecoder
+mosesdecoder=../tools/moses-scripts
+
+# path to subword segmentation scripts: https://github.com/rsennrich/subword-nmt
+subword_nmt=../tools/subword-nmt
+
+# tokenize
+
+prefix=news.2016
+
+cat data/$prefix.$TRG \
+    | $mosesdecoder/scripts/tokenizer/normalize-punctuation.perl -l $TRG \
+    | $mosesdecoder/scripts/tokenizer/tokenizer.perl -a -l $TRG > data/$prefix.tok.$TRG
+
+$mosesdecoder/scripts/recaser/truecase.perl -model model/tc.$TRG < data/$prefix.tok.$TRG > data/$prefix.tc.$TRG
+
+$subword_nmt/apply_bpe.py -c model/$SRC$TRG.bpe < data/$prefix.tc.$TRG > data/$prefix.bpe.$TRG
diff --git a/wmt2017-uedin/scripts/preprocess-data.sh b/wmt2017-uedin/scripts/preprocess-data.sh
new file mode 100755
index 0000000..f02f597
--- /dev/null
+++ b/wmt2017-uedin/scripts/preprocess-data.sh
@@ -0,0 +1,64 @@
+#!/bin/bash -v
+
+# suffix of source language files
+SRC=en
+
+# suffix of target language files
+TRG=de
+
+# number of merge operations
+bpe_operations=32000
+
+# path to moses decoder: https://github.com/moses-smt/mosesdecoder
+mosesdecoder=../tools/moses-scripts
+
+# path to subword segmentation scripts: https://github.com/rsennrich/subword-nmt
+subword_nmt=../tools/subword-nmt
+
+# tokenize
+for prefix in corpus valid test2014 test2015 test2016 test2017
+do
+    cat data/$prefix.$SRC \
+        | $mosesdecoder/scripts/tokenizer/normalize-punctuation.perl -l $SRC \
+        | $mosesdecoder/scripts/tokenizer/tokenizer.perl -a -l $SRC > data/$prefix.tok.$SRC
+
+    test -f data/$prefix.$TRG || continue
+
+    cat data/$prefix.$TRG \
+        | $mosesdecoder/scripts/tokenizer/normalize-punctuation.perl -l $TRG \
+        | $mosesdecoder/scripts/tokenizer/tokenizer.perl -a -l $TRG > data/$prefix.tok.$TRG
+done
+
+# clean empty and long sentences, and sentences with high source-target ratio (training corpus only)
+mv data/corpus.tok.$SRC data/corpus.tok.uncleaned.$SRC
+mv data/corpus.tok.$TRG data/corpus.tok.uncleaned.$TRG
+$mosesdecoder/scripts/training/clean-corpus-n.perl data/corpus.tok.uncleaned $SRC $TRG data/corpus.tok 1 100
+
+if [ ! -e "model/tc.$TRG" ]
+then
+    # train truecaser
+    $mosesdecoder/scripts/recaser/train-truecaser.perl -corpus data/corpus.tok.$SRC -model model/tc.$SRC
+    $mosesdecoder/scripts/recaser/train-truecaser.perl -corpus data/corpus.tok.$TRG -model model/tc.$TRG
+fi
+
+# apply truecaser (cleaned training corpus)
+for prefix in corpus valid test2014 test2015 test2016 test2017
+do
+    $mosesdecoder/scripts/recaser/truecase.perl -model model/tc.$SRC < data/$prefix.tok.$SRC > data/$prefix.tc.$SRC
+    test -f data/$prefix.tok.$TRG || continue
+    $mosesdecoder/scripts/recaser/truecase.perl -model model/tc.$TRG < data/$prefix.tok.$TRG > data/$prefix.tc.$TRG
+done
+
+if [ ! -e "model/$SRC$TRG.bpe" ]
+then
+    # train BPE
+    cat data/corpus.tc.$SRC data/corpus.tc.$TRG | $subword_nmt/learn_bpe.py -s $bpe_operations > model/$SRC$TRG.bpe
+fi
+
+# apply BPE
+for prefix in corpus valid test2014 test2015 test2016 test2017
+do
+    $subword_nmt/apply_bpe.py -c model/$SRC$TRG.bpe < data/$prefix.tc.$SRC > data/$prefix.bpe.$SRC
+    test -f data/$prefix.tc.$TRG || continue
+    $subword_nmt/apply_bpe.py -c model/$SRC$TRG.bpe < data/$prefix.tc.$TRG > data/$prefix.bpe.$TRG
+done
diff --git a/wmt2017-uedin/scripts/validate.en.sh b/wmt2017-uedin/scripts/validate.en.sh
new file mode 100755
index 0000000..4400d63
--- /dev/null
+++ b/wmt2017-uedin/scripts/validate.en.sh
@@ -0,0 +1,8 @@
+#!/bin/bash
+
+cat $1 \
+    | sed 's/\@\@ //g' \
+    | ../tools/moses-scripts/scripts/recaser/detruecase.perl 2>/dev/null \
+    | ../tools/moses-scripts/scripts/tokenizer/detokenizer.perl -l en 2>/dev/null \
+    | ../tools/moses-scripts/scripts/generic/multi-bleu-detok.perl data/valid.en \
+    | sed -r 's/BLEU = ([0-9.]+),.*/\1/'
diff --git a/wmt2017-uedin/scripts/validate.sh b/wmt2017-uedin/scripts/validate.sh
new file mode 100755
index 0000000..ee93bc6
--- /dev/null
+++ b/wmt2017-uedin/scripts/validate.sh
@@ -0,0 +1,8 @@
+#!/bin/bash
+
+cat $1 \
+    | sed 's/\@\@ //g' \
+    | ../tools/moses-scripts/scripts/recaser/detruecase.perl 2>/dev/null \
+    | ../tools/moses-scripts/scripts/tokenizer/detokenizer.perl -l de 2>/dev/null \
+    | ../tools/moses-scripts/scripts/generic/multi-bleu-detok.perl data/valid.de \
+    | sed -r 's/BLEU = ([0-9.]+),.*/\1/'
author	Marcin Junczys-Dowmunt <junczys@amu.edu.pl>	2018-03-10 03:08:12 +0300
committer	Marcin Junczys-Dowmunt <junczys@amu.edu.pl>	2018-03-10 03:08:12 +0300
commit	5aee613746bb86b38ea342816e9afe7ab45caa6f (patch)
tree	18753e928bac1e449f86e034488e4552e44fc5b5
parent	9477cc56b709324b5fa7d1cd66086eb472e269cc (diff)