diff options
author | Roman Grundkiewicz <rgrundki@exseed.ed.ac.uk> | 2017-09-26 23:06:14 +0300 |
---|---|---|
committer | Roman Grundkiewicz <rgrundki@exseed.ed.ac.uk> | 2017-09-26 23:06:14 +0300 |
commit | 2855410054841c147bf5d7f014fc94891a38a9b3 (patch) | |
tree | cfbe6a818738cffd1812a4ba9239fd9d4aa1b57f /models | |
parent | 22b180e172002002bf1abd5bcbb3b77b3b4e59e0 (diff) |
Add basic test for translation
Diffstat (limited to 'models')
-rw-r--r-- | models/download_wmt16.sh | 21 | ||||
-rw-r--r-- | models/preprocess.sh | 12 | ||||
-rw-r--r-- | models/wmt16.en-de/marian.yml | 11 |
3 files changed, 44 insertions, 0 deletions
diff --git a/models/download_wmt16.sh b/models/download_wmt16.sh new file mode 100644 index 0000000..f86d982 --- /dev/null +++ b/models/download_wmt16.sh @@ -0,0 +1,21 @@ +#!/bin/bash + +URL=http://data.statmt.org/rsennrich/wmt16_systems +SRC=en +TRG=de + +MODEL_FILES=( + $URL/$SRC-$TRG/model.npz + $URL/$SRC-$TRG/model.npz.json + $URL/$SRC-$TRG/vocab.$SRC.json + $URL/$SRC-$TRG/vocab.$TRG.json + $URL/$SRC-$TRG/$SRC$TRG.bpe + $URL/$SRC-$TRG/truecase-model.$SRC +) + +mkdir -p wmt16.$SRC-$TRG + +for model_file in ${MODEL_FILES[@]}; do + echo $model_file + wget -q --no-clobber --directory-prefix wmt16.$SRC-$TRG --show-progress $model_file +done diff --git a/models/preprocess.sh b/models/preprocess.sh new file mode 100644 index 0000000..4b5bc3c --- /dev/null +++ b/models/preprocess.sh @@ -0,0 +1,12 @@ +#!/bin/bash + +root="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" +moses_scripts=$root/../tools/moses-scripts +subword_nmt=$root/../tools/subword-nmt + +model_dir=$root/wmt16.en-de + +$moses_scripts/scripts/tokenizer/normalize-punctuation.perl -l en \ + | $moses_scripts/scripts/tokenizer/tokenizer.perl -l en -penn \ + | $moses_scripts/scripts/recaser/truecase.perl -model $model_dir/truecase-model.en \ + | $subword_nmt/apply_bpe.py -c $model_dir/ende.bpe diff --git a/models/wmt16.en-de/marian.yml b/models/wmt16.en-de/marian.yml new file mode 100644 index 0000000..7bb15af --- /dev/null +++ b/models/wmt16.en-de/marian.yml @@ -0,0 +1,11 @@ +relative-paths: true +type: amun +models: + - model.npz +dim-emb: 500 +vocabs: + - vocab.en.json + - vocab.de.json +dim-vocabs: + - 85000 + - 85000 |