From 463da37570ce4e815745ee8f09ac80d70dc38531 Mon Sep 17 00:00:00 2001 From: Marcin Junczys-Dowmunt Date: Sat, 25 Mar 2017 11:18:01 +0100 Subject: change download scripts --- examples/training/.gitignore | 30 ------------------------------ examples/training/run-me.sh | 8 ++++---- examples/training/scripts/preprocess.sh | 2 +- 3 files changed, 5 insertions(+), 35 deletions(-) (limited to 'examples') diff --git a/examples/training/.gitignore b/examples/training/.gitignore index e593d588..7bb1ca88 100644 --- a/examples/training/.gitignore +++ b/examples/training/.gitignore @@ -1,33 +1,3 @@ -data/SETIMES2.en-ro.en -data/SETIMES2.en-ro.ids -data/SETIMES2.en-ro.ro -data/SETIMES2.ro-en.txt.zip -data/corpus.bpe.en -data/corpus.bpe.en.shuf -data/corpus.bpe.ro -data/corpus.bpe.ro.shuf -data/corpus.en -data/corpus.ro -data/corpus.tc.en -data/corpus.tc.ro -data/corpus.tok.clean.en -data/corpus.tok.clean.ro -data/corpus.tok.en -data/corpus.tok.ro -data/europarl-v7.ro-en.en -data/europarl-v7.ro-en.ro -data/newsdev2016.bpe.en -data/newsdev2016.bpe.ro -data/newsdev2016.tc.en -data/newsdev2016.tc.ro -data/newsdev2016.tok.en -data/newsdev2016.tok.ro -data/newstest2016.bpe.en -data/newstest2016.bpe.ro -data/newstest2016.tc.en -data/newstest2016.tc.ro -data/newstest2016.tok.en -data/newstest2016.tok.ro model/ mosesdecoder/ subword-nmt/ diff --git a/examples/training/run-me.sh b/examples/training/run-me.sh index b64f4b9e..a4fa8691 100755 --- a/examples/training/run-me.sh +++ b/examples/training/run-me.sh @@ -42,10 +42,10 @@ then --devices 0 \ --train-sets data/corpus.bpe.ro data/corpus.bpe.en \ --vocabs model/vocab.ro.yml model/vocab.en.yml \ - --dim-vocabs 32000 32000 \ + --dim-vocabs 66000 50000 \ --mini-batch 80 \ --layer-normalization \ - --after-batches 10000 \ + --early-stopping 5 \ --valid-freq 10000 --save-freq 30000 --disp-freq 1000 \ --valid-sets data/newsdev2016.bpe.ro data/newsdev2016.bpe.en \ --valid-metrics cross-entropy valid-script \ @@ -54,11 +54,11 @@ then fi -if [ ! -e "data/newstest2016.bpe.ro.output.postprocessed.dev" ] +if [ ! -e "data/newstest2016.bpe.ro.output.postprocessed" ] then cat data/newstest2016.bpe.ro \ | ../../build/amun -c model/model.npz.amun.yml -b 12 -n --mini-batch 100 --maxi-batch 1000 \ | sed 's/\@\@ //g' | mosesdecoder/scripts/recaser/detruecase.perl > data/newstest2016.bpe.ro.output.postprocessed fi -./mosesdecoder/scripts/generic/multi-bleu.perl data/newtest2016.en < data/newtest2016.bpe.ro.output.postprocessed +./mosesdecoder/scripts/generic/multi-bleu.perl data/newstest2016.tok.en < data/newstest2016.bpe.ro.output.postprocessed diff --git a/examples/training/scripts/preprocess.sh b/examples/training/scripts/preprocess.sh index b8990f05..cd0a6b0f 100755 --- a/examples/training/scripts/preprocess.sh +++ b/examples/training/scripts/preprocess.sh @@ -19,7 +19,7 @@ TRG=en # number of merge operations. Network vocabulary should be slightly larger (to include characters), # or smaller if the operations are learned on the joint vocabulary -bpe_operations=40000 +bpe_operations=85000 # path to moses decoder: https://github.com/moses-smt/mosesdecoder mosesdecoder=mosesdecoder -- cgit v1.2.3