Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/marian-nmt/marian.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMarcin Junczys-Dowmunt <junczys@amu.edu.pl>2017-03-25 13:18:01 +0300
committerMarcin Junczys-Dowmunt <junczys@amu.edu.pl>2017-03-25 13:18:01 +0300
commit463da37570ce4e815745ee8f09ac80d70dc38531 (patch)
tree8feaf6a2fea9d5ea5b7e876fd4b836f20a553d4b /examples
parent222f9dbe987b6cf5bd7ac0be2d7e452152f5d0f3 (diff)
change download scripts
Diffstat (limited to 'examples')
-rw-r--r--examples/training/.gitignore30
-rwxr-xr-xexamples/training/run-me.sh8
-rwxr-xr-xexamples/training/scripts/preprocess.sh2
3 files changed, 5 insertions, 35 deletions
diff --git a/examples/training/.gitignore b/examples/training/.gitignore
index e593d588..7bb1ca88 100644
--- a/examples/training/.gitignore
+++ b/examples/training/.gitignore
@@ -1,33 +1,3 @@
-data/SETIMES2.en-ro.en
-data/SETIMES2.en-ro.ids
-data/SETIMES2.en-ro.ro
-data/SETIMES2.ro-en.txt.zip
-data/corpus.bpe.en
-data/corpus.bpe.en.shuf
-data/corpus.bpe.ro
-data/corpus.bpe.ro.shuf
-data/corpus.en
-data/corpus.ro
-data/corpus.tc.en
-data/corpus.tc.ro
-data/corpus.tok.clean.en
-data/corpus.tok.clean.ro
-data/corpus.tok.en
-data/corpus.tok.ro
-data/europarl-v7.ro-en.en
-data/europarl-v7.ro-en.ro
-data/newsdev2016.bpe.en
-data/newsdev2016.bpe.ro
-data/newsdev2016.tc.en
-data/newsdev2016.tc.ro
-data/newsdev2016.tok.en
-data/newsdev2016.tok.ro
-data/newstest2016.bpe.en
-data/newstest2016.bpe.ro
-data/newstest2016.tc.en
-data/newstest2016.tc.ro
-data/newstest2016.tok.en
-data/newstest2016.tok.ro
model/
mosesdecoder/
subword-nmt/
diff --git a/examples/training/run-me.sh b/examples/training/run-me.sh
index b64f4b9e..a4fa8691 100755
--- a/examples/training/run-me.sh
+++ b/examples/training/run-me.sh
@@ -42,10 +42,10 @@ then
--devices 0 \
--train-sets data/corpus.bpe.ro data/corpus.bpe.en \
--vocabs model/vocab.ro.yml model/vocab.en.yml \
- --dim-vocabs 32000 32000 \
+ --dim-vocabs 66000 50000 \
--mini-batch 80 \
--layer-normalization \
- --after-batches 10000 \
+ --early-stopping 5 \
--valid-freq 10000 --save-freq 30000 --disp-freq 1000 \
--valid-sets data/newsdev2016.bpe.ro data/newsdev2016.bpe.en \
--valid-metrics cross-entropy valid-script \
@@ -54,11 +54,11 @@ then
fi
-if [ ! -e "data/newstest2016.bpe.ro.output.postprocessed.dev" ]
+if [ ! -e "data/newstest2016.bpe.ro.output.postprocessed" ]
then
cat data/newstest2016.bpe.ro \
| ../../build/amun -c model/model.npz.amun.yml -b 12 -n --mini-batch 100 --maxi-batch 1000 \
| sed 's/\@\@ //g' | mosesdecoder/scripts/recaser/detruecase.perl > data/newstest2016.bpe.ro.output.postprocessed
fi
-./mosesdecoder/scripts/generic/multi-bleu.perl data/newtest2016.en < data/newtest2016.bpe.ro.output.postprocessed
+./mosesdecoder/scripts/generic/multi-bleu.perl data/newstest2016.tok.en < data/newstest2016.bpe.ro.output.postprocessed
diff --git a/examples/training/scripts/preprocess.sh b/examples/training/scripts/preprocess.sh
index b8990f05..cd0a6b0f 100755
--- a/examples/training/scripts/preprocess.sh
+++ b/examples/training/scripts/preprocess.sh
@@ -19,7 +19,7 @@ TRG=en
# number of merge operations. Network vocabulary should be slightly larger (to include characters),
# or smaller if the operations are learned on the joint vocabulary
-bpe_operations=40000
+bpe_operations=85000
# path to moses decoder: https://github.com/moses-smt/mosesdecoder
mosesdecoder=mosesdecoder