Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/marian-nmt/marian.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMarcin Junczys-Dowmunt <junczys@amu.edu.pl>2017-03-24 21:19:05 +0300
committerMarcin Junczys-Dowmunt <junczys@amu.edu.pl>2017-03-24 21:19:05 +0300
commitbae7cd9a107db896f629d4c160a1193a1a25d43d (patch)
tree9669efc9cc135fdae5580cb0b17cbae6b11d024b /examples/training/scripts
parent5dbeadb47dbb70b85061aa08c8c55193d9fbe74a (diff)
added newtest2016
Diffstat (limited to 'examples/training/scripts')
-rwxr-xr-xexamples/training/scripts/preprocess.sh8
1 files changed, 4 insertions, 4 deletions
diff --git a/examples/training/scripts/preprocess.sh b/examples/training/scripts/preprocess.sh
index 924f8971..b8990f05 100755
--- a/examples/training/scripts/preprocess.sh
+++ b/examples/training/scripts/preprocess.sh
@@ -19,7 +19,7 @@ TRG=en
# number of merge operations. Network vocabulary should be slightly larger (to include characters),
# or smaller if the operations are learned on the joint vocabulary
-bpe_operations=60000
+bpe_operations=40000
# path to moses decoder: https://github.com/moses-smt/mosesdecoder
mosesdecoder=mosesdecoder
@@ -28,7 +28,7 @@ mosesdecoder=mosesdecoder
subword_nmt=subword-nmt
# tokenize
-for prefix in corpus newsdev2016
+for prefix in corpus newsdev2016 newstest2016
do
cat data/$prefix.$SRC | \
$mosesdecoder/scripts/tokenizer/normalize-punctuation.perl -l $SRC | \
@@ -57,7 +57,7 @@ for prefix in corpus
done
# apply truecaser (dev/test files)
-for prefix in newsdev2016
+for prefix in newsdev2016 newstest2016
do
$mosesdecoder/scripts/recaser/truecase.perl -model model/truecase-model.$SRC < data/$prefix.tok.$SRC > data/$prefix.tc.$SRC
$mosesdecoder/scripts/recaser/truecase.perl -model model/truecase-model.$TRG < data/$prefix.tok.$TRG > data/$prefix.tc.$TRG
@@ -68,7 +68,7 @@ cat data/corpus.tc.$SRC data/corpus.tc.$TRG | $subword_nmt/learn_bpe.py -s $bpe_
# apply BPE
-for prefix in corpus newsdev2016
+for prefix in corpus newsdev2016 newstest2016
do
$subword_nmt/apply_bpe.py -c model/$SRC$TRG.bpe < data/$prefix.tc.$SRC > data/$prefix.bpe.$SRC
$subword_nmt/apply_bpe.py -c model/$SRC$TRG.bpe < data/$prefix.tc.$TRG > data/$prefix.bpe.$TRG