diff options
author | Marcin Junczys-Dowmunt <junczys@amu.edu.pl> | 2017-03-24 21:19:05 +0300 |
---|---|---|
committer | Marcin Junczys-Dowmunt <junczys@amu.edu.pl> | 2017-03-24 21:19:05 +0300 |
commit | bae7cd9a107db896f629d4c160a1193a1a25d43d (patch) | |
tree | 9669efc9cc135fdae5580cb0b17cbae6b11d024b /examples/training/scripts | |
parent | 5dbeadb47dbb70b85061aa08c8c55193d9fbe74a (diff) |
added newtest2016
Diffstat (limited to 'examples/training/scripts')
-rwxr-xr-x | examples/training/scripts/preprocess.sh | 8 |
1 files changed, 4 insertions, 4 deletions
diff --git a/examples/training/scripts/preprocess.sh b/examples/training/scripts/preprocess.sh index 924f8971..b8990f05 100755 --- a/examples/training/scripts/preprocess.sh +++ b/examples/training/scripts/preprocess.sh @@ -19,7 +19,7 @@ TRG=en # number of merge operations. Network vocabulary should be slightly larger (to include characters), # or smaller if the operations are learned on the joint vocabulary -bpe_operations=60000 +bpe_operations=40000 # path to moses decoder: https://github.com/moses-smt/mosesdecoder mosesdecoder=mosesdecoder @@ -28,7 +28,7 @@ mosesdecoder=mosesdecoder subword_nmt=subword-nmt # tokenize -for prefix in corpus newsdev2016 +for prefix in corpus newsdev2016 newstest2016 do cat data/$prefix.$SRC | \ $mosesdecoder/scripts/tokenizer/normalize-punctuation.perl -l $SRC | \ @@ -57,7 +57,7 @@ for prefix in corpus done # apply truecaser (dev/test files) -for prefix in newsdev2016 +for prefix in newsdev2016 newstest2016 do $mosesdecoder/scripts/recaser/truecase.perl -model model/truecase-model.$SRC < data/$prefix.tok.$SRC > data/$prefix.tc.$SRC $mosesdecoder/scripts/recaser/truecase.perl -model model/truecase-model.$TRG < data/$prefix.tok.$TRG > data/$prefix.tc.$TRG @@ -68,7 +68,7 @@ cat data/corpus.tc.$SRC data/corpus.tc.$TRG | $subword_nmt/learn_bpe.py -s $bpe_ # apply BPE -for prefix in corpus newsdev2016 +for prefix in corpus newsdev2016 newstest2016 do $subword_nmt/apply_bpe.py -c model/$SRC$TRG.bpe < data/$prefix.tc.$SRC > data/$prefix.bpe.$SRC $subword_nmt/apply_bpe.py -c model/$SRC$TRG.bpe < data/$prefix.tc.$TRG > data/$prefix.bpe.$TRG |