diff options
author | Marcin Junczys-Dowmunt <junczys@amu.edu.pl> | 2018-03-24 02:52:14 +0300 |
---|---|---|
committer | GitHub <noreply@github.com> | 2018-03-24 02:52:14 +0300 |
commit | 8be005eb5d6ff2135cc5844a4e58543df954839a (patch) | |
tree | 118ebff4d8fc7cce05c5fac60464ccbd072e3d7d /wmt2017-transformer/scripts | |
parent | 974bbfd9f984ba3230097f64840d4f2d88ee4589 (diff) |
Add missing test2017 to preprocessing
Diffstat (limited to 'wmt2017-transformer/scripts')
-rwxr-xr-x | wmt2017-transformer/scripts/preprocess-data.sh | 6 |
1 files changed, 3 insertions, 3 deletions
diff --git a/wmt2017-transformer/scripts/preprocess-data.sh b/wmt2017-transformer/scripts/preprocess-data.sh index 3a968a5..309a646 100755 --- a/wmt2017-transformer/scripts/preprocess-data.sh +++ b/wmt2017-transformer/scripts/preprocess-data.sh @@ -16,7 +16,7 @@ mosesdecoder=../tools/moses-scripts subword_nmt=../tools/subword-nmt # tokenize -for prefix in corpus valid test2014 test2015 test2016 +for prefix in corpus valid test2014 test2015 test2016 test2017 do cat data/$prefix.$SRC \ | $mosesdecoder/scripts/tokenizer/normalize-punctuation.perl -l $SRC \ @@ -39,7 +39,7 @@ $mosesdecoder/scripts/recaser/train-truecaser.perl -corpus data/corpus.tok.$SRC $mosesdecoder/scripts/recaser/train-truecaser.perl -corpus data/corpus.tok.$TRG -model model/tc.$TRG # apply truecaser (cleaned training corpus) -for prefix in corpus valid test2014 test2015 test2016 +for prefix in corpus valid test2014 test2015 test2016 test2017 do $mosesdecoder/scripts/recaser/truecase.perl -model model/tc.$SRC < data/$prefix.tok.$SRC > data/$prefix.tc.$SRC test -f data/$prefix.tok.$TRG || continue @@ -50,7 +50,7 @@ done cat data/corpus.tc.$SRC data/corpus.tc.$TRG | $subword_nmt/learn_bpe.py -s $bpe_operations > model/$SRC$TRG.bpe # apply BPE -for prefix in corpus valid test2014 test2015 test2016 +for prefix in corpus valid test2014 test2015 test2016 test2017 do $subword_nmt/apply_bpe.py -c model/$SRC$TRG.bpe < data/$prefix.tc.$SRC > data/$prefix.bpe.$SRC test -f data/$prefix.tc.$TRG || continue |