diff options
author | Marcin Junczys-Dowmunt <junczys@amu.edu.pl> | 2018-03-10 03:14:11 +0300 |
---|---|---|
committer | Marcin Junczys-Dowmunt <junczys@amu.edu.pl> | 2018-03-10 03:14:11 +0300 |
commit | ed0836bd41aac15943827d79c13493eaa4a66334 (patch) | |
tree | 2d6e5e0acd22d7ab15605d4cacd6cc0d67ee4cfc | |
parent | 5aee613746bb86b38ea342816e9afe7ab45caa6f (diff) |
update scripts
-rwxr-xr-x | wmt2017-uedin/scripts/bla.s | 59 | ||||
-rwxr-xr-x | wmt2017-uedin/scripts/preprocess-data.sh | 22 | ||||
-rw-r--r-- | wmt2017-uedin/scripts/rescore.py | 25 | ||||
-rwxr-xr-x | wmt2017-uedin/scripts/validate.sh | 2 |
4 files changed, 35 insertions, 73 deletions
diff --git a/wmt2017-uedin/scripts/bla.s b/wmt2017-uedin/scripts/bla.s deleted file mode 100755 index ae07a92..0000000 --- a/wmt2017-uedin/scripts/bla.s +++ /dev/null @@ -1,59 +0,0 @@ -#!/bin/bash -v - -# suffix of source language files -SRC=en - -# suffix of target language files -TRG=de - -# number of merge operations -bpe_operations=32000 - -# path to moses decoder: https://github.com/moses-smt/mosesdecoder -mosesdecoder=../tools/moses-scripts - -# path to subword segmentation scripts: https://github.com/rsennrich/subword-nmt -subword_nmt=../tools/subword-nmt - -# tokenize -for prefix in valid test2014 test2015 test2016 test2017 -do - cat data/$prefix.$SRC \ - | $mosesdecoder/scripts/tokenizer/normalize-punctuation.perl -l $SRC \ - | $mosesdecoder/scripts/tokenizer/tokenizer.perl -a -l $SRC > data/$prefix.tok.$SRC - - test -f data/$prefix.$TRG || continue - - cat data/$prefix.$TRG \ - | $mosesdecoder/scripts/tokenizer/normalize-punctuation.perl -l $TRG \ - | $mosesdecoder/scripts/tokenizer/tokenizer.perl -a -l $TRG > data/$prefix.tok.$TRG -done - -if [ ! -e "model/tc.$TRG" ] -then - # train truecaser - $mosesdecoder/scripts/recaser/train-truecaser.perl -corpus data/corpus.tok.$SRC -model model/tc.$SRC - $mosesdecoder/scripts/recaser/train-truecaser.perl -corpus data/corpus.tok.$TRG -model model/tc.$TRG -fi - -# apply truecaser (cleaned training corpus) -for prefix in valid test2014 test2015 test2016 test2017 -do - $mosesdecoder/scripts/recaser/truecase.perl -model model/tc.$SRC < data/$prefix.tok.$SRC > data/$prefix.tc.$SRC - test -f data/$prefix.tok.$TRG || continue - $mosesdecoder/scripts/recaser/truecase.perl -model model/tc.$TRG < data/$prefix.tok.$TRG > data/$prefix.tc.$TRG -done - -if [ ! -e "model/$SRC$TRG.bpe" ] -then - # train BPE - cat data/corpus.tc.$SRC data/corpus.tc.$TRG | $subword_nmt/learn_bpe.py -s $bpe_operations > model/$SRC$TRG.bpe -fi - -# apply BPE -for prefix in valid test2014 test2015 test2016 test2017 -do - $subword_nmt/apply_bpe.py -c model/$SRC$TRG.bpe < data/$prefix.tc.$SRC > data/$prefix.bpe.$SRC - test -f data/$prefix.tc.$TRG || continue - $subword_nmt/apply_bpe.py -c model/$SRC$TRG.bpe < data/$prefix.tc.$TRG > data/$prefix.bpe.$TRG -done diff --git a/wmt2017-uedin/scripts/preprocess-data.sh b/wmt2017-uedin/scripts/preprocess-data.sh index f02f597..3a968a5 100755 --- a/wmt2017-uedin/scripts/preprocess-data.sh +++ b/wmt2017-uedin/scripts/preprocess-data.sh @@ -16,7 +16,7 @@ mosesdecoder=../tools/moses-scripts subword_nmt=../tools/subword-nmt # tokenize -for prefix in corpus valid test2014 test2015 test2016 test2017 +for prefix in corpus valid test2014 test2015 test2016 do cat data/$prefix.$SRC \ | $mosesdecoder/scripts/tokenizer/normalize-punctuation.perl -l $SRC \ @@ -34,29 +34,23 @@ mv data/corpus.tok.$SRC data/corpus.tok.uncleaned.$SRC mv data/corpus.tok.$TRG data/corpus.tok.uncleaned.$TRG $mosesdecoder/scripts/training/clean-corpus-n.perl data/corpus.tok.uncleaned $SRC $TRG data/corpus.tok 1 100 -if [ ! -e "model/tc.$TRG" ] -then - # train truecaser - $mosesdecoder/scripts/recaser/train-truecaser.perl -corpus data/corpus.tok.$SRC -model model/tc.$SRC - $mosesdecoder/scripts/recaser/train-truecaser.perl -corpus data/corpus.tok.$TRG -model model/tc.$TRG -fi +# train truecaser +$mosesdecoder/scripts/recaser/train-truecaser.perl -corpus data/corpus.tok.$SRC -model model/tc.$SRC +$mosesdecoder/scripts/recaser/train-truecaser.perl -corpus data/corpus.tok.$TRG -model model/tc.$TRG # apply truecaser (cleaned training corpus) -for prefix in corpus valid test2014 test2015 test2016 test2017 +for prefix in corpus valid test2014 test2015 test2016 do $mosesdecoder/scripts/recaser/truecase.perl -model model/tc.$SRC < data/$prefix.tok.$SRC > data/$prefix.tc.$SRC test -f data/$prefix.tok.$TRG || continue $mosesdecoder/scripts/recaser/truecase.perl -model model/tc.$TRG < data/$prefix.tok.$TRG > data/$prefix.tc.$TRG done -if [ ! -e "model/$SRC$TRG.bpe" ] -then - # train BPE - cat data/corpus.tc.$SRC data/corpus.tc.$TRG | $subword_nmt/learn_bpe.py -s $bpe_operations > model/$SRC$TRG.bpe -fi +# train BPE +cat data/corpus.tc.$SRC data/corpus.tc.$TRG | $subword_nmt/learn_bpe.py -s $bpe_operations > model/$SRC$TRG.bpe # apply BPE -for prefix in corpus valid test2014 test2015 test2016 test2017 +for prefix in corpus valid test2014 test2015 test2016 do $subword_nmt/apply_bpe.py -c model/$SRC$TRG.bpe < data/$prefix.tc.$SRC > data/$prefix.bpe.$SRC test -f data/$prefix.tc.$TRG || continue diff --git a/wmt2017-uedin/scripts/rescore.py b/wmt2017-uedin/scripts/rescore.py new file mode 100644 index 0000000..f5ecfa9 --- /dev/null +++ b/wmt2017-uedin/scripts/rescore.py @@ -0,0 +1,25 @@ +import sys + +lastNum = 0 +bestScore = -9999 + +for line in sys.stdin: + line = line.rstrip("\n") + fields = line.split(" ||| ") + score = sum(float(score) for score in fields[2].split(" ") if score[-1] != "=") + length = float(len(fields[1].split(" ")) + 1) + + score = score / length + + num = int(fields[0]) + if num > lastNum: + print bestLine + bestScore = -99999 + bestLine = fields[1] + lastNum = num + + if score > bestScore: + bestScore = score + bestLine = fields[1] + +print bestLine diff --git a/wmt2017-uedin/scripts/validate.sh b/wmt2017-uedin/scripts/validate.sh index ee93bc6..a2b4945 100755 --- a/wmt2017-uedin/scripts/validate.sh +++ b/wmt2017-uedin/scripts/validate.sh @@ -1,5 +1,7 @@ #!/bin/bash +export LC_ALL=C.UTF-8 + cat $1 \ | sed 's/\@\@ //g' \ | ../tools/moses-scripts/scripts/recaser/detruecase.perl 2>/dev/null \ |