update scripts

author: Marcin Junczys-Dowmunt <junczys@amu.edu.pl> 2018-03-10 03:14:11 +0300
committer: Marcin Junczys-Dowmunt <junczys@amu.edu.pl> 2018-03-10 03:14:11 +0300
commit: ed0836bd41aac15943827d79c13493eaa4a66334 (patch)
tree: 2d6e5e0acd22d7ab15605d4cacd6cc0d67ee4cfc
parent: 5aee613746bb86b38ea342816e9afe7ab45caa6f (diff)
4 files changed, 35 insertions, 73 deletions
diff --git a/wmt2017-uedin/scripts/bla.s b/wmt2017-uedin/scripts/bla.s
deleted file mode 100755
index ae07a92..0000000
--- a/wmt2017-uedin/scripts/bla.s
+++ /dev/null
@@ -1,59 +0,0 @@
-#!/bin/bash -v
-
-# suffix of source language files
-SRC=en
-
-# suffix of target language files
-TRG=de
-
-# number of merge operations
-bpe_operations=32000
-
-# path to moses decoder: https://github.com/moses-smt/mosesdecoder
-mosesdecoder=../tools/moses-scripts
-
-# path to subword segmentation scripts: https://github.com/rsennrich/subword-nmt
-subword_nmt=../tools/subword-nmt
-
-# tokenize
-for prefix in valid test2014 test2015 test2016 test2017
-do
-    cat data/$prefix.$SRC \
-        | $mosesdecoder/scripts/tokenizer/normalize-punctuation.perl -l $SRC \
-        | $mosesdecoder/scripts/tokenizer/tokenizer.perl -a -l $SRC > data/$prefix.tok.$SRC
-
-    test -f data/$prefix.$TRG || continue
-
-    cat data/$prefix.$TRG \
-        | $mosesdecoder/scripts/tokenizer/normalize-punctuation.perl -l $TRG \
-        | $mosesdecoder/scripts/tokenizer/tokenizer.perl -a -l $TRG > data/$prefix.tok.$TRG
-done
-
-if [ ! -e "model/tc.$TRG" ]
-then
-    # train truecaser
-    $mosesdecoder/scripts/recaser/train-truecaser.perl -corpus data/corpus.tok.$SRC -model model/tc.$SRC
-    $mosesdecoder/scripts/recaser/train-truecaser.perl -corpus data/corpus.tok.$TRG -model model/tc.$TRG
-fi
-
-# apply truecaser (cleaned training corpus)
-for prefix in valid test2014 test2015 test2016 test2017
-do
-    $mosesdecoder/scripts/recaser/truecase.perl -model model/tc.$SRC < data/$prefix.tok.$SRC > data/$prefix.tc.$SRC
-    test -f data/$prefix.tok.$TRG || continue
-    $mosesdecoder/scripts/recaser/truecase.perl -model model/tc.$TRG < data/$prefix.tok.$TRG > data/$prefix.tc.$TRG
-done
-
-if [ ! -e "model/$SRC$TRG.bpe" ]
-then
-    # train BPE
-    cat data/corpus.tc.$SRC data/corpus.tc.$TRG | $subword_nmt/learn_bpe.py -s $bpe_operations > model/$SRC$TRG.bpe
-fi
-
-# apply BPE
-for prefix in valid test2014 test2015 test2016 test2017
-do
-    $subword_nmt/apply_bpe.py -c model/$SRC$TRG.bpe < data/$prefix.tc.$SRC > data/$prefix.bpe.$SRC
-    test -f data/$prefix.tc.$TRG || continue
-    $subword_nmt/apply_bpe.py -c model/$SRC$TRG.bpe < data/$prefix.tc.$TRG > data/$prefix.bpe.$TRG
-done
diff --git a/wmt2017-uedin/scripts/preprocess-data.sh b/wmt2017-uedin/scripts/preprocess-data.sh
index f02f597..3a968a5 100755
--- a/wmt2017-uedin/scripts/preprocess-data.sh
+++ b/wmt2017-uedin/scripts/preprocess-data.sh
@@ -16,7 +16,7 @@ mosesdecoder=../tools/moses-scripts
 subword_nmt=../tools/subword-nmt
 
 # tokenize
-for prefix in corpus valid test2014 test2015 test2016 test2017
+for prefix in corpus valid test2014 test2015 test2016
 do
     cat data/$prefix.$SRC \
         | $mosesdecoder/scripts/tokenizer/normalize-punctuation.perl -l $SRC \
@@ -34,29 +34,23 @@ mv data/corpus.tok.$SRC data/corpus.tok.uncleaned.$SRC
 mv data/corpus.tok.$TRG data/corpus.tok.uncleaned.$TRG
 $mosesdecoder/scripts/training/clean-corpus-n.perl data/corpus.tok.uncleaned $SRC $TRG data/corpus.tok 1 100
 
-if [ ! -e "model/tc.$TRG" ]
-then
-    # train truecaser
-    $mosesdecoder/scripts/recaser/train-truecaser.perl -corpus data/corpus.tok.$SRC -model model/tc.$SRC
-    $mosesdecoder/scripts/recaser/train-truecaser.perl -corpus data/corpus.tok.$TRG -model model/tc.$TRG
-fi
+# train truecaser
+$mosesdecoder/scripts/recaser/train-truecaser.perl -corpus data/corpus.tok.$SRC -model model/tc.$SRC
+$mosesdecoder/scripts/recaser/train-truecaser.perl -corpus data/corpus.tok.$TRG -model model/tc.$TRG
 
 # apply truecaser (cleaned training corpus)
-for prefix in corpus valid test2014 test2015 test2016 test2017
+for prefix in corpus valid test2014 test2015 test2016
 do
     $mosesdecoder/scripts/recaser/truecase.perl -model model/tc.$SRC < data/$prefix.tok.$SRC > data/$prefix.tc.$SRC
     test -f data/$prefix.tok.$TRG || continue
     $mosesdecoder/scripts/recaser/truecase.perl -model model/tc.$TRG < data/$prefix.tok.$TRG > data/$prefix.tc.$TRG
 done
 
-if [ ! -e "model/$SRC$TRG.bpe" ]
-then
-    # train BPE
-    cat data/corpus.tc.$SRC data/corpus.tc.$TRG | $subword_nmt/learn_bpe.py -s $bpe_operations > model/$SRC$TRG.bpe
-fi
+# train BPE
+cat data/corpus.tc.$SRC data/corpus.tc.$TRG | $subword_nmt/learn_bpe.py -s $bpe_operations > model/$SRC$TRG.bpe
 
 # apply BPE
-for prefix in corpus valid test2014 test2015 test2016 test2017
+for prefix in corpus valid test2014 test2015 test2016
 do
     $subword_nmt/apply_bpe.py -c model/$SRC$TRG.bpe < data/$prefix.tc.$SRC > data/$prefix.bpe.$SRC
     test -f data/$prefix.tc.$TRG || continue
diff --git a/wmt2017-uedin/scripts/rescore.py b/wmt2017-uedin/scripts/rescore.py
new file mode 100644
index 0000000..f5ecfa9
--- /dev/null
+++ b/wmt2017-uedin/scripts/rescore.py
@@ -0,0 +1,25 @@
+import sys
+
+lastNum = 0
+bestScore = -9999
+
+for line in sys.stdin:
+    line = line.rstrip("\n")
+    fields = line.split(" ||| ")
+    score = sum(float(score) for score in fields[2].split(" ") if score[-1] != "=")
+    length = float(len(fields[1].split(" ")) + 1)
+
+    score = score / length
+
+    num = int(fields[0])
+    if num > lastNum:
+      print bestLine
+      bestScore = -99999
+      bestLine = fields[1]
+    lastNum = num
+
+    if score > bestScore:
+      bestScore = score
+      bestLine = fields[1]
+
+print bestLine
diff --git a/wmt2017-uedin/scripts/validate.sh b/wmt2017-uedin/scripts/validate.sh
index ee93bc6..a2b4945 100755
--- a/wmt2017-uedin/scripts/validate.sh
+++ b/wmt2017-uedin/scripts/validate.sh
@@ -1,5 +1,7 @@
 #!/bin/bash
 
+export LC_ALL=C.UTF-8
+
 cat $1 \
     | sed 's/\@\@ //g' \
     | ../tools/moses-scripts/scripts/recaser/detruecase.perl 2>/dev/null \
author	Marcin Junczys-Dowmunt <junczys@amu.edu.pl>	2018-03-10 03:14:11 +0300
committer	Marcin Junczys-Dowmunt <junczys@amu.edu.pl>	2018-03-10 03:14:11 +0300
commit	ed0836bd41aac15943827d79c13493eaa4a66334 (patch)
tree	2d6e5e0acd22d7ab15605d4cacd6cc0d67ee4cfc
parent	5aee613746bb86b38ea342816e9afe7ab45caa6f (diff)