Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/marian-nmt/marian-examples.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMarcin Junczys-Dowmunt <junczys@amu.edu.pl>2018-03-10 03:14:11 +0300
committerMarcin Junczys-Dowmunt <junczys@amu.edu.pl>2018-03-10 03:14:11 +0300
commited0836bd41aac15943827d79c13493eaa4a66334 (patch)
tree2d6e5e0acd22d7ab15605d4cacd6cc0d67ee4cfc
parent5aee613746bb86b38ea342816e9afe7ab45caa6f (diff)
update scripts
-rwxr-xr-xwmt2017-uedin/scripts/bla.s59
-rwxr-xr-xwmt2017-uedin/scripts/preprocess-data.sh22
-rw-r--r--wmt2017-uedin/scripts/rescore.py25
-rwxr-xr-xwmt2017-uedin/scripts/validate.sh2
4 files changed, 35 insertions, 73 deletions
diff --git a/wmt2017-uedin/scripts/bla.s b/wmt2017-uedin/scripts/bla.s
deleted file mode 100755
index ae07a92..0000000
--- a/wmt2017-uedin/scripts/bla.s
+++ /dev/null
@@ -1,59 +0,0 @@
-#!/bin/bash -v
-
-# suffix of source language files
-SRC=en
-
-# suffix of target language files
-TRG=de
-
-# number of merge operations
-bpe_operations=32000
-
-# path to moses decoder: https://github.com/moses-smt/mosesdecoder
-mosesdecoder=../tools/moses-scripts
-
-# path to subword segmentation scripts: https://github.com/rsennrich/subword-nmt
-subword_nmt=../tools/subword-nmt
-
-# tokenize
-for prefix in valid test2014 test2015 test2016 test2017
-do
- cat data/$prefix.$SRC \
- | $mosesdecoder/scripts/tokenizer/normalize-punctuation.perl -l $SRC \
- | $mosesdecoder/scripts/tokenizer/tokenizer.perl -a -l $SRC > data/$prefix.tok.$SRC
-
- test -f data/$prefix.$TRG || continue
-
- cat data/$prefix.$TRG \
- | $mosesdecoder/scripts/tokenizer/normalize-punctuation.perl -l $TRG \
- | $mosesdecoder/scripts/tokenizer/tokenizer.perl -a -l $TRG > data/$prefix.tok.$TRG
-done
-
-if [ ! -e "model/tc.$TRG" ]
-then
- # train truecaser
- $mosesdecoder/scripts/recaser/train-truecaser.perl -corpus data/corpus.tok.$SRC -model model/tc.$SRC
- $mosesdecoder/scripts/recaser/train-truecaser.perl -corpus data/corpus.tok.$TRG -model model/tc.$TRG
-fi
-
-# apply truecaser (cleaned training corpus)
-for prefix in valid test2014 test2015 test2016 test2017
-do
- $mosesdecoder/scripts/recaser/truecase.perl -model model/tc.$SRC < data/$prefix.tok.$SRC > data/$prefix.tc.$SRC
- test -f data/$prefix.tok.$TRG || continue
- $mosesdecoder/scripts/recaser/truecase.perl -model model/tc.$TRG < data/$prefix.tok.$TRG > data/$prefix.tc.$TRG
-done
-
-if [ ! -e "model/$SRC$TRG.bpe" ]
-then
- # train BPE
- cat data/corpus.tc.$SRC data/corpus.tc.$TRG | $subword_nmt/learn_bpe.py -s $bpe_operations > model/$SRC$TRG.bpe
-fi
-
-# apply BPE
-for prefix in valid test2014 test2015 test2016 test2017
-do
- $subword_nmt/apply_bpe.py -c model/$SRC$TRG.bpe < data/$prefix.tc.$SRC > data/$prefix.bpe.$SRC
- test -f data/$prefix.tc.$TRG || continue
- $subword_nmt/apply_bpe.py -c model/$SRC$TRG.bpe < data/$prefix.tc.$TRG > data/$prefix.bpe.$TRG
-done
diff --git a/wmt2017-uedin/scripts/preprocess-data.sh b/wmt2017-uedin/scripts/preprocess-data.sh
index f02f597..3a968a5 100755
--- a/wmt2017-uedin/scripts/preprocess-data.sh
+++ b/wmt2017-uedin/scripts/preprocess-data.sh
@@ -16,7 +16,7 @@ mosesdecoder=../tools/moses-scripts
subword_nmt=../tools/subword-nmt
# tokenize
-for prefix in corpus valid test2014 test2015 test2016 test2017
+for prefix in corpus valid test2014 test2015 test2016
do
cat data/$prefix.$SRC \
| $mosesdecoder/scripts/tokenizer/normalize-punctuation.perl -l $SRC \
@@ -34,29 +34,23 @@ mv data/corpus.tok.$SRC data/corpus.tok.uncleaned.$SRC
mv data/corpus.tok.$TRG data/corpus.tok.uncleaned.$TRG
$mosesdecoder/scripts/training/clean-corpus-n.perl data/corpus.tok.uncleaned $SRC $TRG data/corpus.tok 1 100
-if [ ! -e "model/tc.$TRG" ]
-then
- # train truecaser
- $mosesdecoder/scripts/recaser/train-truecaser.perl -corpus data/corpus.tok.$SRC -model model/tc.$SRC
- $mosesdecoder/scripts/recaser/train-truecaser.perl -corpus data/corpus.tok.$TRG -model model/tc.$TRG
-fi
+# train truecaser
+$mosesdecoder/scripts/recaser/train-truecaser.perl -corpus data/corpus.tok.$SRC -model model/tc.$SRC
+$mosesdecoder/scripts/recaser/train-truecaser.perl -corpus data/corpus.tok.$TRG -model model/tc.$TRG
# apply truecaser (cleaned training corpus)
-for prefix in corpus valid test2014 test2015 test2016 test2017
+for prefix in corpus valid test2014 test2015 test2016
do
$mosesdecoder/scripts/recaser/truecase.perl -model model/tc.$SRC < data/$prefix.tok.$SRC > data/$prefix.tc.$SRC
test -f data/$prefix.tok.$TRG || continue
$mosesdecoder/scripts/recaser/truecase.perl -model model/tc.$TRG < data/$prefix.tok.$TRG > data/$prefix.tc.$TRG
done
-if [ ! -e "model/$SRC$TRG.bpe" ]
-then
- # train BPE
- cat data/corpus.tc.$SRC data/corpus.tc.$TRG | $subword_nmt/learn_bpe.py -s $bpe_operations > model/$SRC$TRG.bpe
-fi
+# train BPE
+cat data/corpus.tc.$SRC data/corpus.tc.$TRG | $subword_nmt/learn_bpe.py -s $bpe_operations > model/$SRC$TRG.bpe
# apply BPE
-for prefix in corpus valid test2014 test2015 test2016 test2017
+for prefix in corpus valid test2014 test2015 test2016
do
$subword_nmt/apply_bpe.py -c model/$SRC$TRG.bpe < data/$prefix.tc.$SRC > data/$prefix.bpe.$SRC
test -f data/$prefix.tc.$TRG || continue
diff --git a/wmt2017-uedin/scripts/rescore.py b/wmt2017-uedin/scripts/rescore.py
new file mode 100644
index 0000000..f5ecfa9
--- /dev/null
+++ b/wmt2017-uedin/scripts/rescore.py
@@ -0,0 +1,25 @@
+import sys
+
+lastNum = 0
+bestScore = -9999
+
+for line in sys.stdin:
+ line = line.rstrip("\n")
+ fields = line.split(" ||| ")
+ score = sum(float(score) for score in fields[2].split(" ") if score[-1] != "=")
+ length = float(len(fields[1].split(" ")) + 1)
+
+ score = score / length
+
+ num = int(fields[0])
+ if num > lastNum:
+ print bestLine
+ bestScore = -99999
+ bestLine = fields[1]
+ lastNum = num
+
+ if score > bestScore:
+ bestScore = score
+ bestLine = fields[1]
+
+print bestLine
diff --git a/wmt2017-uedin/scripts/validate.sh b/wmt2017-uedin/scripts/validate.sh
index ee93bc6..a2b4945 100755
--- a/wmt2017-uedin/scripts/validate.sh
+++ b/wmt2017-uedin/scripts/validate.sh
@@ -1,5 +1,7 @@
#!/bin/bash
+export LC_ALL=C.UTF-8
+
cat $1 \
| sed 's/\@\@ //g' \
| ../tools/moses-scripts/scripts/recaser/detruecase.perl 2>/dev/null \