added training example

author: Marcin Junczys-Dowmunt <junczys@amu.edu.pl> 2017-03-24 20:37:35 +0300
committer: Marcin Junczys-Dowmunt <junczys@amu.edu.pl> 2017-03-24 20:37:35 +0300
commit: 5dbeadb47dbb70b85061aa08c8c55193d9fbe74a (patch)
tree: 2e0e7d1698b0c756896c6af59cde4eb4f5eabd02 /examples/training/scripts
parent: e7157515d3f30b44649f2f48b4a70b3999530a02 (diff)
5 files changed, 151 insertions, 0 deletions
diff --git a/examples/training/scripts/download-files.sh b/examples/training/scripts/download-files.sh
new file mode 100755
index 00000000..069dcbc2
--- /dev/null
+++ b/examples/training/scripts/download-files.sh
@@ -0,0 +1,22 @@
+#!/bin/bash -v
+
+# get En-Ro training data for WMT16
+
+if [ ! -f data/ro-en.tgz ];
+then
+  wget http://www.statmt.org/europarl/v7/ro-en.tgz -O data/ro-en.tgz
+fi
+
+if [ ! -f data/SETIMES2.ro-en.txt.zip ];
+then
+  wget http://opus.lingfil.uu.se/download.php?f=SETIMES2/en-ro.txt.zip -O data/SETIMES2.ro-en.txt.zip
+fi
+
+cd data/
+tar -xf ro-en.tgz
+unzip SETIMES2.ro-en.txt.zip
+
+cat europarl-v7.ro-en.en SETIMES2.en-ro.en > corpus.en
+cat europarl-v7.ro-en.ro SETIMES2.en-ro.ro > corpus.ro
+
+cd ..
diff --git a/examples/training/scripts/normalise-romanian.py b/examples/training/scripts/normalise-romanian.py
new file mode 100755
index 00000000..7d5e86ca
--- /dev/null
+++ b/examples/training/scripts/normalise-romanian.py
@@ -0,0 +1,17 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+# Author: Barry Haddow
+# Distributed under MIT license
+
+#
+# Normalise Romanian s-comma and t-comma
+
+import io
+import sys
+istream = io.TextIOWrapper(sys.stdin.buffer, encoding='utf-8')
+ostream = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8')
+
+for line in istream:
+  line = line.replace("\u015e", "\u0218").replace("\u015f", "\u0219")
+  line = line.replace("\u0162", "\u021a").replace("\u0163", "\u021b")
+  ostream.write(line)
diff --git a/examples/training/scripts/preprocess.sh b/examples/training/scripts/preprocess.sh
new file mode 100755
index 00000000..924f8971
--- /dev/null
+++ b/examples/training/scripts/preprocess.sh
@@ -0,0 +1,75 @@
+#!/bin/bash -v
+
+# this sample script preprocesses a sample corpus, including tokenization,
+# truecasing, and subword segmentation. 
+# for application to a different language pair,
+# change source and target prefix, optionally the number of BPE operations,
+# and the file names (currently, data/corpus and data/newsdev2016 are being processed)
+
+# in the tokenization step, you will want to remove Romanian-specific normalization / diacritic removal,
+# and you may want to add your own.
+# also, you may want to learn BPE segmentations separately for each language,
+# especially if they differ in their alphabet
+
+# suffix of source language files
+SRC=ro
+
+# suffix of target language files
+TRG=en
+
+# number of merge operations. Network vocabulary should be slightly larger (to include characters),
+# or smaller if the operations are learned on the joint vocabulary
+bpe_operations=60000
+
+# path to moses decoder: https://github.com/moses-smt/mosesdecoder
+mosesdecoder=mosesdecoder
+
+# path to subword segmentation scripts: https://github.com/rsennrich/subword-nmt
+subword_nmt=subword-nmt 
+
+# tokenize
+for prefix in corpus newsdev2016
+ do
+   cat data/$prefix.$SRC | \
+   $mosesdecoder/scripts/tokenizer/normalize-punctuation.perl -l $SRC | \
+   ./scripts/normalise-romanian.py | \
+   ./scripts/remove-diacritics.py | \
+   $mosesdecoder/scripts/tokenizer/tokenizer.perl -a -l $SRC > data/$prefix.tok.$SRC
+
+   cat data/$prefix.$TRG | \
+   $mosesdecoder/scripts/tokenizer/normalize-punctuation.perl -l $TRG | \
+   $mosesdecoder/scripts/tokenizer/tokenizer.perl -a -l $TRG > data/$prefix.tok.$TRG
+
+ done
+
+# clean empty and long sentences, and sentences with high source-target ratio (training corpus only)
+$mosesdecoder/scripts/training/clean-corpus-n.perl data/corpus.tok $SRC $TRG data/corpus.tok.clean 1 80
+
+# train truecaser
+$mosesdecoder/scripts/recaser/train-truecaser.perl -corpus data/corpus.tok.clean.$SRC -model model/truecase-model.$SRC
+$mosesdecoder/scripts/recaser/train-truecaser.perl -corpus data/corpus.tok.clean.$TRG -model model/truecase-model.$TRG
+
+# apply truecaser (cleaned training corpus)
+for prefix in corpus
+ do
+  $mosesdecoder/scripts/recaser/truecase.perl -model model/truecase-model.$SRC < data/$prefix.tok.clean.$SRC > data/$prefix.tc.$SRC
+  $mosesdecoder/scripts/recaser/truecase.perl -model model/truecase-model.$TRG < data/$prefix.tok.clean.$TRG > data/$prefix.tc.$TRG
+ done
+
+# apply truecaser (dev/test files)
+for prefix in newsdev2016
+ do
+  $mosesdecoder/scripts/recaser/truecase.perl -model model/truecase-model.$SRC < data/$prefix.tok.$SRC > data/$prefix.tc.$SRC
+  $mosesdecoder/scripts/recaser/truecase.perl -model model/truecase-model.$TRG < data/$prefix.tok.$TRG > data/$prefix.tc.$TRG
+ done
+
+# train BPE
+cat data/corpus.tc.$SRC data/corpus.tc.$TRG | $subword_nmt/learn_bpe.py -s $bpe_operations > model/$SRC$TRG.bpe
+
+# apply BPE
+
+for prefix in corpus newsdev2016
+ do
+  $subword_nmt/apply_bpe.py -c model/$SRC$TRG.bpe < data/$prefix.tc.$SRC > data/$prefix.bpe.$SRC
+  $subword_nmt/apply_bpe.py -c model/$SRC$TRG.bpe < data/$prefix.tc.$TRG > data/$prefix.bpe.$TRG
+ done
diff --git a/examples/training/scripts/remove-diacritics.py b/examples/training/scripts/remove-diacritics.py
new file mode 100755
index 00000000..46e4f6db
--- /dev/null
+++ b/examples/training/scripts/remove-diacritics.py
@@ -0,0 +1,20 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+# Author: Barry Haddow
+# Distributed under MIT license
+
+#
+# Remove Romanian diacritics. Assumes s-comma and t-comma are normalised
+
+import io
+import sys
+istream = io.TextIOWrapper(sys.stdin.buffer, encoding='utf-8')
+ostream = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8')
+
+for line in istream:
+  line = line.replace("\u0218", "S").replace("\u0219", "s") #s-comma
+  line = line.replace("\u021a", "T").replace("\u021b", "t") #t-comma
+  line = line.replace("\u0102", "A").replace("\u0103", "a")
+  line = line.replace("\u00C2", "A").replace("\u00E2", "a")
+  line = line.replace("\u00CE", "I").replace("\u00EE", "i")
+  ostream.write(line)
diff --git a/examples/training/scripts/validate.sh b/examples/training/scripts/validate.sh
new file mode 100755
index 00000000..992689de
--- /dev/null
+++ b/examples/training/scripts/validate.sh
@@ -0,0 +1,17 @@
+#!/bin/bash
+
+#model prefix
+prefix=model/model.npz
+
+dev=data/newsdev2016.bpe.ro
+ref=data/newsdev2016.tok.en
+
+# decode
+
+cat $dev | ../../build/amun -c $prefix.dev.npz.amun.yml -b 12 -n --mini-batch 10 --maxi-batch 100 2>/dev/null \
+ | sed 's/\@\@ //g' | mosesdecoder/scripts/recaser/detruecase.perl > $dev.output.postprocessed.dev
+
+## get BLEU
+BLEU=`./mosesdecoder/scripts/generic/multi-bleu.perl $ref < $dev.output.postprocessed.dev | cut -f 3 -d ' ' | cut -f 1 -d ','`
+
+echo $BLEU
author	Marcin Junczys-Dowmunt <junczys@amu.edu.pl>	2017-03-24 20:37:35 +0300
committer	Marcin Junczys-Dowmunt <junczys@amu.edu.pl>	2017-03-24 20:37:35 +0300
commit	5dbeadb47dbb70b85061aa08c8c55193d9fbe74a (patch)
tree	2e0e7d1698b0c756896c6af59cde4eb4f5eabd02 /examples/training/scripts
parent	e7157515d3f30b44649f2f48b4a70b3999530a02 (diff)