diff options
author | Marcin Junczys-Dowmunt <junczys@amu.edu.pl> | 2017-03-24 20:37:35 +0300 |
---|---|---|
committer | Marcin Junczys-Dowmunt <junczys@amu.edu.pl> | 2017-03-24 20:37:35 +0300 |
commit | 5dbeadb47dbb70b85061aa08c8c55193d9fbe74a (patch) | |
tree | 2e0e7d1698b0c756896c6af59cde4eb4f5eabd02 /examples/training/scripts | |
parent | e7157515d3f30b44649f2f48b4a70b3999530a02 (diff) |
added training example
Diffstat (limited to 'examples/training/scripts')
-rwxr-xr-x | examples/training/scripts/download-files.sh | 22 | ||||
-rwxr-xr-x | examples/training/scripts/normalise-romanian.py | 17 | ||||
-rwxr-xr-x | examples/training/scripts/preprocess.sh | 75 | ||||
-rwxr-xr-x | examples/training/scripts/remove-diacritics.py | 20 | ||||
-rwxr-xr-x | examples/training/scripts/validate.sh | 17 |
5 files changed, 151 insertions, 0 deletions
diff --git a/examples/training/scripts/download-files.sh b/examples/training/scripts/download-files.sh new file mode 100755 index 00000000..069dcbc2 --- /dev/null +++ b/examples/training/scripts/download-files.sh @@ -0,0 +1,22 @@ +#!/bin/bash -v + +# get En-Ro training data for WMT16 + +if [ ! -f data/ro-en.tgz ]; +then + wget http://www.statmt.org/europarl/v7/ro-en.tgz -O data/ro-en.tgz +fi + +if [ ! -f data/SETIMES2.ro-en.txt.zip ]; +then + wget http://opus.lingfil.uu.se/download.php?f=SETIMES2/en-ro.txt.zip -O data/SETIMES2.ro-en.txt.zip +fi + +cd data/ +tar -xf ro-en.tgz +unzip SETIMES2.ro-en.txt.zip + +cat europarl-v7.ro-en.en SETIMES2.en-ro.en > corpus.en +cat europarl-v7.ro-en.ro SETIMES2.en-ro.ro > corpus.ro + +cd .. diff --git a/examples/training/scripts/normalise-romanian.py b/examples/training/scripts/normalise-romanian.py new file mode 100755 index 00000000..7d5e86ca --- /dev/null +++ b/examples/training/scripts/normalise-romanian.py @@ -0,0 +1,17 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +# Author: Barry Haddow +# Distributed under MIT license + +# +# Normalise Romanian s-comma and t-comma + +import io +import sys +istream = io.TextIOWrapper(sys.stdin.buffer, encoding='utf-8') +ostream = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8') + +for line in istream: + line = line.replace("\u015e", "\u0218").replace("\u015f", "\u0219") + line = line.replace("\u0162", "\u021a").replace("\u0163", "\u021b") + ostream.write(line) diff --git a/examples/training/scripts/preprocess.sh b/examples/training/scripts/preprocess.sh new file mode 100755 index 00000000..924f8971 --- /dev/null +++ b/examples/training/scripts/preprocess.sh @@ -0,0 +1,75 @@ +#!/bin/bash -v + +# this sample script preprocesses a sample corpus, including tokenization, +# truecasing, and subword segmentation. +# for application to a different language pair, +# change source and target prefix, optionally the number of BPE operations, +# and the file names (currently, data/corpus and data/newsdev2016 are being processed) + +# in the tokenization step, you will want to remove Romanian-specific normalization / diacritic removal, +# and you may want to add your own. +# also, you may want to learn BPE segmentations separately for each language, +# especially if they differ in their alphabet + +# suffix of source language files +SRC=ro + +# suffix of target language files +TRG=en + +# number of merge operations. Network vocabulary should be slightly larger (to include characters), +# or smaller if the operations are learned on the joint vocabulary +bpe_operations=60000 + +# path to moses decoder: https://github.com/moses-smt/mosesdecoder +mosesdecoder=mosesdecoder + +# path to subword segmentation scripts: https://github.com/rsennrich/subword-nmt +subword_nmt=subword-nmt + +# tokenize +for prefix in corpus newsdev2016 + do + cat data/$prefix.$SRC | \ + $mosesdecoder/scripts/tokenizer/normalize-punctuation.perl -l $SRC | \ + ./scripts/normalise-romanian.py | \ + ./scripts/remove-diacritics.py | \ + $mosesdecoder/scripts/tokenizer/tokenizer.perl -a -l $SRC > data/$prefix.tok.$SRC + + cat data/$prefix.$TRG | \ + $mosesdecoder/scripts/tokenizer/normalize-punctuation.perl -l $TRG | \ + $mosesdecoder/scripts/tokenizer/tokenizer.perl -a -l $TRG > data/$prefix.tok.$TRG + + done + +# clean empty and long sentences, and sentences with high source-target ratio (training corpus only) +$mosesdecoder/scripts/training/clean-corpus-n.perl data/corpus.tok $SRC $TRG data/corpus.tok.clean 1 80 + +# train truecaser +$mosesdecoder/scripts/recaser/train-truecaser.perl -corpus data/corpus.tok.clean.$SRC -model model/truecase-model.$SRC +$mosesdecoder/scripts/recaser/train-truecaser.perl -corpus data/corpus.tok.clean.$TRG -model model/truecase-model.$TRG + +# apply truecaser (cleaned training corpus) +for prefix in corpus + do + $mosesdecoder/scripts/recaser/truecase.perl -model model/truecase-model.$SRC < data/$prefix.tok.clean.$SRC > data/$prefix.tc.$SRC + $mosesdecoder/scripts/recaser/truecase.perl -model model/truecase-model.$TRG < data/$prefix.tok.clean.$TRG > data/$prefix.tc.$TRG + done + +# apply truecaser (dev/test files) +for prefix in newsdev2016 + do + $mosesdecoder/scripts/recaser/truecase.perl -model model/truecase-model.$SRC < data/$prefix.tok.$SRC > data/$prefix.tc.$SRC + $mosesdecoder/scripts/recaser/truecase.perl -model model/truecase-model.$TRG < data/$prefix.tok.$TRG > data/$prefix.tc.$TRG + done + +# train BPE +cat data/corpus.tc.$SRC data/corpus.tc.$TRG | $subword_nmt/learn_bpe.py -s $bpe_operations > model/$SRC$TRG.bpe + +# apply BPE + +for prefix in corpus newsdev2016 + do + $subword_nmt/apply_bpe.py -c model/$SRC$TRG.bpe < data/$prefix.tc.$SRC > data/$prefix.bpe.$SRC + $subword_nmt/apply_bpe.py -c model/$SRC$TRG.bpe < data/$prefix.tc.$TRG > data/$prefix.bpe.$TRG + done diff --git a/examples/training/scripts/remove-diacritics.py b/examples/training/scripts/remove-diacritics.py new file mode 100755 index 00000000..46e4f6db --- /dev/null +++ b/examples/training/scripts/remove-diacritics.py @@ -0,0 +1,20 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +# Author: Barry Haddow +# Distributed under MIT license + +# +# Remove Romanian diacritics. Assumes s-comma and t-comma are normalised + +import io +import sys +istream = io.TextIOWrapper(sys.stdin.buffer, encoding='utf-8') +ostream = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8') + +for line in istream: + line = line.replace("\u0218", "S").replace("\u0219", "s") #s-comma + line = line.replace("\u021a", "T").replace("\u021b", "t") #t-comma + line = line.replace("\u0102", "A").replace("\u0103", "a") + line = line.replace("\u00C2", "A").replace("\u00E2", "a") + line = line.replace("\u00CE", "I").replace("\u00EE", "i") + ostream.write(line) diff --git a/examples/training/scripts/validate.sh b/examples/training/scripts/validate.sh new file mode 100755 index 00000000..992689de --- /dev/null +++ b/examples/training/scripts/validate.sh @@ -0,0 +1,17 @@ +#!/bin/bash + +#model prefix +prefix=model/model.npz + +dev=data/newsdev2016.bpe.ro +ref=data/newsdev2016.tok.en + +# decode + +cat $dev | ../../build/amun -c $prefix.dev.npz.amun.yml -b 12 -n --mini-batch 10 --maxi-batch 100 2>/dev/null \ + | sed 's/\@\@ //g' | mosesdecoder/scripts/recaser/detruecase.perl > $dev.output.postprocessed.dev + +## get BLEU +BLEU=`./mosesdecoder/scripts/generic/multi-bleu.perl $ref < $dev.output.postprocessed.dev | cut -f 3 -d ' ' | cut -f 1 -d ','` + +echo $BLEU |