diff options
author | Marcin Junczys-Dowmunt <marcinjd@microsoft.com> | 2018-11-25 18:52:09 +0300 |
---|---|---|
committer | Marcin Junczys-Dowmunt <marcinjd@microsoft.com> | 2018-11-25 18:52:09 +0300 |
commit | 8bf8e5d89e84a6ece037a7b7b7929d75470bf218 (patch) | |
tree | 8406499ac61e19e8dfedeae8884bc94536c39429 | |
parent | 153c7147f99e699555b243c6b4be3ae65ac17dd8 (diff) |
add romanian normalization
-rw-r--r-- | training-basics-spm/data/norm_romanian.tsv | 14 | ||||
-rwxr-xr-x | training-basics-spm/run-me.sh | 7 |
2 files changed, 18 insertions, 3 deletions
diff --git a/training-basics-spm/data/norm_romanian.tsv b/training-basics-spm/data/norm_romanian.tsv new file mode 100644 index 0000000..4e165d5 --- /dev/null +++ b/training-basics-spm/data/norm_romanian.tsv @@ -0,0 +1,14 @@ +015E 53 +015F 73 +0162 54 +0163 74 +0218 53 +0219 73 +021A 54 +021B 74 +0102 41 +0103 61 +00C2 41 +00E2 61 +00CE 49 +00EE 69
\ No newline at end of file diff --git a/training-basics-spm/run-me.sh b/training-basics-spm/run-me.sh index 8be3152..90ff701 100755 --- a/training-basics-spm/run-me.sh +++ b/training-basics-spm/run-me.sh @@ -16,7 +16,7 @@ then exit 1 fi -if [ ! -e "data/corpus.en" ] +if [ ! -e "data/corpus.ro" ] then cd data # get En-Ro training data for WMT16 @@ -50,8 +50,9 @@ then --model model/model.npz \ --train-sets data/corpus.ro data/corpus.en \ --vocabs model/vocab.roen.spm model/vocab.roen.spm \ + --sentencepiece-options '--normalization_rule_tsv=data/norm_romanian.tsv' \ --dim-vocabs 32000 32000 \ - --mini-batch-fit -w 8000 \ + --mini-batch-fit -w 5000 \ --layer-normalization --tied-embeddings-all \ --dropout-rnn 0.2 --dropout-src 0.1 --dropout-trg 0.1 \ --early-stopping 5 --max-length 100 \ @@ -75,5 +76,5 @@ cat data/newstest2016.ro \ --mini-batch 64 --maxi-batch 10 --maxi-batch-sort src > data/newstest2016.ro.output # calculate bleu scores on dev and test set -../tools/moses-scripts/scripts/generic/multi-bleu-detok.perl data/newsdev2016.en < data/newsdev2016.ro.output +../tools/moses-scripts/scripts/generic/multi-bleu-detok.perl data/newsdev2016.en < data/newsdev2016.ro.output ../tools/moses-scripts/scripts/generic/multi-bleu-detok.perl data/newstest2016.en < data/newstest2016.ro.output |