Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/marian-nmt/marian-examples.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMarcin Junczys-Dowmunt <marcinjd@microsoft.com>2018-11-25 18:52:09 +0300
committerMarcin Junczys-Dowmunt <marcinjd@microsoft.com>2018-11-25 18:52:09 +0300
commit8bf8e5d89e84a6ece037a7b7b7929d75470bf218 (patch)
tree8406499ac61e19e8dfedeae8884bc94536c39429
parent153c7147f99e699555b243c6b4be3ae65ac17dd8 (diff)
add romanian normalization
-rw-r--r--training-basics-spm/data/norm_romanian.tsv14
-rwxr-xr-xtraining-basics-spm/run-me.sh7
2 files changed, 18 insertions, 3 deletions
diff --git a/training-basics-spm/data/norm_romanian.tsv b/training-basics-spm/data/norm_romanian.tsv
new file mode 100644
index 0000000..4e165d5
--- /dev/null
+++ b/training-basics-spm/data/norm_romanian.tsv
@@ -0,0 +1,14 @@
+015E 53
+015F 73
+0162 54
+0163 74
+0218 53
+0219 73
+021A 54
+021B 74
+0102 41
+0103 61
+00C2 41
+00E2 61
+00CE 49
+00EE 69 \ No newline at end of file
diff --git a/training-basics-spm/run-me.sh b/training-basics-spm/run-me.sh
index 8be3152..90ff701 100755
--- a/training-basics-spm/run-me.sh
+++ b/training-basics-spm/run-me.sh
@@ -16,7 +16,7 @@ then
exit 1
fi
-if [ ! -e "data/corpus.en" ]
+if [ ! -e "data/corpus.ro" ]
then
cd data
# get En-Ro training data for WMT16
@@ -50,8 +50,9 @@ then
--model model/model.npz \
--train-sets data/corpus.ro data/corpus.en \
--vocabs model/vocab.roen.spm model/vocab.roen.spm \
+ --sentencepiece-options '--normalization_rule_tsv=data/norm_romanian.tsv' \
--dim-vocabs 32000 32000 \
- --mini-batch-fit -w 8000 \
+ --mini-batch-fit -w 5000 \
--layer-normalization --tied-embeddings-all \
--dropout-rnn 0.2 --dropout-src 0.1 --dropout-trg 0.1 \
--early-stopping 5 --max-length 100 \
@@ -75,5 +76,5 @@ cat data/newstest2016.ro \
--mini-batch 64 --maxi-batch 10 --maxi-batch-sort src > data/newstest2016.ro.output
# calculate bleu scores on dev and test set
-../tools/moses-scripts/scripts/generic/multi-bleu-detok.perl data/newsdev2016.en < data/newsdev2016.ro.output
+../tools/moses-scripts/scripts/generic/multi-bleu-detok.perl data/newsdev2016.en < data/newsdev2016.ro.output
../tools/moses-scripts/scripts/generic/multi-bleu-detok.perl data/newstest2016.en < data/newstest2016.ro.output