Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/marian-nmt/marian-regression-tests.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorPedro Coelho <pedrodiascoelho97@gmail.com>2021-02-18 12:58:00 +0300
committerPedro Coelho <pedrodiascoelho97@gmail.com>2021-02-18 12:58:00 +0300
commit5d0121133813b70d7b3ffc0def8f2964d74efd1f (patch)
treefa98d4ca3d4b36e53b0756aaa031913542e411b1
parent20335b8642159335b9c958032d81b46938480a7f (diff)
add test for training with factors
-rw-r--r--tests/training/features/factors/factors.expected44
-rwxr-xr-xtests/training/features/factors/setup.sh54
-rw-r--r--tests/training/features/factors/test_factors.sh32
3 files changed, 130 insertions, 0 deletions
diff --git a/tests/training/features/factors/factors.expected b/tests/training/features/factors/factors.expected
new file mode 100644
index 0000000..0164290
--- /dev/null
+++ b/tests/training/features/factors/factors.expected
@@ -0,0 +1,44 @@
+214.03384399
+234.90438843
+227.33822632
+229.79216003
+223.72485352
+220.85546875
+216.96545410
+210.97512817
+217.42474365
+217.29472351
+222.89697266
+225.04882812
+221.97128296
+221.55412292
+210.07235718
+214.99653625
+210.14802551
+202.47036743
+229.11576843
+213.29634094
+223.94625854
+213.39431763
+222.35632324
+208.29580688
+203.60266113
+210.88558960
+194.14140320
+232.09394836
+209.24897766
+215.81555176
+205.87390137
+216.11462402
+209.60700989
+198.33728027
+212.82144165
+186.60208130
+228.83305359
+203.70266724
+216.54733276
+201.15771484
+213.18595886
+202.07525635
+195.43476868
+209.35302734
diff --git a/tests/training/features/factors/setup.sh b/tests/training/features/factors/setup.sh
new file mode 100755
index 0000000..ac22a0c
--- /dev/null
+++ b/tests/training/features/factors/setup.sh
@@ -0,0 +1,54 @@
+#!/bin/bash -x
+
+#####################################################################
+# AUTHOR: pedrodiascoelho
+#####################################################################
+
+# Exit on error
+set -e
+
+# Test code goes here
+test -f $MRT_DATA/europarl.de-en/toy.bpe.en || exit 1
+test -f $MRT_DATA/europarl.de-en/toy.bpe.de || exit 1
+
+#escape carachters #:_\|
+test -s toy.bpe.esc.en || cat $MRT_DATA/europarl.de-en/toy.bpe.en | sed 's/#/\&htg;/g;s/:/\&cln;/g;s/_/\&usc;/g;s/\\/\&esc;/g;s/|/\&ppe;/g' \
+ > toy.bpe.esc.en
+test -s toy.bpe.esc.de || cat $MRT_DATA/europarl.de-en/toy.bpe.de | sed 's/#/\&htg;/g;s/:/\&cln;/g;s/_/\&usc;/g;s/\\/\&esc;/g;s/|/\&ppe;/g' \
+ > toy.bpe.esc.de
+
+#add factors to replace @@ markers. s1 is used if a word is a subword (if it has the suffix @@), s0 is used otherwise
+test -s toy.bpe.fact.en || cat toy.bpe.esc.en | sed 's/\(\s\|$\)/|s0 /g;s/@@|s0/|s1/g;s/\s*$//' > toy.bpe.fact.en
+test -s toy.bpe.fact.de || cat toy.bpe.esc.de | sed 's/\(\s\|$\)/|s0 /g;s/@@|s0/|s1/g;s/\s*$//' > toy.bpe.fact.de
+
+#creates factored vocabulary
+if [[ ! -s vocab.en.fsv ]]; then
+ echo '_lemma
+
+_s
+s0 : _s
+s1 : _s
+
+</s> : _lemma
+<unk> : _lemma' > vocab.en.fsv
+
+ sed -i 's/@@//g' toy.bpe.esc.en
+ $MRT_MARIAN/marian-vocab < toy.bpe.esc.en | grep -v '<\/s>\|<unk>' | sed 's/"//g' | sed 's/:.*$/ : _lemma _has_s/' >> vocab.en.fsv
+fi
+
+if [[ ! -s vocab.de.fsv ]]; then
+ echo '_lemma
+
+_s
+s0 : _s
+s1 : _s
+
+</s> : _lemma
+<unk> : _lemma' > vocab.de.fsv
+
+ sed -i 's/@@//g' toy.bpe.esc.de
+ $MRT_MARIAN/marian-vocab < toy.bpe.esc.de | grep -v '<\/s>\|<unk>' | sed 's/"//g' | sed 's/:.*$/ : _lemma _has_s/' >> vocab.de.fsv
+fi
+
+# Exit with success code
+exit 0
diff --git a/tests/training/features/factors/test_factors.sh b/tests/training/features/factors/test_factors.sh
new file mode 100644
index 0000000..97deb6e
--- /dev/null
+++ b/tests/training/features/factors/test_factors.sh
@@ -0,0 +1,32 @@
+#!/bin/bash -x
+
+#####################################################################
+# SUMMARY: Training a factored model
+# AUTHOR: pedrodiascoelho
+# TAGS: factors
+#####################################################################
+
+# Exit on error
+set -e
+
+# Remove old artifacts and create working directory
+rm -rf factors factors.{log,out,diff}
+mkdir -p factors
+
+# Run marian command
+$MRT_MARIAN/marian \
+ --no-shuffle --seed 1111 --dim-emb 32 --dim-rnn 64 --maxi-batch 1 --maxi-batch-sort none \
+ -m factors/model.npz -t toy.bpe.fact.{en,de} -v vocab.en.fsv vocab.de.fsv \
+ --disp-freq 5 -e 5 \
+ --log factors.log
+
+# Check if files exist
+test -e factors/model.npz
+test -e factors.log
+
+# Compare the current output with the expected output
+cat factors.log | $MRT_TOOLS/extract-costs.sh > factors.out
+$MRT_TOOLS/diff-nums.py factors.out factors.expected -o factors.diff
+
+# Exit with success code
+exit 0