diff options
author | Pedro Coelho <pedrodiascoelho97@gmail.com> | 2021-02-18 12:58:00 +0300 |
---|---|---|
committer | Pedro Coelho <pedrodiascoelho97@gmail.com> | 2021-02-18 12:58:00 +0300 |
commit | 5d0121133813b70d7b3ffc0def8f2964d74efd1f (patch) | |
tree | fa98d4ca3d4b36e53b0756aaa031913542e411b1 | |
parent | 20335b8642159335b9c958032d81b46938480a7f (diff) |
add test for training with factors
-rw-r--r-- | tests/training/features/factors/factors.expected | 44 | ||||
-rwxr-xr-x | tests/training/features/factors/setup.sh | 54 | ||||
-rw-r--r-- | tests/training/features/factors/test_factors.sh | 32 |
3 files changed, 130 insertions, 0 deletions
diff --git a/tests/training/features/factors/factors.expected b/tests/training/features/factors/factors.expected new file mode 100644 index 0000000..0164290 --- /dev/null +++ b/tests/training/features/factors/factors.expected @@ -0,0 +1,44 @@ +214.03384399 +234.90438843 +227.33822632 +229.79216003 +223.72485352 +220.85546875 +216.96545410 +210.97512817 +217.42474365 +217.29472351 +222.89697266 +225.04882812 +221.97128296 +221.55412292 +210.07235718 +214.99653625 +210.14802551 +202.47036743 +229.11576843 +213.29634094 +223.94625854 +213.39431763 +222.35632324 +208.29580688 +203.60266113 +210.88558960 +194.14140320 +232.09394836 +209.24897766 +215.81555176 +205.87390137 +216.11462402 +209.60700989 +198.33728027 +212.82144165 +186.60208130 +228.83305359 +203.70266724 +216.54733276 +201.15771484 +213.18595886 +202.07525635 +195.43476868 +209.35302734 diff --git a/tests/training/features/factors/setup.sh b/tests/training/features/factors/setup.sh new file mode 100755 index 0000000..ac22a0c --- /dev/null +++ b/tests/training/features/factors/setup.sh @@ -0,0 +1,54 @@ +#!/bin/bash -x + +##################################################################### +# AUTHOR: pedrodiascoelho +##################################################################### + +# Exit on error +set -e + +# Test code goes here +test -f $MRT_DATA/europarl.de-en/toy.bpe.en || exit 1 +test -f $MRT_DATA/europarl.de-en/toy.bpe.de || exit 1 + +#escape carachters #:_\| +test -s toy.bpe.esc.en || cat $MRT_DATA/europarl.de-en/toy.bpe.en | sed 's/#/\&htg;/g;s/:/\&cln;/g;s/_/\&usc;/g;s/\\/\&esc;/g;s/|/\&ppe;/g' \ + > toy.bpe.esc.en +test -s toy.bpe.esc.de || cat $MRT_DATA/europarl.de-en/toy.bpe.de | sed 's/#/\&htg;/g;s/:/\&cln;/g;s/_/\&usc;/g;s/\\/\&esc;/g;s/|/\&ppe;/g' \ + > toy.bpe.esc.de + +#add factors to replace @@ markers. s1 is used if a word is a subword (if it has the suffix @@), s0 is used otherwise +test -s toy.bpe.fact.en || cat toy.bpe.esc.en | sed 's/\(\s\|$\)/|s0 /g;s/@@|s0/|s1/g;s/\s*$//' > toy.bpe.fact.en +test -s toy.bpe.fact.de || cat toy.bpe.esc.de | sed 's/\(\s\|$\)/|s0 /g;s/@@|s0/|s1/g;s/\s*$//' > toy.bpe.fact.de + +#creates factored vocabulary +if [[ ! -s vocab.en.fsv ]]; then + echo '_lemma + +_s +s0 : _s +s1 : _s + +</s> : _lemma +<unk> : _lemma' > vocab.en.fsv + + sed -i 's/@@//g' toy.bpe.esc.en + $MRT_MARIAN/marian-vocab < toy.bpe.esc.en | grep -v '<\/s>\|<unk>' | sed 's/"//g' | sed 's/:.*$/ : _lemma _has_s/' >> vocab.en.fsv +fi + +if [[ ! -s vocab.de.fsv ]]; then + echo '_lemma + +_s +s0 : _s +s1 : _s + +</s> : _lemma +<unk> : _lemma' > vocab.de.fsv + + sed -i 's/@@//g' toy.bpe.esc.de + $MRT_MARIAN/marian-vocab < toy.bpe.esc.de | grep -v '<\/s>\|<unk>' | sed 's/"//g' | sed 's/:.*$/ : _lemma _has_s/' >> vocab.de.fsv +fi + +# Exit with success code +exit 0 diff --git a/tests/training/features/factors/test_factors.sh b/tests/training/features/factors/test_factors.sh new file mode 100644 index 0000000..97deb6e --- /dev/null +++ b/tests/training/features/factors/test_factors.sh @@ -0,0 +1,32 @@ +#!/bin/bash -x + +##################################################################### +# SUMMARY: Training a factored model +# AUTHOR: pedrodiascoelho +# TAGS: factors +##################################################################### + +# Exit on error +set -e + +# Remove old artifacts and create working directory +rm -rf factors factors.{log,out,diff} +mkdir -p factors + +# Run marian command +$MRT_MARIAN/marian \ + --no-shuffle --seed 1111 --dim-emb 32 --dim-rnn 64 --maxi-batch 1 --maxi-batch-sort none \ + -m factors/model.npz -t toy.bpe.fact.{en,de} -v vocab.en.fsv vocab.de.fsv \ + --disp-freq 5 -e 5 \ + --log factors.log + +# Check if files exist +test -e factors/model.npz +test -e factors.log + +# Compare the current output with the expected output +cat factors.log | $MRT_TOOLS/extract-costs.sh > factors.out +$MRT_TOOLS/diff-nums.py factors.out factors.expected -o factors.diff + +# Exit with success code +exit 0 |