diff options
author | Pedro Coelho <pedrodiascoelho97@gmail.com> | 2021-02-23 14:16:27 +0300 |
---|---|---|
committer | Pedro Coelho <pedrodiascoelho97@gmail.com> | 2021-02-23 14:16:27 +0300 |
commit | 0a396730bea7ca4ab5e732d39f510fde90c7dabe (patch) | |
tree | f51c4bf6cd632910b1b8b44880a986ce36bf9203 | |
parent | c505cf1fc66e4d4df039456c9633fb8689bd5155 (diff) |
updated and simplified factors training test
- added clip-norm-0
- added verification if factors are being used
- factored vocabulary is no longer constructed on the fly. The one already in the models folder is used
- made training test marian-10 friendly
-rw-r--r-- | tests/training/features/factors/factors.expected | 88 | ||||
-rwxr-xr-x | tests/training/features/factors/setup.sh | 42 | ||||
-rw-r--r-- | tests/training/features/factors/test_factors.sh | 6 |
3 files changed, 52 insertions, 84 deletions
diff --git a/tests/training/features/factors/factors.expected b/tests/training/features/factors/factors.expected index 0164290..70ee0cf 100644 --- a/tests/training/features/factors/factors.expected +++ b/tests/training/features/factors/factors.expected @@ -1,44 +1,44 @@ -214.03384399 -234.90438843 -227.33822632 -229.79216003 -223.72485352 -220.85546875 -216.96545410 -210.97512817 -217.42474365 -217.29472351 -222.89697266 -225.04882812 -221.97128296 -221.55412292 -210.07235718 -214.99653625 -210.14802551 -202.47036743 -229.11576843 -213.29634094 -223.94625854 -213.39431763 -222.35632324 -208.29580688 -203.60266113 -210.88558960 -194.14140320 -232.09394836 -209.24897766 -215.81555176 -205.87390137 -216.11462402 -209.60700989 -198.33728027 -212.82144165 -186.60208130 -228.83305359 -203.70266724 -216.54733276 -201.15771484 -213.18595886 -202.07525635 -195.43476868 -209.35302734 +10.77032471 +10.66634846 +10.57369232 +10.49110889 +10.42863846 +10.37640285 +10.31346226 +10.26187325 +10.20974541 +10.14718914 +10.10516739 +10.08621216 +10.04499340 +10.03392220 +10.01447582 +9.95392036 +9.91196060 +9.86151695 +9.79879570 +9.77040195 +9.74897766 +9.72190189 +9.72009087 +9.66733265 +9.62868977 +9.60218239 +9.51061630 +9.43571186 +9.44846630 +9.42066574 +9.38913155 +9.40541935 +9.32790852 +9.28542328 +9.26995373 +9.15800571 +9.10080147 +9.11642838 +9.08799362 +9.07044029 +9.08870029 +9.00327969 +8.95821762 +8.93725300 diff --git a/tests/training/features/factors/setup.sh b/tests/training/features/factors/setup.sh index ac22a0c..faa043c 100755 --- a/tests/training/features/factors/setup.sh +++ b/tests/training/features/factors/setup.sh @@ -11,44 +11,10 @@ set -e test -f $MRT_DATA/europarl.de-en/toy.bpe.en || exit 1 test -f $MRT_DATA/europarl.de-en/toy.bpe.de || exit 1 -#escape carachters #:_\| -test -s toy.bpe.esc.en || cat $MRT_DATA/europarl.de-en/toy.bpe.en | sed 's/#/\&htg;/g;s/:/\&cln;/g;s/_/\&usc;/g;s/\\/\&esc;/g;s/|/\&ppe;/g' \ - > toy.bpe.esc.en -test -s toy.bpe.esc.de || cat $MRT_DATA/europarl.de-en/toy.bpe.de | sed 's/#/\&htg;/g;s/:/\&cln;/g;s/_/\&usc;/g;s/\\/\&esc;/g;s/|/\&ppe;/g' \ - > toy.bpe.esc.de - -#add factors to replace @@ markers. s1 is used if a word is a subword (if it has the suffix @@), s0 is used otherwise -test -s toy.bpe.fact.en || cat toy.bpe.esc.en | sed 's/\(\s\|$\)/|s0 /g;s/@@|s0/|s1/g;s/\s*$//' > toy.bpe.fact.en -test -s toy.bpe.fact.de || cat toy.bpe.esc.de | sed 's/\(\s\|$\)/|s0 /g;s/@@|s0/|s1/g;s/\s*$//' > toy.bpe.fact.de - -#creates factored vocabulary -if [[ ! -s vocab.en.fsv ]]; then - echo '_lemma - -_s -s0 : _s -s1 : _s - -</s> : _lemma -<unk> : _lemma' > vocab.en.fsv - - sed -i 's/@@//g' toy.bpe.esc.en - $MRT_MARIAN/marian-vocab < toy.bpe.esc.en | grep -v '<\/s>\|<unk>' | sed 's/"//g' | sed 's/:.*$/ : _lemma _has_s/' >> vocab.en.fsv -fi - -if [[ ! -s vocab.de.fsv ]]; then - echo '_lemma - -_s -s0 : _s -s1 : _s - -</s> : _lemma -<unk> : _lemma' > vocab.de.fsv - - sed -i 's/@@//g' toy.bpe.esc.de - $MRT_MARIAN/marian-vocab < toy.bpe.esc.de | grep -v '<\/s>\|<unk>' | sed 's/"//g' | sed 's/:.*$/ : _lemma _has_s/' >> vocab.de.fsv -fi +test -s toy.bpe.fact.en || cat $MRT_DATA/europarl.de-en/toy.bpe.en | \ + sed 's/\(\s\|$\)/|s0 /g;s/@@|s0/|s1/g;s/\s*$//' > toy.bpe.fact.en +test -s toy.bpe.fact.de || cat $MRT_DATA/europarl.de-en/toy.bpe.en | \ + sed 's/\(\s\|$\)/|s0 /g;s/@@|s0/|s1/g;s/\s*$//' > toy.bpe.fact.de # Exit with success code exit 0 diff --git a/tests/training/features/factors/test_factors.sh b/tests/training/features/factors/test_factors.sh index 97deb6e..193ff37 100644 --- a/tests/training/features/factors/test_factors.sh +++ b/tests/training/features/factors/test_factors.sh @@ -15,14 +15,16 @@ mkdir -p factors # Run marian command $MRT_MARIAN/marian \ - --no-shuffle --seed 1111 --dim-emb 32 --dim-rnn 64 --maxi-batch 1 --maxi-batch-sort none \ - -m factors/model.npz -t toy.bpe.fact.{en,de} -v vocab.en.fsv vocab.de.fsv \ + --no-shuffle --seed 1111 --dim-emb 32 --dim-rnn 64 --maxi-batch 1 --maxi-batch-sort none --clip-norm 0 \ + -m factors/model.npz -t toy.bpe.fact.{en,de} -v $MRT_MODELS/factors/vocab.{en,de}.fsv \ --disp-freq 5 -e 5 \ --log factors.log # Check if files exist test -e factors/model.npz test -e factors.log +grep -q "Factored embeddings enabled" factors.log +grep -q "Factored outputs enabled" factors.log # Compare the current output with the expected output cat factors.log | $MRT_TOOLS/extract-costs.sh > factors.out |