diff options
author | Roman Grundkiewicz <roman.grundkiewicz@microsoft.com> | 2020-06-25 18:45:02 +0300 |
---|---|---|
committer | Roman Grundkiewicz <roman.grundkiewicz@microsoft.com> | 2020-06-25 18:45:02 +0300 |
commit | 7b31b4eb387a6037ee970afddf5050829d0bf649 (patch) | |
tree | f9b2068497ebd3d3aa8426ca24edc6ffe803e0ce /tests | |
parent | e7ee6bcc285c5883265cc83b9a7baca0bfd9d256 (diff) |
Basic tests for guided alignment and data weighting with --tsv
Diffstat (limited to 'tests')
6 files changed, 84 insertions, 8 deletions
diff --git a/tests/interface/input-tsv/.gitignore b/tests/interface/input-tsv/.gitignore index 972de4d..7760deb 100644 --- a/tests/interface/input-tsv/.gitignore +++ b/tests/interface/input-tsv/.gitignore @@ -14,7 +14,9 @@ train_lm train_empty_lines train_extra_tabs train_align +train_align0 train_weights +train_weights0 train_align_weights train.de @@ -25,6 +27,7 @@ train.bpe.en train.bpe.tsv train_empty_lines.tsv train_extra_tabs.tsv +train2*.tsv valid valid.tsv diff --git a/tests/interface/input-tsv/setup.sh b/tests/interface/input-tsv/setup.sh index 9888ee6..2b213e2 100644 --- a/tests/interface/input-tsv/setup.sh +++ b/tests/interface/input-tsv/setup.sh @@ -18,3 +18,8 @@ test -s train.tsv || paste train.{de,en} > train.tsv test -s train.bpe.de || cat $MRT_DATA/train.max50.de > train.bpe.de test -s train.bpe.en || cat $MRT_DATA/train.max50.en > train.bpe.en test -s train.bpe.tsv || paste train.bpe.{de,en} > train.bpe.tsv + +test -s train2.de-en-aln.tsv || paste train2.{de,en,aln} > train2.de-en-aln.tsv +test -s train2.aln-de-en.tsv || paste train2.{aln,de,en} > train2.aln-de-en.tsv +test -s train2.de-en-w.tsv || paste train2.{de,en,w} > train2.de-en-w.tsv +test -s train2.w-de-en.tsv || paste train2.{w,de,en} > train2.w-de-en.tsv diff --git a/tests/interface/input-tsv/test_tsv_train_with_align.sh b/tests/interface/input-tsv/test_tsv_train_with_align.sh index 90079fd..8edf098 100644 --- a/tests/interface/input-tsv/test_tsv_train_with_align.sh +++ b/tests/interface/input-tsv/test_tsv_train_with_align.sh @@ -2,7 +2,7 @@ ##################################################################### # SUMMARY: Train a model on TSV data with guided alignment -# TAGS: sentencepiece tsv train +# TAGS: sentencepiece tsv train align ##################################################################### # Exit on error @@ -15,16 +15,16 @@ mkdir -p train_align # Run marian command $MRT_MARIAN/marian \ --no-shuffle --seed 5555 --dim-emb 32 --dim-rnn 64 --maxi-batch 1 --maxi-batch-sort none --optimizer sgd --learn-rate 0.1 \ - -m train_align/model.npz -t train2.de train2.en -v $MRT_MODELS/rnn-spm/vocab.deen.{spm,spm} \ + -m train_align/model.npz --tsv -t train2.de-en-aln.tsv -v $MRT_MODELS/rnn-spm/vocab.deen.{spm,spm} \ --after-batches 100 --disp-freq 4 \ - --guided-alignment train2.aln --guided-alignment-weight 1.0 \ + --guided-alignment 2 --guided-alignment-weight 1.0 \ --log train_align.log # Check if files exist test -e train_align/model.npz test -e train_align.log -grep -qi "word alignments from file" train_align.log +grep -qi "word alignments from" train_align.log # Compare the current output with the expected output cat train_align.log | $MRT_TOOLS/extract-costs.sh > train_align.out diff --git a/tests/interface/input-tsv/test_tsv_train_with_align_pos0.sh b/tests/interface/input-tsv/test_tsv_train_with_align_pos0.sh new file mode 100644 index 0000000..85758e3 --- /dev/null +++ b/tests/interface/input-tsv/test_tsv_train_with_align_pos0.sh @@ -0,0 +1,34 @@ +#!/bin/bash -x + +##################################################################### +# SUMMARY: Train a model on TSV data with guided alignment +# TAGS: sentencepiece tsv train align +##################################################################### + +# Exit on error +set -e + +# Remove old artifacts and create working directory +rm -rf train_align0 train_align0.{log,out,diff} +mkdir -p train_align0 + +# Run marian command +$MRT_MARIAN/marian \ + --no-shuffle --seed 5555 --dim-emb 32 --dim-rnn 64 --maxi-batch 1 --maxi-batch-sort none --optimizer sgd --learn-rate 0.1 \ + -m train_align0/model.npz --tsv -t train2.aln-de-en.tsv -v $MRT_MODELS/rnn-spm/vocab.deen.{spm,spm} \ + --after-batches 100 --disp-freq 4 \ + --guided-alignment 0 --guided-alignment-weight 1.0 \ + --log train_align0.log + + +# Check if files exist +test -e train_align0/model.npz +test -e train_align0.log +grep -qi "word alignments from" train_align0.log + +# Compare the current output with the expected output +cat train_align0.log | $MRT_TOOLS/extract-costs.sh > train_align0.out +$MRT_TOOLS/diff-nums.py train_align0.out train_align.expected -p 0.01 -o train_align0.diff + +# Exit with success code +exit 0 diff --git a/tests/interface/input-tsv/test_tsv_train_with_weights.sh b/tests/interface/input-tsv/test_tsv_train_with_weights.sh index 49abde6..7d6927e 100644 --- a/tests/interface/input-tsv/test_tsv_train_with_weights.sh +++ b/tests/interface/input-tsv/test_tsv_train_with_weights.sh @@ -2,7 +2,7 @@ ##################################################################### # SUMMARY: Train a model on TSV data with sentence weighting -# TAGS: sentencepiece tsv train +# TAGS: sentencepiece tsv train dataweights ##################################################################### # Exit on error @@ -15,16 +15,16 @@ mkdir -p train_weights # Run marian command $MRT_MARIAN/marian \ --no-shuffle --seed 5555 --dim-emb 32 --dim-rnn 64 --maxi-batch 1 --maxi-batch-sort none --optimizer sgd --learn-rate 0.1 \ - -m train_weights/model.npz -t train2.de train2.en -v $MRT_MODELS/rnn-spm/vocab.deen.{spm,spm} \ + -m train_weights/model.npz --tsv -t train2.de-en-w.tsv -v $MRT_MODELS/rnn-spm/vocab.deen.{spm,spm} \ --after-batches 100 --disp-freq 4 \ - --data-weighting train2.w --data-weighting-type sentence \ + --data-weighting 2 --data-weighting-type sentence \ --log train_weights.log # Check if files exist test -e train_weights/model.npz test -e train_weights.log -grep -qi "weights from file" train_weights.log +grep -qi "weights from" train_weights.log # Compare the current output with the expected output cat train_weights.log | $MRT_TOOLS/extract-costs.sh > train_weights.out diff --git a/tests/interface/input-tsv/test_tsv_train_with_weights_pos0.sh b/tests/interface/input-tsv/test_tsv_train_with_weights_pos0.sh new file mode 100644 index 0000000..430b8c9 --- /dev/null +++ b/tests/interface/input-tsv/test_tsv_train_with_weights_pos0.sh @@ -0,0 +1,34 @@ +#!/bin/bash -x + +##################################################################### +# SUMMARY: Train a model on TSV data with sentence weighting +# TAGS: sentencepiece tsv train dataweights +##################################################################### + +# Exit on error +set -e + +# Remove old artifacts and create working directory +rm -rf train_weights0 train_weights0.{log,out,diff} +mkdir -p train_weights0 + +# Run marian command +$MRT_MARIAN/marian \ + --no-shuffle --seed 5555 --dim-emb 32 --dim-rnn 64 --maxi-batch 1 --maxi-batch-sort none --optimizer sgd --learn-rate 0.1 \ + -m train_weights0/model.npz --tsv -t train2.w-de-en.tsv -v $MRT_MODELS/rnn-spm/vocab.deen.{spm,spm} \ + --after-batches 100 --disp-freq 4 \ + --data-weighting 0 --data-weighting-type sentence \ + --log train_weights0.log + + +# Check if files exist +test -e train_weights0/model.npz +test -e train_weights0.log +grep -qi "weights from" train_weights0.log + +# Compare the current output with the expected output +cat train_weights0.log | $MRT_TOOLS/extract-costs.sh > train_weights0.out +$MRT_TOOLS/diff-nums.py train_weights0.out train_weights.expected -p 0.01 -o train_weights0.diff + +# Exit with success code +exit 0 |