Add tests with input types

author: Roman Grundkiewicz <roman.grundkiewicz@microsoft.com> 2020-06-26 21:04:26 +0300
committer: Roman Grundkiewicz <roman.grundkiewicz@microsoft.com> 2020-06-26 21:04:26 +0300
commit: 823728eb483b15448b8f3641c231a68f2e9c04a0 (patch)
tree: e728d3ab996d77d6a8bb96ee76385e4bf487c5d6 /tests
parent: e8fef26c9c57387d79660c93cf1a4923d168130f (diff)
4 files changed, 100 insertions, 2 deletions
diff --git a/tests/interface/input-tsv/.gitignore b/tests/interface/input-tsv/.gitignore
index 52173ab..5a16ccc 100644
--- a/tests/interface/input-tsv/.gitignore
+++ b/tests/interface/input-tsv/.gitignore
@@ -13,8 +13,16 @@ train_vocabs_nopaths
 train_lm
 train_empty_lines
 train_extra_tabs
-train_align*
-train_weights*
+train_align
+train_align0
+train_align_weights
+train_align_weights_intypes
+train_align_shuffle
+train_align_shuffle_ram
+train_align_stdin
+train_weights
+train_weights0
+train_intypes_stdin
 
 train.de
 train.en
diff --git a/tests/interface/input-tsv/test_tsv_train_inputtypes_stdin.sh b/tests/interface/input-tsv/test_tsv_train_inputtypes_stdin.sh
new file mode 100644
index 0000000..04831ef
--- /dev/null
+++ b/tests/interface/input-tsv/test_tsv_train_inputtypes_stdin.sh
@@ -0,0 +1,31 @@
+#!/bin/bash -x
+
+#####################################################################
+# SUMMARY: Train a model on data in the TSV format from STDIN with input-types
+# TAGS: sentencepiece tsv train inputtypes
+#####################################################################
+
+# Exit on error
+set -e
+
+# Remove old artifacts and create working directory
+rm -rf train_intypes_stdin train_intypes_stdin.{log,out,diff}
+mkdir -p train_intypes_stdin
+
+# Run marian command
+cat train.tsv | $MRT_MARIAN/marian \
+    --no-shuffle --seed 1111 --dim-emb 32 --dim-rnn 64 --maxi-batch 1 --maxi-batch-sort none --optimizer sgd \
+    -m train_intypes_stdin/model.npz --tsv -t stdin --input-types sequence sequence -v $MRT_MODELS/rnn-spm/vocab.deen.{spm,spm} \
+    --after-batches 10 --disp-freq 2 \
+    --log train_intypes_stdin.log
+
+# Check if files exist
+test -e train_intypes_stdin/model.npz
+test -e train_intypes_stdin.log
+
+# Compare the current output with the expected output
+cat train_intypes_stdin.log | $MRT_TOOLS/extract-costs.sh > train_intypes_stdin.out
+$MRT_TOOLS/diff-nums.py train_intypes_stdin.out train.expected -p 0.01 -o train_intypes_stdin.diff
+
+# Exit with success code
+exit 0
diff --git a/tests/interface/input-tsv/test_tsv_train_with_align_and_weights_inputtypes.sh b/tests/interface/input-tsv/test_tsv_train_with_align_and_weights_inputtypes.sh
new file mode 100644
index 0000000..a983f22
--- /dev/null
+++ b/tests/interface/input-tsv/test_tsv_train_with_align_and_weights_inputtypes.sh
@@ -0,0 +1,34 @@
+#!/bin/bash -x
+
+#####################################################################
+# SUMMARY: Train a model on TSV data with guided alignment and data weighting using input-types
+# TAGS: sentencepiece tsv train align dataweights
+#####################################################################
+
+# Exit on error
+set -e
+
+# Remove old artifacts and create working directory
+rm -rf train_align_weights_intypes train_align_weights_intypes.{log,out,diff}
+mkdir -p train_align_weights_intypes
+
+# Run marian command
+$MRT_MARIAN/marian \
+    --no-shuffle --seed 7777 --dim-emb 32 --dim-rnn 64 --maxi-batch 1 --maxi-batch-sort none --optimizer sgd --learn-rate 0.1 \
+    -m train_align_weights_intypes/model.npz --tsv -t train2.de-w-aln-en.tsv -v $MRT_MODELS/rnn-spm/vocab.deen.{spm,spm} \
+    --after-batches 60 --disp-freq 4 \
+    --input-types sequence weight alignment sequence --guided-alignment-weight 1.0 \
+    --log train_align_weights_intypes.log
+
+# Check if files exist
+test -e train_align_weights_intypes/model.npz
+test -e train_align_weights_intypes.log
+grep -qi "word alignments from" train_align_weights_intypes.log
+grep -qi "weights from" train_align_weights_intypes.log
+
+# Compare the current output with the expected output
+cat train_align_weights_intypes.log | $MRT_TOOLS/extract-costs.sh > train_align_weights_intypes.out
+$MRT_TOOLS/diff-nums.py train_align_weights_intypes.out train_align_weights.expected -p 0.01 -o train_align_weights_intypes.diff
+
+# Exit with success code
+exit 0
diff --git a/tests/interface/input-tsv/train_align_shuffle.expected b/tests/interface/input-tsv/train_align_shuffle.expected
new file mode 100644
index 0000000..9acf3eb
--- /dev/null
+++ b/tests/interface/input-tsv/train_align_shuffle.expected
@@ -0,0 +1,25 @@
+266.17364502
+272.09991455
+258.59213257
+242.22854614
+245.76492310
+250.07225037
+231.29319763
+237.89175415
+223.80023193
+222.42335510
+226.13597107
+213.71946716
+213.41311646
+216.42651367
+214.63465881
+200.07705688
+200.67541504
+212.30265808
+206.34919739
+199.96444702
+200.64167786
+197.81892395
+197.48988342
+195.52680969
+192.76986694
author	Roman Grundkiewicz <roman.grundkiewicz@microsoft.com>	2020-06-26 21:04:26 +0300
committer	Roman Grundkiewicz <roman.grundkiewicz@microsoft.com>	2020-06-26 21:04:26 +0300
commit	823728eb483b15448b8f3641c231a68f2e9c04a0 (patch)
tree	e728d3ab996d77d6a8bb96ee76385e4bf487c5d6 /tests
parent	e8fef26c9c57387d79660c93cf1a4923d168130f (diff)