Merge pull request #74 from marian-nmt/clip-norm-0

Update tests to set --clip-norm to 0 by default
author: Roman Grundkiewicz <rgrundkiewicz@gmail.com> 2021-01-25 17:41:38 +0300
committer: GitHub <noreply@github.com> 2021-01-25 17:41:38 +0300
commit: 4c44a8d92090a010d0cedf5fe2c26e692d51d20d (patch)
tree: 12d8c9a0dc459c9b6977481d7988e81a621ca0d9
parent: 97b2f95abab6134c1632b286e373e513ecc52020 (diff)
parent: 88e775a09ad7326d1e974e150c5febfe06fc80ce (diff)
152 files changed, 950 insertions, 839 deletions
diff --git a/tests/interface/input-tsv/restore_stdin.expected b/tests/interface/input-tsv/restore_stdin.expected
index b9cc6d4..d26f7ff 100644
--- a/tests/interface/input-tsv/restore_stdin.expected
+++ b/tests/interface/input-tsv/restore_stdin.expected
@@ -1,30 +1,30 @@
-Ep. 1 : Up. 2 : Sen. 32 : Cost 214.87408447
-Ep. 1 : Up. 4 : Sen. 64 : Cost 195.88232422
-Ep. 1 : Up. 6 : Sen. 96 : Cost 222.75996399
-Ep. 1 : Up. 8 : Sen. 128 : Cost 232.49481201
-Ep. 1 : Up. 10 : Sen. 160 : Cost 204.78642273
-Ep. 1 : Up. 12 : Sen. 192 : Cost 256.89501953
-Ep. 1 : Up. 14 : Sen. 224 : Cost 237.73818970
-Ep. 1 : Up. 16 : Sen. 256 : Cost 210.21063232
-Ep. 1 : Up. 18 : Sen. 288 : Cost 178.70904541
-Ep. 1 : Up. 20 : Sen. 320 : Cost 224.30038452
-Ep. 1 : Up. 22 : Sen. 352 : Cost 225.22837830
-Ep. 1 : Up. 24 : Sen. 384 : Cost 210.81533813
-Ep. 1 : Up. 26 : Sen. 416 : Cost 202.19320679
-Ep. 1 : Up. 28 : Sen. 448 : Cost 211.53353882
-Ep. 1 : Up. 30 : Sen. 480 : Cost 209.39002991
-Ep. 1 : Up. 32 : Sen. 512 : Cost 206.38954163
-Ep. 1 : Up. 34 : Sen. 544 : Cost 202.88201904
-Ep. 1 : Up. 36 : Sen. 576 : Cost 192.36555481
-Ep. 1 : Up. 38 : Sen. 608 : Cost 179.21670532
-Ep. 1 : Up. 40 : Sen. 640 : Cost 164.29644775
-Ep. 1 : Up. 42 : Sen. 672 : Cost 187.61584473
-Ep. 1 : Up. 44 : Sen. 704 : Cost 244.09938049
-Ep. 1 : Up. 46 : Sen. 736 : Cost 266.25546265
-Ep. 1 : Up. 48 : Sen. 768 : Cost 197.74813843
-Ep. 1 : Up. 50 : Sen. 800 : Cost 187.12585449
-Ep. 1 : Up. 52 : Sen. 832 : Cost 186.14714050
-Ep. 1 : Up. 54 : Sen. 864 : Cost 227.19046021
-Ep. 1 : Up. 56 : Sen. 896 : Cost 210.90580750
-Ep. 1 : Up. 58 : Sen. 928 : Cost 210.68801880
-Ep. 1 : Up. 60 : Sen. 960 : Cost 182.89875793
+Ep. 1 : Up. 2 : Sen. 32 : Cost 214.83363342
+Ep. 1 : Up. 4 : Sen. 64 : Cost 195.70648193
+Ep. 1 : Up. 6 : Sen. 96 : Cost 222.41781616
+Ep. 1 : Up. 8 : Sen. 128 : Cost 231.91462708
+Ep. 1 : Up. 10 : Sen. 160 : Cost 204.08346558
+Ep. 1 : Up. 12 : Sen. 192 : Cost 255.86239624
+Ep. 1 : Up. 14 : Sen. 224 : Cost 236.60090637
+Ep. 1 : Up. 16 : Sen. 256 : Cost 209.00881958
+Ep. 1 : Up. 18 : Sen. 288 : Cost 177.51702881
+Ep. 1 : Up. 20 : Sen. 320 : Cost 222.74383545
+Ep. 1 : Up. 22 : Sen. 352 : Cost 223.34017944
+Ep. 1 : Up. 24 : Sen. 384 : Cost 208.93505859
+Ep. 1 : Up. 26 : Sen. 416 : Cost 200.02706909
+Ep. 1 : Up. 28 : Sen. 448 : Cost 209.29515076
+Ep. 1 : Up. 30 : Sen. 480 : Cost 207.00128174
+Ep. 1 : Up. 32 : Sen. 512 : Cost 203.81817627
+Ep. 1 : Up. 34 : Sen. 544 : Cost 200.10937500
+Ep. 1 : Up. 36 : Sen. 576 : Cost 189.81176758
+Ep. 1 : Up. 38 : Sen. 608 : Cost 176.77787781
+Ep. 1 : Up. 40 : Sen. 640 : Cost 161.60902405
+Ep. 1 : Up. 42 : Sen. 672 : Cost 184.40527344
+Ep. 1 : Up. 44 : Sen. 704 : Cost 239.88012695
+Ep. 1 : Up. 46 : Sen. 736 : Cost 262.33227539
+Ep. 1 : Up. 48 : Sen. 768 : Cost 194.13323975
+Ep. 1 : Up. 50 : Sen. 800 : Cost 183.32736206
+Ep. 1 : Up. 52 : Sen. 832 : Cost 181.78253174
+Ep. 1 : Up. 54 : Sen. 864 : Cost 222.31034851
+Ep. 1 : Up. 56 : Sen. 896 : Cost 206.36886597
+Ep. 1 : Up. 58 : Sen. 928 : Cost 205.69429016
+Ep. 1 : Up. 60 : Sen. 960 : Cost 178.27331543
diff --git a/tests/interface/input-tsv/test_tsv_train.sh b/tests/interface/input-tsv/test_tsv_train.sh
index 72b87d8..822b249 100644
--- a/tests/interface/input-tsv/test_tsv_train.sh
+++ b/tests/interface/input-tsv/test_tsv_train.sh
@@ -14,7 +14,7 @@ mkdir -p train
 
 # Run marian command
 $MRT_MARIAN/marian \
-    --cost-type ce-mean --no-shuffle --seed 1111 --dim-emb 32 --dim-rnn 64 --maxi-batch 1 --maxi-batch-sort none --optimizer sgd \
+    --cost-type ce-mean --no-shuffle --clip-norm 0 --seed 1111 --dim-emb 32 --dim-rnn 64 --maxi-batch 1 --maxi-batch-sort none --optimizer sgd \
     -m train/model.npz --tsv -t train.tsv -v $MRT_MODELS/rnn-spm/vocab.deen.{spm,spm} \
     --after-batches 10 --disp-freq 2 \
     --log train.log
diff --git a/tests/interface/input-tsv/test_tsv_train_assume_stdin.sh b/tests/interface/input-tsv/test_tsv_train_assume_stdin.sh
index 87f8c7c..75a2537 100644
--- a/tests/interface/input-tsv/test_tsv_train_assume_stdin.sh
+++ b/tests/interface/input-tsv/test_tsv_train_assume_stdin.sh
@@ -14,7 +14,7 @@ mkdir -p train_stdin2
 
 # Run marian command
 cat train.tsv | $MRT_MARIAN/marian \
-    --cost-type ce-mean --no-shuffle --seed 1111 --dim-emb 32 --dim-rnn 64 --maxi-batch 1 --maxi-batch-sort none --optimizer sgd \
+    --cost-type ce-mean --no-shuffle --clip-norm 0 --seed 1111 --dim-emb 32 --dim-rnn 64 --maxi-batch 1 --maxi-batch-sort none --optimizer sgd \
     -m train_stdin2/model.npz -t stdin -v $MRT_MODELS/rnn-spm/vocab.deen.{spm,spm} \
     --after-batches 10 --disp-freq 2 \
     --log train_stdin2.log
diff --git a/tests/interface/input-tsv/test_tsv_train_create_vocab_joint.sh b/tests/interface/input-tsv/test_tsv_train_create_vocab_joint.sh
index 88b5b62..7cad7d2 100644
--- a/tests/interface/input-tsv/test_tsv_train_create_vocab_joint.sh
+++ b/tests/interface/input-tsv/test_tsv_train_create_vocab_joint.sh
@@ -14,7 +14,7 @@ mkdir -p train_vocab
 
 # Run marian command
 $MRT_MARIAN/marian \
-    --cost-type ce-mean --no-shuffle --seed 1111 --dim-emb 32 --dim-rnn 64 --maxi-batch 1 --maxi-batch-sort none --optimizer sgd \
+    --cost-type ce-mean --no-shuffle --clip-norm 0 --seed 1111 --dim-emb 32 --dim-rnn 64 --maxi-batch 1 --maxi-batch-sort none --optimizer sgd \
     -m train_vocab/model.npz --tsv -t train.tsv -v train_vocab/vocab.spm train_vocab/vocab.spm --dim-vocabs 2000 2000 -T train_vocab \
     --after-batches 20 --disp-freq 2 \
     --log train_vocab.log
diff --git a/tests/interface/input-tsv/test_tsv_train_create_vocabs.sh b/tests/interface/input-tsv/test_tsv_train_create_vocabs.sh
index a23e7ac..0a3da3f 100644
--- a/tests/interface/input-tsv/test_tsv_train_create_vocabs.sh
+++ b/tests/interface/input-tsv/test_tsv_train_create_vocabs.sh
@@ -14,7 +14,7 @@ mkdir -p train_vocabs
 
 # Run marian command
 $MRT_MARIAN/marian \
-    --cost-type ce-mean --no-shuffle --seed 1111 --dim-emb 32 --dim-rnn 64 --maxi-batch 1 --maxi-batch-sort none --optimizer sgd \
+    --cost-type ce-mean --no-shuffle --clip-norm 0 --seed 1111 --dim-emb 32 --dim-rnn 64 --maxi-batch 1 --maxi-batch-sort none --optimizer sgd \
     -m train_vocabs/model.npz --tsv -t train.tsv -v train_vocabs/vocab.de.spm train_vocabs/vocab.en.spm --dim-vocabs 2000 2000 -T train_vocabs \
     --after-batches 20 --disp-freq 2 \
     --log train_vocabs.log
diff --git a/tests/interface/input-tsv/test_tsv_train_create_vocabs_yml.sh b/tests/interface/input-tsv/test_tsv_train_create_vocabs_yml.sh
index 26d20a6..b257ea6 100644
--- a/tests/interface/input-tsv/test_tsv_train_create_vocabs_yml.sh
+++ b/tests/interface/input-tsv/test_tsv_train_create_vocabs_yml.sh
@@ -14,7 +14,7 @@ mkdir -p train_vocabs_yml
 
 # Run marian command
 $MRT_MARIAN/marian \
-    --cost-type ce-mean --no-shuffle --seed 1111 --dim-emb 32 --dim-rnn 64 --maxi-batch 1 --maxi-batch-sort none --optimizer sgd \
+    --cost-type ce-mean --no-shuffle --clip-norm 0 --seed 1111 --dim-emb 32 --dim-rnn 64 --maxi-batch 1 --maxi-batch-sort none --optimizer sgd \
     -m train_vocabs_yml/model.npz --tsv -t train.bpe.tsv -v train_vocabs_yml/vocab.de.yml train_vocabs_yml/vocab.en.yml --dim-vocabs 2000 2000 -T train_vocabs_yml \
     --after-batches 20 --disp-freq 2 \
     --log train_vocabs_yml.log
diff --git a/tests/interface/input-tsv/test_tsv_train_inputtypes_stdin.sh b/tests/interface/input-tsv/test_tsv_train_inputtypes_stdin.sh
index 07f2eab..0f824ac 100644
--- a/tests/interface/input-tsv/test_tsv_train_inputtypes_stdin.sh
+++ b/tests/interface/input-tsv/test_tsv_train_inputtypes_stdin.sh
@@ -14,7 +14,7 @@ mkdir -p train_intypes_stdin
 
 # Run marian command
 cat train.tsv | $MRT_MARIAN/marian \
-    --cost-type ce-mean --no-shuffle --seed 1111 --dim-emb 32 --dim-rnn 64 --maxi-batch 1 --maxi-batch-sort none --optimizer sgd \
+    --cost-type ce-mean --no-shuffle --clip-norm 0 --seed 1111 --dim-emb 32 --dim-rnn 64 --maxi-batch 1 --maxi-batch-sort none --optimizer sgd \
     -m train_intypes_stdin/model.npz --tsv -t stdin --input-types sequence sequence -v $MRT_MODELS/rnn-spm/vocab.deen.{spm,spm} \
     --after-batches 10 --disp-freq 2 \
     --log train_intypes_stdin.log
diff --git a/tests/interface/input-tsv/test_tsv_train_mini_batch_fit.sh b/tests/interface/input-tsv/test_tsv_train_mini_batch_fit.sh
index 42afeac..401c7f9 100644
--- a/tests/interface/input-tsv/test_tsv_train_mini_batch_fit.sh
+++ b/tests/interface/input-tsv/test_tsv_train_mini_batch_fit.sh
@@ -14,7 +14,7 @@ mkdir -p train_fit
 
 # Run marian command
 $MRT_MARIAN/marian \
-    --cost-type ce-mean --mini-batch-fit -w 500 --seed 1111 --dim-emb 32 --dim-rnn 64 --maxi-batch 10 --optimizer sgd --sync-sgd \
+    --cost-type ce-mean --mini-batch-fit -w 500 --seed 1111 --dim-emb 32 --dim-rnn 64 --maxi-batch 10 --optimizer sgd --sync-sgd --clip-norm 0 \
     -m train_fit/model.npz --tsv -t train.tsv -v $MRT_MODELS/rnn-spm/vocab.deen.{spm,spm} \
     --after-batches 20 --disp-freq 4 \
     --log train_fit.log
diff --git a/tests/interface/input-tsv/test_tsv_train_mini_batch_fit_stdin.sh b/tests/interface/input-tsv/test_tsv_train_mini_batch_fit_stdin.sh
index 0b4ba42..81e9410 100644
--- a/tests/interface/input-tsv/test_tsv_train_mini_batch_fit_stdin.sh
+++ b/tests/interface/input-tsv/test_tsv_train_mini_batch_fit_stdin.sh
@@ -14,7 +14,7 @@ mkdir -p train_fit_stdin
 
 # Run marian command
 cat train.tsv | $MRT_MARIAN/marian \
-    --cost-type ce-mean --no-shuffle --mini-batch-fit -w 500 --seed 2222 --dim-emb 32 --dim-rnn 64 --maxi-batch 10 --optimizer sgd \
+    --cost-type ce-mean --no-shuffle --clip-norm 0 --mini-batch-fit -w 500 --seed 2222 --dim-emb 32 --dim-rnn 64 --maxi-batch 10 --optimizer sgd \
     -m train_fit_stdin/model.npz --tsv -t stdin -v $MRT_MODELS/rnn-spm/vocab.deen.{spm,spm} \
     --disp-freq 4 --log train_fit_stdin.log
 
diff --git a/tests/interface/input-tsv/test_tsv_train_restore_from_stdin.sh b/tests/interface/input-tsv/test_tsv_train_restore_from_stdin.sh
index f8953ef..36b7685 100644
--- a/tests/interface/input-tsv/test_tsv_train_restore_from_stdin.sh
+++ b/tests/interface/input-tsv/test_tsv_train_restore_from_stdin.sh
@@ -11,7 +11,7 @@ test -e vocab.de.yml || $MRT_MARIAN/marian-vocab < train.bpe.de > vocab.de.yml
 test -e vocab.en.yml || $MRT_MARIAN/marian-vocab < train.bpe.en > vocab.en.yml
 
 # TODO: Weight decaying in Adam is disabled, because it gives unstable results on GPU
-extra_opts="--no-shuffle --seed 2222 --maxi-batch 1 --maxi-batch-sort none --mini-batch 16 --dim-emb 128 --dim-rnn 256 --disp-freq 2 --type s2s --sync-sgd --optimizer sgd --cost-type ce-mean"
+extra_opts="--no-shuffle --clip-norm 0 --seed 2222 --maxi-batch 1 --maxi-batch-sort none --mini-batch 16 --dim-emb 128 --dim-rnn 256 --disp-freq 2 --type s2s --sync-sgd --optimizer sgd --cost-type ce-mean"
 
 # Step 1: Train a model in one go, up to the update no. 70, and save training logs
 #$MRT_MARIAN/marian \
diff --git a/tests/interface/input-tsv/test_tsv_train_shuffle.sh b/tests/interface/input-tsv/test_tsv_train_shuffle.sh
index e5ca216..9dd4ac9 100644
--- a/tests/interface/input-tsv/test_tsv_train_shuffle.sh
+++ b/tests/interface/input-tsv/test_tsv_train_shuffle.sh
@@ -14,7 +14,7 @@ mkdir -p train_shuffle
 
 # Run marian command
 $MRT_MARIAN/marian \
-    --cost-type ce-mean --seed 1111 --dim-emb 32 --dim-rnn 64 --maxi-batch 10 --optimizer sgd --sync-sgd \
+    --cost-type ce-mean --seed 1111 --dim-emb 32 --dim-rnn 64 --maxi-batch 10 --optimizer sgd --sync-sgd --clip-norm 0 \
     -m train_shuffle/model.npz --tsv --tsv-fields 2 -t train.tsv -v $MRT_MODELS/rnn-spm/vocab.deen.{spm,spm} \
     --after-batches 20 --disp-freq 4 \
     --log train_shuffle.log
diff --git a/tests/interface/input-tsv/test_tsv_train_shuffle_in_ram.sh b/tests/interface/input-tsv/test_tsv_train_shuffle_in_ram.sh
index 8ce67c3..c6e0421 100644
--- a/tests/interface/input-tsv/test_tsv_train_shuffle_in_ram.sh
+++ b/tests/interface/input-tsv/test_tsv_train_shuffle_in_ram.sh
@@ -14,7 +14,7 @@ mkdir -p train_shuffle_ram
 
 # Run marian command
 $MRT_MARIAN/marian \
-    --cost-type ce-mean --shuffle-in-ram --seed 1111 --dim-emb 32 --dim-rnn 64 --maxi-batch 10 --optimizer sgd \
+    --cost-type ce-mean --shuffle-in-ram --clip-norm 0 --seed 1111 --dim-emb 32 --dim-rnn 64 --maxi-batch 10 --optimizer sgd \
     -m train_shuffle_ram/model.npz --tsv --tsv-fields 2 -t train.tsv -v $MRT_MODELS/rnn-spm/vocab.deen.{spm,spm} \
     --after-batches 20 --disp-freq 4 \
     --log train_shuffle_ram.log
diff --git a/tests/interface/input-tsv/test_tsv_train_stdin.sh b/tests/interface/input-tsv/test_tsv_train_stdin.sh
index 2a77516..7050b36 100644
--- a/tests/interface/input-tsv/test_tsv_train_stdin.sh
+++ b/tests/interface/input-tsv/test_tsv_train_stdin.sh
@@ -14,7 +14,7 @@ mkdir -p train_stdin
 
 # Run marian command
 cat train.tsv | $MRT_MARIAN/marian \
-    --cost-type ce-mean --no-shuffle --seed 1111 --dim-emb 32 --dim-rnn 64 --maxi-batch 1 --maxi-batch-sort none --optimizer sgd \
+    --cost-type ce-mean --no-shuffle --clip-norm 0 --seed 1111 --dim-emb 32 --dim-rnn 64 --maxi-batch 1 --maxi-batch-sort none --optimizer sgd \
     -m train_stdin/model.npz --tsv -t stdin -v $MRT_MODELS/rnn-spm/vocab.deen.{spm,spm} \
     --after-batches 10 --disp-freq 2 \
     --log train_stdin.log
diff --git a/tests/interface/input-tsv/test_tsv_train_stdin_2_epochs.sh b/tests/interface/input-tsv/test_tsv_train_stdin_2_epochs.sh
index af35c5c..654f019 100644
--- a/tests/interface/input-tsv/test_tsv_train_stdin_2_epochs.sh
+++ b/tests/interface/input-tsv/test_tsv_train_stdin_2_epochs.sh
@@ -14,7 +14,7 @@ mkdir -p train_stdin_2e
 
 # Train for the 1st epoch
 cat train.tsv | $MRT_MARIAN/marian \
-    --cost-type ce-mean --no-shuffle --seed 1111 --dim-emb 32 --dim-rnn 64 --maxi-batch 1 --maxi-batch-sort none --optimizer sgd \
+    --cost-type ce-mean --no-shuffle --clip-norm 0 --seed 1111 --dim-emb 32 --dim-rnn 64 --maxi-batch 1 --maxi-batch-sort none --optimizer sgd \
     -m train_stdin_2e/model.npz --tsv -t stdin -v $MRT_MODELS/rnn-spm/vocab.deen.{spm,spm} \
     --disp-freq 5 \
     --log train_stdin_2e.log
@@ -25,7 +25,7 @@ test -e train_stdin_2e.log
 
 # Train for the 2nd epoch
 cat train.tsv | $MRT_MARIAN/marian \
-    --cost-type ce-mean --no-shuffle --seed 1111 --dim-emb 32 --dim-rnn 64 --maxi-batch 1 --maxi-batch-sort none --optimizer sgd --learn-rate 0.002 \
+    --cost-type ce-mean --no-shuffle --clip-norm 0 --seed 1111 --dim-emb 32 --dim-rnn 64 --maxi-batch 1 --maxi-batch-sort none --optimizer sgd --learn-rate 0.002 \
     -m train_stdin_2e/model.npz --tsv -t stdin -v $MRT_MODELS/rnn-spm/vocab.deen.{spm,spm} \
     --disp-freq 5 \
     --log train_stdin_2e.log
diff --git a/tests/interface/input-tsv/test_tsv_train_stdin_empty_fields.sh b/tests/interface/input-tsv/test_tsv_train_stdin_empty_fields.sh
index e9b0b33..d76ad80 100644
--- a/tests/interface/input-tsv/test_tsv_train_stdin_empty_fields.sh
+++ b/tests/interface/input-tsv/test_tsv_train_stdin_empty_fields.sh
@@ -20,7 +20,7 @@ paste train.{de,en} \
 
 # Run marian command
 cat train_empty_lines.tsv | $MRT_MARIAN/marian \
-    --cost-type ce-mean --no-shuffle --seed 1111 --dim-emb 32 --dim-rnn 64 --mini-batch 32 --maxi-batch 1 --maxi-batch-sort none --optimizer sgd --max-length 200 \
+    --cost-type ce-mean --no-shuffle --clip-norm 0 --seed 1111 --dim-emb 32 --dim-rnn 64 --mini-batch 32 --maxi-batch 1 --maxi-batch-sort none --optimizer sgd --max-length 200 \
     -m train_empty_lines/model.npz --tsv -v $MRT_MODELS/rnn-spm/vocab.deen.{spm,spm} \
     --after-epochs 1 --disp-freq 2 \
     --log train_empty_lines.log
diff --git a/tests/interface/input-tsv/test_tsv_train_stdin_lm.sh b/tests/interface/input-tsv/test_tsv_train_stdin_lm.sh
index e9a25c6..8ebd9ce 100644
--- a/tests/interface/input-tsv/test_tsv_train_stdin_lm.sh
+++ b/tests/interface/input-tsv/test_tsv_train_stdin_lm.sh
@@ -14,7 +14,7 @@ mkdir -p train_lm
 
 # Run marian command
 cat train.en | $MRT_MARIAN/marian --type lm \
-    --cost-type ce-mean --no-shuffle --seed 4444 --dim-emb 32 --dim-rnn 64 --maxi-batch 1 --maxi-batch-sort none --optimizer sgd \
+    --cost-type ce-mean --no-shuffle --clip-norm 0 --seed 4444 --dim-emb 32 --dim-rnn 64 --maxi-batch 1 --maxi-batch-sort none --optimizer sgd \
     -m train_lm/model.npz -t stdin -v $MRT_MODELS/rnn-spm/vocab.deen.spm \
     --after-batches 10 --disp-freq 2 \
     --log train_lm.log
diff --git a/tests/interface/input-tsv/test_tsv_train_with_align.sh b/tests/interface/input-tsv/test_tsv_train_with_align.sh
index c101d51..a95ad9d 100644
--- a/tests/interface/input-tsv/test_tsv_train_with_align.sh
+++ b/tests/interface/input-tsv/test_tsv_train_with_align.sh
@@ -14,7 +14,7 @@ mkdir -p train_align
 
 # Run marian command
 $MRT_MARIAN/marian \
-    --cost-type ce-mean --no-shuffle --seed 5555 --dim-emb 32 --dim-rnn 64 --maxi-batch 1 --maxi-batch-sort none --optimizer sgd --learn-rate 0.1 \
+    --cost-type ce-mean --no-shuffle --clip-norm 0 --seed 5555 --dim-emb 32 --dim-rnn 64 --maxi-batch 1 --maxi-batch-sort none --optimizer sgd --learn-rate 0.001 \
     -m train_align/model.npz --tsv -t train2.de-en-aln.tsv -v $MRT_MODELS/rnn-spm/vocab.deen.{spm,spm} \
     --after-batches 100 --disp-freq 4 \
     --guided-alignment 2 --guided-alignment-weight 1.0 \
diff --git a/tests/interface/input-tsv/test_tsv_train_with_align_and_weights.sh b/tests/interface/input-tsv/test_tsv_train_with_align_and_weights.sh
index b02488a..efa1793 100644
--- a/tests/interface/input-tsv/test_tsv_train_with_align_and_weights.sh
+++ b/tests/interface/input-tsv/test_tsv_train_with_align_and_weights.sh
@@ -14,7 +14,7 @@ mkdir -p train_align_weights
 
 # Run marian command
 $MRT_MARIAN/marian \
-    --cost-type ce-mean --no-shuffle --seed 7777 --dim-emb 32 --dim-rnn 64 --maxi-batch 1 --maxi-batch-sort none --optimizer sgd --learn-rate 0.1 \
+    --cost-type ce-mean --no-shuffle --clip-norm 1 --seed 7777 --dim-emb 32 --dim-rnn 64 --maxi-batch 1 --maxi-batch-sort none --optimizer sgd --learn-rate 0.1 \
     -m train_align_weights/model.npz --tsv -t train2.de-w-aln-en.tsv -v $MRT_MODELS/rnn-spm/vocab.deen.{spm,spm} \
     --after-batches 60 --disp-freq 4 \
     --guided-alignment 2 --guided-alignment-weight 1.0 --data-weighting 1 \
diff --git a/tests/interface/input-tsv/test_tsv_train_with_align_and_weights_inputtypes.sh b/tests/interface/input-tsv/test_tsv_train_with_align_and_weights_inputtypes.sh
index 8653a67..c0e175e 100644
--- a/tests/interface/input-tsv/test_tsv_train_with_align_and_weights_inputtypes.sh
+++ b/tests/interface/input-tsv/test_tsv_train_with_align_and_weights_inputtypes.sh
@@ -14,7 +14,7 @@ mkdir -p train_align_weights_intypes
 
 # Run marian command
 $MRT_MARIAN/marian \
-    --cost-type ce-mean --no-shuffle --seed 7777 --dim-emb 32 --dim-rnn 64 --maxi-batch 1 --maxi-batch-sort none --optimizer sgd --learn-rate 0.1 \
+    --cost-type ce-mean --no-shuffle --clip-norm 1 --seed 7777 --dim-emb 32 --dim-rnn 64 --maxi-batch 1 --maxi-batch-sort none --optimizer sgd --learn-rate 0.1 \
     -m train_align_weights_intypes/model.npz --tsv -t train2.de-w-aln-en.tsv -v $MRT_MODELS/rnn-spm/vocab.deen.{spm,spm} \
     --after-batches 60 --disp-freq 4 \
     --input-types sequence weight alignment sequence --guided-alignment-weight 1.0 \
diff --git a/tests/interface/input-tsv/test_tsv_train_with_align_pos0.sh b/tests/interface/input-tsv/test_tsv_train_with_align_pos0.sh
index 55f9995..34c829f 100644
--- a/tests/interface/input-tsv/test_tsv_train_with_align_pos0.sh
+++ b/tests/interface/input-tsv/test_tsv_train_with_align_pos0.sh
@@ -14,7 +14,7 @@ mkdir -p train_align0
 
 # Run marian command
 $MRT_MARIAN/marian \
-    --cost-type ce-mean --no-shuffle --seed 5555 --dim-emb 32 --dim-rnn 64 --maxi-batch 1 --maxi-batch-sort none --optimizer sgd --learn-rate 0.1 \
+    --cost-type ce-mean --no-shuffle --clip-norm 0 --seed 5555 --dim-emb 32 --dim-rnn 64 --maxi-batch 1 --maxi-batch-sort none --optimizer sgd --learn-rate 0.001 \
     -m train_align0/model.npz --tsv -t train2.aln-de-en.tsv -v $MRT_MODELS/rnn-spm/vocab.deen.{spm,spm} \
     --after-batches 100 --disp-freq 4 \
     --guided-alignment 0 --guided-alignment-weight 1.0 \
diff --git a/tests/interface/input-tsv/test_tsv_train_with_align_shuffle.sh b/tests/interface/input-tsv/test_tsv_train_with_align_shuffle.sh
index d02f8dc..7a0503c 100644
--- a/tests/interface/input-tsv/test_tsv_train_with_align_shuffle.sh
+++ b/tests/interface/input-tsv/test_tsv_train_with_align_shuffle.sh
@@ -14,7 +14,7 @@ mkdir -p train_align_shuffle
 
 # Run marian command
 $MRT_MARIAN/marian \
-    --cost-type ce-mean --seed 4444 --dim-emb 32 --dim-rnn 64 --maxi-batch 1 --maxi-batch-sort none --optimizer sgd --learn-rate 0.1 --sync-sgd \
+    --cost-type ce-mean --clip-norm 1 --seed 4444 --dim-emb 32 --dim-rnn 64 --maxi-batch 1 --maxi-batch-sort none --optimizer sgd --learn-rate 0.1 --sync-sgd \
     -m train_align_shuffle/model.npz --tsv -t train2.aln-de-en.tsv -v $MRT_MODELS/rnn-spm/vocab.deen.{spm,spm} \
     --after-batches 100 --disp-freq 4 \
     --guided-alignment 0 --guided-alignment-weight 1.0 \
diff --git a/tests/interface/input-tsv/test_tsv_train_with_align_shuffle_in_ram.sh b/tests/interface/input-tsv/test_tsv_train_with_align_shuffle_in_ram.sh
index 2c4dc1a..49e74de 100644
--- a/tests/interface/input-tsv/test_tsv_train_with_align_shuffle_in_ram.sh
+++ b/tests/interface/input-tsv/test_tsv_train_with_align_shuffle_in_ram.sh
@@ -14,7 +14,7 @@ mkdir -p train_align_shuffle_ram
 
 # Run marian command
 $MRT_MARIAN/marian \
-    --cost-type ce-mean --shuffle-in-ram --seed 4444 --dim-emb 32 --dim-rnn 64 --maxi-batch 1 --maxi-batch-sort none --optimizer sgd --learn-rate 0.1 \
+    --cost-type ce-mean --shuffle-in-ram --clip-norm 1 --seed 4444 --dim-emb 32 --dim-rnn 64 --maxi-batch 1 --maxi-batch-sort none --optimizer sgd --learn-rate 0.1 \
     -m train_align_shuffle_ram/model.npz --tsv -t train2.aln-de-en.tsv -v $MRT_MODELS/rnn-spm/vocab.deen.{spm,spm} \
     --after-batches 100 --disp-freq 4 \
     --guided-alignment 0 --guided-alignment-weight 1.0 \
diff --git a/tests/interface/input-tsv/test_tsv_train_with_align_stdin.sh b/tests/interface/input-tsv/test_tsv_train_with_align_stdin.sh
index b266f3d..6eb5386 100644
--- a/tests/interface/input-tsv/test_tsv_train_with_align_stdin.sh
+++ b/tests/interface/input-tsv/test_tsv_train_with_align_stdin.sh
@@ -14,7 +14,7 @@ mkdir -p train_align_stdin
 
 # Run marian command
 cat train2.aln-de-en.tsv | $MRT_MARIAN/marian \
-    --cost-type ce-mean --no-shuffle --seed 5555 --dim-emb 32 --dim-rnn 64 --maxi-batch 1 --maxi-batch-sort none --optimizer sgd --learn-rate 0.1 \
+    --cost-type ce-mean --no-shuffle --clip-norm 0 --seed 5555 --dim-emb 32 --dim-rnn 64 --maxi-batch 1 --maxi-batch-sort none --optimizer sgd --learn-rate 0.001 \
     -m train_align_stdin/model.npz -t stdin -v $MRT_MODELS/rnn-spm/vocab.deen.{spm,spm} \
     --disp-freq 4 \
     --guided-alignment 0 --guided-alignment-weight 1.0 \
diff --git a/tests/interface/input-tsv/test_tsv_train_with_weights.sh b/tests/interface/input-tsv/test_tsv_train_with_weights.sh
index 6528713..5b08134 100644
--- a/tests/interface/input-tsv/test_tsv_train_with_weights.sh
+++ b/tests/interface/input-tsv/test_tsv_train_with_weights.sh
@@ -14,7 +14,7 @@ mkdir -p train_weights
 
 # Run marian command
 $MRT_MARIAN/marian \
-    --cost-type ce-mean --no-shuffle --seed 5555 --dim-emb 32 --dim-rnn 64 --maxi-batch 1 --maxi-batch-sort none --optimizer sgd --learn-rate 0.1 \
+    --cost-type ce-mean --no-shuffle --clip-norm 1 --seed 5555 --dim-emb 32 --dim-rnn 64 --maxi-batch 1 --maxi-batch-sort none --optimizer sgd --learn-rate 0.1 \
     -m train_weights/model.npz --tsv -t train2.de-en-w.tsv -v $MRT_MODELS/rnn-spm/vocab.deen.{spm,spm} \
     --after-batches 100 --disp-freq 4 \
     --data-weighting 2 --data-weighting-type sentence \
diff --git a/tests/interface/input-tsv/test_tsv_train_with_weights_pos0.sh b/tests/interface/input-tsv/test_tsv_train_with_weights_pos0.sh
index ee4522f..5065dd3 100644
--- a/tests/interface/input-tsv/test_tsv_train_with_weights_pos0.sh
+++ b/tests/interface/input-tsv/test_tsv_train_with_weights_pos0.sh
@@ -14,7 +14,7 @@ mkdir -p train_weights0
 
 # Run marian command
 $MRT_MARIAN/marian \
-    --cost-type ce-mean --no-shuffle --seed 5555 --dim-emb 32 --dim-rnn 64 --maxi-batch 1 --maxi-batch-sort none --optimizer sgd --learn-rate 0.1 \
+    --cost-type ce-mean --no-shuffle --clip-norm 1 --seed 5555 --dim-emb 32 --dim-rnn 64 --maxi-batch 1 --maxi-batch-sort none --optimizer sgd --learn-rate 0.1 \
     -m train_weights0/model.npz --tsv -t train2.w-de-en.tsv -v $MRT_MODELS/rnn-spm/vocab.deen.{spm,spm} \
     --after-batches 100 --disp-freq 4 \
     --data-weighting 0 --data-weighting-type sentence \
diff --git a/tests/interface/input-tsv/test_tsv_valid.sh b/tests/interface/input-tsv/test_tsv_valid.sh
index df72551..94c46f8 100644
--- a/tests/interface/input-tsv/test_tsv_valid.sh
+++ b/tests/interface/input-tsv/test_tsv_valid.sh
@@ -18,7 +18,7 @@ test -e valid/vocab.spm || cp $MRT_MODELS/rnn-spm/vocab.deen.spm valid/vocab.spm
 
 # Train
 $MRT_MARIAN/marian \
-    --seed 2222 --no-shuffle --mini-batch 32 --maxi-batch 1 --optimizer sgd \
+    --seed 2222 --no-shuffle --clip-norm 1 --mini-batch 32 --maxi-batch 1 --optimizer sgd \
     -m valid/model.npz --tsv -t train.tsv -v valid/vocab.{spm,spm} \
     --disp-freq 20 --valid-freq 30 --after-batches 30 \
     --valid-metrics cross-entropy translation --valid-translation-output valid.out \
diff --git a/tests/interface/input-tsv/train.expected b/tests/interface/input-tsv/train.expected
index 1cf2ecf..ed76575 100644
--- a/tests/interface/input-tsv/train.expected
+++ b/tests/interface/input-tsv/train.expected
@@ -1,5 +1,5 @@
-261.83258057
-257.78665161
-262.48895264
-253.13388062
-234.16917419
+261.58084106
+256.52697754
+260.31454468
+250.19743347
+230.69422913
diff --git a/tests/interface/input-tsv/train_align.expected b/tests/interface/input-tsv/train_align.expected
index 2de51f1..6d1c8b3 100644
--- a/tests/interface/input-tsv/train_align.expected
+++ b/tests/interface/input-tsv/train_align.expected
@@ -1,25 +1,25 @@
-272.57867432
-267.45211792
-245.10440063
-243.12583923
-254.65167236
-251.95730591
-259.63885498
-243.55068970
-236.45735168
-215.81468201
-212.01930237
-222.31713867
-220.01065063
-230.49443054
-219.36715698
-214.80720520
-198.64233398
-195.14010620
-205.54002380
-204.59991455
-215.78044128
-205.05665588
-201.84078979
-187.56027222
-184.19506836
+267.55108643
+238.90954590
+203.89498901
+194.40493774
+201.26391602
+196.71656799
+205.61053467
+190.75955200
+190.06002808
+175.82437134
+171.81805420
+183.65437317
+183.88174438
+195.11131287
+181.43255615
+182.07211304
+168.76817322
+167.46075439
+175.70928955
+179.71203613
+187.29899597
+175.05770874
+175.59832764
+165.63943481
+163.86834717
diff --git a/tests/interface/input-tsv/train_align_stdin.expected b/tests/interface/input-tsv/train_align_stdin.expected
index a468d22..e7a4610 100644
--- a/tests/interface/input-tsv/train_align_stdin.expected
+++ b/tests/interface/input-tsv/train_align_stdin.expected
@@ -1,7 +1,7 @@
-272.57867432
-267.45211792
-245.10440063
-243.12583923
-254.65167236
-251.95730591
-259.63885498
+267.55108643
+238.90954590
+203.89497375
+194.40493774
+201.26391602
+196.71656799
+205.61053467
diff --git a/tests/interface/input-tsv/train_empty_lines.expected b/tests/interface/input-tsv/train_empty_lines.expected
index bf4b72e..acc5403 100644
--- a/tests/interface/input-tsv/train_empty_lines.expected
+++ b/tests/interface/input-tsv/train_empty_lines.expected
@@ -1,16 +1,16 @@
-270.58209229
-212.28765869
-285.14007568
-286.86123657
-168.05017090
-281.66876221
-267.08026123
-269.00738525
-262.89984131
-224.43609619
-284.56796265
-281.34075928
-231.55950928
-286.07806396
-249.47781372
-264.77264404
+270.44882202
+211.79843140
+284.02545166
+285.28485107
+166.57519531
+279.17941284
+264.18832397
+265.68725586
+259.24093628
+221.06784058
+279.52667236
+276.05969238
+226.25883484
+279.85809326
+242.84822083
+257.72427368
diff --git a/tests/interface/input-tsv/train_fit.expected b/tests/interface/input-tsv/train_fit.expected
index 1c2d2eb..c577e77 100644
--- a/tests/interface/input-tsv/train_fit.expected
+++ b/tests/interface/input-tsv/train_fit.expected
@@ -1,5 +1,5 @@
-251.22476196
-252.45635986
-252.93251038
-258.67086792
-232.73229980
+250.19946289
+249.29103088
+247.88410950
+250.90098572
+222.61479187
diff --git a/tests/interface/input-tsv/train_fit_stdin.expected b/tests/interface/input-tsv/train_fit_stdin.expected
index b9ce675..8d06b37 100644
--- a/tests/interface/input-tsv/train_fit_stdin.expected
+++ b/tests/interface/input-tsv/train_fit_stdin.expected
@@ -1,3 +1,3 @@
-344.08602905
-173.01716614
-248.64839172
+342.83029175
+170.86856079
+244.25839233
diff --git a/tests/interface/input-tsv/train_lm.expected b/tests/interface/input-tsv/train_lm.expected
index c768644..fdbed00 100644
--- a/tests/interface/input-tsv/train_lm.expected
+++ b/tests/interface/input-tsv/train_lm.expected
@@ -1,5 +1,5 @@
-274.50836182
-274.87689209
-266.24481201
-259.36730957
-235.45114136
+274.36938477
+274.21545410
+265.08605957
+257.78823853
+233.71450806
diff --git a/tests/interface/input-tsv/train_shuffle.expected b/tests/interface/input-tsv/train_shuffle.expected
index 912fc9a..f9d2a72 100644
--- a/tests/interface/input-tsv/train_shuffle.expected
+++ b/tests/interface/input-tsv/train_shuffle.expected
@@ -1,5 +1,5 @@
-216.91867065
-261.72125244
-267.90841675
-266.43109131
-300.19824219
+216.37680054
+259.30813599
+263.75015259
+260.62384033
+291.27304077
diff --git a/tests/interface/input-tsv/train_stdin_2e.expected b/tests/interface/input-tsv/train_stdin_2e.expected
index 279c4ff..f80f79b 100644
--- a/tests/interface/input-tsv/train_stdin_2e.expected
+++ b/tests/interface/input-tsv/train_stdin_2e.expected
@@ -1,6 +1,6 @@
-262.65640259
-245.10810852
-248.28816223
-262.62274170
-245.00149536
-248.09956360
+261.66619873
+242.05940247
+243.13801575
+238.52865601
+317.93377686
+355.64865112
diff --git a/tests/interface/input-tsv/train_vocab.expected b/tests/interface/input-tsv/train_vocab.expected
index 7e5f15e..2196992 100644
--- a/tests/interface/input-tsv/train_vocab.expected
+++ b/tests/interface/input-tsv/train_vocab.expected
@@ -1,10 +1,10 @@
-221.76995850
-213.01387024
-217.94252014
-216.67770386
-209.55206299
-233.91926575
-206.07537842
-231.02885437
-206.19801331
-222.06900024
+221.08058167
+211.53454590
+216.04510498
+213.98677063
+207.70535278
+230.34349060
+201.30010986
+224.49655151
+201.09184265
+214.30133057
diff --git a/tests/interface/input-tsv/train_vocabs.expected b/tests/interface/input-tsv/train_vocabs.expected
index f003061..18492a4 100644
--- a/tests/interface/input-tsv/train_vocabs.expected
+++ b/tests/interface/input-tsv/train_vocabs.expected
@@ -1,10 +1,10 @@
-225.92840576
-209.46032715
-223.60330200
-207.56042480
-223.16098022
-198.65243530
-225.91799927
-209.45040894
-223.59326172
-207.55093384
+225.66580200
+208.27442932
+221.54891968
+204.83132935
+219.20014954
+194.64096069
+220.58721924
+203.36322021
+216.46800232
+199.75990295
diff --git a/tests/interface/input-tsv/train_vocabs_yml.expected b/tests/interface/input-tsv/train_vocabs_yml.expected
index becf7bf..c6b5967 100644
--- a/tests/interface/input-tsv/train_vocabs_yml.expected
+++ b/tests/interface/input-tsv/train_vocabs_yml.expected
@@ -1,10 +1,10 @@
-200.10346985
-199.77453613
-196.38256836
-185.44483948
-168.59661865
-192.99969482
-181.77833557
-177.61343384
-200.09066772
-199.76245117
+199.88481140
+198.79373169
+194.57501221
+183.03530884
+165.68594360
+189.14419556
+177.06405640
+172.28703308
+193.72500610
+192.91064453
diff --git a/tests/interface/input-tsv/update_outputs.sh b/tests/interface/input-tsv/update_outputs.sh
new file mode 100755
index 0000000..82436fb
--- /dev/null
+++ b/tests/interface/input-tsv/update_outputs.sh
@@ -0,0 +1,27 @@
+#!/bin/sh -x
+cp train.out train.expected
+cp train_stdin2.out train.expected
+cp train_vocab.out train_vocab.expected
+cp train_vocabs.out train_vocabs.expected
+cp train_vocabs.de.spm.out train_vocabs.de.spm.expected
+cp train_vocabs.en.spm.out train_vocabs.en.spm.expected
+cp train_vocabs_yml.out train_vocabs_yml.expected
+cp train_intypes_stdin.out train.expected
+cp train_fit.out train_fit.expected
+cp train_fit_stdin.out train_fit_stdin.expected
+cp restore_stdin.out restore_stdin.expected
+cp train_shuffle.out train_shuffle.expected
+cp train_shuffle_ram.out train_shuffle.expected
+cp train_stdin.out train.expected
+cp train_stdin_2e.out train_stdin_2e.expected
+cp train_empty_lines.out train_empty_lines.expected
+cp train_lm.out train_lm.expected
+cp train_align.out train_align.expected
+cp train_align_weights.out train_align_weights.expected
+cp train_align_weights_intypes.out train_align_weights.expected
+cp train_align0.out train_align.expected
+cp train_align_shuffle.out train_align_shuffle.expected
+cp train_align_shuffle_ram.out train_align_shuffle.expected
+cp train_align_stdin.out train_align_stdin.expected
+cp train_weights.out train_weights.expected
+cp train_weights0.out train_weights.expected
diff --git a/tests/training/basics/.gitignore b/tests/training/basics/.gitignore
index 7cdacd2..a169c9f 100644
--- a/tests/training/basics/.gitignore
+++ b/tests/training/basics/.gitignore
@@ -1,4 +1,5 @@
 toy
+tiny
 valid
 trans
 sqlite
diff --git a/tests/training/basics/test_tiny_vocab.sh b/tests/training/basics/test_tiny_vocab.sh
new file mode 100644
index 0000000..f14683b
--- /dev/null
+++ b/tests/training/basics/test_tiny_vocab.sh
@@ -0,0 +1,31 @@
+#!/bin/bash
+
+#####################################################################
+# SUMMARY: Run a basic training command with tiny vocabs
+# AUTHOR: snukky
+# TAGS: small-vocab
+#####################################################################
+
+# Exit on error
+set -e
+
+# Test code goes here
+mkdir -p tiny
+rm -f tiny/* tiny.log
+
+$MRT_MARIAN/marian \
+    --seed 1111 --dim-emb 256 --dim-rnn 512 --no-shuffle --clip-norm 0 \
+    -m tiny/model.npz -t $MRT_DATA/europarl.de-en/toy.bpe.{de,en} -v tiny/vocab.de.yml tiny/vocab.en.yml \
+    --log tiny.log --disp-freq 5 -e 5
+
+test -e tiny/vocab.en.yml
+test -e tiny/vocab.de.yml
+test -e tiny/model.npz
+test -e tiny/model.npz.yml
+test -e tiny/model.npz.amun.yml
+
+cat tiny.log | $MRT_TOOLS/extract-costs.sh > tiny.out
+$MRT_TOOLS/diff-nums.py tiny.out tiny.expected -p 0.1 -o tiny.diff
+
+# Exit with success code
+exit 0
diff --git a/tests/training/basics/test_toy_vocab.sh b/tests/training/basics/test_toy_vocab.sh
deleted file mode 100644
index 671843f..0000000
--- a/tests/training/basics/test_toy_vocab.sh
+++ /dev/null
@@ -1,31 +0,0 @@
-#!/bin/bash
-
-#####################################################################
-# SUMMARY: Run a basic training command with toy vocabs
-# AUTHOR: snukky
-# TAGS: small-vocab
-#####################################################################
-
-# Exit on error
-set -e
-
-# Test code goes here
-mkdir -p toy
-rm -f toy/* toy.log
-
-$MRT_MARIAN/marian \
-    --seed 1111 --dim-emb 256 --dim-rnn 512 --no-shuffle \
-    -m toy/model.npz -t $MRT_DATA/europarl.de-en/toy.bpe.{de,en} -v toy/vocab.de.yml toy/vocab.en.yml \
-    --log toy.log --disp-freq 5 -e 5
-
-test -e toy/vocab.en.yml
-test -e toy/vocab.de.yml
-test -e toy/model.npz
-test -e toy/model.npz.yml
-test -e toy/model.npz.amun.yml
-
-cat toy.log | $MRT_TOOLS/extract-costs.sh > toy.out
-$MRT_TOOLS/diff-nums.py toy.out toy.expected -p 0.1 -o toy.diff
-
-# Exit with success code
-exit 0
diff --git a/tests/training/basics/tiny.expected b/tests/training/basics/tiny.expected
new file mode 100644
index 0000000..f6c2caf
--- /dev/null
+++ b/tests/training/basics/tiny.expected
@@ -0,0 +1,44 @@
+8.78632832
+8.75309849
+8.69854641
+8.60049725
+8.41462326
+8.04460907
+7.52984953
+7.11023378
+7.05034065
+7.12709856
+6.98178005
+6.85693312
+6.69474936
+6.58481455
+6.51727343
+6.49125957
+6.21844482
+6.46901369
+6.53361320
+6.46509838
+6.43254900
+6.35190487
+6.26485729
+6.21732950
+6.21957254
+5.91266489
+6.35146904
+6.36572838
+6.33053923
+6.28441715
+6.20286036
+6.15570545
+6.08000135
+6.03157282
+5.67149544
+6.35678244
+6.31723213
+6.26349401
+6.21279621
+6.12982273
+6.08621264
+5.94192123
+5.94551659
+5.48049164
diff --git a/tests/training/basics/toy.expected b/tests/training/basics/toy.expected
deleted file mode 100644
index f134a62..0000000
--- a/tests/training/basics/toy.expected
+++ /dev/null
@@ -1,44 +0,0 @@
-8.78630924
-8.75282860
-8.69768810
-8.60030174
-8.42331791
-8.08456516
-7.58919859
-7.15421867
-7.07388210
-7.15911722
-7.00602388
-6.87553406
-6.70337963
-6.58344078
-6.49999046
-6.45548153
-6.15827656
-6.46891832
-6.53488016
-6.44239426
-6.40002155
-6.31497908
-6.22505951
-6.17120123
-6.16692400
-5.82793045
-6.37700939
-6.38672018
-6.30941343
-6.23907804
-6.14631748
-6.09236145
-5.99969482
-5.93795681
-5.50292015
-7.16254091
-6.92708254
-6.59395170
-6.40627527
-6.22969198
-6.13137770
-5.96749878
-5.96595860
-5.42677450
diff --git a/tests/training/cost-functions/ce-mean.expected b/tests/training/cost-functions/ce-mean.expected
index 99dcb2e..d2284ad 100644
--- a/tests/training/cost-functions/ce-mean.expected
+++ b/tests/training/cost-functions/ce-mean.expected
@@ -1,8 +1,8 @@
-150.17980957
-263.68411255
-100.49322510
-253.63926697
-274.96899414
-145.67076111
-207.88955688
-245.54043579
+150.17977905
+263.68469238
+100.50036621
+253.64115906
+274.95046997
+145.69451904
+207.82330322
+245.20465088
diff --git a/tests/training/cost-functions/ce-sum.expected b/tests/training/cost-functions/ce-sum.expected
index 211f95e..bf94e75 100644
--- a/tests/training/cost-functions/ce-sum.expected
+++ b/tests/training/cost-functions/ce-sum.expected
@@ -1,8 +1,8 @@
-9616.72363281
-16924.73828125
-6485.94677734
-16367.00390625
-17824.65820312
-9517.77148438
-13688.55371094
-16464.12500000
+9611.53320312
+16874.39843750
+6437.13916016
+16230.42187500
+17589.32421875
+9352.28906250
+13412.98730469
+16044.14746094
diff --git a/tests/training/cost-functions/perplexity.expected b/tests/training/cost-functions/perplexity.expected
index 454d690..e72030a 100644
--- a/tests/training/cost-functions/perplexity.expected
+++ b/tests/training/cost-functions/perplexity.expected
@@ -1,8 +1,8 @@
-4855.37011719
-4845.00927734
-4863.23437500
-4840.18750000
-4846.01660156
-4849.56250000
-4848.35107422
-4839.06152344
+4833.17675781
+4724.25634766
+4562.26855469
+4509.33154297
+4332.39013672
+4184.27783203
+4086.94750977
+3897.33496094
diff --git a/tests/training/cost-functions/test_ce-mean-words.sh b/tests/training/cost-functions/test_ce-mean-words.sh
index 7c875ff..b9bb8f5 100644
--- a/tests/training/cost-functions/test_ce-mean-words.sh
+++ b/tests/training/cost-functions/test_ce-mean-words.sh
@@ -14,7 +14,7 @@ rm -rf ce-mean-words ce-mean-words.log
 mkdir -p ce-mean-words
 
 $MRT_MARIAN/marian \
-    --cost-type ce-mean-words \
+    --cost-type ce-mean-words --clip-norm 0 \
     --seed 9999 --sync-sgd \
     -m ce-mean-words/model.npz -t $MRT_DATA/train.max50.{en,de} -v vocab.en.yml vocab.de.yml \
     --disp-freq 2 --after-epochs 1 \
diff --git a/tests/training/cost-functions/test_ce-mean.sh b/tests/training/cost-functions/test_ce-mean.sh
index c4109b1..e1b08d2 100644
--- a/tests/training/cost-functions/test_ce-mean.sh
+++ b/tests/training/cost-functions/test_ce-mean.sh
@@ -14,7 +14,7 @@ rm -rf ce-mean ce-mean.log
 mkdir -p ce-mean
 
 $MRT_MARIAN/marian \
-    --cost-type ce-mean \
+    --cost-type ce-mean --clip-norm 0 \
     --seed 9999 --sync-sgd \
     -m ce-mean/model.npz -t $MRT_DATA/train.max50.{en,de} -v vocab.en.yml vocab.de.yml \
     --disp-freq 2 --after-epochs 1 \
diff --git a/tests/training/cost-functions/test_ce-sum.sh b/tests/training/cost-functions/test_ce-sum.sh
index f22f137..a3cfb04 100644
--- a/tests/training/cost-functions/test_ce-sum.sh
+++ b/tests/training/cost-functions/test_ce-sum.sh
@@ -14,7 +14,7 @@ rm -rf ce-sum ce-sum.log
 mkdir -p ce-sum
 
 $MRT_MARIAN/marian \
-    --cost-type ce-sum --disp-label-counts false \
+    --cost-type ce-sum --disp-label-counts false --clip-norm 0 \
     --seed 9999 --optimizer sgd --sync-sgd \
     -m ce-sum/model.npz -t $MRT_DATA/train.max50.{en,de} -v vocab.en.yml vocab.de.yml \
     --disp-freq 2 --after-epochs 1 \
diff --git a/tests/training/cost-functions/test_perplexity.sh b/tests/training/cost-functions/test_perplexity.sh
index 7a790fd..83dfb36 100644
--- a/tests/training/cost-functions/test_perplexity.sh
+++ b/tests/training/cost-functions/test_perplexity.sh
@@ -14,7 +14,7 @@ rm -rf perplexity perplexity.log
 mkdir -p perplexity
 
 $MRT_MARIAN/marian \
-    --cost-type perplexity \
+    --cost-type perplexity --clip-norm 0 \
     --seed 9999 --optimizer sgd --sync-sgd \
     -m perplexity/model.npz -t $MRT_DATA/train.max50.{en,de} -v vocab.en.yml vocab.de.yml \
     --disp-freq 2 --after-epochs 1 \
diff --git a/tests/training/features/data-weighting/maxibatch.expected b/tests/training/features/data-weighting/maxibatch.expected
index 7c0001c..ab99d76 100644
--- a/tests/training/features/data-weighting/maxibatch.expected
+++ b/tests/training/features/data-weighting/maxibatch.expected
@@ -1,10 +1,10 @@
-6924.51171875
-5225.91162109
-4136.33691406
-3092.30273438
-2086.31420898
-5388.93750000
-5653.18310547
-4460.88183594
-3398.95581055
-2398.78735352
+6904.40136719
+5177.73974609
+4077.06103516
+3035.06811523
+2036.97106934
+5234.54150391
+5421.84570312
+4205.61328125
+3152.19384766
+2185.01635742
diff --git a/tests/training/features/data-weighting/sqlite.expected b/tests/training/features/data-weighting/sqlite.expected
index d2f0f31..05298eb 100644
--- a/tests/training/features/data-weighting/sqlite.expected
+++ b/tests/training/features/data-weighting/sqlite.expected
@@ -1,100 +1,100 @@
-145.22386169
-408.22799683
-1194.63964844
-233.55360413
-1430.38696289
-126.14705658
-378.98550415
-96.71858215
-757.48663330
-330.56832886
-1604.02294922
-214.05000305
-438.19305420
-96.71667480
-466.88586426
-194.51495361
-1197.85363770
-146.07209778
-933.41638184
-389.17184448
-933.05151367
-622.52008057
-874.42907715
-825.77954102
-437.69128418
-155.24894714
-496.18453979
-232.95338440
-350.40795898
-321.22961426
-610.87152100
-670.83068848
-1778.17529297
-241.61517334
-1370.66113281
-136.27859497
-1510.65393066
-184.96159363
-815.58801270
-136.21365356
-729.38146973
-174.92094421
-408.49716187
-213.90168762
-233.55191040
-136.20471191
-290.47631836
-165.94480896
-584.11578369
-349.23815918
-958.81738281
-253.41488647
-437.03112793
-330.26031494
-435.45944214
-154.90957642
-521.61523438
-563.37170410
-496.03311157
-175.29434204
-438.16888428
-135.96989441
-496.29870605
-87.74061584
-436.65737915
-214.36607361
-58.15562820
-378.97769165
-348.81768799
-48.82063675
-993.13079834
-292.51773071
-524.27105713
-620.84045410
-1228.60827637
-535.02069092
-846.56628418
-291.05285645
-1658.58984375
-145.76992798
-1809.52172852
-680.63385010
-875.07159424
-78.35388947
-1049.29785156
-165.48069763
-611.14044189
-87.67588806
-438.09893799
-126.03882599
-262.80325317
-184.69609070
-262.46411133
-213.61804199
-1341.65759277
-106.89002228
-174.81327820
-116.23052979
-1165.48278809
-612.53643799
+145.22384644
+408.21359253
+1194.58532715
+233.50500488
+1429.99536133
+126.10730743
+378.85754395
+96.65737915
+757.20660400
+330.41281128
+1603.38867188
+213.94171143
+438.01345825
+96.60343933
+466.71673584
+194.41984558
+1197.37438965
+145.95291138
+932.55432129
+388.84637451
+932.01428223
+622.00317383
+873.45208740
+824.53771973
+437.26394653
+155.06787109
+495.64407349
+232.61149597
+349.86016846
+320.80847168
+610.00537109
+670.07312012
+1775.15356445
+241.20024109
+1368.22827148
+135.98722839
+1508.71228027
+184.60949707
+814.25140381
+135.82812500
+728.21057129
+174.76170349
+408.09228516
+213.54101562
+233.23889160
+136.04132080
+290.16552734
+165.65209961
+583.17614746
+348.41030884
+956.42248535
+252.87858582
+435.55230713
+329.51501465
+434.74572754
+154.38674927
+520.62304688
+562.26965332
+494.76245117
+174.66748047
+436.70803833
+135.57254028
+494.28051758
+87.53215790
+435.90051270
+213.83657837
+57.93138885
+377.68255615
+347.87652588
+48.66077423
+989.79504395
+291.43624878
+522.76562500
+619.18884277
+1225.72314453
+533.32568359
+844.25756836
+289.89556885
+1653.40588379
+144.93725586
+1802.95410156
+678.34832764
+870.25048828
+78.17111969
+1046.43676758
+164.60942078
+609.03454590
+87.29292297
+435.67877197
+125.71372223
+261.20431519
+184.11248779
+260.97955322
+212.86184692
+1336.96362305
+106.14862061
+173.34860229
+115.57688904
+1160.16357422
+609.54388428
diff --git a/tests/training/features/data-weighting/sqlite_word.expected b/tests/training/features/data-weighting/sqlite_word.expected
index fb557e0..2271c82 100644
--- a/tests/training/features/data-weighting/sqlite_word.expected
+++ b/tests/training/features/data-weighting/sqlite_word.expected
@@ -1,14 +1,14 @@
-853.40081787
-710.09143066
-610.75262451
-526.56585693
-439.88232422
-345.80377197
-234.99189758
-606.08709717
-734.31378174
-627.71948242
-544.36926270
-456.04959106
-364.86871338
-264.01800537
+846.28295898
+683.08270264
+546.49383545
+436.32812500
+349.99142456
+268.15167236
+178.71995544
+478.48889160
+563.18597412
+471.57501221
+402.95126343
+333.62921143
+264.42492676
+190.03326416
diff --git a/tests/training/features/data-weighting/test_maxi_batches_with_sentence_weights.sh b/tests/training/features/data-weighting/test_maxi_batches_with_sentence_weights.sh
index 58e9e82..bec6015 100644
--- a/tests/training/features/data-weighting/test_maxi_batches_with_sentence_weights.sh
+++ b/tests/training/features/data-weighting/test_maxi_batches_with_sentence_weights.sh
@@ -16,7 +16,7 @@ test -e vocab.de.yml || $MRT_MARIAN/marian-vocab < $MRT_DATA/europarl.de-en/corp
 test -e vocab.en.yml || $MRT_MARIAN/marian-vocab < $MRT_DATA/europarl.de-en/corpus.bpe.en > vocab.en.yml
 
 $MRT_MARIAN/marian \
-    --seed 3333 --no-shuffle --dim-emb 128 --dim-rnn 256 --optimizer sgd \
+    --seed 3333 --no-shuffle --clip-norm 0 --dim-emb 128 --dim-rnn 256 --optimizer sgd \
     -m maxibatch/model.npz -t train.1k.{de,en} -v vocab.{de,en}.yml \
     --log maxibatch.log --disp-freq 10 --after-batches 100 --mini-batch 16 --cost-type ce-sum --disp-label-counts false \
     --data-weighting train.1k.inc.txt --data-weighting-type sentence
diff --git a/tests/training/features/data-weighting/test_maxi_batches_with_word_weights.sh b/tests/training/features/data-weighting/test_maxi_batches_with_word_weights.sh
index 5cb5592..55bf916 100644
--- a/tests/training/features/data-weighting/test_maxi_batches_with_word_weights.sh
+++ b/tests/training/features/data-weighting/test_maxi_batches_with_word_weights.sh
@@ -16,7 +16,7 @@ test -e vocab.de.yml || $MRT_MARIAN/marian-vocab < $MRT_DATA/europarl.de-en/corp
 test -e vocab.en.yml || $MRT_MARIAN/marian-vocab < $MRT_DATA/europarl.de-en/corpus.bpe.en > vocab.en.yml
 
 $MRT_MARIAN/marian \
-    --seed 6666 --no-shuffle --dim-emb 128 --dim-rnn 256 --optimizer sgd \
+    --seed 6666 --no-shuffle --clip-norm 0 --dim-emb 128 --dim-rnn 256 --optimizer sgd \
     -m word_maxibatch/model.npz -t train.1k.{de,en} -v vocab.{de,en}.yml \
     --log word_maxibatch.log --disp-freq 10 --after-batches 100 --mini-batch 16 --cost-type ce-mean \
     --data-weighting train.1k.wordinc.txt --data-weighting-type word
diff --git a/tests/training/features/data-weighting/test_sentence_weighting_sqlite.sh b/tests/training/features/data-weighting/test_sentence_weighting_sqlite.sh
index 2295d64..9c85fac 100644
--- a/tests/training/features/data-weighting/test_sentence_weighting_sqlite.sh
+++ b/tests/training/features/data-weighting/test_sentence_weighting_sqlite.sh
@@ -13,7 +13,7 @@ rm -rf sqlite sqlite.log
 mkdir -p sqlite
 
 $MRT_MARIAN/marian \
-    --seed 1111 --no-shuffle --maxi-batch 1 --maxi-batch-sort none --max-length 100 --dim-emb 128 --dim-rnn 256 --optimizer sgd --cost-type ce-mean \
+    --seed 1111 --clip-norm 0 --no-shuffle --maxi-batch 1 --maxi-batch-sort none --max-length 100 --dim-emb 128 --dim-rnn 256 --optimizer sgd --cost-type ce-mean \
     -m sqlite/model.npz -t train.1k.{de,en} -v vocab.{de,en}.yml \
     --log sqlite.log --disp-freq 1 --after-batches 100 --mini-batch 1 \
     --data-weighting train.1k.weights.txt --data-weighting-type sentence --sqlite sqlite/corpus.sqlite3
diff --git a/tests/training/features/data-weighting/test_word_weighting_sqlite.sh b/tests/training/features/data-weighting/test_word_weighting_sqlite.sh
index fe399c7..4518885 100644
--- a/tests/training/features/data-weighting/test_word_weighting_sqlite.sh
+++ b/tests/training/features/data-weighting/test_word_weighting_sqlite.sh
@@ -15,7 +15,7 @@ mkdir -p sqlite_word
 cat $MRT_DATA/europarl.de-en/toy.bpe.en | sed -r 's/[^ ]+/2/g' > sqlite_word.weights.txt
 
 $MRT_MARIAN/marian \
-    --seed 1111 --no-shuffle --dim-emb 128 --dim-rnn 256 --optimizer sgd --cost-type ce-mean \
+    --seed 1111 --no-shuffle --clip-norm 0 --dim-emb 128 --dim-rnn 256 --optimizer sgd --cost-type ce-mean \
     -m sqlite_word/model.npz -t $MRT_DATA/europarl.de-en/toy.bpe.{de,en} -v vocab.{de,en}.yml \
     --log sqlite_word.log --disp-freq 5 -e 2 --mini-batch-fit -w 500 \
     --data-weighting sqlite_word.weights.txt --data-weighting-type word --sqlite sqlite_word/corpus.sqlite3
diff --git a/tests/training/features/data-weighting/test_word_weighting_with_eos.sh b/tests/training/features/data-weighting/test_word_weighting_with_eos.sh
index c3d7b93..2b7c3dd 100644
--- a/tests/training/features/data-weighting/test_word_weighting_with_eos.sh
+++ b/tests/training/features/data-weighting/test_word_weighting_with_eos.sh
@@ -17,7 +17,7 @@ cat $MRT_DATA/europarl.de-en/toy.bpe.en | sed -r -e 's/[^ ]+/2/g' -e 's/$/ 2/' >
 
 # Train
 $MRT_MARIAN/marian \
-    --seed 1111 --no-shuffle --dim-emb 128 --dim-rnn 256 --optimizer sgd --cost-type ce-mean \
+    --seed 1111 --no-shuffle --clip-norm 0 --dim-emb 128 --dim-rnn 256 --optimizer sgd --cost-type ce-mean \
     -m word_eos/model.npz -t $MRT_DATA/europarl.de-en/toy.bpe.{de,en} -v vocab.{de,en}.yml \
     --log word_eos.log --disp-freq 5 -e 2 \
     --data-weighting word_eos.weights.txt --data-weighting-type word
diff --git a/tests/training/features/data-weighting/test_word_weighting_with_twos.sh b/tests/training/features/data-weighting/test_word_weighting_with_twos.sh
index 0660501..b514b28 100644
--- a/tests/training/features/data-weighting/test_word_weighting_with_twos.sh
+++ b/tests/training/features/data-weighting/test_word_weighting_with_twos.sh
@@ -18,7 +18,7 @@ cat $MRT_DATA/europarl.de-en/toy.bpe.en | sed -r 's/[^ ]+/2/g' > word_twos.weigh
 
 # Train with word weighting
 $MRT_MARIAN/marian \
-    --seed 1111 --no-shuffle --dim-emb 128 --dim-rnn 256 --optimizer sgd --cost-type ce-mean \
+    --seed 1111 --no-shuffle --clip-norm 0 --dim-emb 128 --dim-rnn 256 --optimizer sgd --cost-type ce-mean \
     -m word_twos/model.npz -t $MRT_DATA/europarl.de-en/toy.bpe.{de,en} -v vocab.{de,en}.yml \
     --log word_twos.log --disp-freq 5 -e 2 \
     --data-weighting word_twos.weights.txt --data-weighting-type word
@@ -41,7 +41,7 @@ echo "data-weighting-type: word" >> word_twos.config.yml
 
 # Train with word weighting
 $MRT_MARIAN/marian \
-    --seed 1111 --no-shuffle --dim-emb 128 --dim-rnn 256 --optimizer sgd --cost-type ce-mean \
+    --seed 1111 --no-shuffle --clip-norm 0 --dim-emb 128 --dim-rnn 256 --optimizer sgd --cost-type ce-mean \
     -m word_twos_cfg/model.npz -t $MRT_DATA/europarl.de-en/toy.bpe.{de,en} -v vocab.{de,en}.yml \
     --log word_twos_cfg.log --disp-freq 5 -e 2 \
     -c word_twos.config.yml
diff --git a/tests/training/features/data-weighting/test_word_weighting_with_twos_sync.sh b/tests/training/features/data-weighting/test_word_weighting_with_twos_sync.sh
index 675ae8b..bf6d753 100644
--- a/tests/training/features/data-weighting/test_word_weighting_with_twos_sync.sh
+++ b/tests/training/features/data-weighting/test_word_weighting_with_twos_sync.sh
@@ -18,7 +18,7 @@ cat $MRT_DATA/europarl.de-en/toy.bpe.en | sed -r 's/[^ ]+/2/g' > word_twos_sync.
 
 # Train with word weighting
 $MRT_MARIAN/marian \
-    --seed 1111 --no-shuffle --dim-emb 128 --dim-rnn 256 --optimizer sgd --cost-type ce-mean \
+    --seed 1111 --no-shuffle --clip-norm 0 --dim-emb 128 --dim-rnn 256 --optimizer sgd --cost-type ce-mean \
     -m word_twos_sync/model.npz -t $MRT_DATA/europarl.de-en/toy.bpe.{de,en} -v vocab.{de,en}.yml --sync-sgd \
     --log word_twos_sync.log --disp-freq 5 -e 2 \
     --data-weighting word_twos_sync.weights.txt --data-weighting-type word
diff --git a/tests/training/features/data-weighting/word_eos.expected b/tests/training/features/data-weighting/word_eos.expected
index a4ec027..da2dc29 100644
--- a/tests/training/features/data-weighting/word_eos.expected
+++ b/tests/training/features/data-weighting/word_eos.expected
@@ -1,17 +1,17 @@
-Ep. 1 : Up. 5 : Sen. 320 : Cost 856.40283203
-Ep. 1 : Up. 10 : Sen. 640 : Cost 705.00976562
-Ep. 1 : Up. 15 : Sen. 960 : Cost 604.22814941
-Ep. 1 : Up. 20 : Sen. 1,280 : Cost 518.43249512
-Ep. 1 : Up. 25 : Sen. 1,600 : Cost 442.49536133
-Ep. 1 : Up. 30 : Sen. 1,920 : Cost 367.35723877
-Ep. 1 : Up. 35 : Sen. 2,240 : Cost 301.55618286
-Ep. 1 : Up. 40 : Sen. 2,560 : Cost 230.20394897
-Ep. 2 : Up. 45 : Sen. 64 : Cost 340.81048584
-Ep. 2 : Up. 50 : Sen. 384 : Cost 818.84265137
-Ep. 2 : Up. 55 : Sen. 704 : Cost 681.78875732
-Ep. 2 : Up. 60 : Sen. 1,024 : Cost 587.13653564
-Ep. 2 : Up. 65 : Sen. 1,344 : Cost 501.12982178
-Ep. 2 : Up. 70 : Sen. 1,664 : Cost 427.36920166
-Ep. 2 : Up. 75 : Sen. 1,984 : Cost 354.46206665
-Ep. 2 : Up. 80 : Sen. 2,304 : Cost 287.68417358
-Ep. 2 : Up. 85 : Sen. 2,624 : Cost 212.97563171
+Ep. 1 : Up. 5 : Sen. 320 : Cost 848.23455811
+Ep. 1 : Up. 10 : Sen. 640 : Cost 672.56451416
+Ep. 1 : Up. 15 : Sen. 960 : Cost 525.11682129
+Ep. 1 : Up. 20 : Sen. 1,280 : Cost 418.14608765
+Ep. 1 : Up. 25 : Sen. 1,600 : Cost 345.36917114
+Ep. 1 : Up. 30 : Sen. 1,920 : Cost 280.52749634
+Ep. 1 : Up. 35 : Sen. 2,240 : Cost 227.94680786
+Ep. 1 : Up. 40 : Sen. 2,560 : Cost 167.04818726
+Ep. 2 : Up. 45 : Sen. 64 : Cost 262.95532227
+Ep. 2 : Up. 50 : Sen. 384 : Cost 660.01922607
+Ep. 2 : Up. 55 : Sen. 704 : Cost 521.61163330
+Ep. 2 : Up. 60 : Sen. 1,024 : Cost 435.67529297
+Ep. 2 : Up. 65 : Sen. 1,344 : Cost 364.16458130
+Ep. 2 : Up. 70 : Sen. 1,664 : Cost 305.08660889
+Ep. 2 : Up. 75 : Sen. 1,984 : Cost 250.36157227
+Ep. 2 : Up. 80 : Sen. 2,304 : Cost 204.07473755
+Ep. 2 : Up. 85 : Sen. 2,624 : Cost 142.30810547
diff --git a/tests/training/features/data-weighting/word_maxibatch.expected b/tests/training/features/data-weighting/word_maxibatch.expected
index 542de08..c71c457 100644
--- a/tests/training/features/data-weighting/word_maxibatch.expected
+++ b/tests/training/features/data-weighting/word_maxibatch.expected
@@ -1,10 +1,10 @@
-493.42471313
-355.55953979
-272.83404541
-199.24537659
-131.18077087
-386.23254395
-388.55151367
-296.93032837
-220.35517883
-152.07803345
+491.56161499
+351.35723877
+267.88531494
+194.60379028
+127.38488770
+371.88699341
+366.62310791
+272.43316650
+197.15826416
+132.87835693
diff --git a/tests/training/features/data-weighting/word_twos.expected b/tests/training/features/data-weighting/word_twos.expected
index cc235d0..95d13cc 100644
--- a/tests/training/features/data-weighting/word_twos.expected
+++ b/tests/training/features/data-weighting/word_twos.expected
@@ -1,17 +1,17 @@
-Ep. 1 : Up. 5 : Sen. 320 : Cost 846.69714355 :
-Ep. 1 : Up. 10 : Sen. 640 : Cost 695.30053711 :
-Ep. 1 : Up. 15 : Sen. 960 : Cost 594.51928711 :
-Ep. 1 : Up. 20 : Sen. 1,280 : Cost 508.72247314 :
-Ep. 1 : Up. 25 : Sen. 1,600 : Cost 432.78329468 :
-Ep. 1 : Up. 30 : Sen. 1,920 : Cost 357.64947510 :
-Ep. 1 : Up. 35 : Sen. 2,240 : Cost 291.84161377 :
-Ep. 1 : Up. 40 : Sen. 2,560 : Cost 220.49028015 :
-Ep. 2 : Up. 45 : Sen. 64 : Cost 331.08535767 :
-Ep. 2 : Up. 50 : Sen. 384 : Cost 809.13928223 :
-Ep. 2 : Up. 55 : Sen. 704 : Cost 672.08361816 :
-Ep. 2 : Up. 60 : Sen. 1,024 : Cost 577.43341064 :
-Ep. 2 : Up. 65 : Sen. 1,344 : Cost 491.42279053 :
-Ep. 2 : Up. 70 : Sen. 1,664 : Cost 417.66470337 :
-Ep. 2 : Up. 75 : Sen. 1,984 : Cost 344.76025391 :
-Ep. 2 : Up. 80 : Sen. 2,304 : Cost 277.97634888 :
-Ep. 2 : Up. 85 : Sen. 2,624 : Cost 203.26664734 :
+Ep. 1 : Up. 5 : Sen. 320 : Cost 838.97186279 :
+Ep. 1 : Up. 10 : Sen. 640 : Cost 665.44097900 :
+Ep. 1 : Up. 15 : Sen. 960 : Cost 523.22821045 :
+Ep. 1 : Up. 20 : Sen. 1,280 : Cost 417.61639404 :
+Ep. 1 : Up. 25 : Sen. 1,600 : Cost 343.39797974 :
+Ep. 1 : Up. 30 : Sen. 1,920 : Cost 278.35540771 :
+Ep. 1 : Up. 35 : Sen. 2,240 : Cost 225.92178345 :
+Ep. 1 : Up. 40 : Sen. 2,560 : Cost 165.37797546 :
+Ep. 2 : Up. 45 : Sen. 64 : Cost 257.18948364 :
+Ep. 2 : Up. 50 : Sen. 384 : Cost 635.78594971 :
+Ep. 2 : Up. 55 : Sen. 704 : Cost 507.77557373 :
+Ep. 2 : Up. 60 : Sen. 1,024 : Cost 431.42156982 :
+Ep. 2 : Up. 65 : Sen. 1,344 : Cost 361.39825439 :
+Ep. 2 : Up. 70 : Sen. 1,664 : Cost 302.86456299 :
+Ep. 2 : Up. 75 : Sen. 1,984 : Cost 248.74520874 :
+Ep. 2 : Up. 80 : Sen. 2,304 : Cost 203.10728455 :
+Ep. 2 : Up. 85 : Sen. 2,624 : Cost 141.87115479 :
diff --git a/tests/training/features/data-weighting/word_twos_sync.expected b/tests/training/features/data-weighting/word_twos_sync.expected
index df4c0d5..f199178 100644
--- a/tests/training/features/data-weighting/word_twos_sync.expected
+++ b/tests/training/features/data-weighting/word_twos_sync.expected
@@ -1,17 +1,17 @@
-Ep. 1 : Up. 5 : Sen. 320 : Cost 846.69714355 :
-Ep. 1 : Up. 10 : Sen. 640 : Cost 695.30053711 :
-Ep. 1 : Up. 15 : Sen. 960 : Cost 594.51928711 :
-Ep. 1 : Up. 20 : Sen. 1,280 : Cost 508.72241211 :
-Ep. 1 : Up. 25 : Sen. 1,600 : Cost 432.78320312 :
-Ep. 1 : Up. 30 : Sen. 1,920 : Cost 357.64950562 :
-Ep. 1 : Up. 35 : Sen. 2,240 : Cost 291.84161377 :
-Ep. 1 : Up. 40 : Sen. 2,560 : Cost 220.49028015 :
-Ep. 2 : Up. 45 : Sen. 64 : Cost 331.08532715 :
-Ep. 2 : Up. 50 : Sen. 384 : Cost 809.13928223 :
-Ep. 2 : Up. 55 : Sen. 704 : Cost 672.08367920 :
-Ep. 2 : Up. 60 : Sen. 1,024 : Cost 577.43341064 :
-Ep. 2 : Up. 65 : Sen. 1,344 : Cost 491.42279053 :
-Ep. 2 : Up. 70 : Sen. 1,664 : Cost 417.66470337 :
-Ep. 2 : Up. 75 : Sen. 1,984 : Cost 344.76025391 :
-Ep. 2 : Up. 80 : Sen. 2,304 : Cost 277.97634888 :
-Ep. 2 : Up. 85 : Sen. 2,624 : Cost 203.26664734 :
+Ep. 1 : Up. 5 : Sen. 320 : Cost 838.97186279 :
+Ep. 1 : Up. 10 : Sen. 640 : Cost 665.44097900 :
+Ep. 1 : Up. 15 : Sen. 960 : Cost 523.22821045 :
+Ep. 1 : Up. 20 : Sen. 1,280 : Cost 417.61639404 :
+Ep. 1 : Up. 25 : Sen. 1,600 : Cost 343.39797974 :
+Ep. 1 : Up. 30 : Sen. 1,920 : Cost 278.35540771 :
+Ep. 1 : Up. 35 : Sen. 2,240 : Cost 225.92178345 :
+Ep. 1 : Up. 40 : Sen. 2,560 : Cost 165.37797546 :
+Ep. 2 : Up. 45 : Sen. 64 : Cost 257.18945312 :
+Ep. 2 : Up. 50 : Sen. 384 : Cost 635.78594971 :
+Ep. 2 : Up. 55 : Sen. 704 : Cost 507.77557373 :
+Ep. 2 : Up. 60 : Sen. 1,024 : Cost 431.42156982 :
+Ep. 2 : Up. 65 : Sen. 1,344 : Cost 361.39825439 :
+Ep. 2 : Up. 70 : Sen. 1,664 : Cost 302.86456299 :
+Ep. 2 : Up. 75 : Sen. 1,984 : Cost 248.74520874 :
+Ep. 2 : Up. 80 : Sen. 2,304 : Cost 203.10728455 :
+Ep. 2 : Up. 85 : Sen. 2,624 : Cost 141.87115479 :
diff --git a/tests/training/features/exp-smoothing/test_expsmooth.sh b/tests/training/features/exp-smoothing/test_expsmooth.sh
index b27cee9..69d7072 100644
--- a/tests/training/features/exp-smoothing/test_expsmooth.sh
+++ b/tests/training/features/exp-smoothing/test_expsmooth.sh
@@ -1,5 +1,9 @@
 #!/bin/bash -x
 
+#####################################################################
+# TAGS: clip-norm
+#####################################################################
+
 # Exit on error
 set -e
 
@@ -8,7 +12,7 @@ rm -rf expsmooth expsmooth*.log
 mkdir -p expsmooth
 
 
-opts="--no-shuffle --seed 777 --mini-batch 4 --maxi-batch 1 --maxi-batch-sort none --dim-rnn 64 --dim-emb 32 --optimizer sgd --learn-rate 0.5 --valid-sets valid.bpe.en valid.bpe.de --valid-metrics cross-entropy --valid-mini-batch 32 --cost-type ce-mean"
+opts="--no-shuffle --clip-norm 1 --seed 777 --mini-batch 4 --maxi-batch 1 --maxi-batch-sort none --dim-rnn 64 --dim-emb 32 --optimizer sgd --learn-rate 0.5 --valid-sets valid.bpe.en valid.bpe.de --valid-metrics cross-entropy --valid-mini-batch 32 --cost-type ce-mean"
 
 # No exponential smoothing
 $MRT_MARIAN/marian \
diff --git a/tests/training/features/exp-smoothing/test_expsmooth_sync.sh b/tests/training/features/exp-smoothing/test_expsmooth_sync.sh
index 3bab8ee..29e2978 100644
--- a/tests/training/features/exp-smoothing/test_expsmooth_sync.sh
+++ b/tests/training/features/exp-smoothing/test_expsmooth_sync.sh
@@ -13,11 +13,11 @@ rm -rf expsmooth_sync expsmooth_sync*.log
 mkdir -p expsmooth_sync
 
 
-opts="--no-shuffle --seed 777 --mini-batch 4 --maxi-batch 1 --maxi-batch-sort none --dim-rnn 64 --dim-emb 32 --optimizer adam --learn-rate 0.0001 --valid-sets valid.bpe.en valid.bpe.de --valid-metrics cross-entropy --valid-mini-batch 32 --devices 0 1 --sync-sgd"
+opts="--no-shuffle --clip-norm 0 --seed 777 --mini-batch 4 --maxi-batch 1 --maxi-batch-sort none --dim-rnn 64 --dim-emb 32 --optimizer adam --learn-rate 0.0001 --valid-sets valid.bpe.en valid.bpe.de --valid-metrics cross-entropy --valid-mini-batch 32 --devices 0 1 --sync-sgd"
 
 # No exponential smoothing
 $MRT_MARIAN/marian \
-    -m expsmooth_sync/model.noexp.npz -t $MRT_DATA/europarl.de-en/corpus.bpe.{en,de} -v vocab.en.yml vocab.de.yml --clip-norm 0 --cost-type ce-mean-words \
+    -m expsmooth_sync/model.noexp.npz -t $MRT_DATA/europarl.de-en/corpus.bpe.{en,de} -v vocab.en.yml vocab.de.yml --cost-type ce-mean-words \
     --disp-freq 20 --valid-freq 20 --after-batches 200 $opts \
     --log expsmooth_sync_0.log
 
@@ -30,7 +30,7 @@ cat expsmooth_sync_0.log | $MRT_TOOLS/strip-timestamps.sh | grep "Ep\. " | grep
 
 # With exponential smoothing
 $MRT_MARIAN/marian \
-    -m expsmooth_sync/model.npz -t $MRT_DATA/europarl.de-en/corpus.bpe.{en,de} -v vocab.en.yml vocab.de.yml --clip-norm 0 --cost-type ce-mean-words \
+    -m expsmooth_sync/model.npz -t $MRT_DATA/europarl.de-en/corpus.bpe.{en,de} -v vocab.en.yml vocab.de.yml --cost-type ce-mean-words \
     --disp-freq 20 --valid-freq 20 --after-batches 200 --exponential-smoothing 0.0001 $opts \
     --log expsmooth_sync.log
 
diff --git a/tests/training/features/guided-alignment/test_guided_alignment_rnn.sh b/tests/training/features/guided-alignment/test_guided_alignment_rnn.sh
index a022e5c..925f36f 100644
--- a/tests/training/features/guided-alignment/test_guided_alignment_rnn.sh
+++ b/tests/training/features/guided-alignment/test_guided_alignment_rnn.sh
@@ -3,7 +3,7 @@
 #####################################################################
 # SUMMARY: Training S2S model with guided alignment
 # AUTHOR: snukky
-# TAGS: align rnn
+# TAGS: align rnn clip-norm
 #####################################################################
 
 # Exit on error
@@ -15,7 +15,7 @@ mkdir -p rnn
 
 # Run marian command
 $MRT_MARIAN/marian \
-    --no-shuffle --seed 1111 --dim-emb 32 --dim-rnn 64 --mini-batch 32 --maxi-batch 1 --maxi-batch-sort none --optimizer sgd --cost-type ce-mean \
+    --no-shuffle --clip-norm 1 --seed 1111 --dim-emb 32 --dim-rnn 64 --mini-batch 32 --maxi-batch 1 --maxi-batch-sort none --optimizer sgd --cost-type ce-mean \
     -m rnn/model.npz -t corpus.bpe.{en,de} -v vocab.en.yml vocab.de.yml \
     --after-batches 100 --disp-freq 10 \
     --guided-alignment corpus.bpe.align --guided-alignment-weight 1.0 --learn-rate 0.1 \
diff --git a/tests/training/features/guided-alignment/test_guided_alignment_transformer.sh b/tests/training/features/guided-alignment/test_guided_alignment_transformer.sh
index f5f18b9..cd28f1c 100644
--- a/tests/training/features/guided-alignment/test_guided_alignment_transformer.sh
+++ b/tests/training/features/guided-alignment/test_guided_alignment_transformer.sh
@@ -15,7 +15,7 @@ mkdir -p transformer
 
 # Run marian command
 $MRT_MARIAN/marian --type transformer \
-    --no-shuffle --seed 2222 --dim-emb 32 --dim-rnn 64 --mini-batch 32 --maxi-batch 1 --maxi-batch-sort none --optimizer sgd --cost-type ce-mean \
+    --no-shuffle --clip-norm 0 --seed 2222 --dim-emb 32 --dim-rnn 64 --mini-batch 32 --maxi-batch 1 --maxi-batch-sort none --optimizer sgd --cost-type ce-mean \
     -m transformer/model.npz -t corpus.bpe.{en,de} -v vocab.en.yml vocab.de.yml \
     --after-batches 100 --disp-freq 10 \
     --guided-alignment corpus.bpe.align --guided-alignment-weight 1.0 --learn-rate 0.1 \
diff --git a/tests/training/features/guided-alignment/test_guided_alignment_transformer_sync.sh b/tests/training/features/guided-alignment/test_guided_alignment_transformer_sync.sh
index 49675e8..963052d 100644
--- a/tests/training/features/guided-alignment/test_guided_alignment_transformer_sync.sh
+++ b/tests/training/features/guided-alignment/test_guided_alignment_transformer_sync.sh
@@ -15,7 +15,7 @@ mkdir -p transformer_sync
 
 # Run marian command
 $MRT_MARIAN/marian --type transformer \
-    --no-shuffle --seed 2222 --dim-emb 32 --dim-rnn 64 --mini-batch 32 --maxi-batch 1 --maxi-batch-sort none --optimizer sgd --cost-type ce-mean --sync-sgd \
+    --no-shuffle --clip-norm 0 --seed 2222 --dim-emb 32 --dim-rnn 64 --mini-batch 32 --maxi-batch 1 --maxi-batch-sort none --optimizer sgd --cost-type ce-mean --sync-sgd \
     -m transformer_sync/model.npz -t corpus.bpe.{en,de} -v vocab.en.yml vocab.de.yml \
     --after-batches 100 --disp-freq 10 \
     --guided-alignment corpus.bpe.align --guided-alignment-weight 1.0 --learn-rate 0.1 \
diff --git a/tests/training/features/guided-alignment/transformer.expected b/tests/training/features/guided-alignment/transformer.expected
index 2513e28..e69de29 100644
--- a/tests/training/features/guided-alignment/transformer.expected
+++ b/tests/training/features/guided-alignment/transformer.expected
@@ -1,10 +0,0 @@
-244.42282104
-256.56842041
-238.89138794
-233.57333374
-223.44998169
-204.23277283
-232.98970032
-204.28886414
-215.29394531
-201.92327881
diff --git a/tests/training/features/mixed-ensembles/s2s_transf.expected b/tests/training/features/mixed-ensembles/s2s_transf.expected
index 1aba12f..3f2ff2d 100644
--- a/tests/training/features/mixed-ensembles/s2s_transf.expected
+++ b/tests/training/features/mixed-ensembles/s2s_transf.expected
@@ -1,5 +1,5 @@
-herrsch@@ Binnengrenzen gli@@ Borrell nische Millennium nun@@ Millennium nun@@ waren gessen@@ tentei@@ 41 typ@@ rig aufweisen ethn@@ Baum@@ nahe Unter@@ Unterzeichnung teure Wohl itäten ausgewogene Pläne persönliche agieren Meeres@@ persönliche agieren Meeres@@ persönliche Schlußfolgerungen Unterschied Fe@@ Unter@@ Somm@@ Pläne persönliche rain wunder@@ extended persönliche rain
-herrsch@@ Binnengrenzen Vorsitzes Pläne Kön@@ unterstützte tei@@ whol@@ Millennium wenngleich Kön@@ unterstützte tei@@ whol@@ VAT operator ethn@@ Baum@@ nahe Hague CI@@ COD rain will Kommissarin aush@@ wecken ASEM Konzep@@ Demokratisierungs@@ abzuwarten Voraussetzungen Kommissionspräsidenten unterbrochen COD Napole@@ Tür@@ log@@ Varela log@@ regulations wecken extreme Woh@@ log@@ Varela log@@ Varela
-waren gessen@@ nehme gli@@ cut@@ ethn@@ Baum@@ lich ca. EPL@@ ca. cycle tive ely Pazi@@ eben@@ agents ethn@@ agieren ethn@@ agieren Meeres@@ brachte Umweltverträglichkeitsprü@@ oring Genuss agieren ethn@@ agieren Meeres@@ ethn@@ agieren Meeres@@ nü@@ Instan@@ Geflügel@@ ahn finanziell bund@@ fortführen reform@@ Einklang need extreme agents
-herrsch@@ Binnengrenzen Binnengrenzen Binnengrenzen Dele@@ Tan@@ Texten Texten Texten Texten Texten Texten Texten wo nonsense thal@@ Sk@@ ethn@@ Baum@@ nahe nützlichen Konfrontation zielen Positionen mes@@ cor@@ Statistiken herrsch@@ Binnengrenzen nonsense reform just genügt erregend menschliche netz erregend menschliche tbewer@@ maj@@ coa log@@ Varela log@@ Varela log@@ Varela log@@ Varela log@@ Varela Budge@@ alitä@@ fit
-Statistiken rig ASEM Papier@@ ethn@@ itäts@@ zusätzlich itäts@@ zusätzlich itäts@@ zusätzlich itäts@@ zusätzlich will Capp@@ break ethn@@ agieren ethn@@ agieren ethn@@ agieren ethn@@ agieren
+
+
+
+
+
diff --git a/tests/training/features/mixed-ensembles/test_ensemble_of_different_s2s.sh b/tests/training/features/mixed-ensembles/test_ensemble_of_different_s2s.sh
index 42e231b..539956d 100644
--- a/tests/training/features/mixed-ensembles/test_ensemble_of_different_s2s.sh
+++ b/tests/training/features/mixed-ensembles/test_ensemble_of_different_s2s.sh
@@ -13,7 +13,7 @@ set -e
 rm -rf two_s2s two_s2s*.log
 mkdir -p two_s2s
 
-options="--no-shuffle --mini-batch 32 --maxi-batch 1 --maxi-batch-sort none --optimizer sgd --disp-freq 20 --after-batches 100"
+options="--no-shuffle --clip-norm 0 --mini-batch 32 --maxi-batch 1 --maxi-batch-sort none --optimizer sgd --disp-freq 20 --after-batches 60"
 
 # Train model A
 $MRT_MARIAN/marian \
diff --git a/tests/training/features/mixed-ensembles/test_ensemble_of_s2s_and_transformer.sh b/tests/training/features/mixed-ensembles/test_ensemble_of_s2s_and_transformer.sh
index a80fe32..fe33de3 100644
--- a/tests/training/features/mixed-ensembles/test_ensemble_of_s2s_and_transformer.sh
+++ b/tests/training/features/mixed-ensembles/test_ensemble_of_s2s_and_transformer.sh
@@ -7,7 +7,7 @@ set -e
 rm -rf s2s_transf s2s_transf*.log
 mkdir -p s2s_transf
 
-options="--no-shuffle --mini-batch 32 --maxi-batch 1 --maxi-batch-sort none --optimizer sgd --disp-freq 20 --after-batches 100"
+options="--no-shuffle --clip-norm 0 --mini-batch 32 --maxi-batch 1 --maxi-batch-sort none --optimizer sgd --disp-freq 20 --after-batches 60"
 
 # Train model A
 $MRT_MARIAN/marian \
diff --git a/tests/training/features/mixed-ensembles/two_s2s.expected b/tests/training/features/mixed-ensembles/two_s2s.expected
index 9cfe9b0..fe80305 100644
--- a/tests/training/features/mixed-ensembles/two_s2s.expected
+++ b/tests/training/features/mixed-ensembles/two_s2s.expected
@@ -1,5 +1,5 @@
-umgewandelt Davies Standpunkt Standpunkt Standpunkt Standpunkt Standpunkt Standpunkt Standpunkt Standpunkt Standpunkt Standpunkt Standpunkt Standpunkt Standpunkt Standpunkt Standpunkt Standpunkt Standpunkt Standpunkt Standpunkt Standpunkt Standpunkt Standpunkt Standpunkt Standpunkt Standpunkt Standpunkt Standpunkt Standpunkt Standpunkt Standpunkt Standpunkt Standpunkt Standpunkt Standpunkt Standpunkt Standpunkt Standpunkt Standpunkt Standpunkt Standpunkt Standpunkt Standpunkt Standpunkt
-staff@@ thalten Blut@@ beschä@@ trade Entlastungsverfahren Strafgerichtshof versu@@ gla@@ Jose@@ trade Entlastungsverfahren Strafgerichtshof versu@@ gla@@ Jose@@ Einfuhren Jose@@ trade Entlastungsverfahren Koordination ω@@ Hilfest@@ versu@@ gla@@ Jose@@ Einfuhren Jose@@ trade Entlastungsverfahren Koordination ω@@ Hilfest@@ versu@@ gla@@ Jose@@ Einfuhren Jose@@ trade Entlastungsverfahren Koordination ω@@ Hilfest@@ versu@@ gla@@ Jose@@ Einfuhren ärz@@
-Hed@@ Warrant Hed@@ GM@@ Lebensunterhalt schriftlichen Hed@@ GM@@ Lebensunterhalt schriftlichen Hed@@ GM@@ Lebensunterhalt schriftlichen Hed@@ GM@@ Lebensunterhalt schriftlichen Hed@@ GM@@ Lebensunterhalt schriftlichen Hed@@ GM@@ Lebensunterhalt schriftlichen Hed@@ GM@@ Lebensunterhalt schriftlichen Hed@@ GM@@ Lebensunterhalt schriftlichen Hed@@ GM@@ Lebensunterhalt schriftlichen Hed@@ GM@@ Lebensunterhalt schriftlichen Hed@@ GM@@ Lebensunterhalt
-tests Beihilfen General les gerich@@ verwendet Betrieben verei Prognosen les gerich@@ gerich@@ verwendet Betrieben anhaltenden freue day ismen Schle@@ gewünschte Nahrungsmittelhilfe sur Forscher les gerich@@ gerich@@ verwendet Betrieben verei Prognosen les gerich@@ gerich@@ verwendet Betrieben anhaltenden freue day ismen Schle@@ gewünschte Nahrungsmittelhilfe sur Forscher les gerich@@ gerich@@ verwendet Betrieben verei Prognosen les gerich@@ verwendet
-Sal@@ Υ@@ fil Υ@@ fil Υ@@ fil Υ@@ fil Υ@@ fil Υ@@ fil Υ@@ fil Υ@@ fil Υ@@ fil Υ@@ fil Υ@@ fil Υ@@
+.
+.
+.
+.
+.
diff --git a/tests/training/features/quantized-model/model_centers.expected b/tests/training/features/quantized-model/model_centers.expected
index 57380ae..954a001 100644
--- a/tests/training/features/quantized-model/model_centers.expected
+++ b/tests/training/features/quantized-model/model_centers.expected
@@ -1,51 +1,49 @@
-Tensor decoder_W_comb_att unique centers: [-0.17677179 -0.11784786 -0.05892393 -0.          0.05892393  0.11784786
-  0.17677179]
-Tensor decoder_Wc_att unique centers: [-0.15336949 -0.10224632 -0.05112316 -0.          0.05112316  0.10224632
-  0.15336949]
-Tensor Wemb_dec unique centers: [-0.32046145 -0.21364096 -0.10682048  0.          0.10682048  0.21364096
-  0.32046145]
-Tensor decoder_U unique centers: [-0.17687811 -0.11791874 -0.05895937 -0.          0.05895937  0.11791874
-  0.17687811]
-Tensor decoder_Ux unique centers: [-0.21770547 -0.14513698 -0.07256849  0.          0.07256849  0.14513698
-  0.21770547]
-Tensor decoder_W unique centers: [-0.19397542 -0.12931694 -0.06465847 -0.          0.06465847  0.12931694
-  0.19397542]
-Tensor decoder_Wx unique centers: [-0.25329626 -0.16886416 -0.08443208 -0.          0.08443208  0.16886416
-  0.25329626]
-Tensor decoder_U_nl unique centers: [-0.17696194 -0.11797463 -0.05898732  0.          0.05898732  0.11797463
-  0.17696194]
-Tensor decoder_Ux_nl unique centers: [-0.21896881 -0.14597921 -0.07298961  0.          0.07298961  0.14597921
-  0.21896881]
-Tensor decoder_Wc unique centers: [-0.15324192 -0.10216128 -0.05108064  0.          0.05108064  0.10216128
-  0.15324192]
-Tensor decoder_Wcx unique centers: [-0.18192002 -0.12128001 -0.06064001 -0.          0.06064001  0.12128001
-  0.18192002]
-Tensor ff_logit_prev_W unique centers: [-0.32183957 -0.2145597  -0.10727985 -0.          0.10727985  0.2145597
-  0.32183957]
-Tensor ff_logit_lstm_W unique centers: [-0.25455362 -0.16970241 -0.08485121  0.          0.08485121  0.16970241
-  0.25455362]
-Tensor ff_logit_ctx_W unique centers: [-0.19867198 -0.13244799 -0.06622399 -0.          0.06622399  0.13244799
-  0.19867198]
-Tensor decoder_ff_logit_l2_Wt unique centers: [-0.36124557 -0.24083039 -0.1204152   0.          0.1204152   0.24083039
-  0.36124557]
-Tensor ff_state_W unique centers: [-0.17704961 -0.11803307 -0.05901653  0.          0.05901653  0.11803307
-  0.17704961]
-Tensor Wemb unique centers: [-0.31208774 -0.20805849 -0.10402925  0.          0.10402925  0.20805849
-  0.31208774]
-Tensor encoder_U unique centers: [-0.17686225 -0.11790817 -0.05895409  0.          0.05895409  0.11790817
-  0.17686225]
-Tensor encoder_Ux unique centers: [-0.21824732 -0.14549822 -0.07274911  0.          0.07274911  0.14549822
-  0.21824732]
-Tensor encoder_W unique centers: [-0.19403435 -0.12935624 -0.06467812  0.          0.06467812  0.12935624
-  0.19403435]
-Tensor encoder_Wx unique centers: [-0.25213736 -0.16809157 -0.08404578 -0.          0.08404578  0.16809157
-  0.25213736]
-Tensor encoder_r_U unique centers: [-0.17699143 -0.11799429 -0.05899715  0.          0.05899715  0.11799429
-  0.17699143]
-Tensor encoder_r_Ux unique centers: [-0.21971346 -0.14647564 -0.07323782 -0.          0.07323782  0.14647564
-  0.21971346]
-Tensor encoder_r_W unique centers: [-0.19410282 -0.12940188 -0.06470094  0.          0.06470094  0.12940188
-  0.19410282]
-Tensor encoder_r_Wx unique centers: [-0.25225359 -0.16816907 -0.08408453 -0.          0.08408453  0.16816907
-  0.25225359]
+Tensor decoder_W_comb_att unique centers: [-0.1826457 -0.1217638 -0.0608819  0.         0.0608819  0.1217638
+  0.1826457]
+Tensor decoder_Wc_att unique centers: [-0.17328945 -0.1155263  -0.05776315  0.          0.05776315  0.1155263
+  0.17328945]
+Tensor Wemb_dec unique centers: [-2.3631978 -1.5754652 -0.7877326  0.         0.7877326  1.5754652
+  2.3631978]
+Tensor decoder_U unique centers: [-0.3221001 -0.2147334 -0.1073667 -0.         0.1073667  0.2147334
+  0.3221001]
+Tensor decoder_Ux unique centers: [-0.43822908 -0.29215273 -0.14607637  0.          0.14607637  0.29215273
+  0.43822908]
+Tensor decoder_W unique centers: [-0.22816041 -0.15210694 -0.07605347  0.          0.07605347  0.15210694
+  0.22816041]
+Tensor decoder_Wx unique centers: [-0.49631694 -0.33087796 -0.16543898 -0.          0.16543898  0.33087796
+  0.49631694]
+Tensor decoder_U_nl unique centers: [-0.3815875  -0.25439167 -0.12719584 -0.          0.12719584  0.25439167
+  0.3815875 ]
+Tensor decoder_Ux_nl unique centers: [-0.5111215  -0.34074768 -0.17037384  0.          0.17037384  0.34074768
+  0.5111215 ]
+Tensor decoder_Wc unique centers: [-0.42579597 -0.283864   -0.141932   -0.          0.141932    0.283864
+  0.42579597]
+Tensor decoder_Wcx unique centers: [-0.8375     -0.55833334 -0.27916667 -0.          0.27916667  0.55833334
+  0.8375    ]
+Tensor ff_logit_prev_W unique centers: [-70.87341  -23.624472   0.        23.624472  47.248943]
+Tensor ff_logit_lstm_W unique centers: [-246.07938 -164.05292  -82.02646    0.        82.02646  164.05292
+  246.07938]
+Tensor ff_logit_ctx_W unique centers: [-240.9685  -160.64568  -80.32284    0.        80.32284  160.64568
+  240.9685 ]
+Tensor decoder_ff_logit_l2_Wt unique centers: [-106.12637   -70.750916  -35.375458   -0.         35.375458   70.750916
+  106.12637 ]
+Tensor ff_state_W unique centers: [-0.2559117 -0.1706078 -0.0853039 -0.         0.0853039  0.1706078
+  0.2559117]
+Tensor Wemb unique centers: [-0.39904252 -0.19952126  0.          0.19952126  0.39904252  0.5985638 ]
+Tensor encoder_U unique centers: [-0.30375382 -0.20250255 -0.10125127 -0.          0.10125127  0.20250255
+  0.30375382]
+Tensor encoder_Ux unique centers: [-0.45867392 -0.30578262 -0.15289131 -0.          0.15289131  0.30578262
+  0.45867392]
+Tensor encoder_W unique centers: [-0.2062971 -0.1375314 -0.0687657  0.         0.0687657  0.1375314
+  0.2062971]
+Tensor encoder_Wx unique centers: [-0.3073737  -0.20491579 -0.1024579   0.          0.1024579   0.20491579
+  0.3073737 ]
+Tensor encoder_r_U unique centers: [-0.34318972 -0.22879314 -0.11439657  0.          0.11439657  0.22879314
+  0.34318972]
+Tensor encoder_r_Ux unique centers: [-0.72291785 -0.48194525 -0.24097262 -0.          0.24097262  0.48194525
+  0.72291785]
+Tensor encoder_r_W unique centers: [-0.21613705 -0.14409137 -0.07204568 -0.          0.07204568  0.14409137
+  0.21613705]
+Tensor encoder_r_Wx unique centers: [-0.39892155 -0.2659477  -0.13297385 -0.          0.13297385  0.2659477
+  0.39892155]
 Tensor decoder_c_tt unique centers: []
diff --git a/tests/training/features/quantized-model/quantized.expected b/tests/training/features/quantized-model/quantized.expected
index 17620ec..2d0638e 100644
--- a/tests/training/features/quantized-model/quantized.expected
+++ b/tests/training/features/quantized-model/quantized.expected
@@ -1,10 +1,10 @@
-225.10929871
-243.58345032
-229.45071411
-224.28813171
-212.65242004
-204.06596375
-197.81690979
-190.08915710
-193.72299194
-195.20808411
+5296.80419922
+14729.64062500
+14570.66210938
+17166.55859375
+16055.21875000
+16277.48437500
+18673.34765625
+16747.37109375
+17298.72070312
+16335.72949219
diff --git a/tests/training/features/quantized-model/test_quant_centers.sh b/tests/training/features/quantized-model/test_quant_centers.sh
index 22dd863..8318c24 100644
--- a/tests/training/features/quantized-model/test_quant_centers.sh
+++ b/tests/training/features/quantized-model/test_quant_centers.sh
@@ -16,7 +16,7 @@ mkdir -p train
 
 # Train an 8-bits model
 $MRT_MARIAN/marian \
-    --no-shuffle --seed 1111 --dim-emb 32 --dim-rnn 64 --mini-batch 32 --maxi-batch 1 --maxi-batch-sort none --learn-rate 0.1 --optimizer sgd \
+    --no-shuffle --seed 1111 --dim-emb 32 --dim-rnn 64 --mini-batch 32 --maxi-batch 1 --maxi-batch-sort none --learn-rate 0.1 --optimizer sgd --clip-norm 0 \
     -m train/model.npz -t $MRT_DATA/europarl.de-en/corpus.bpe.{en,de} -v train/vocab.en.yml train/vocab.de.yml \
     --cost-type cross-entropy --sync-sgd --after-batches 10 --disp-freq 2 --quantize-bits 3
 
diff --git a/tests/training/features/quantized-model/test_quantmodel.sh b/tests/training/features/quantized-model/test_quantmodel.sh
index 8b55697..67019f2 100644
--- a/tests/training/features/quantized-model/test_quantmodel.sh
+++ b/tests/training/features/quantized-model/test_quantmodel.sh
@@ -16,7 +16,7 @@ mkdir -p train
 
 # Train an 8-bits model
 $MRT_MARIAN/marian \
-    --no-shuffle --seed 1111 --dim-emb 32 --dim-rnn 64 --mini-batch 32 --maxi-batch 1 --maxi-batch-sort none --learn-rate 0.1 --optimizer sgd \
+    --no-shuffle --seed 1111 --dim-emb 32 --dim-rnn 64 --mini-batch 32 --maxi-batch 1 --maxi-batch-sort none --learn-rate 0.1 --optimizer sgd --clip-norm 0 \
     -m train/model.npz -t $MRT_DATA/europarl.de-en/corpus.bpe.{en,de} -v train/vocab.en.yml train/vocab.de.yml \
     --cost-type cross-entropy --sync-sgd --after-batches 100 --disp-freq 10 --quantize-bits 8 \
     --log $PREFIX.log
diff --git a/tests/training/features/quantized-model/test_quantmodel_log.sh b/tests/training/features/quantized-model/test_quantmodel_log.sh
index f79809b..924eb4b 100644
--- a/tests/training/features/quantized-model/test_quantmodel_log.sh
+++ b/tests/training/features/quantized-model/test_quantmodel_log.sh
@@ -16,7 +16,7 @@ mkdir -p train
 
 # Train an 8-bits model
 $MRT_MARIAN/marian \
-    --no-shuffle --seed 1111 --dim-emb 32 --dim-rnn 64 --mini-batch 32 --maxi-batch 1 --maxi-batch-sort none --learn-rate 0.1 --optimizer sgd \
+    --no-shuffle --seed 1111 --dim-emb 32 --dim-rnn 64 --mini-batch 32 --maxi-batch 1 --maxi-batch-sort none --learn-rate 0.1 --optimizer sgd --clip-norm 1 \
     -m train/model.npz -t $MRT_DATA/europarl.de-en/corpus.bpe.{en,de} -v train/vocab.en.yml train/vocab.de.yml \
     --cost-type cross-entropy --sync-sgd --after-batches 100 --disp-freq 10 --quantize-bits 4 --quantize-log-based --quantize-optimization-steps 3 \
     --log $PREFIX.log
diff --git a/tests/training/features/quantized-model/test_quantmodel_with_bias.sh b/tests/training/features/quantized-model/test_quantmodel_with_bias.sh
index de14ffb..8dee56b 100644
--- a/tests/training/features/quantized-model/test_quantmodel_with_bias.sh
+++ b/tests/training/features/quantized-model/test_quantmodel_with_bias.sh
@@ -16,14 +16,14 @@ mkdir -p train
 
 # training with quantized bias is tricky, so we start by training a normal model first before finetuning it to the quantized space.
 $MRT_MARIAN/marian \
-    --no-shuffle --seed 1111 --dim-emb 32 --dim-rnn 64 --mini-batch 32 --maxi-batch 1 --maxi-batch-sort none --learn-rate 0.1 --optimizer sgd \
+    --no-shuffle --seed 1111 --dim-emb 32 --dim-rnn 64 --mini-batch 32 --maxi-batch 1 --maxi-batch-sort none --learn-rate 0.1 --optimizer sgd --clip-norm 1 \
     -m train/model.npz -t $MRT_DATA/europarl.de-en/corpus.bpe.{en,de} -v train/vocab.en.yml train/vocab.de.yml \
     --cost-type cross-entropy --sync-sgd --after-batches 20 --disp-freq 10 \
     --log $PREFIX.log
 
 # Train an 8-bits model
 $MRT_MARIAN/marian \
-    --no-shuffle --seed 1111 --dim-emb 32 --dim-rnn 64 --mini-batch 32 --maxi-batch 1 --maxi-batch-sort none --learn-rate 0.1 --optimizer sgd \
+    --no-shuffle --seed 1111 --dim-emb 32 --dim-rnn 64 --mini-batch 32 --maxi-batch 1 --maxi-batch-sort none --learn-rate 0.1 --optimizer sgd --clip-norm 1 \
     -m train/model.npz -t $MRT_DATA/europarl.de-en/corpus.bpe.{en,de} -v train/vocab.en.yml train/vocab.de.yml \
     --cost-type cross-entropy --sync-sgd --after-batches 100 --disp-freq 10 --quantize-bits 8 --quantize-biases \
     --log $PREFIX.log
diff --git a/tests/training/features/quantized-model/test_quantmodel_with_optimization.sh b/tests/training/features/quantized-model/test_quantmodel_with_optimization.sh
index 510c339..17a72d8 100644
--- a/tests/training/features/quantized-model/test_quantmodel_with_optimization.sh
+++ b/tests/training/features/quantized-model/test_quantmodel_with_optimization.sh
@@ -16,7 +16,7 @@ mkdir -p train
 
 # Train an 8-bits model
 $MRT_MARIAN/marian \
-    --no-shuffle --seed 1111 --dim-emb 32 --dim-rnn 64 --mini-batch 32 --maxi-batch 1 --maxi-batch-sort none --learn-rate 0.1 --optimizer sgd \
+    --no-shuffle --seed 1111 --dim-emb 32 --dim-rnn 64 --mini-batch 32 --maxi-batch 1 --maxi-batch-sort none --learn-rate 0.1 --optimizer sgd --clip-norm 1 \
     -m train/model.npz -t $MRT_DATA/europarl.de-en/corpus.bpe.{en,de} -v train/vocab.en.yml train/vocab.de.yml \
     --cost-type cross-entropy --sync-sgd --after-batches 100 --disp-freq 10 --quantize-bits 8 --quantize-optimization-steps 3 \
     --log $PREFIX.log
diff --git a/tests/training/features/quantized-model/update.sh b/tests/training/features/quantized-model/update.sh
new file mode 100755
index 0000000..04be645
--- /dev/null
+++ b/tests/training/features/quantized-model/update.sh
@@ -0,0 +1,7 @@
+#!/usr/bin/env sh
+cp model_centers.out model_centers.expected
+cp test-center.out test-center.expected
+cp quantized-log4bit.out quantized-log4bit.expected
+cp quantized.out quantized.expected
+cp quantized-with-bias.out quantized-with-bias.expected
+cp quantized-opt.out quantized-opt.expected
diff --git a/tests/training/features/right-left/rnn.expected b/tests/training/features/right-left/rnn.expected
index c683efb..ff302f0 100644
--- a/tests/training/features/right-left/rnn.expected
+++ b/tests/training/features/right-left/rnn.expected
@@ -1,10 +1,10 @@
-227.26374817
-251.25552368
-244.43490601
-247.96240234
-242.51679993
-239.25460815
-236.51896667
-231.50540161
-238.35562134
-242.17578125
+226.89152527
+249.98703003
+242.43225098
+245.21345520
+239.08744812
+234.52084351
+230.54391479
+224.25790405
+228.97502136
+230.70504761
diff --git a/tests/training/features/right-left/test_right_left_rnn.sh b/tests/training/features/right-left/test_right_left_rnn.sh
index 245125e..ae3976b 100644
--- a/tests/training/features/right-left/test_right_left_rnn.sh
+++ b/tests/training/features/right-left/test_right_left_rnn.sh
@@ -14,7 +14,7 @@ mkdir -p rnn
 
 # Run marian command
 $MRT_MARIAN/marian \
-    --no-shuffle --seed 1111 --dim-emb 32 --dim-rnn 64 --mini-batch 32 --maxi-batch 1 --maxi-batch-sort none --optimizer sgd --cost-type ce-mean \
+    --no-shuffle --clip-norm 0 --seed 1111 --dim-emb 32 --dim-rnn 64 --mini-batch 32 --maxi-batch 1 --maxi-batch-sort none --optimizer sgd --cost-type ce-mean \
     -m rnn/model.npz -t $MRT_DATA/europarl.de-en/corpus.bpe.{en,de} -v vocab.en.yml vocab.de.yml \
     --after-batches 100 --disp-freq 10 \
     --right-left --log rnn.log
diff --git a/tests/training/features/right-left/test_right_left_transformer.sh b/tests/training/features/right-left/test_right_left_transformer.sh
index 8a40bdd..2939c3c 100644
--- a/tests/training/features/right-left/test_right_left_transformer.sh
+++ b/tests/training/features/right-left/test_right_left_transformer.sh
@@ -14,7 +14,7 @@ mkdir -p transformer
 
 # Run marian command
 $MRT_MARIAN/marian --type transformer \
-    --no-shuffle --seed 2222 --dim-emb 32 --dim-rnn 64 --mini-batch 32 --maxi-batch 1 --maxi-batch-sort none --optimizer sgd --cost-type ce-mean \
+    --no-shuffle --clip-norm 0 --seed 2222 --dim-emb 32 --dim-rnn 64 --mini-batch 32 --maxi-batch 1 --maxi-batch-sort none --optimizer sgd --cost-type ce-mean \
     -m transformer/model.npz -t $MRT_DATA/europarl.de-en/corpus.bpe.{en,de} -v vocab.en.yml vocab.de.yml \
     --after-batches 100 --disp-freq 10 \
     --right-left --log transformer.log
diff --git a/tests/training/features/right-left/transformer.expected b/tests/training/features/right-left/transformer.expected
index a63420f..0fc2662 100644
--- a/tests/training/features/right-left/transformer.expected
+++ b/tests/training/features/right-left/transformer.expected
@@ -1,10 +1,10 @@
-237.99105835
-263.23455811
-255.45816040
-259.72146606
-254.48379517
-250.45918274
-248.04586792
-242.33943176
-249.78984070
-253.30130005
+233.16964722
+249.00422668
+235.84651184
+234.45532227
+225.37080383
+216.76550293
+210.73200989
+202.66915894
+207.25146484
+209.57803345
diff --git a/tests/training/models/lm/lm-transformer.expected b/tests/training/models/lm/lm-transformer.expected
index a5e9556..593d94e 100644
--- a/tests/training/models/lm/lm-transformer.expected
+++ b/tests/training/models/lm/lm-transformer.expected
@@ -1,5 +1,5 @@
-405.95352173
-277.85601807
-198.49377441
-135.86233521
-74.85224152
+406.14587402
+279.24975586
+201.01249695
+139.34625244
+78.13222504
diff --git a/tests/training/models/lm/lm-transformer.scores.expected b/tests/training/models/lm/lm-transformer.scores.expected
index d40b653..ed3599e 100644
--- a/tests/training/models/lm/lm-transformer.scores.expected
+++ b/tests/training/models/lm/lm-transformer.scores.expected
@@ -1,10 +1,10 @@
--90.117882
--179.877197
--81.371750
--204.165802
--636.969482
--131.934113
--359.475616
--56.732944
--68.373947
--100.774132
+-94.390747
+-187.140671
+-85.050323
+-211.180054
+-645.849121
+-136.817657
+-366.460815
+-57.487789
+-72.613792
+-102.306747
diff --git a/tests/training/models/lm/lm.expected b/tests/training/models/lm/lm.expected
index c6b5c74..dc0ae65 100644
--- a/tests/training/models/lm/lm.expected
+++ b/tests/training/models/lm/lm.expected
@@ -1,5 +1,5 @@
-410.02645874
-306.52648926
-233.58132935
-167.19117737
-91.86805725
+410.03164673
+306.58309937
+233.76004028
+167.80232239
+92.23210144
diff --git a/tests/training/models/lm/lm.scores.expected b/tests/training/models/lm/lm.scores.expected
index 9ea8ae4..b42cd9b 100644
--- a/tests/training/models/lm/lm.scores.expected
+++ b/tests/training/models/lm/lm.scores.expected
@@ -1,10 +1,10 @@
--114.927658
--208.074463
--102.252083
--244.505508
--677.256836
--154.783279
--411.580017
--58.307816
--89.968994
--111.055710
+-114.125137
+-206.581238
+-101.570534
+-242.844177
+-673.484863
+-153.583893
+-409.248169
+-57.871357
+-89.267410
+-110.574005
diff --git a/tests/training/models/lm/test_lm-transformer.sh b/tests/training/models/lm/test_lm-transformer.sh
index 476c2e2..aa2188c 100644
--- a/tests/training/models/lm/test_lm-transformer.sh
+++ b/tests/training/models/lm/test_lm-transformer.sh
@@ -14,7 +14,7 @@ rm -rf lm-transformer lm-transformer.log
 mkdir -p lm-transformer
 
 $MRT_MARIAN/marian \
-    --seed 1111 --no-shuffle \
+    --seed 1111 --no-shuffle --clip-norm 0 \
     --type lm-transformer --dim-emb 128 --dim-rnn 256 --cost-type ce-mean \
     -m lm-transformer/model.npz -t $MRT_DATA/europarl.de-en/corpus.bpe.en -v vocab.en.yml \
     --disp-freq 20 --after-batches 100 \
diff --git a/tests/training/models/lm/test_lm.sh b/tests/training/models/lm/test_lm.sh
index 91f94d4..f55e860 100644
--- a/tests/training/models/lm/test_lm.sh
+++ b/tests/training/models/lm/test_lm.sh
@@ -14,7 +14,7 @@ rm -rf lm lm.log
 mkdir -p lm
 
 $MRT_MARIAN/marian \
-    --seed 1111 --no-shuffle \
+    --seed 1111 --no-shuffle --clip-norm 0 \
     --type lm --dim-emb 128 --dim-rnn 256 --cost-type ce-mean \
     -m lm/model.npz -t $MRT_DATA/europarl.de-en/corpus.bpe.en -v vocab.en.yml \
     --disp-freq 20 --after-batches 100 \
diff --git a/tests/training/models/multi-source/multi-s2s.expected b/tests/training/models/multi-source/multi-s2s.expected
index 61608c4..2258c02 100644
--- a/tests/training/models/multi-source/multi-s2s.expected
+++ b/tests/training/models/multi-source/multi-s2s.expected
@@ -1,5 +1,5 @@
-388.14068604
-279.26577759
-198.44155884
-137.92988586
-75.97171021
+388.15350342
+278.90570068
+197.63183594
+137.78120422
+76.32478333
diff --git a/tests/training/models/multi-source/multi-transformer.expected b/tests/training/models/multi-source/multi-transformer.expected
index 35c5e4b..538ed15 100644
--- a/tests/training/models/multi-source/multi-transformer.expected
+++ b/tests/training/models/multi-source/multi-transformer.expected
@@ -1,5 +1,5 @@
-382.23056030
-264.14666748
-193.33871460
-133.58370972
-71.43719482
+382.69680786
+265.52267456
+196.04469299
+138.10417175
+75.06012726
diff --git a/tests/training/models/multi-source/test_multi-s2s.sh b/tests/training/models/multi-source/test_multi-s2s.sh
index a1ceef1..52c3ded 100644
--- a/tests/training/models/multi-source/test_multi-s2s.sh
+++ b/tests/training/models/multi-source/test_multi-s2s.sh
@@ -14,7 +14,7 @@ rm -rf multi-s2s multi-s2s.log
 mkdir -p multi-s2s
 
 $MRT_MARIAN/marian \
-    --seed 1111 --no-shuffle \
+    --seed 1111 --no-shuffle --clip-norm 0 \
     --type multi-s2s --dim-emb 128 --dim-rnn 256 --cost-type ce-mean \
     -m multi-s2s/model.npz -t train.bpe.{en,xx,de} -v vocab.en.yml vocab.xx.yml vocab.de.yml \
     --disp-freq 20 --after-batches 100 \
diff --git a/tests/training/models/multi-source/test_multi-transformer.sh b/tests/training/models/multi-source/test_multi-transformer.sh
index 425ebdc..e41d918 100644
--- a/tests/training/models/multi-source/test_multi-transformer.sh
+++ b/tests/training/models/multi-source/test_multi-transformer.sh
@@ -14,7 +14,7 @@ rm -rf multi-transformer multi-transformer.log
 mkdir -p multi-transformer
 
 $MRT_MARIAN/marian \
-    --seed 1111 --no-shuffle \
+    --seed 1111 --no-shuffle --clip-norm 0 \
     --type multi-transformer --dim-emb 128 --dim-rnn 256 --cost-type ce-mean \
     -m multi-transformer/model.npz -t train.bpe.{en,xx,de} -v vocab.en.yml vocab.xx.yml vocab.de.yml \
     --disp-freq 20 --after-batches 100 \
diff --git a/tests/training/models/nematus/encdec_depth.expected b/tests/training/models/nematus/encdec_depth.expected
index af2a74a..6cf8ebd 100644
--- a/tests/training/models/nematus/encdec_depth.expected
+++ b/tests/training/models/nematus/encdec_depth.expected
@@ -1,5 +1,5 @@
-489.13665771
-462.08361816
-439.01745605
-420.90402222
-404.19827271
+488.88616943
+461.44476318
+437.74578857
+419.76626587
+403.67724609
diff --git a/tests/training/models/nematus/test_encdec_depth.sh b/tests/training/models/nematus/test_encdec_depth.sh
index ed5276e..0de9026 100644
--- a/tests/training/models/nematus/test_encdec_depth.sh
+++ b/tests/training/models/nematus/test_encdec_depth.sh
@@ -17,7 +17,7 @@ $MRT_MARIAN/marian \
     --type nematus --enc-cell gru-nematus --dec-cell gru-nematus \
     --enc-depth 4 --enc-cell-depth 4 --enc-type bidirectional --dec-depth 4 --dec-cell-base-depth 4 --dec-cell-high-depth 1 \
     --layer-normalization \
-    --no-shuffle --seed 1111 --dim-emb 64 --dim-rnn 128 --cost-type ce-mean \
+    --no-shuffle --clip-norm 0 --seed 1111 --dim-emb 64 --dim-rnn 128 --cost-type ce-mean \
     -m encdec_depth/model.npz -t $MRT_DATA/europarl.de-en/corpus.bpe.{de,en} -v vocab.en.yml vocab.de.yml \
     --log encdec_depth.log --disp-freq 2 --after-batches 10
 
@@ -25,7 +25,7 @@ test -e encdec_depth/model.npz
 test -e encdec_depth/model.npz.yml
 
 cat encdec_depth.log | $MRT_TOOLS/extract-costs.sh > encdec_depth.out
-$MRT_TOOLS/diff-nums.py encdec_depth.out encdec_depth.expected -p 3 -o encdec_depth.diff
+$MRT_TOOLS/diff-nums.py encdec_depth.out encdec_depth.expected -p 3.0 -o encdec_depth.diff
 
 # Exit with success code
 exit 0
diff --git a/tests/training/models/nematus/test_wmt17_model.sh b/tests/training/models/nematus/test_wmt17_model.sh
index d43ff9c..814630a 100644
--- a/tests/training/models/nematus/test_wmt17_model.sh
+++ b/tests/training/models/nematus/test_wmt17_model.sh
@@ -11,7 +11,7 @@ $MRT_MARIAN/marian \
     --type nematus --enc-cell gru-nematus --dec-cell gru-nematus \
     --enc-depth 1 --enc-cell-depth 4 --enc-type bidirectional --dec-depth 1 --dec-cell-base-depth 8 --dec-cell-high-depth 1 \
     --layer-normalization \
-    --no-shuffle --seed 1111 --dim-emb 64 --dim-rnn 128 --cost-type ce-mean \
+    --no-shuffle --clip-norm 0 --seed 1111 --dim-emb 64 --dim-rnn 128 --cost-type ce-mean \
     -m wmt17/model.npz -t $MRT_DATA/europarl.de-en/corpus.bpe.{de,en} -v vocab.en.yml vocab.de.yml \
     --log wmt17.log --disp-freq 2 --after-batches 10
 
@@ -19,7 +19,7 @@ test -e wmt17/model.npz
 test -e wmt17/model.npz.yml
 
 cat wmt17.log | $MRT_TOOLS/extract-costs.sh > wmt17.out
-$MRT_TOOLS/diff-nums.py wmt17.out wmt17.expected -p 2 -o wmt17.diff
+$MRT_TOOLS/diff-nums.py wmt17.out wmt17.expected -p 0.9 -o wmt17.diff
 
 # Exit with success code
 exit 0
diff --git a/tests/training/models/nematus/wmt17.expected b/tests/training/models/nematus/wmt17.expected
index c7ffbcd..0a14b91 100644
--- a/tests/training/models/nematus/wmt17.expected
+++ b/tests/training/models/nematus/wmt17.expected
@@ -1,5 +1,5 @@
-490.30654907
-466.26824951
-442.79544067
-426.92376709
-411.47766113
+490.18170166
+466.03765869
+442.34454346
+426.33612061
+411.51934814
diff --git a/tests/training/models/transformer/test_transformer.sh b/tests/training/models/transformer/test_transformer.sh
index b1ad881..41870a4 100644
--- a/tests/training/models/transformer/test_transformer.sh
+++ b/tests/training/models/transformer/test_transformer.sh
@@ -7,7 +7,7 @@ set -e
 rm -rf transformer transformer*.log
 mkdir -p transformer
 
-opts="--no-shuffle --seed 1111 --mini-batch 32 --maxi-batch 1 --maxi-batch-sort none --optimizer sgd --dim-emb 64 --dim-rnn 128 --cost-type ce-mean"
+opts="--no-shuffle --clip-norm 0 --seed 1111 --mini-batch 32 --maxi-batch 1 --maxi-batch-sort none --optimizer sgd --dim-emb 64 --dim-rnn 128 --cost-type ce-mean"
 
 $MRT_MARIAN/marian \
     --type transformer -m transformer/model.npz -t $MRT_DATA/europarl.de-en/corpus.bpe.{en,de} -v vocab.en.yml vocab.de.yml \
diff --git a/tests/training/models/transformer/transformer.expected b/tests/training/models/transformer/transformer.expected
index 50d85ee..28778b1 100644
--- a/tests/training/models/transformer/transformer.expected
+++ b/tests/training/models/transformer/transformer.expected
@@ -1,10 +1,10 @@
-236.04219055
-260.96929932
-254.12194824
-257.51263428
-253.00631714
-248.38674927
-245.93569946
-240.78047180
-248.01782227
-252.18527222
+226.79606628
+235.31921387
+219.17929077
+216.75386047
+210.01785278
+203.72079468
+198.40823364
+190.92230225
+195.92117310
+199.06428528
diff --git a/tests/training/multi-gpu/sync_sgd_1gpu.expected b/tests/training/multi-gpu/sync_sgd_1gpu.expected
index 73c9503..b826610 100644
--- a/tests/training/multi-gpu/sync_sgd_1gpu.expected
+++ b/tests/training/multi-gpu/sync_sgd_1gpu.expected
@@ -1,4 +1,4 @@
-236.64883423
-197.38874817
-198.74374390
-183.10134888
+230.86734009
+176.34066772
+178.55038452
+161.01515198
diff --git a/tests/training/multi-gpu/sync_sgd_1gpu_expsmooth.expected b/tests/training/multi-gpu/sync_sgd_1gpu_expsmooth.expected
index 0600ba2..b826610 100644
--- a/tests/training/multi-gpu/sync_sgd_1gpu_expsmooth.expected
+++ b/tests/training/multi-gpu/sync_sgd_1gpu_expsmooth.expected
@@ -1,4 +1,4 @@
-236.64883423
-197.38874817
-198.74374390
-183.10137939
+230.86734009
+176.34066772
+178.55038452
+161.01515198
diff --git a/tests/training/multi-gpu/test_sync_sgd_1gpu.sh b/tests/training/multi-gpu/test_sync_sgd_1gpu.sh
index 2b9c72b..a1a4453 100644
--- a/tests/training/multi-gpu/test_sync_sgd_1gpu.sh
+++ b/tests/training/multi-gpu/test_sync_sgd_1gpu.sh
@@ -8,8 +8,8 @@ rm -rf sync_sgd_1gpu sync_sgd_1gpu.log
 mkdir -p sync_sgd_1gpu
 
 $MRT_MARIAN/marian \
-    --no-shuffle --seed 888 --mini-batch 4 --maxi-batch 1 --maxi-batch-sort none \
-    --dim-rnn 64 --dim-emb 32 --learn-rate 0.1 \
+    --no-shuffle --clip-norm 0 --seed 888 --mini-batch 4 --maxi-batch 1 --maxi-batch-sort none \
+    --dim-rnn 64 --dim-emb 32 --learn-rate 0.02 \
     --devices 0 --sync-sgd --optimizer sgd --cost-type ce-mean \
     -m sync_sgd_1gpu/model.npz -t $MRT_DATA/europarl.de-en/corpus.bpe.{en,de} -v vocab.en.yml vocab.de.yml \
     --disp-freq 5 --save-freq 10 --after-batches 20 \
@@ -19,7 +19,7 @@ test -e sync_sgd_1gpu/model.npz
 test -e sync_sgd_1gpu.log
 
 cat sync_sgd_1gpu.log | $MRT_TOOLS/extract-costs.sh > sync_sgd_1gpu.out
-$MRT_TOOLS/diff-nums.py sync_sgd_1gpu.out sync_sgd_1gpu.expected -o sync_sgd_1gpu.diff
+$MRT_TOOLS/diff-nums.py -p 0.02 sync_sgd_1gpu.out sync_sgd_1gpu.expected -o sync_sgd_1gpu.diff
 
 # Exit with success code
 exit 0
diff --git a/tests/training/multi-gpu/test_sync_sgd_1gpu_expsmooth.sh b/tests/training/multi-gpu/test_sync_sgd_1gpu_expsmooth.sh
index d1d77d3..586a57a 100644
--- a/tests/training/multi-gpu/test_sync_sgd_1gpu_expsmooth.sh
+++ b/tests/training/multi-gpu/test_sync_sgd_1gpu_expsmooth.sh
@@ -8,8 +8,8 @@ rm -rf sync_sgd_1gpu_expsmooth sync_sgd_1gpu_expsmooth.log
 mkdir -p sync_sgd_1gpu_expsmooth
 
 $MRT_MARIAN/marian \
-    --no-shuffle --seed 888 --mini-batch 4 --maxi-batch 1 --maxi-batch-sort none \
-    --dim-rnn 64 --dim-emb 32 --learn-rate 0.1 \
+    --no-shuffle --clip-norm 0 --seed 888 --mini-batch 4 --maxi-batch 1 --maxi-batch-sort none \
+    --dim-rnn 64 --dim-emb 32 --learn-rate 0.02 \
     --devices 0 --sync-sgd --optimizer sgd --exponential-smoothing --cost-type ce-mean \
     -m sync_sgd_1gpu_expsmooth/model.npz -t $MRT_DATA/europarl.de-en/corpus.bpe.{en,de} -v vocab.en.yml vocab.de.yml \
     --disp-freq 5 --save-freq 10 --after-batches 20 \
@@ -19,7 +19,7 @@ test -e sync_sgd_1gpu_expsmooth/model.npz
 test -e sync_sgd_1gpu_expsmooth.log
 
 cat sync_sgd_1gpu_expsmooth.log | $MRT_TOOLS/extract-costs.sh > sync_sgd_1gpu_expsmooth.out
-$MRT_TOOLS/diff-nums.py sync_sgd_1gpu_expsmooth.out sync_sgd_1gpu_expsmooth.expected -o sync_sgd_1gpu_expsmooth.diff
+$MRT_TOOLS/diff-nums.py -p 0.02 sync_sgd_1gpu_expsmooth.out sync_sgd_1gpu_expsmooth.expected -o sync_sgd_1gpu_expsmooth.diff
 
 # Exit with success code
 exit 0
diff --git a/tests/training/restarting/sgd_2e.expected b/tests/training/restarting/sgd_2e.expected
index 2a801f2..15bbe18 100644
--- a/tests/training/restarting/sgd_2e.expected
+++ b/tests/training/restarting/sgd_2e.expected
@@ -1,16 +1,16 @@
 Ep. 1 : Up. 4 : Sen. 128 : Cost 257.99652100
 Ep. 1 : Up. 8 : Sen. 256 : Cost 267.93783569
-Ep. 1 : Up. 12 : Sen. 384 : Cost 243.39039612
-Ep. 1 : Up. 16 : Sen. 512 : Cost 235.87208557
+Ep. 1 : Up. 12 : Sen. 384 : Cost 243.39041138
+Ep. 1 : Up. 16 : Sen. 512 : Cost 235.87210083
 Ep. 1 : Up. 20 : Sen. 640 : Cost 204.79017639
-Ep. 1 : Up. 24 : Sen. 768 : Cost 240.11624146
-Ep. 1 : Up. 28 : Sen. 896 : Cost 208.47099304
-Ep. 1 : Up. 32 : Sen. 1,024 : Cost 199.36221313
-Ep. 2 : Up. 36 : Sen. 128 : Cost 213.58728027
-Ep. 2 : Up. 40 : Sen. 256 : Cost 220.51084900
-Ep. 2 : Up. 44 : Sen. 384 : Cost 199.10847473
-Ep. 2 : Up. 48 : Sen. 512 : Cost 194.56945801
-Ep. 2 : Up. 52 : Sen. 640 : Cost 171.77461243
-Ep. 2 : Up. 56 : Sen. 768 : Cost 208.75405884
-Ep. 2 : Up. 60 : Sen. 896 : Cost 186.17434692
-Ep. 2 : Up. 64 : Sen. 1,024 : Cost 179.63316345
+Ep. 1 : Up. 24 : Sen. 768 : Cost 240.11631775
+Ep. 1 : Up. 28 : Sen. 896 : Cost 208.47109985
+Ep. 1 : Up. 32 : Sen. 1,024 : Cost 199.36233521
+Ep. 2 : Up. 36 : Sen. 128 : Cost 213.58744812
+Ep. 2 : Up. 40 : Sen. 256 : Cost 220.51107788
+Ep. 2 : Up. 44 : Sen. 384 : Cost 199.10870361
+Ep. 2 : Up. 48 : Sen. 512 : Cost 194.56954956
+Ep. 2 : Up. 52 : Sen. 640 : Cost 171.77453613
+Ep. 2 : Up. 56 : Sen. 768 : Cost 208.75399780
+Ep. 2 : Up. 60 : Sen. 896 : Cost 186.17416382
+Ep. 2 : Up. 64 : Sen. 1,024 : Cost 179.63301086
diff --git a/tests/training/restarting/sgd_sync_2e.expected b/tests/training/restarting/sgd_sync_2e.expected
index 07061db..a83990d 100644
--- a/tests/training/restarting/sgd_sync_2e.expected
+++ b/tests/training/restarting/sgd_sync_2e.expected
@@ -7,10 +7,10 @@ Ep. 1 : Up. 24 : Sen. 768 : Cost 240.11631775
 Ep. 1 : Up. 28 : Sen. 896 : Cost 208.47109985
 Ep. 1 : Up. 32 : Sen. 1,024 : Cost 199.36233521
 Ep. 2 : Up. 36 : Sen. 128 : Cost 213.58744812
-Ep. 2 : Up. 40 : Sen. 256 : Cost 220.51107788
+Ep. 2 : Up. 40 : Sen. 256 : Cost 220.51104736
 Ep. 2 : Up. 44 : Sen. 384 : Cost 199.10870361
 Ep. 2 : Up. 48 : Sen. 512 : Cost 194.56954956
 Ep. 2 : Up. 52 : Sen. 640 : Cost 171.77453613
-Ep. 2 : Up. 56 : Sen. 768 : Cost 208.75396729
+Ep. 2 : Up. 56 : Sen. 768 : Cost 208.75399780
 Ep. 2 : Up. 60 : Sen. 896 : Cost 186.17416382
 Ep. 2 : Up. 64 : Sen. 1,024 : Cost 179.63301086
diff --git a/tests/training/restarting/test_sgd_for_two_epochs.sh b/tests/training/restarting/test_sgd_for_two_epochs.sh
index ad92b5f..11bf76e 100644
--- a/tests/training/restarting/test_sgd_for_two_epochs.sh
+++ b/tests/training/restarting/test_sgd_for_two_epochs.sh
@@ -1,5 +1,11 @@
 #!/bin/bash -x
 
+#####################################################################
+# SUMMARY: Restaring training after the 1st epoch (async)
+# AUTHOR: snukky
+# TAGS: optimizer clip-norm
+#####################################################################
+
 # Exit on error
 set -e
 
@@ -7,12 +13,13 @@ set -e
 rm -rf sgd_2e sgd_1st_epoch.log sgd_2nd_epoch.log
 mkdir -p sgd_2e
 
-extra_opts="--no-shuffle --seed 1111 --maxi-batch 1 --maxi-batch-sort none --mini-batch 32 --optimizer sgd"
+extra_opts="--no-shuffle --clip-norm 1 --seed 1111 --maxi-batch 1 --maxi-batch-sort none --mini-batch 32 --optimizer sgd"
 # Added because default options has changes
 extra_opts="$extra_opts --cost-type ce-mean --disp-label-counts false"
 
 
 # Uncomment to prepare the expected output
+#rm -f sgd_two_epochs.log
 #$MRT_MARIAN/marian \
     #-m sgd_2e/model_2e.npz -t $MRT_DATA/train.max50.{en,de} -v vocab.en.yml vocab.de.yml \
     #--disp-freq 4 --save-freq 32 --after-epochs 2 -l 0.1 $extra_opts \
diff --git a/tests/training/restarting/test_sgd_for_two_epochs_sync.sh b/tests/training/restarting/test_sgd_for_two_epochs_sync.sh
index d3ee295..8615e80 100644
--- a/tests/training/restarting/test_sgd_for_two_epochs_sync.sh
+++ b/tests/training/restarting/test_sgd_for_two_epochs_sync.sh
@@ -1,5 +1,11 @@
 #!/bin/bash -x
 
+#####################################################################
+# SUMMARY: Restaring training after the 1st epoch (sync-sgd)
+# AUTHOR: snukky
+# TAGS: optimizer clip-norm
+#####################################################################
+
 # Exit on error
 set -e
 
@@ -7,12 +13,13 @@ set -e
 rm -rf sgd_sync_2e sgd_sync_*_epoch.log
 mkdir -p sgd_sync_2e
 
-extra_opts="--no-shuffle --seed 1111 --maxi-batch 1 --maxi-batch-sort none --mini-batch 32 --optimizer sgd --sync-sgd"
+extra_opts="--no-shuffle --clip-norm 1 --seed 1111 --maxi-batch 1 --maxi-batch-sort none --mini-batch 32 --optimizer sgd --sync-sgd"
 # Added because default options has changes
 extra_opts="$extra_opts --cost-type ce-mean --disp-label-counts false"
 
 
 # Uncomment to prepare the expected output
+#rm -f sgd_sync_two_epochs.log
 #$MRT_MARIAN/marian \
     #-m sgd_sync_2e/model_2e.npz -t $MRT_DATA/train.max50.{en,de} -v vocab.en.yml vocab.de.yml \
     #--disp-freq 4 --save-freq 32 --after-epochs 2 -l 0.1 $extra_opts \
diff --git a/tests/training/restoring/corpus/finetune.expected b/tests/training/restoring/corpus/finetune.expected
index 6f4bc95..21f1847 100644
--- a/tests/training/restoring/corpus/finetune.expected
+++ b/tests/training/restoring/corpus/finetune.expected
@@ -1,15 +1,15 @@
-Ep. 1 : Up. 4 : Sen. 256 : Cost 239.27255249
-Ep. 1 : Up. 8 : Sen. 512 : Cost 246.85655212
-Ep. 1 : Up. 12 : Sen. 768 : Cost 230.16513062
-Ep. 1 : Up. 16 : Sen. 1,024 : Cost 251.03186035
-Ep. 1 : Up. 20 : Sen. 1,280 : Cost 249.74163818
-Ep. 1 : Up. 24 : Sen. 1,536 : Cost 239.31179810
-Ep. 1 : Up. 28 : Sen. 1,792 : Cost 231.93222046
-Ep. 1 : Up. 32 : Sen. 128 : Cost 255.42749023
-Ep. 1 : Up. 36 : Sen. 384 : Cost 250.27011108
-Ep. 1 : Up. 40 : Sen. 640 : Cost 249.66784668
-Ep. 1 : Up. 44 : Sen. 896 : Cost 254.14111328
-Ep. 2 : Up. 48 : Sen. 128 : Cost 237.40222168
-Ep. 2 : Up. 52 : Sen. 384 : Cost 255.97949219
-Ep. 2 : Up. 56 : Sen. 640 : Cost 252.84860229
-Ep. 2 : Up. 60 : Sen. 896 : Cost 244.12496948
+Ep. 1 : Up. 4 : Sen. 256 : Cost 238.82701111
+Ep. 1 : Up. 8 : Sen. 512 : Cost 245.15895081
+Ep. 1 : Up. 12 : Sen. 768 : Cost 227.24861145
+Ep. 1 : Up. 16 : Sen. 1,024 : Cost 246.25918579
+Ep. 1 : Up. 20 : Sen. 1,280 : Cost 243.25015259
+Ep. 1 : Up. 24 : Sen. 1,536 : Cost 230.48197937
+Ep. 1 : Up. 28 : Sen. 1,792 : Cost 219.80914307
+Ep. 1 : Up. 32 : Sen. 128 : Cost 236.07504272
+Ep. 1 : Up. 36 : Sen. 384 : Cost 225.42373657
+Ep. 1 : Up. 40 : Sen. 640 : Cost 218.38552856
+Ep. 1 : Up. 44 : Sen. 896 : Cost 217.53744507
+Ep. 2 : Up. 48 : Sen. 128 : Cost 201.09486389
+Ep. 2 : Up. 52 : Sen. 384 : Cost 215.17204285
+Ep. 2 : Up. 56 : Sen. 640 : Cost 211.10237122
+Ep. 2 : Up. 60 : Sen. 896 : Cost 200.33345032
diff --git a/tests/training/restoring/corpus/test_finetune.sh b/tests/training/restoring/corpus/test_finetune.sh
index 1e99645..78099d4 100644
--- a/tests/training/restoring/corpus/test_finetune.sh
+++ b/tests/training/restoring/corpus/test_finetune.sh
@@ -17,8 +17,7 @@ test -e vocab.de.yml
 test -e vocab.en.yml
 
 extra_opts="--seed 2222 --maxi-batch 1 --maxi-batch-sort none --mini-batch 64 --optimizer sgd --dim-emb 128 --dim-rnn 256 --disp-freq 4"
-# Added because default options has changes
-extra_opts="$extra_opts --cost-type ce-mean --disp-label-counts false"
+extra_opts="$extra_opts --cost-type ce-mean --disp-label-counts false --clip-norm 0"
 
 
 # Train a model on a training corpus
diff --git a/tests/training/restoring/exp-smoothing/test_expsmooth.sh b/tests/training/restoring/exp-smoothing/test_expsmooth.sh
index f048018..e7c7b6d 100644
--- a/tests/training/restoring/exp-smoothing/test_expsmooth.sh
+++ b/tests/training/restoring/exp-smoothing/test_expsmooth.sh
@@ -1,5 +1,11 @@
 #!/bin/bash -x
 
+#####################################################################
+# SUMMARY: Compare costs from a restarted training with exp-smoothing with a single pass
+# AUTHOR: snukky
+# TAGS: exp-smooth clip-norm
+#####################################################################
+
 # Exit on error
 set -e
 
@@ -8,7 +14,7 @@ rm -rf expsmooth expsmooth_*.log
 mkdir -p expsmooth
 
 
-opts="--no-shuffle --seed 777 --mini-batch 4 --maxi-batch 1 --maxi-batch-sort none"
+opts="--no-shuffle --clip-norm 1 --seed 777 --mini-batch 4 --maxi-batch 1 --maxi-batch-sort none"
 opts="$opts --dim-rnn 64 --dim-emb 32 --optimizer sgd --learn-rate 0.5"
 opts="$opts --valid-sets valid.bpe.en valid.bpe.de --valid-metrics cross-entropy --valid-mini-batch 32"
 # Added because default options has changes
diff --git a/tests/training/restoring/exp-smoothing/test_expsmooth_s2s.sh b/tests/training/restoring/exp-smoothing/test_expsmooth_s2s.sh
index 1080546..831ebec 100644
--- a/tests/training/restoring/exp-smoothing/test_expsmooth_s2s.sh
+++ b/tests/training/restoring/exp-smoothing/test_expsmooth_s2s.sh
@@ -1,5 +1,11 @@
 #!/bin/bash -x
 
+#####################################################################
+# SUMMARY: Compare costs from a restarted training with exp-smoothing with a single pass
+# AUTHOR: snukky
+# TAGS: exp-smooth clip-norm
+#####################################################################
+
 # Exit on error
 set -e
 
@@ -8,7 +14,7 @@ rm -rf expsmooth_s2s expsmooth_s2s_*.log
 mkdir -p expsmooth_s2s
 
 
-opts="--no-shuffle --seed 777 --mini-batch 4 --maxi-batch 1 --maxi-batch-sort none"
+opts="--no-shuffle --clip-norm 1 --seed 777 --mini-batch 4 --maxi-batch 1 --maxi-batch-sort none --sync-sgd"
 opts="$opts --dim-rnn 64 --dim-emb 32 --optimizer sgd --learn-rate 0.5"
 opts="$opts --valid-sets valid.bpe.en valid.bpe.de --valid-metrics cross-entropy --valid-mini-batch 32 --type s2s"
 # Added because default options has changes
diff --git a/tests/training/restoring/exp-smoothing/test_expsmooth_sync.sh b/tests/training/restoring/exp-smoothing/test_expsmooth_sync.sh
index 3e26acf..eafc1cc 100644
--- a/tests/training/restoring/exp-smoothing/test_expsmooth_sync.sh
+++ b/tests/training/restoring/exp-smoothing/test_expsmooth_sync.sh
@@ -1,5 +1,11 @@
 #!/bin/bash -x
 
+#####################################################################
+# SUMMARY: Compare costs from a restarted training with exp-smoothing with a single pass on 2 GPUs
+# AUTHOR: snukky
+# TAGS: exp-smooth clip-norm multigpu
+#####################################################################
+
 # Exit on error
 set -e
 
diff --git a/tests/training/restoring/multi-gpu/test_adam_sync.sh b/tests/training/restoring/multi-gpu/test_adam_sync.sh
index 84079d3..daf2524 100644
--- a/tests/training/restoring/multi-gpu/test_adam_sync.sh
+++ b/tests/training/restoring/multi-gpu/test_adam_sync.sh
@@ -1,5 +1,11 @@
 #!/bin/bash -x
 
+#####################################################################
+# SUMMARY: Training with Adam on 2 GPUs (sync-sgd)
+# AUTHOR: snukky
+# TAGS: optimizer adam multigpu
+#####################################################################
+
 # Exit on error
 set -e
 
diff --git a/tests/training/restoring/multi-gpu/test_async.sh b/tests/training/restoring/multi-gpu/test_async.sh
index 8e22f30..ba13ec6 100644
--- a/tests/training/restoring/multi-gpu/test_async.sh
+++ b/tests/training/restoring/multi-gpu/test_async.sh
@@ -1,5 +1,11 @@
 #!/bin/bash -x
 
+#####################################################################
+# SUMMARY: Training with SGD on 2 GPUs (async)
+# AUTHOR: snukky
+# TAGS: optimizer multigpu clip-norm
+#####################################################################
+
 # Exit on error
 set -e
 
@@ -12,7 +18,7 @@ fi
 rm -rf async async_*.log async.*out async.*expected
 mkdir -p async
 
-opts="--no-shuffle --seed 777 --mini-batch 1 --maxi-batch 1 --maxi-batch-sort none --dim-rnn 64 --dim-emb 32 --optimizer sgd --learn-rate 0.1 --devices 0 1"
+opts="--no-shuffle --clip-norm 0 --seed 777 --mini-batch 1 --maxi-batch 1 --maxi-batch-sort none --dim-rnn 64 --dim-emb 32 --optimizer sgd --learn-rate 0.005 --devices 0 1"
 # Added because default options has changes
 opts="$opts --cost-type ce-mean --disp-label-counts false"
 
diff --git a/tests/training/restoring/multi-gpu/test_sync.sh b/tests/training/restoring/multi-gpu/test_sync.sh
index ff10d23..57fc76d 100644
--- a/tests/training/restoring/multi-gpu/test_sync.sh
+++ b/tests/training/restoring/multi-gpu/test_sync.sh
@@ -1,5 +1,11 @@
 #!/bin/bash -x
 
+#####################################################################
+# SUMMARY: Training with SGD on 2 GPUs (sync-sgd)
+# AUTHOR: snukky
+# TAGS: optimizer multigpu clip-norm
+#####################################################################
+
 # Exit on error
 set -e
 
@@ -12,7 +18,7 @@ fi
 rm -rf sync sync_*.log
 mkdir -p sync
 
-opts="--no-shuffle --seed 777 --mini-batch 4 --maxi-batch 1 --maxi-batch-sort none --dim-rnn 64 --dim-emb 32 --optimizer sgd --learn-rate 0.1 --devices 0 1 --sync-sgd"
+opts="--no-shuffle --clip-norm 0 --seed 777 --mini-batch 4 --maxi-batch 1 --maxi-batch-sort none --dim-rnn 64 --dim-emb 32 --optimizer sgd --learn-rate 0.01 --devices 0 1 --sync-sgd"
 # Added because default options has changes
 opts="$opts --cost-type ce-mean --disp-label-counts false"
 
@@ -49,7 +55,7 @@ test -e sync_2.log
 
 cat sync_2.log | $MRT_TOOLS/strip-timestamps.sh | grep "Ep\. " | sed 's/ : Time.*//' >> sync.out
 
-$MRT_TOOLS/diff-nums.py -p 0.3 sync.out sync.expected -o sync.diff
+$MRT_TOOLS/diff-nums.py -p 0.1 sync.out sync.expected -o sync.diff
 
 # Exit with success code
 exit 0
diff --git a/tests/training/restoring/optimizer/adagrad.costs.expected b/tests/training/restoring/optimizer/adagrad.costs.expected
index 7b4f7e1..533d10c 100644
--- a/tests/training/restoring/optimizer/adagrad.costs.expected
+++ b/tests/training/restoring/optimizer/adagrad.costs.expected
@@ -1,10 +1,10 @@
-238.52751160
-245.27938843
-239.83557129
-232.83401489
-238.87149048
-253.74154663
-255.69897461
-243.06086731
-244.85818481
-235.55209351
+238.52250671
+245.26730347
+239.82205200
+232.81472778
+238.84121704
+253.69161987
+255.61422729
+242.94416809
+244.69099426
+235.35519409
diff --git a/tests/training/restoring/optimizer/adagrad.gt.expected b/tests/training/restoring/optimizer/adagrad.gt.expected
index 30a932c..a6c90a5 100644
--- a/tests/training/restoring/optimizer/adagrad.gt.expected
+++ b/tests/training/restoring/optimizer/adagrad.gt.expected
@@ -1,2 +1,2 @@
-[[  4.38133684e-05   1.40065049e-06   3.63037943e-05 ...,   1.23982169e-02
-    3.66997421e-02   4.11312692e-02]]
+[[8.0574207e+00 1.5418689e-01 4.3262744e+00 ... 4.0905408e+03
+  1.1550205e+04 1.3359570e+04]]
diff --git a/tests/training/restoring/optimizer/adam.costs.expected b/tests/training/restoring/optimizer/adam.costs.expected
index a6b5f9a..565b1ea 100644
--- a/tests/training/restoring/optimizer/adam.costs.expected
+++ b/tests/training/restoring/optimizer/adam.costs.expected
@@ -1,10 +1,10 @@
-238.40983582
-244.61091614
-238.22981262
-229.24475098
-230.14970398
-234.50399780
-228.12467957
-210.38107300
-206.17379761
-196.83959961
+238.40853882
+244.59863281
+238.15905762
+228.80813599
+227.96830750
+231.00505066
+225.24502563
+207.64001465
+203.54002380
+194.72296143
diff --git a/tests/training/restoring/optimizer/adam.mt.expected b/tests/training/restoring/optimizer/adam.mt.expected
index 3a2de9b..67c9756 100644
--- a/tests/training/restoring/optimizer/adam.mt.expected
+++ b/tests/training/restoring/optimizer/adam.mt.expected
@@ -1,2 +1,2 @@
-[[ 8.0254285e-06 -5.1497386e-07  3.8298724e-05 ...  1.5516396e-03
-   1.5692838e-03  2.0285486e-03]]
+[[-0.00667148  0.00525377  0.0564099  ...  1.5877182   1.6200635
+   2.2804906 ]]
diff --git a/tests/training/restoring/optimizer/adam.vt.expected b/tests/training/restoring/optimizer/adam.vt.expected
index 6fbbfe5..c54930d 100644
--- a/tests/training/restoring/optimizer/adam.vt.expected
+++ b/tests/training/restoring/optimizer/adam.vt.expected
@@ -1,2 +1,2 @@
-[[  9.29374124e-08   4.41528991e-09   3.45339437e-08 ...,   2.22943163e-05
-    2.69053471e-05   5.34869505e-05]]
+[[8.1617765e-02 3.0912522e-03 1.2053944e-02 ... 4.4662014e+01
+  3.7031158e+01 7.2262390e+01]]
diff --git a/tests/training/restoring/optimizer/adam_load.expected b/tests/training/restoring/optimizer/adam_load.expected
index bf5fef4..5dd5164 100644
--- a/tests/training/restoring/optimizer/adam_load.expected
+++ b/tests/training/restoring/optimizer/adam_load.expected
@@ -1,6 +1,6 @@
 Ep. 1 : Up. 1 : Sen. 2 : Cost 223.64685059
-Ep. 1 : Up. 2 : Sen. 4 : Cost 258.80792236
-Ep. 1 : Up. 3 : Sen. 6 : Cost 255.67260742
-Ep. 1 : Up. 4 : Sen. 8 : Cost 346.67749023
-Ep. 1 : Up. 5 : Sen. 10 : Cost 278.72695923
-Ep. 1 : Up. 6 : Sen. 12 : Cost 178.23016357
+Ep. 1 : Up. 2 : Sen. 4 : Cost 258.78131104
+Ep. 1 : Up. 3 : Sen. 6 : Cost 256.86120605
+Ep. 1 : Up. 4 : Sen. 8 : Cost 365.52239990
+Ep. 1 : Up. 5 : Sen. 10 : Cost 281.86376953
+Ep. 1 : Up. 6 : Sen. 12 : Cost 203.98873901
diff --git a/tests/training/restoring/optimizer/adam_sync.costs.expected b/tests/training/restoring/optimizer/adam_sync.costs.expected
index d390e92..29a7e61 100644
--- a/tests/training/restoring/optimizer/adam_sync.costs.expected
+++ b/tests/training/restoring/optimizer/adam_sync.costs.expected
@@ -1,10 +1,10 @@
-7245.93652344
-7990.90771484
-7741.82177734
-7778.60302734
-7445.29589844
-7015.16699219
-6661.45312500
-6346.10888672
-6402.09814453
-6369.64550781
+7245.93505859
+7990.90478516
+7741.81542969
+7778.61621094
+7445.38574219
+7015.21337891
+6661.38769531
+6346.22802734
+6402.10009766
+6369.72216797
diff --git a/tests/training/restoring/optimizer/test_adagrad_params.sh b/tests/training/restoring/optimizer/test_adagrad_params.sh
index 8fca356..1372071 100644
--- a/tests/training/restoring/optimizer/test_adagrad_params.sh
+++ b/tests/training/restoring/optimizer/test_adagrad_params.sh
@@ -1,5 +1,11 @@
 #!/bin/bash -x
 
+#####################################################################
+# SUMMARY: Training with Adagrad optimizer
+# AUTHOR: snukky
+# TAGS: optimizer adagrad
+#####################################################################
+
 # Exit on error
 set -e
 
@@ -8,7 +14,7 @@ rm -rf adagrad adagrad*.log
 mkdir -p adagrad
 
 $MRT_MARIAN/marian \
-    --no-shuffle --seed 7777 --maxi-batch 1 --maxi-batch-sort none --dim-emb 128 --dim-rnn 256 \
+    --no-shuffle --clip-norm 0 --seed 7777 --maxi-batch 1 --maxi-batch-sort none --dim-emb 128 --dim-rnn 256 \
     -m adagrad/model.npz -t $MRT_DATA/europarl.de-en/corpus.bpe.{en,de} -v vocab.en.yml vocab.de.yml \
     --disp-freq 10 --after-batches 100 --save-freq 60 --optimizer adagrad --cost-type ce-mean \
     --log adagrad.log
@@ -24,7 +30,7 @@ python3 $MRT_MARIAN/../scripts/contrib/model_info.py -m adagrad/model.npz.optimi
 $MRT_TOOLS/diff.sh adagrad.keys.out adagrad.keys.expected > adagrad.keys.diff
 
 python3 $MRT_MARIAN/../scripts/contrib/model_info.py -m adagrad/model.npz.optimizer.npz -k "adagrad_gt" > adagrad.gt.out
-$MRT_TOOLS/diff-nums.py --numpy -p 0.001 adagrad.gt.out adagrad.gt.expected -o adagrad.gt.diff
+$MRT_TOOLS/diff-nums.py --numpy -p 0.009 adagrad.gt.out adagrad.gt.expected -o adagrad.gt.diff
 
 # Exit with success code
 exit 0
diff --git a/tests/training/restoring/optimizer/test_adam_params.sh b/tests/training/restoring/optimizer/test_adam_params.sh
index e0bd76f..4310850 100644
--- a/tests/training/restoring/optimizer/test_adam_params.sh
+++ b/tests/training/restoring/optimizer/test_adam_params.sh
@@ -1,5 +1,11 @@
 #!/bin/bash -x
 
+#####################################################################
+# SUMMARY: Training with Adam
+# AUTHOR: snukky
+# TAGS: optimizer adam
+#####################################################################
+
 # Exit on error
 set -e
 
@@ -8,7 +14,7 @@ rm -rf adam adam.log
 mkdir -p adam
 
 $MRT_MARIAN/marian \
-    --no-shuffle --seed 7777 --maxi-batch 1 --maxi-batch-sort none --dim-emb 128 --dim-rnn 256 \
+    --no-shuffle --clip-norm 0 --seed 7777 --maxi-batch 1 --maxi-batch-sort none --dim-emb 128 --dim-rnn 256 \
     -m adam/model.npz -t $MRT_DATA/europarl.de-en/corpus.bpe.{en,de} -v vocab.en.yml vocab.de.yml \
     --disp-freq 10 --after-batches 100 --save-freq 60 --cost-type ce-mean \
     --log adam.log
diff --git a/tests/training/restoring/optimizer/test_adam_params_async.sh b/tests/training/restoring/optimizer/test_adam_params_async.sh
index 2b2c869..6dee216 100644
--- a/tests/training/restoring/optimizer/test_adam_params_async.sh
+++ b/tests/training/restoring/optimizer/test_adam_params_async.sh
@@ -1,5 +1,11 @@
 #!/bin/bash -x
 
+#####################################################################
+# SUMMARY: Training with Adam on 2 GPUs with asynchronous SGD
+# AUTHOR: snukky
+# TAGS: optimizer adam multigpu async clip-norm
+#####################################################################
+
 # Exit on error
 set -e
 
@@ -13,7 +19,7 @@ if (( $MRT_NUM_DEVICES < 2 )); then
 fi
 
 $MRT_MARIAN/marian \
-    --no-shuffle --seed 7777 --maxi-batch 1 --maxi-batch-sort none --mini-batch 32 --dim-emb 128 --dim-rnn 256 \
+    --no-shuffle --clip-norm 1 --seed 7777 --maxi-batch 1 --maxi-batch-sort none --mini-batch 32 --dim-emb 128 --dim-rnn 256 \
     -m adam_async/model.npz -t $MRT_DATA/europarl.de-en/corpus.bpe.{en,de} -v vocab.en.yml vocab.de.yml \
     --disp-freq 10 --after-batches 100 --save-freq 60 --cost-type ce-sum --disp-label-counts false \
     --log adam_async.log --devices 0 1
@@ -33,8 +39,8 @@ $MRT_TOOLS/diff.sh adam_async.keys.out adam.keys.expected > adam_async.keys.diff
 python3 $MRT_MARIAN/../scripts/contrib/model_info.py -m adam_async/model.npz.optimizer.npz -k "adam_mt" > adam_async.mt.out
 python3 $MRT_MARIAN/../scripts/contrib/model_info.py -m adam_async/model.npz.optimizer.npz -k "adam_vt" > adam_async.vt.out
 
-$MRT_TOOLS/diff-nums.py --numpy -a -p 0.02  adam_async.mt.out adam_async.mt.expected -o adam_async.mt.diff
-$MRT_TOOLS/diff-nums.py --numpy    -p 0.001 adam_async.vt.out adam_async.vt.expected -o adam_async.vt.diff
+$MRT_TOOLS/diff-nums.py --numpy -a -p 0.03 adam_async.mt.out adam_async.mt.expected -o adam_async.mt.diff
+$MRT_TOOLS/diff-nums.py --numpy    -p 0.03 adam_async.vt.out adam_async.vt.expected -o adam_async.vt.diff
 
 # Exit with success code
 exit 0
diff --git a/tests/training/restoring/optimizer/test_adam_params_sync.sh b/tests/training/restoring/optimizer/test_adam_params_sync.sh
index 1e2481d..43dcda1 100644
--- a/tests/training/restoring/optimizer/test_adam_params_sync.sh
+++ b/tests/training/restoring/optimizer/test_adam_params_sync.sh
@@ -1,5 +1,11 @@
 #!/bin/bash -x
 
+#####################################################################
+# SUMMARY: Training with Adam on 2 GPUs with sync-sgd
+# AUTHOR: snukky
+# TAGS: optimizer adam multigpu
+#####################################################################
+
 # Exit on error
 set -e
 
@@ -13,10 +19,10 @@ if (( $MRT_NUM_DEVICES < 2 )); then
 fi
 
 $MRT_MARIAN/marian \
-    --no-shuffle --seed 7777 --maxi-batch 1 --maxi-batch-sort none --mini-batch 32 --dim-emb 128 --dim-rnn 256 \
+    --no-shuffle --clip-norm 0 --seed 7777 --maxi-batch 1 --maxi-batch-sort none --mini-batch 32 --dim-emb 128 --dim-rnn 256 \
     -m adam_sync/model.npz -t $MRT_DATA/europarl.de-en/corpus.bpe.{en,de} -v vocab.en.yml vocab.de.yml \
     --disp-freq 10 --after-batches 100 --save-freq 60 \
-    --log adam_sync.log --devices 0 1 --sync-sgd --cost-type ce-sum --disp-label-counts false --clip-norm 0
+    --log adam_sync.log --devices 0 1 --sync-sgd --cost-type ce-sum --disp-label-counts false
 
 test -e adam_sync/model.npz
 test -e adam_sync/model.npz.optimizer.npz
diff --git a/tests/training/restoring/optimizer/test_loading_adam_params.sh b/tests/training/restoring/optimizer/test_loading_adam_params.sh
index 95a48c8..2ded056 100644
--- a/tests/training/restoring/optimizer/test_loading_adam_params.sh
+++ b/tests/training/restoring/optimizer/test_loading_adam_params.sh
@@ -1,5 +1,11 @@
 #!/bin/bash -x
 
+#####################################################################
+# SUMMARY: Loading Adam parameters after restarting training
+# AUTHOR: snukky
+# TAGS: optimizer adam
+#####################################################################
+
 # Exit on error
 set -e
 
@@ -8,8 +14,7 @@ rm -rf adam_load adam_load_?.log
 mkdir -p adam_load
 
 extra_opts="--no-shuffle --seed 7777 --maxi-batch 1 --maxi-batch-sort none --mini-batch 2 --dim-rnn 64 --dim-emb 32"
-# Added because default options has changes
-extra_opts="$extra_opts --cost-type ce-mean --disp-label-counts false"
+extra_opts="$extra_opts --cost-type ce-mean --disp-label-counts false --clip-norm 0"
 
 $MRT_MARIAN/marian \
     -m adam_load/model.npz -t $MRT_DATA/train.max50.{en,de} -v vocab.en.yml vocab.de.yml \
diff --git a/tests/training/restoring/validation/test_adding_validator_after_restart.sh b/tests/training/restoring/validation/test_adding_validator_after_restart.sh
index ff95d90..6a6f2f3 100644
--- a/tests/training/restoring/validation/test_adding_validator_after_restart.sh
+++ b/tests/training/restoring/validation/test_adding_validator_after_restart.sh
@@ -9,7 +9,7 @@ mkdir -p valid_add
 
 extra_opts="--no-shuffle --seed 2222 --maxi-batch 1 --maxi-batch-sort none --optimizer sgd"
 extra_opts="$extra_opts --dim-emb 128 --dim-rnn 256 --mini-batch 16"
-extra_opts="$extra_opts --cost-type ce-mean --disp-label-counts false"
+extra_opts="$extra_opts --cost-type ce-mean --disp-label-counts false --clip-norm 0"
 
 #$MRT_MARIAN/marian $extra_opts \
     #-m valid_add/model.npz -t $MRT_DATA/europarl.de-en/corpus.bpe.{en,de} -v vocab.en.yml vocab.de.yml \
diff --git a/tests/training/restoring/validation/test_restoring_newbest_validators.sh b/tests/training/restoring/validation/test_restoring_newbest_validators.sh
index 444599b..fa8b37a 100644
--- a/tests/training/restoring/validation/test_restoring_newbest_validators.sh
+++ b/tests/training/restoring/validation/test_restoring_newbest_validators.sh
@@ -14,7 +14,7 @@ head -n 8 $MRT_DATA/europarl.de-en/toy.bpe.de > valid.mini.bpe.de
 # Uncomment to re-generate the expected output
 
 #$MRT_MARIAN/marian \
-    #--type s2s --no-shuffle --seed 2222 --maxi-batch 1 --maxi-batch-sort none --quiet-translation \
+    #--type s2s --no-shuffle --seed 2222 --maxi-batch 1 --maxi-batch-sort none --quiet-translation --clip-norm 0 \
     #--dim-emb 64 --dim-rnn 128 --mini-batch 16 --optimizer sgd --cost-type ce-mean \
     #-m valid_newbest/model.npz -t $MRT_DATA/europarl.de-en/toy.bpe.{en,de} -v vocab.en.yml vocab.de.yml \
     #--disp-freq 5 --valid-freq 10 --after-batches 100 \
@@ -28,7 +28,7 @@ head -n 8 $MRT_DATA/europarl.de-en/toy.bpe.de > valid.mini.bpe.de
 
 
 $MRT_MARIAN/marian \
-    --type s2s --no-shuffle --seed 2222 --maxi-batch 1 --maxi-batch-sort none --quiet-translation \
+    --type s2s --no-shuffle --seed 2222 --maxi-batch 1 --maxi-batch-sort none --quiet-translation --clip-norm 0 \
     --dim-emb 64 --dim-rnn 128 --mini-batch 16 --optimizer sgd --cost-type ce-mean \
     -m valid_newbest/model.npz -t $MRT_DATA/europarl.de-en/toy.bpe.{en,de} -v vocab.en.yml vocab.de.yml \
     --disp-freq 5 --valid-freq 10 --after-batches 50 \
diff --git a/tests/training/restoring/validation/test_restoring_validation_lower_is_better.sh b/tests/training/restoring/validation/test_restoring_validation_lower_is_better.sh
index a29e534..f700e3c 100644
--- a/tests/training/restoring/validation/test_restoring_validation_lower_is_better.sh
+++ b/tests/training/restoring/validation/test_restoring_validation_lower_is_better.sh
@@ -9,7 +9,7 @@ mkdir -p valid_lowisbet
 
 extra_opts="--no-shuffle --seed 1111 --maxi-batch 1 --maxi-batch-sort none"
 extra_opts="$extra_opts --dim-emb 64 --dim-rnn 128 --mini-batch 32"
-extra_opts="$extra_opts --cost-type ce-mean --disp-label-counts false"
+extra_opts="$extra_opts --cost-type ce-mean --disp-label-counts false --clip-norm 0"
 
 
 # Files for the validation sets are swapped intentionally
diff --git a/tests/training/restoring/validation/test_valid_reset_stalled.sh b/tests/training/restoring/validation/test_valid_reset_stalled.sh
index e967a1f..c2c7d4b 100644
--- a/tests/training/restoring/validation/test_valid_reset_stalled.sh
+++ b/tests/training/restoring/validation/test_valid_reset_stalled.sh
@@ -27,7 +27,7 @@ $MRT_MARIAN/marian $extra_opts \
     --disp-freq 10 --valid-freq 20 --after-batches 140 --early-stopping 5 \
     --valid-metrics translation valid-script cross-entropy --valid-script-path ./valid_script_ab.sh \
     --valid-sets valid.mini.bpe.{de,en} \
-    --overwrite --keep-best \
+    --overwrite --keep-best --clip-norm 0 \
     --log valid_reset_stalled_1.log
 
 test -e valid_reset_stalled/model.npz
@@ -43,7 +43,7 @@ $MRT_MARIAN/marian $extra_opts \
     --disp-freq 10 --valid-freq 20 --after-batches 200 --early-stopping 5 --valid-reset-stalled \
     --valid-metrics translation valid-script cross-entropy --valid-script-path ./valid_script_ab.sh \
     --valid-sets valid.mini.bpe.{de,en} \
-    --overwrite --keep-best \
+    --overwrite --keep-best --clip-norm 0 \
     --log valid_reset_stalled_2.log
 
 test -e valid_reset_stalled/model.npz
diff --git a/tests/training/restoring/validation/update.sh b/tests/training/restoring/validation/update.sh
new file mode 100755
index 0000000..809fbaa
--- /dev/null
+++ b/tests/training/restoring/validation/update.sh
@@ -0,0 +1,6 @@
+#!/usr/bin/env sh
+cp valid_reset_stalled.out valid_reset_stalled.expected
+cp valid_add.out valid_add.expected
+cp valid_newbest.out valid_newbest.expected
+cp valid_stalled.out valid_stalled.expected
+cp valid_lowisbet.out valid_lowisbet.expected
diff --git a/tests/training/restoring/validation/valid_add.expected b/tests/training/restoring/validation/valid_add.expected
index fb2d8a5..893a57d 100644
--- a/tests/training/restoring/validation/valid_add.expected
+++ b/tests/training/restoring/validation/valid_add.expected
@@ -1,15 +1,15 @@
-[valid] Ep. 1 : Up. 20 : cross-entropy : 296.282 : new best
-[valid] Ep. 1 : Up. 40 : cross-entropy : 296.269 : new best
-[valid] Ep. 1 : Up. 60 : cross-entropy : 296.255 : new best
-[valid] Ep. 1 : Up. 80 : cross-entropy : 296.242 : new best
-[valid] Ep. 1 : Up. 100 : cross-entropy : 296.229 : new best
-[valid] Ep. 1 : Up. 120 : cross-entropy : 296.216 : new best
-[valid] Ep. 1 : Up. 120 : ce-mean-words : 10.1618 : new best
-[valid] Ep. 1 : Up. 140 : cross-entropy : 296.202 : new best
-[valid] Ep. 1 : Up. 140 : ce-mean-words : 10.1613 : new best
-[valid] Ep. 1 : Up. 160 : cross-entropy : 296.189 : new best
-[valid] Ep. 1 : Up. 160 : ce-mean-words : 10.1609 : new best
-[valid] Ep. 1 : Up. 180 : cross-entropy : 296.176 : new best
-[valid] Ep. 1 : Up. 180 : ce-mean-words : 10.1604 : new best
-[valid] Ep. 1 : Up. 200 : cross-entropy : 296.162 : new best
-[valid] Ep. 1 : Up. 200 : ce-mean-words : 10.1599 : new best
+[valid] Ep. 1 : Up. 20 : cross-entropy : 294.63 : new best
+[valid] Ep. 1 : Up. 40 : cross-entropy : 292.643 : new best
+[valid] Ep. 1 : Up. 60 : cross-entropy : 290.224 : new best
+[valid] Ep. 1 : Up. 80 : cross-entropy : 286.857 : new best
+[valid] Ep. 1 : Up. 100 : cross-entropy : 282.156 : new best
+[valid] Ep. 1 : Up. 120 : cross-entropy : 274.584 : new best
+[valid] Ep. 1 : Up. 120 : ce-mean-words : 9.41969 : new best
+[valid] Ep. 1 : Up. 140 : cross-entropy : 264.996 : new best
+[valid] Ep. 1 : Up. 140 : ce-mean-words : 9.09079 : new best
+[valid] Ep. 1 : Up. 160 : cross-entropy : 258.914 : new best
+[valid] Ep. 1 : Up. 160 : ce-mean-words : 8.88213 : new best
+[valid] Ep. 1 : Up. 180 : cross-entropy : 255.943 : new best
+[valid] Ep. 1 : Up. 180 : ce-mean-words : 8.78019 : new best
+[valid] Ep. 1 : Up. 200 : cross-entropy : 253.146 : new best
+[valid] Ep. 1 : Up. 200 : ce-mean-words : 8.68424 : new best
diff --git a/tests/training/restoring/validation/valid_lowisbet.expected b/tests/training/restoring/validation/valid_lowisbet.expected
index daa223a..8a2ca20 100644
--- a/tests/training/restoring/validation/valid_lowisbet.expected
+++ b/tests/training/restoring/validation/valid_lowisbet.expected
@@ -1,7 +1,7 @@
-[valid] Ep. 1 : Up. 30 : cross-entropy : 299.128 : new best
-[valid] Ep. 2 : Up. 60 : cross-entropy : 298.528 : new best
-[valid] Ep. 3 : Up. 90 : cross-entropy : 296.43 : new best
-[valid] Ep. 4 : Up. 120 : cross-entropy : 297.912 : stalled 1 times (last best: 296.43)
-[valid] Ep. 5 : Up. 150 : cross-entropy : 297.791 : stalled 2 times (last best: 296.43)
-[valid] Ep. 6 : Up. 180 : cross-entropy : 297.654 : stalled 3 times (last best: 296.43)
-[valid] Ep. 7 : Up. 210 : cross-entropy : 297.794 : stalled 4 times (last best: 296.43)
+[valid] Ep. 1 : Up. 30 : cross-entropy : 299.127 : new best
+[valid] Ep. 2 : Up. 60 : cross-entropy : 298.417 : new best
+[valid] Ep. 3 : Up. 90 : cross-entropy : 296.252 : new best
+[valid] Ep. 4 : Up. 120 : cross-entropy : 298.171 : stalled 1 times (last best: 296.252)
+[valid] Ep. 5 : Up. 150 : cross-entropy : 298.057 : stalled 2 times (last best: 296.252)
+[valid] Ep. 6 : Up. 180 : cross-entropy : 298.052 : stalled 3 times (last best: 296.252)
+[valid] Ep. 7 : Up. 210 : cross-entropy : 298.133 : stalled 4 times (last best: 296.252)
diff --git a/tests/training/restoring/validation/valid_newbest.expected b/tests/training/restoring/validation/valid_newbest.expected
index d03d098..22ce219 100644
--- a/tests/training/restoring/validation/valid_newbest.expected
+++ b/tests/training/restoring/validation/valid_newbest.expected
@@ -1,20 +1,20 @@
-[valid] Ep. 1 : Up. 10 : cross-entropy : 250.506 : new best
-[valid] Ep. 1 : Up. 10 : translation : 8 : new best
-[valid] Ep. 1 : Up. 20 : cross-entropy : 250.501 : new best
-[valid] Ep. 1 : Up. 20 : translation : 8 : stalled 1 times (last best: 8)
-[valid] Ep. 1 : Up. 30 : cross-entropy : 250.497 : new best
-[valid] Ep. 1 : Up. 30 : translation : 8 : stalled 2 times (last best: 8)
-[valid] Ep. 1 : Up. 40 : cross-entropy : 250.491 : new best
-[valid] Ep. 1 : Up. 40 : translation : 9 : new best
-[valid] Ep. 1 : Up. 50 : cross-entropy : 250.486 : new best
-[valid] Ep. 1 : Up. 50 : translation : 7 : stalled 1 times (last best: 9)
-[valid] Ep. 1 : Up. 60 : cross-entropy : 250.481 : new best
-[valid] Ep. 1 : Up. 60 : translation : 3 : stalled 2 times (last best: 9)
-[valid] Ep. 1 : Up. 70 : cross-entropy : 250.476 : new best
-[valid] Ep. 1 : Up. 70 : translation : 6 : stalled 3 times (last best: 9)
-[valid] Ep. 1 : Up. 80 : cross-entropy : 250.471 : new best
-[valid] Ep. 1 : Up. 80 : translation : 0 : stalled 4 times (last best: 9)
-[valid] Ep. 1 : Up. 90 : cross-entropy : 250.465 : new best
-[valid] Ep. 1 : Up. 90 : translation : 9 : stalled 5 times (last best: 9)
-[valid] Ep. 1 : Up. 100 : cross-entropy : 250.461 : new best
-[valid] Ep. 1 : Up. 100 : translation : 6 : stalled 6 times (last best: 9)
+[valid] Ep. 1 : Up. 10 : cross-entropy : 249.884 : new best
+[valid] Ep. 1 : Up. 10 : translation : 5 : new best
+[valid] Ep. 1 : Up. 20 : cross-entropy : 249.337 : new best
+[valid] Ep. 1 : Up. 20 : translation : 4 : stalled 1 times (last best: 5)
+[valid] Ep. 1 : Up. 30 : cross-entropy : 248.804 : new best
+[valid] Ep. 1 : Up. 30 : translation : 3 : stalled 2 times (last best: 5)
+[valid] Ep. 1 : Up. 40 : cross-entropy : 248.218 : new best
+[valid] Ep. 1 : Up. 40 : translation : 6 : new best
+[valid] Ep. 1 : Up. 50 : cross-entropy : 247.56 : new best
+[valid] Ep. 1 : Up. 50 : translation : 9 : new best
+[valid] Ep. 1 : Up. 60 : cross-entropy : 246.856 : new best
+[valid] Ep. 1 : Up. 60 : translation : 6 : stalled 1 times (last best: 9)
+[valid] Ep. 1 : Up. 70 : cross-entropy : 246.112 : new best
+[valid] Ep. 1 : Up. 70 : translation : 8 : stalled 2 times (last best: 9)
+[valid] Ep. 1 : Up. 80 : cross-entropy : 245.247 : new best
+[valid] Ep. 1 : Up. 80 : translation : 8 : stalled 3 times (last best: 9)
+[valid] Ep. 1 : Up. 90 : cross-entropy : 244.336 : new best
+[valid] Ep. 1 : Up. 90 : translation : 8 : stalled 4 times (last best: 9)
+[valid] Ep. 1 : Up. 100 : cross-entropy : 243.37 : new best
+[valid] Ep. 1 : Up. 100 : translation : 8 : stalled 5 times (last best: 9)
diff --git a/tests/training/restoring/validation/valid_reset_stalled.expected b/tests/training/restoring/validation/valid_reset_stalled.expected
index eed1393..da5b590 100644
--- a/tests/training/restoring/validation/valid_reset_stalled.expected
+++ b/tests/training/restoring/validation/valid_reset_stalled.expected
@@ -1,30 +1,30 @@
 [valid] Ep. 1 : Up. 20 : translation : 333.5 : new best
 [valid] Ep. 1 : Up. 20 : valid-script : 222.3 : new best
-[valid] Ep. 1 : Up. 20 : cross-entropy : 250.501 : new best
+[valid] Ep. 1 : Up. 20 : cross-entropy : 249.337 : new best
 [valid] Ep. 1 : Up. 40 : translation : 333.4 : stalled 1 times (last best: 333.5)
 [valid] Ep. 1 : Up. 40 : valid-script : 222.2 : stalled 1 times (last best: 222.3)
-[valid] Ep. 1 : Up. 40 : cross-entropy : 250.491 : new best
+[valid] Ep. 1 : Up. 40 : cross-entropy : 248.218 : new best
 [valid] Ep. 1 : Up. 60 : translation : 333.3 : stalled 2 times (last best: 333.5)
 [valid] Ep. 1 : Up. 60 : valid-script : 222.1 : stalled 2 times (last best: 222.3)
-[valid] Ep. 1 : Up. 60 : cross-entropy : 250.481 : new best
+[valid] Ep. 1 : Up. 60 : cross-entropy : 246.856 : new best
 [valid] Ep. 1 : Up. 80 : translation : 333.2 : stalled 3 times (last best: 333.5)
 [valid] Ep. 1 : Up. 80 : valid-script : 222.6 : new best
-[valid] Ep. 1 : Up. 80 : cross-entropy : 250.471 : new best
+[valid] Ep. 1 : Up. 80 : cross-entropy : 245.247 : new best
 [valid] Ep. 1 : Up. 100 : translation : 333.1 : stalled 4 times (last best: 333.5)
 [valid] Ep. 1 : Up. 100 : valid-script : 222.5 : stalled 1 times (last best: 222.6)
-[valid] Ep. 1 : Up. 100 : cross-entropy : 250.461 : new best
+[valid] Ep. 1 : Up. 100 : cross-entropy : 243.37 : new best
 [valid] Ep. 1 : Up. 120 : translation : 333.9 : new best
 [valid] Ep. 1 : Up. 120 : valid-script : 222.4 : stalled 2 times (last best: 222.6)
-[valid] Ep. 1 : Up. 120 : cross-entropy : 250.45 : new best
+[valid] Ep. 1 : Up. 120 : cross-entropy : 240.802 : new best
 [valid] Ep. 1 : Up. 140 : translation : 333.8 : stalled 1 times (last best: 333.9)
 [valid] Ep. 1 : Up. 140 : valid-script : 222.3 : stalled 3 times (last best: 222.6)
-[valid] Ep. 1 : Up. 140 : cross-entropy : 250.441 : new best
+[valid] Ep. 1 : Up. 140 : cross-entropy : 237.65 : new best
 [valid] Ep. 1 : Up. 160 : translation : 333.7 : stalled 1 times (last best: 333.9)
 [valid] Ep. 1 : Up. 160 : valid-script : 222.2 : stalled 1 times (last best: 222.6)
-[valid] Ep. 1 : Up. 160 : cross-entropy : 250.43 : new best
+[valid] Ep. 1 : Up. 160 : cross-entropy : 233.833 : new best
 [valid] Ep. 2 : Up. 180 : translation : 333.6 : stalled 2 times (last best: 333.9)
 [valid] Ep. 2 : Up. 180 : valid-script : 222.1 : stalled 2 times (last best: 222.6)
-[valid] Ep. 2 : Up. 180 : cross-entropy : 250.42 : new best
+[valid] Ep. 2 : Up. 180 : cross-entropy : 230.035 : new best
 [valid] Ep. 2 : Up. 200 : translation : 333.5 : stalled 3 times (last best: 333.9)
 [valid] Ep. 2 : Up. 200 : valid-script : 222.6 : stalled 3 times (last best: 222.6)
-[valid] Ep. 2 : Up. 200 : cross-entropy : 250.41 : new best
+[valid] Ep. 2 : Up. 200 : cross-entropy : 227.982 : new best
diff --git a/tests/training/validation/final_batch.expected b/tests/training/validation/final_batch.expected
index eac5cdc..2e0a8b3 100644
--- a/tests/training/validation/final_batch.expected
+++ b/tests/training/validation/final_batch.expected
@@ -1,3 +1,3 @@
-[valid] Ep. 1 : Up. 60 : cross-entropy : 240.376 : new best
-[valid] Ep. 1 : Up. 120 : cross-entropy : 240.348 : new best
-[valid] Ep. 1 : Up. 150 : cross-entropy : 240.332 : new best
+[valid] Ep. 1 : Up. 60 : cross-entropy : 198.667 : new best
+[valid] Ep. 1 : Up. 120 : cross-entropy : 186.536 : new best
+[valid] Ep. 1 : Up. 150 : cross-entropy : 181.413 : new best
diff --git a/tests/training/validation/final_epoch.expected b/tests/training/validation/final_epoch.expected
index ebcb25c..f96a0df 100644
--- a/tests/training/validation/final_epoch.expected
+++ b/tests/training/validation/final_epoch.expected
@@ -1,3 +1,3 @@
-[valid] Ep. 1 : Up. 40 : cross-entropy : 240.475 : new best
-[valid] Ep. 1 : Up. 80 : cross-entropy : 240.459 : new best
-[valid] Ep. 2 : Up. 81 : cross-entropy : 240.459 : new best
+[valid] Ep. 1 : Up. 40 : cross-entropy : 234.305 : new best
+[valid] Ep. 1 : Up. 80 : cross-entropy : 227.512 : new best
+[valid] Ep. 2 : Up. 81 : cross-entropy : 227.471 : new best
diff --git a/tests/training/validation/final_match.expected b/tests/training/validation/final_match.expected
index 87c9cc2..d685062 100644
--- a/tests/training/validation/final_match.expected
+++ b/tests/training/validation/final_match.expected
@@ -1,3 +1,3 @@
-[valid] Ep. 1 : Up. 60 : cross-entropy : 240.376 : new best
-[valid] Ep. 1 : Up. 120 : cross-entropy : 240.348 : new best
-[valid] Ep. 1 : Up. 180 : cross-entropy : 240.317 : new best
+[valid] Ep. 1 : Up. 60 : cross-entropy : 198.667 : new best
+[valid] Ep. 1 : Up. 120 : cross-entropy : 186.536 : new best
+[valid] Ep. 1 : Up. 180 : cross-entropy : 179.091 : new best
diff --git a/tests/training/validation/test_final_validation_after_batches.sh b/tests/training/validation/test_final_validation_after_batches.sh
index b4ccc3c..84a3dda 100644
--- a/tests/training/validation/test_final_validation_after_batches.sh
+++ b/tests/training/validation/test_final_validation_after_batches.sh
@@ -8,10 +8,10 @@ rm -rf final_batch final_batch.log vocab.*.yml
 mkdir -p final_batch
 
 $MRT_MARIAN/marian \
-    --no-shuffle --seed 1111 --optimizer sgd --dim-emb 64 --dim-rnn 128 \
+    --no-shuffle --clip-norm 0 --seed 1111 --optimizer sgd --dim-emb 64 --dim-rnn 128 \
     -m final_batch/model.npz -t $MRT_DATA/europarl.de-en/corpus.bpe.{en,de} \
     -v vocab.en.yml vocab.de.yml --dim-vocabs 50000 50000 \
-    --disp-freq 30 --valid-freq 60 --after-batches 150 \
+    --disp-freq 30 --valid-freq 60 --after 150u \
     --valid-metrics cross-entropy --valid-sets valid.bpe.{en,de} \
     --valid-log final_batch.log
 
diff --git a/tests/training/validation/test_final_validation_after_batches_match.sh b/tests/training/validation/test_final_validation_after_batches_match.sh
index fc676cf..dc64991 100644
--- a/tests/training/validation/test_final_validation_after_batches_match.sh
+++ b/tests/training/validation/test_final_validation_after_batches_match.sh
@@ -8,10 +8,10 @@ rm -rf final_match final_match.log vocab.*.yml
 mkdir -p final_match
 
 $MRT_MARIAN/marian \
-    --no-shuffle --seed 1111 --optimizer sgd --dim-emb 64 --dim-rnn 128 \
+    --no-shuffle --clip-norm 0 --seed 1111 --optimizer sgd --dim-emb 64 --dim-rnn 128 \
     -m final_match/model.npz -t $MRT_DATA/europarl.de-en/corpus.bpe.{en,de} \
     -v vocab.en.yml vocab.de.yml --dim-vocabs 50000 50000 \
-    --disp-freq 30 --valid-freq 60 --after-batches 180 \
+    --disp-freq 30 --valid-freq 60 --after 180u \
     --valid-metrics cross-entropy --valid-sets valid.bpe.{en,de} \
     --valid-log final_match.log
 
diff --git a/tests/training/validation/test_final_validation_after_epochs.sh b/tests/training/validation/test_final_validation_after_epochs.sh
index e8e606f..e263259 100644
--- a/tests/training/validation/test_final_validation_after_epochs.sh
+++ b/tests/training/validation/test_final_validation_after_epochs.sh
@@ -11,10 +11,10 @@ test -e train.bpe.en || head -n 3000 $MRT_DATA/europarl.de-en/corpus.bpe.en > tr
 test -e train.bpe.de || head -n 3000 $MRT_DATA/europarl.de-en/corpus.bpe.de > train.bpe.de
 
 $MRT_MARIAN/marian \
-    --no-shuffle --seed 1111 --optimizer sgd --dim-emb 64 --dim-rnn 128 \
+    --no-shuffle --clip-norm 0 --seed 1111 --optimizer sgd --dim-emb 64 --dim-rnn 128 \
     -m final_epoch/model.npz -t train.bpe.{en,de} \
     -v vocab.small.en.yml vocab.small.de.yml --dim-vocabs 50000 50000 \
-    --mini-batch 32 --disp-freq 20 --valid-freq 40 --after-epochs 1 \
+    --mini-batch 32 --disp-freq 20 --valid-freq 40 --after 1e \
     --valid-metrics cross-entropy --valid-sets valid.bpe.{en,de} \
     --valid-log final_epoch.log
 
diff --git a/tests/training/validation/test_translation_metric_with_empty_lines.sh b/tests/training/validation/test_translation_metric_with_empty_lines.sh
index 95333c2..5ff7115 100644
--- a/tests/training/validation/test_translation_metric_with_empty_lines.sh
+++ b/tests/training/validation/test_translation_metric_with_empty_lines.sh
@@ -28,9 +28,9 @@ test -e trans_empty_lines/train.en || cat $MRT_DATA/train.max50.en | sed 's/@@ /
 
 # Train
 $MRT_MARIAN/marian \
-    --seed 2222 --no-shuffle --mini-batch 32 --maxi-batch 1 --optimizer sgd \
+    --seed 2222 --no-shuffle --clip-norm 0 --mini-batch 32 --maxi-batch 1 --optimizer sgd -l 0.00001 \
     -m trans_empty_lines/model.npz -t trans_empty_lines/train.{de,en} -v trans_empty_lines/vocab.{spm,spm} \
-    --disp-freq 20 --valid-freq 60 --after-batches 60 \
+    --disp-freq 10 --valid-freq 30 --after 30u \
     --valid-metrics cross-entropy translation --valid-translation-output trans_empty_lines.out \
     --valid-sets trans_empty_lines.de trans_empty_lines.en \
     --valid-log trans_empty_lines.log
diff --git a/tests/training/validation/test_translation_script.sh b/tests/training/validation/test_translation_script.sh
index 2bf8648..11d0883 100644
--- a/tests/training/validation/test_translation_script.sh
+++ b/tests/training/validation/test_translation_script.sh
@@ -14,11 +14,11 @@ rm -rf trans trans.log trans_script.temp
 mkdir -p trans
 
 $MRT_MARIAN/marian \
-    --seed 2222 --no-shuffle --dim-emb 128 --dim-rnn 256 --maxi-batch 1 --mini-batch 16 \
+    --seed 2222 --no-shuffle --clip-norm 0 --dim-emb 128 --dim-rnn 256 --maxi-batch 1 --mini-batch 16 \
     -m trans/model.npz \
     -t $MRT_DATA/europarl.de-en/corpus.bpe.{en,de} -v vocab.50k.en.yml vocab.50k.de.yml \
     --dim-vocabs 50000 50000 \
-    --disp-freq 30 --valid-freq 60 --after-batches 150 \
+    --disp-freq 30 --valid-freq 60 --after 150u \
     --valid-metrics cross-entropy translation --valid-script-path ./trans_script.sh \
     --valid-sets trans.bpe.en trans.bpe.de \
     --valid-log trans.log
diff --git a/tests/training/validation/test_valid_script.sh b/tests/training/validation/test_valid_script.sh
index 53cfba5..7cd60e9 100644
--- a/tests/training/validation/test_valid_script.sh
+++ b/tests/training/validation/test_valid_script.sh
@@ -14,7 +14,7 @@ rm -rf valid valid.log valid_script.temp
 mkdir -p valid
 
 $MRT_MARIAN/marian \
-    --seed 2222 --no-shuffle --dim-emb 128 --dim-rnn 256 --maxi-batch 1 --mini-batch 16 \
+    --seed 2222 --no-shuffle --clip-norm 0 --dim-emb 128 --dim-rnn 256 --maxi-batch 1 --mini-batch 16 \
     -m valid/model.npz -t $MRT_DATA/europarl.de-en/corpus.bpe.{en,de} \
     -v vocab.50k.en.yml vocab.50k.de.yml --dim-vocabs 50000 50000 \
     --disp-freq 5 --valid-freq 15 --after-batches 75 \
diff --git a/tests/training/validation/trans.expected b/tests/training/validation/trans.expected
index 68987df..875a980 100644
--- a/tests/training/validation/trans.expected
+++ b/tests/training/validation/trans.expected
@@ -1,4 +1,4 @@
-[valid] Ep. 1 : Up. 60 : cross-entropy : 218.835 : new best
+[valid] Ep. 1 : Up. 60 : cross-entropy : 215.525 : new best
 [valid] Ep. 1 : Up. 60 : translation : 1 : new best
-[valid] Ep. 1 : Up. 120 : cross-entropy : 187.91 : new best
+[valid] Ep. 1 : Up. 120 : cross-entropy : 186.551 : new best
 [valid] Ep. 1 : Up. 120 : translation : 2 : new best
diff --git a/tests/training/validation/trans_empty_lines.expected b/tests/training/validation/trans_empty_lines.expected
index 45ae20b..46a9bae 100644
--- a/tests/training/validation/trans_empty_lines.expected
+++ b/tests/training/validation/trans_empty_lines.expected
@@ -1,9 +1,9 @@
-That concludes the agenda, ladies and gentlemen.
-The Minutes of this sitting will be submitted to plenary at the beginning of the next part-session.
-Mr Manders has the floor for a point of order.
-Mr President, I would like to take this opportunity to wish you, the Bureau and all Members well for the new year.
+, ladies and gentlemen , Parliament has completed the agenda .
+The Minutes of this sitting will be submitted to plenary at the beginning of the next part-session .
+Mr Manders has the floor for a point of order .
+Mr President , I would like to take this opportunity to wish you , the Bureau and all Members well for the new year .
 
-I would even like to allow me to name the Commission and the Council, even if they are not present.
+I would even allow me to name the Commission and the Council , even if they are not present .
 
 
 I declare the session of the European Parliament adjourned.
diff --git a/tests/training/validation/valid.expected b/tests/training/validation/valid.expected
index 66764ec..48227e6 100644
--- a/tests/training/validation/valid.expected
+++ b/tests/training/validation/valid.expected
@@ -1,10 +1,10 @@
 [valid] Ep. 1 : Up. 15 : cross-entropy : 307.647 : new best
 [valid] Ep. 1 : Up. 15 : valid-script : 1 : new best
-[valid] Ep. 1 : Up. 30 : cross-entropy : 305.551 : new best
+[valid] Ep. 1 : Up. 30 : cross-entropy : 305.336 : new best
 [valid] Ep. 1 : Up. 30 : valid-script : 2 : new best
-[valid] Ep. 1 : Up. 45 : cross-entropy : 299.442 : new best
+[valid] Ep. 1 : Up. 45 : cross-entropy : 297.463 : new best
 [valid] Ep. 1 : Up. 45 : valid-script : 3 : new best
-[valid] Ep. 1 : Up. 60 : cross-entropy : 281.549 : new best
+[valid] Ep. 1 : Up. 60 : cross-entropy : 277.038 : new best
 [valid] Ep. 1 : Up. 60 : valid-script : 4 : new best
-[valid] Ep. 1 : Up. 75 : cross-entropy : 268.403 : new best
+[valid] Ep. 1 : Up. 75 : cross-entropy : 265.286 : new best
 [valid] Ep. 1 : Up. 75 : valid-script : 5 : new best
author	Roman Grundkiewicz <rgrundkiewicz@gmail.com>	2021-01-25 17:41:38 +0300
committer	GitHub <noreply@github.com>	2021-01-25 17:41:38 +0300
commit	4c44a8d92090a010d0cedf5fe2c26e692d51d20d (patch)
tree	12d8c9a0dc459c9b6977481d7988e81a621ca0d9
parent	97b2f95abab6134c1632b286e373e513ecc52020 (diff)
parent	88e775a09ad7326d1e974e150c5febfe06fc80ce (diff)