#!/bin/bash -x

#####################################################################
# SUMMARY: Create a SentencePiece vocabulary with normalization rules
# AUTHOR: snukky
# TAGS: sentencepiece
#####################################################################

# Exit on error
set -e

# Remove old artifacts and create working directory
rm -rf vocab.norm vocab.norm.*{log,out,diff}
mkdir -p vocab.norm

# Run marian command
$MRT_MARIAN/marian \
    --no-shuffle --seed 2222 --dim-emb 32 --dim-rnn 64 --maxi-batch 1 --maxi-batch-sort none --after-batches 1 \
    -m vocab.norm/model.npz -t $MRT_DATA/europarl.de-en/corpus.small.{en,de}.gz \
    --dim-vocabs 4000 -v vocab.norm/vocab.ende.spm vocab.norm/vocab.ende.spm --sentencepiece-options '--normalization_rule_tsv=norm.tsv --num_threads=1' --sentencepiece-max-lines 10000 \
    --log vocab.norm.log

# Check if files exist
test -e vocab.norm/model.npz
test -e vocab.norm/vocab.ende.spm
test -e vocab.norm.log

# Check logging messages
grep -q "Training SentencePiece vocabulary .*vocab.ende.spm" vocab.norm.log

# Extract a textual vocabulary and compare with the expected output
LC_ALL=C $MRT_MARIAN/spm_export_vocab --model vocab.norm/vocab.ende.spm | sort > vocab.norm.out
$MRT_TOOLS/diff-nums.py vocab.norm.out vocab.norm.expected -o vocab.norm.diff

# Normalization is uppercasing, so check if there is no lowercased ASCII characters
grep -qvP '[a-z]' vocab.norm.out

# Exit with success code
exit 0