diff options
Diffstat (limited to 'transformer-intro/scripts/preprocess-data.sh')
-rwxr-xr-x | transformer-intro/scripts/preprocess-data.sh | 36 |
1 files changed, 36 insertions, 0 deletions
diff --git a/transformer-intro/scripts/preprocess-data.sh b/transformer-intro/scripts/preprocess-data.sh new file mode 100755 index 0000000..e6e8c6a --- /dev/null +++ b/transformer-intro/scripts/preprocess-data.sh @@ -0,0 +1,36 @@ +#!/usr/bin/env bash +set -euo pipefail + +MOSES_SCRIPTS="$PWD/../tools/moses-scripts/scripts" + +SRC="en" +TRG="de" + +cd data +if [ -e corpus.clean.$SRC ] && [ -e corpus.clean.$TRG ]; then + echo "No action needed" + exit 0 +fi + + +for lang in $SRC $TRG; do + # Remove non-printing characters + cat corpus.$lang \ + | perl $MOSES_SCRIPTS/tokenizer/remove-non-printing-char.perl \ + > .corpus.norm.$lang + # | perl $MOSES_SCRIPTS/tokenizer/normalize-punctuation.perl -l $lang \ # could optionally norm quotes +done + +# Contrain length between 1 100 +perl $MOSES_SCRIPTS/training/clean-corpus-n.perl .corpus.norm $SRC $TRG .corpus.trim 1 100 + +# Deduplicate +paste <(cat .corpus.trim.$SRC) <(cat .corpus.trim.$TRG) \ + | LC_ALL=C sort -S 50% | uniq \ + > .corpus.uniq.$SRC$TRG.tsv + +cat .corpus.uniq.$SRC$TRG.tsv | cut -f 1 > corpus.clean.$SRC +cat .corpus.uniq.$SRC$TRG.tsv | cut -f 2 > corpus.clean.$TRG + +# Clean up +rm .corpus.* |