Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/marian-nmt/marian-examples.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
Diffstat (limited to 'transformer-intro/scripts/preprocess-data.sh')
-rwxr-xr-xtransformer-intro/scripts/preprocess-data.sh36
1 files changed, 36 insertions, 0 deletions
diff --git a/transformer-intro/scripts/preprocess-data.sh b/transformer-intro/scripts/preprocess-data.sh
new file mode 100755
index 0000000..e6e8c6a
--- /dev/null
+++ b/transformer-intro/scripts/preprocess-data.sh
@@ -0,0 +1,36 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+MOSES_SCRIPTS="$PWD/../tools/moses-scripts/scripts"
+
+SRC="en"
+TRG="de"
+
+cd data
+if [ -e corpus.clean.$SRC ] && [ -e corpus.clean.$TRG ]; then
+ echo "No action needed"
+ exit 0
+fi
+
+
+for lang in $SRC $TRG; do
+ # Remove non-printing characters
+ cat corpus.$lang \
+ | perl $MOSES_SCRIPTS/tokenizer/remove-non-printing-char.perl \
+ > .corpus.norm.$lang
+ # | perl $MOSES_SCRIPTS/tokenizer/normalize-punctuation.perl -l $lang \ # could optionally norm quotes
+done
+
+# Contrain length between 1 100
+perl $MOSES_SCRIPTS/training/clean-corpus-n.perl .corpus.norm $SRC $TRG .corpus.trim 1 100
+
+# Deduplicate
+paste <(cat .corpus.trim.$SRC) <(cat .corpus.trim.$TRG) \
+ | LC_ALL=C sort -S 50% | uniq \
+ > .corpus.uniq.$SRC$TRG.tsv
+
+cat .corpus.uniq.$SRC$TRG.tsv | cut -f 1 > corpus.clean.$SRC
+cat .corpus.uniq.$SRC$TRG.tsv | cut -f 2 > corpus.clean.$TRG
+
+# Clean up
+rm .corpus.*