Welcome to mirror list, hosted at ThFree Co, Russian Federation.

preprocess-data.sh « scripts « transformer-intro - github.com/marian-nmt/marian-examples.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
blob: e6e8c6a8b0fa89037aad18cd19951e27505b5eb2 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
#!/usr/bin/env bash
set -euo pipefail

MOSES_SCRIPTS="$PWD/../tools/moses-scripts/scripts"

SRC="en"
TRG="de"

cd data
if [ -e corpus.clean.$SRC ] && [ -e  corpus.clean.$TRG ]; then
  echo "No action needed"
  exit 0
fi


for lang in $SRC $TRG; do
  # Remove non-printing characters
  cat corpus.$lang \
    | perl $MOSES_SCRIPTS/tokenizer/remove-non-printing-char.perl \
    > .corpus.norm.$lang
    # | perl $MOSES_SCRIPTS/tokenizer/normalize-punctuation.perl -l $lang \  # could optionally norm quotes
done

# Contrain length between 1 100
perl $MOSES_SCRIPTS/training/clean-corpus-n.perl .corpus.norm $SRC $TRG .corpus.trim 1 100

# Deduplicate
paste <(cat .corpus.trim.$SRC) <(cat .corpus.trim.$TRG) \
  | LC_ALL=C sort -S 50% | uniq \
  > .corpus.uniq.$SRC$TRG.tsv

cat .corpus.uniq.$SRC$TRG.tsv | cut -f 1 > corpus.clean.$SRC
cat .corpus.uniq.$SRC$TRG.tsv | cut -f 2 > corpus.clean.$TRG

# Clean up
rm .corpus.*