transformer-intro/scripts/download-files.sh


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39

#!/usr/bin/env bash
set -euo pipefail

cd data
echo "Downloading data"
# Get en-de for training WMT21
wget -nc https://www.statmt.org/europarl/v10/training/europarl-v10.de-en.tsv.gz 2> /dev/null
wget -nc https://data.statmt.org/news-commentary/v16/training/news-commentary-v16.de-en.tsv.gz 2> /dev/null
wget -nc https://www.statmt.org/wmt13/training-parallel-commoncrawl.tgz 2> /dev/null

# Dev Sets
sacrebleu -t wmt19 -l en-de --echo src > valid.en
sacrebleu -t wmt19 -l en-de --echo ref > valid.de

# Test Sets
sacrebleu -t wmt20 -l en-de --echo src > test.en
sacrebleu -t wmt20 -l en-de --echo ref > test.de

# Uncompress
for compressed in europarl-v10.de-en.tsv news-commentary-v16.de-en.tsv; do
  if [ ! -e $compressed ]; then
    gzip --keep -q -d $compressed.gz
  fi
done

tar xf training-parallel-commoncrawl.tgz

# Corpus
if [ ! -e corpus.de ] || [ ! -e corpus.en ]; then
  # TSVs
  cat europarl-v10.de-en.tsv news-commentary-v16.de-en.tsv | cut -f 1 > corpus.de
  cat europarl-v10.de-en.tsv news-commentary-v16.de-en.tsv | cut -f 2 > corpus.en

  # Plain text
  cat commoncrawl.de-en.de >> corpus.de
  cat commoncrawl.de-en.en >> corpus.en
fi

echo "Corpus prepared"