blob: 069dcbc2f2b52f703936ab87f2c5e87e2af61c7a (
plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
|
#!/bin/bash -v
# get En-Ro training data for WMT16
if [ ! -f data/ro-en.tgz ];
then
wget http://www.statmt.org/europarl/v7/ro-en.tgz -O data/ro-en.tgz
fi
if [ ! -f data/SETIMES2.ro-en.txt.zip ];
then
wget http://opus.lingfil.uu.se/download.php?f=SETIMES2/en-ro.txt.zip -O data/SETIMES2.ro-en.txt.zip
fi
cd data/
tar -xf ro-en.tgz
unzip SETIMES2.ro-en.txt.zip
cat europarl-v7.ro-en.en SETIMES2.en-ro.en > corpus.en
cat europarl-v7.ro-en.ro SETIMES2.en-ro.ro > corpus.ro
cd ..
|