blob: 960f57efdbe8d781417487a55bafff9d6cbe08ff (
plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
|
#!/bin/bash
#
# Download word vector files for all supported languages. Run as:
# ./download_vectors.sh WORDVEC_DIR
# where WORDVEC_DIR is the target directory to store the word vector data.
# check arguments
: ${1?"Usage: $0 WORDVEC_DIR"}
WORDVEC_DIR=$1
# constants and functions
CONLL17_URL="https://lindat.mff.cuni.cz/repository/xmlui/bitstream/handle/11234/1-1989/word-embeddings-conll17.tar"
CONLL17_TAR="word-embeddings-conll17.tar"
FASTTEXT_BASE_URL="https://dl.fbaipublicfiles.com/fasttext/vectors-wiki"
# TODO: some fasttext vectors are now at
# https://fasttext.cc/docs/en/pretrained-vectors.html
# there are also vectors for
# Welsh, Icelandic, Thai, Sanskrit
# https://fasttext.cc/docs/en/crawl-vectors.html
declare -a FASTTEXT_LANG=("Afrikaans" "Armenian" "Breton" "Buryat" "Chinese" "Faroese" "Gothic" "Kurmanji" "North_Sami" "Serbian" "Upper_Sorbian")
declare -a FASTTEXT_CODE=("af" "hy" "br" "bxr" "zh" "fo" "got" "ku" "se" "sr" "hsb")
declare -a LOCAL_CODE=("af" "hy" "br" "bxr" "zh" "fo" "got" "kmr" "sme" "sr" "hsb")
color_green='\033[32;1m'
color_clear='\033[0m' # No Color
function msg() {
echo -e "${color_green}$@${color_clear}"
}
function prepare_fasttext_vec() {
lang=$1
ftcode=$2
code=$3
cwd=$(pwd)
mkdir -p $lang
cd $lang
msg "=== Downloading fasttext vector file for ${lang}..."
url="${FASTTEXT_BASE_URL}/wiki.${ftcode}.vec"
fname="${code}.vectors"
wget $url -O $fname
msg "=== Compressing file ${fname}..."
xz $fname
cd $cwd
}
# do the actual work
mkdir -p $WORDVEC_DIR
cd $WORDVEC_DIR
msg "Downloading CONLL17 word vectors. This may take a while..."
wget $CONLL17_URL -O $CONLL17_TAR
msg "Extracting CONLL17 word vector files..."
tar -xvf $CONLL17_TAR
rm $CONLL17_TAR
msg "Preparing fasttext vectors for the rest of the languages."
for (( i=0; i<${#FASTTEXT_LANG[*]}; ++i)); do
prepare_fasttext_vec ${FASTTEXT_LANG[$i]} ${FASTTEXT_CODE[$i]} ${LOCAL_CODE[$i]}
done
# handle old french
mkdir Old_French
ln -s French/fr.vectors.xz Old_French/fro.vectors.xz
msg "All done."
|