Welcome to mirror list, hosted at ThFree Co, Russian Federation.

download_vectors.sh « scripts - github.com/stanfordnlp/stanza.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
blob: 960f57efdbe8d781417487a55bafff9d6cbe08ff (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
#!/bin/bash
#
# Download word vector files for all supported languages. Run as:
#   ./download_vectors.sh WORDVEC_DIR
# where WORDVEC_DIR is the target directory to store the word vector data.

# check arguments
: ${1?"Usage: $0 WORDVEC_DIR"}
WORDVEC_DIR=$1

# constants and functions
CONLL17_URL="https://lindat.mff.cuni.cz/repository/xmlui/bitstream/handle/11234/1-1989/word-embeddings-conll17.tar"
CONLL17_TAR="word-embeddings-conll17.tar"

FASTTEXT_BASE_URL="https://dl.fbaipublicfiles.com/fasttext/vectors-wiki"

# TODO: some fasttext vectors are now at
# https://fasttext.cc/docs/en/pretrained-vectors.html
# there are also vectors for
# Welsh, Icelandic, Thai, Sanskrit
# https://fasttext.cc/docs/en/crawl-vectors.html

declare -a FASTTEXT_LANG=("Afrikaans" "Armenian" "Breton" "Buryat" "Chinese" "Faroese" "Gothic" "Kurmanji" "North_Sami" "Serbian" "Upper_Sorbian")
declare -a FASTTEXT_CODE=("af" "hy" "br" "bxr" "zh" "fo" "got" "ku" "se" "sr" "hsb")
declare -a LOCAL_CODE=("af" "hy" "br" "bxr" "zh" "fo" "got" "kmr" "sme" "sr" "hsb")

color_green='\033[32;1m'
color_clear='\033[0m' # No Color
function msg() {
    echo -e "${color_green}$@${color_clear}"
}

function prepare_fasttext_vec() {
    lang=$1
    ftcode=$2
    code=$3

    cwd=$(pwd)
    mkdir -p $lang
    cd $lang
    msg "=== Downloading fasttext vector file for ${lang}..."
    url="${FASTTEXT_BASE_URL}/wiki.${ftcode}.vec"
    fname="${code}.vectors"
    wget $url -O $fname

    msg "=== Compressing file ${fname}..."
    xz $fname
    cd $cwd
}

# do the actual work
mkdir -p $WORDVEC_DIR
cd $WORDVEC_DIR

msg "Downloading CONLL17 word vectors. This may take a while..."
wget $CONLL17_URL -O $CONLL17_TAR

msg "Extracting CONLL17 word vector files..."
tar -xvf $CONLL17_TAR
rm $CONLL17_TAR

msg "Preparing fasttext vectors for the rest of the languages."
for (( i=0; i<${#FASTTEXT_LANG[*]}; ++i)); do
    prepare_fasttext_vec ${FASTTEXT_LANG[$i]} ${FASTTEXT_CODE[$i]} ${LOCAL_CODE[$i]}
done

# handle old french
mkdir Old_French
ln -s French/fr.vectors.xz Old_French/fro.vectors.xz

msg "All done."