stanza/models/classifiers/data.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61

import logging
import stanza.models.classifiers.classifier_args as classifier_args

logger = logging.getLogger('stanza')

def update_text(sentence, wordvec_type):
    """
    Process a line of text (with tokenization provided as whitespace)
    into a list of strings.
    """
    # stanford sentiment dataset has a lot of random - and /
    sentence = sentence.replace("-", " ")
    sentence = sentence.replace("/", " ")
    sentence = sentence.strip()
    if sentence == "":
        # removed too much
        sentence = "-"
    sentence = sentence.split()
    # our current word vectors are all entirely lowercased
    sentence = [word.lower() for word in sentence]
    if wordvec_type == classifier_args.WVType.WORD2VEC:
        return sentence
    elif wordvec_type == classifier_args.WVType.GOOGLE:
        new_sentence = []
        for word in sentence:
            if word != '0' and word != '1':
                word = re.sub('[0-9]', '#', word)
            new_sentence.append(word)
        return new_sentence
    elif wordvec_type == classifier_args.WVType.FASTTEXT:
        return sentence
    elif wordvec_type == classifier_args.WVType.OTHER:
        return sentence
    else:
        raise ValueError("Unknown wordvec_type {}".format(wordvec_type))


def read_dataset(dataset, wordvec_type, min_len):
    """
    returns a list where the values of the list are
      label, [token...]
    """
    lines = []
    for filename in dataset.split(","):
        try:
            new_lines = open(filename, encoding="utf-8").readlines()
        except UnicodeDecodeError:
            logger.error("Could not read {}".format(filename))
            raise
        lines.extend(new_lines)
    lines = [x.strip() for x in lines]
    lines = [x.split(maxsplit=1) for x in lines if x]
    lines = [x for x in lines if len(x) > 1]
    # TODO: maybe do this processing later, once the model is built.
    # then move the processing into the model so we can use
    # overloading to potentially make future model types
    lines = [(x[0], update_text(x[1], wordvec_type)) for x in lines]
    if min_len:
        lines = [x for x in lines if len(x[1]) >= min_len]
    return lines