stanza/pipeline/external/spacy.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70

"""
Processors related to spaCy in the pipeline.
"""

from stanza.models.common import doc
from stanza.pipeline._constants import TOKENIZE
from stanza.pipeline.processor import ProcessorVariant, register_processor_variant

def check_spacy():
    """
    Import necessary components from spaCy to perform tokenization.
    """
    try:
        import spacy
    except ImportError:
        raise ImportError(
            "spaCy is used but not installed on your machine. Go to https://spacy.io/usage for installation instructions."
        )
    return True

@register_processor_variant(TOKENIZE, 'spacy')
class SpacyTokenizer(ProcessorVariant):
    def __init__(self, config):
        """ Construct a spaCy-based tokenizer by loading the spaCy pipeline.
        """
        if config['lang'] != 'en':
            raise Exception("spaCy tokenizer is currently only allowed in English pipeline.")

        try:
            import spacy
            from spacy.lang.en import English
        except ImportError:
            raise ImportError(
                "spaCy 2.0+ is used but not installed on your machine. Go to https://spacy.io/usage for installation instructions."
            )

        # Create a Tokenizer with the default settings for English
        # including punctuation rules and exceptions
        self.nlp = English()
        # by default spacy uses dependency parser to do ssplit
        # we need to add a sentencizer for fast rule-based ssplit
        if spacy.__version__.startswith("2."):
            self.nlp.add_pipe(self.nlp.create_pipe("sentencizer"))
        else:
            self.nlp.add_pipe("sentencizer")
        self.no_ssplit = config.get('no_ssplit', False)

    def process(self, text):
        """ Tokenize a document with the spaCy tokenizer and wrap the results into a Doc object.
        """
        if not isinstance(text, str):
            raise Exception("Must supply a string to the spaCy tokenizer.")
        spacy_doc = self.nlp(text)

        sentences = []
        for sent in spacy_doc.sents:
            tokens = []
            for tok in sent:
                token_entry = {
                    doc.TEXT: tok.text,
                    doc.MISC: f"{doc.START_CHAR}={tok.idx}|{doc.END_CHAR}={tok.idx+len(tok.text)}"
                }
                tokens.append(token_entry)
            sentences.append(tokens)

        # if no_ssplit is set, flatten all the sentences into one sentence
        if self.no_ssplit:
            sentences = [[t for s in sentences for t in s]]

        return doc.Document(sentences, text)