1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
|
"""
Basic testing of multi-word-token expansion
"""
import pytest
import stanza
from stanza.tests import *
pytestmark = pytest.mark.pipeline
# mwt data for testing
FR_MWT_SENTENCE = "Alors encore inconnu du grand public, Emmanuel Macron devient en 2014 ministre de l'Économie, de " \
"l'Industrie et du Numérique."
FR_MWT_TOKEN_TO_WORDS_GOLD = """
token: Alors words: [<Word id=1;text=Alors>]
token: encore words: [<Word id=2;text=encore>]
token: inconnu words: [<Word id=3;text=inconnu>]
token: du words: [<Word id=4;text=de>, <Word id=5;text=le>]
token: grand words: [<Word id=6;text=grand>]
token: public words: [<Word id=7;text=public>]
token: , words: [<Word id=8;text=,>]
token: Emmanuel words: [<Word id=9;text=Emmanuel>]
token: Macron words: [<Word id=10;text=Macron>]
token: devient words: [<Word id=11;text=devient>]
token: en words: [<Word id=12;text=en>]
token: 2014 words: [<Word id=13;text=2014>]
token: ministre words: [<Word id=14;text=ministre>]
token: de words: [<Word id=15;text=de>]
token: l' words: [<Word id=16;text=l'>]
token: Économie words: [<Word id=17;text=Économie>]
token: , words: [<Word id=18;text=,>]
token: de words: [<Word id=19;text=de>]
token: l' words: [<Word id=20;text=l'>]
token: Industrie words: [<Word id=21;text=Industrie>]
token: et words: [<Word id=22;text=et>]
token: du words: [<Word id=23;text=de>, <Word id=24;text=le>]
token: Numérique words: [<Word id=25;text=Numérique>]
token: . words: [<Word id=26;text=.>]
""".strip()
FR_MWT_WORD_TO_TOKEN_GOLD = """
word: Alors token parent:1-Alors
word: encore token parent:2-encore
word: inconnu token parent:3-inconnu
word: de token parent:4-5-du
word: le token parent:4-5-du
word: grand token parent:6-grand
word: public token parent:7-public
word: , token parent:8-,
word: Emmanuel token parent:9-Emmanuel
word: Macron token parent:10-Macron
word: devient token parent:11-devient
word: en token parent:12-en
word: 2014 token parent:13-2014
word: ministre token parent:14-ministre
word: de token parent:15-de
word: l' token parent:16-l'
word: Économie token parent:17-Économie
word: , token parent:18-,
word: de token parent:19-de
word: l' token parent:20-l'
word: Industrie token parent:21-Industrie
word: et token parent:22-et
word: de token parent:23-24-du
word: le token parent:23-24-du
word: Numérique token parent:25-Numérique
word: . token parent:26-.
""".strip()
def test_mwt():
pipeline = stanza.Pipeline(processors='tokenize,mwt', dir=TEST_MODELS_DIR, lang='fr')
doc = pipeline(FR_MWT_SENTENCE)
token_to_words = "\n".join(
[f'token: {token.text.ljust(9)}\t\twords: [{", ".join([word.pretty_print() for word in token.words])}]' for sent in doc.sentences for token in sent.tokens]
).strip()
word_to_token = "\n".join(
[f'word: {word.text.ljust(9)}\t\ttoken parent:{"-".join([str(x) for x in word.parent.id])}-{word.parent.text}'
for sent in doc.sentences for word in sent.words]).strip()
assert token_to_words == FR_MWT_TOKEN_TO_WORDS_GOLD
assert word_to_token == FR_MWT_WORD_TO_TOKEN_GOLD
|