Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/stanfordnlp/stanza.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
Diffstat (limited to 'tests/test_server_request.py')
-rw-r--r--tests/test_server_request.py180
1 files changed, 106 insertions, 74 deletions
diff --git a/tests/test_server_request.py b/tests/test_server_request.py
index 4fbaa7f5..4f9d63d4 100644
--- a/tests/test_server_request.py
+++ b/tests/test_server_request.py
@@ -7,7 +7,7 @@ import pytest
import stanza.server as corenlp
from stanza.protobuf import Document
-from tests import TEST_WORKING_DIR
+from tests import TEST_WORKING_DIR, compare_ignoring_whitespace
pytestmark = pytest.mark.client
@@ -34,39 +34,51 @@ Sentence #1 (10 tokens):
Angela Merkel ist seit 2005 Bundeskanzlerin der Bundesrepublik Deutschland.
Tokens:
-[Text=Angela CharacterOffsetBegin=0 CharacterOffsetEnd=6 PartOfSpeech=NE Lemma=angela NamedEntityTag=PERSON]
-[Text=Merkel CharacterOffsetBegin=7 CharacterOffsetEnd=13 PartOfSpeech=NE Lemma=merkel NamedEntityTag=PERSON]
-[Text=ist CharacterOffsetBegin=14 CharacterOffsetEnd=17 PartOfSpeech=VAFIN Lemma=ist NamedEntityTag=O]
-[Text=seit CharacterOffsetBegin=18 CharacterOffsetEnd=22 PartOfSpeech=APPR Lemma=seit NamedEntityTag=O]
-[Text=2005 CharacterOffsetBegin=23 CharacterOffsetEnd=27 PartOfSpeech=CARD Lemma=2005 NamedEntityTag=O]
-[Text=Bundeskanzlerin CharacterOffsetBegin=28 CharacterOffsetEnd=43 PartOfSpeech=NN Lemma=bundeskanzlerin NamedEntityTag=O]
-[Text=der CharacterOffsetBegin=44 CharacterOffsetEnd=47 PartOfSpeech=ART Lemma=der NamedEntityTag=O]
-[Text=Bundesrepublik CharacterOffsetBegin=48 CharacterOffsetEnd=62 PartOfSpeech=NN Lemma=bundesrepublik NamedEntityTag=LOCATION]
-[Text=Deutschland CharacterOffsetBegin=63 CharacterOffsetEnd=74 PartOfSpeech=NE Lemma=deutschland NamedEntityTag=LOCATION]
-[Text=. CharacterOffsetBegin=74 CharacterOffsetEnd=75 PartOfSpeech=$. Lemma=. NamedEntityTag=O]
-
-Constituency parse:
-(ROOT
- (S
- (MPN (NE Angela) (NE Merkel))
- (VAFIN ist)
- (PP (APPR seit) (CARD 2005) (NN Bundeskanzlerin)
- (NP (ART der) (NN Bundesrepublik) (NE Deutschland)))
- ($. .)))
+[Text=Angela CharacterOffsetBegin=0 CharacterOffsetEnd=6 PartOfSpeech=PROPN Lemma=angela NamedEntityTag=PERSON]
+[Text=Merkel CharacterOffsetBegin=7 CharacterOffsetEnd=13 PartOfSpeech=PROPN Lemma=merkel NamedEntityTag=PERSON]
+[Text=ist CharacterOffsetBegin=14 CharacterOffsetEnd=17 PartOfSpeech=AUX Lemma=ist NamedEntityTag=O]
+[Text=seit CharacterOffsetBegin=18 CharacterOffsetEnd=22 PartOfSpeech=ADP Lemma=seit NamedEntityTag=O]
+[Text=2005 CharacterOffsetBegin=23 CharacterOffsetEnd=27 PartOfSpeech=NUM Lemma=2005 NamedEntityTag=O]
+[Text=Bundeskanzlerin CharacterOffsetBegin=28 CharacterOffsetEnd=43 PartOfSpeech=NOUN Lemma=bundeskanzlerin NamedEntityTag=O]
+[Text=der CharacterOffsetBegin=44 CharacterOffsetEnd=47 PartOfSpeech=DET Lemma=der NamedEntityTag=O]
+[Text=Bundesrepublik CharacterOffsetBegin=48 CharacterOffsetEnd=62 PartOfSpeech=PROPN Lemma=bundesrepublik NamedEntityTag=LOCATION]
+[Text=Deutschland CharacterOffsetBegin=63 CharacterOffsetEnd=74 PartOfSpeech=PROPN Lemma=deutschland NamedEntityTag=LOCATION]
+[Text=. CharacterOffsetBegin=74 CharacterOffsetEnd=75 PartOfSpeech=PUNCT Lemma=. NamedEntityTag=O]
+Dependency Parse (enhanced plus plus dependencies):
+root(ROOT-0, Bundeskanzlerin-6)
+nsubj(Bundeskanzlerin-6, Angela-1)
+flat(Angela-1, Merkel-2)
+cop(Bundeskanzlerin-6, ist-3)
+case(2005-5, seit-4)
+nmod:seit(Bundeskanzlerin-6, 2005-5)
+det(Bundesrepublik-8, der-7)
+nmod(Bundeskanzlerin-6, Bundesrepublik-8)
+appos(Bundesrepublik-8, Deutschland-9)
+punct(Bundeskanzlerin-6, .-10)
Extracted the following NER entity mentions:
-Angela Merkel PERSON
-Bundesrepublik Deutschland LOCATION
+Angela Merkel PERSON PERSON:0.9999981583355767
+Bundesrepublik Deutschland LOCATION LOCATION:0.968290232887181
"""
-FRENCH_CUSTOM_PROPS = {'annotators': 'tokenize,ssplit,pos,parse', 'tokenize.language': 'fr',
- 'pos.model': 'edu/stanford/nlp/models/pos-tagger/french/french.tagger',
- 'parse.model': 'edu/stanford/nlp/models/lexparser/frenchFactored.ser.gz',
+FRENCH_CUSTOM_PROPS = {'annotators': 'tokenize,ssplit,mwt,pos,parse',
+ 'tokenize.language': 'fr',
+ 'pos.model': 'edu/stanford/nlp/models/pos-tagger/french-ud.tagger',
+ 'parse.model': 'edu/stanford/nlp/models/srparser/frenchSR.ser.gz',
+ 'mwt.mappingFile': 'edu/stanford/nlp/models/mwt/french/french-mwt.tsv',
+ 'mwt.pos.model': 'edu/stanford/nlp/models/mwt/french/french-mwt.tagger',
+ 'mwt.statisticalMappingFile': 'edu/stanford/nlp/models/mwt/french/french-mwt-statistical.tsv',
+ 'mwt.preserveCasing': 'false',
'outputFormat': 'text'}
-FRENCH_EXTRA_PROPS = {'annotators': 'tokenize,ssplit,pos,depparse',
- 'pos.model': 'edu/stanford/nlp/models/pos-tagger/french/french-ud.tagger',
+FRENCH_EXTRA_PROPS = {'annotators': 'tokenize,ssplit,mwt,pos,depparse',
+ 'tokenize.language': 'fr',
+ 'pos.model': 'edu/stanford/nlp/models/pos-tagger/french-ud.tagger',
+ 'mwt.mappingFile': 'edu/stanford/nlp/models/mwt/french/french-mwt.tsv',
+ 'mwt.pos.model': 'edu/stanford/nlp/models/mwt/french/french-mwt.tagger',
+ 'mwt.statisticalMappingFile': 'edu/stanford/nlp/models/mwt/french/french-mwt-statistical.tsv',
+ 'mwt.preserveCasing': 'false',
'depparse.model': 'edu/stanford/nlp/models/parser/nndep/UD_French.gz'}
FRENCH_DOC = "Cette enquête préliminaire fait suite aux révélations de l’hebdomadaire quelques jours plus tôt."
@@ -77,37 +89,59 @@ Cette enquête préliminaire fait suite aux révélations de l’hebdomadaire qu
Tokens:
[Text=Cette CharacterOffsetBegin=0 CharacterOffsetEnd=5 PartOfSpeech=DET]
-[Text=enquête CharacterOffsetBegin=6 CharacterOffsetEnd=13 PartOfSpeech=NC]
+[Text=enquête CharacterOffsetBegin=6 CharacterOffsetEnd=13 PartOfSpeech=NOUN]
[Text=préliminaire CharacterOffsetBegin=14 CharacterOffsetEnd=26 PartOfSpeech=ADJ]
-[Text=fait CharacterOffsetBegin=27 CharacterOffsetEnd=31 PartOfSpeech=V]
-[Text=suite CharacterOffsetBegin=32 CharacterOffsetEnd=37 PartOfSpeech=N]
-[Text=à CharacterOffsetBegin=38 CharacterOffsetEnd=39 PartOfSpeech=P]
-[Text=les CharacterOffsetBegin=39 CharacterOffsetEnd=41 PartOfSpeech=DET]
-[Text=révélations CharacterOffsetBegin=42 CharacterOffsetEnd=53 PartOfSpeech=NC]
-[Text=de CharacterOffsetBegin=54 CharacterOffsetEnd=56 PartOfSpeech=P]
-[Text=l' CharacterOffsetBegin=57 CharacterOffsetEnd=59 PartOfSpeech=DET]
-[Text=hebdomadaire CharacterOffsetBegin=59 CharacterOffsetEnd=71 PartOfSpeech=NC]
+[Text=fait CharacterOffsetBegin=27 CharacterOffsetEnd=31 PartOfSpeech=VERB]
+[Text=suite CharacterOffsetBegin=32 CharacterOffsetEnd=37 PartOfSpeech=NOUN]
+[Text=à CharacterOffsetBegin=38 CharacterOffsetEnd=41 PartOfSpeech=ADP]
+[Text=les CharacterOffsetBegin=38 CharacterOffsetEnd=41 PartOfSpeech=DET]
+[Text=révélations CharacterOffsetBegin=42 CharacterOffsetEnd=53 PartOfSpeech=NOUN]
+[Text=de CharacterOffsetBegin=54 CharacterOffsetEnd=56 PartOfSpeech=ADP]
+[Text=l’ CharacterOffsetBegin=57 CharacterOffsetEnd=59 PartOfSpeech=NOUN]
+[Text=hebdomadaire CharacterOffsetBegin=59 CharacterOffsetEnd=71 PartOfSpeech=ADJ]
[Text=quelques CharacterOffsetBegin=72 CharacterOffsetEnd=80 PartOfSpeech=DET]
-[Text=jours CharacterOffsetBegin=81 CharacterOffsetEnd=86 PartOfSpeech=NC]
+[Text=jours CharacterOffsetBegin=81 CharacterOffsetEnd=86 PartOfSpeech=NOUN]
[Text=plus CharacterOffsetBegin=87 CharacterOffsetEnd=91 PartOfSpeech=ADV]
[Text=tôt CharacterOffsetBegin=92 CharacterOffsetEnd=95 PartOfSpeech=ADV]
-[Text=. CharacterOffsetBegin=95 CharacterOffsetEnd=96 PartOfSpeech=PUNC]
+[Text=. CharacterOffsetBegin=95 CharacterOffsetEnd=96 PartOfSpeech=PUNCT]
Constituency parse:
(ROOT
(SENT
- (NP (DET Cette) (NC enquête)
- (AP (ADJ préliminaire)))
+ (NP (DET Cette)
+ (MWN (NOUN enquête) (ADJ préliminaire)))
(VN
- (MWV (V fait) (N suite)))
- (PP (P à)
- (NP (DET les) (NC révélations)
- (PP (P de)
- (NP (DET l') (NC hebdomadaire)
- (AdP
- (NP (DET quelques) (NC jours))
- (ADV plus) (ADV tôt))))))
- (PUNC .)))
+ (MWV (VERB fait) (NOUN suite)))
+ (PP (ADP à)
+ (NP (DET les) (NOUN révélations)
+ (PP (ADP de)
+ (NP (NOUN l’)
+ (AP (ADJ hebdomadaire))))))
+ (NP (DET quelques) (NOUN jours))
+ (AdP (ADV plus) (ADV tôt))
+ (PUNCT .)))
+
+
+Binary Constituency parse:
+(ROOT
+ (SENT
+ (NP (DET Cette)
+ (MWN (NOUN enquête) (ADJ préliminaire)))
+ (@SENT
+ (@SENT
+ (@SENT
+ (@SENT
+ (VN
+ (MWV (VERB fait) (NOUN suite)))
+ (PP (ADP à)
+ (NP
+ (@NP (DET les) (NOUN révélations))
+ (PP (ADP de)
+ (NP (NOUN l’)
+ (AP (ADJ hebdomadaire)))))))
+ (NP (DET quelques) (NOUN jours)))
+ (AdP (ADV plus) (ADV tôt)))
+ (PUNCT .))))
"""
FRENCH_EXTRA_GOLD = """
@@ -120,12 +154,12 @@ Tokens:
[Text=préliminaire CharacterOffsetBegin=14 CharacterOffsetEnd=26 PartOfSpeech=ADJ]
[Text=fait CharacterOffsetBegin=27 CharacterOffsetEnd=31 PartOfSpeech=VERB]
[Text=suite CharacterOffsetBegin=32 CharacterOffsetEnd=37 PartOfSpeech=NOUN]
-[Text=à CharacterOffsetBegin=38 CharacterOffsetEnd=39 PartOfSpeech=ADP]
-[Text=les CharacterOffsetBegin=39 CharacterOffsetEnd=41 PartOfSpeech=DET]
+[Text=à CharacterOffsetBegin=38 CharacterOffsetEnd=41 PartOfSpeech=ADP]
+[Text=les CharacterOffsetBegin=38 CharacterOffsetEnd=41 PartOfSpeech=DET]
[Text=révélations CharacterOffsetBegin=42 CharacterOffsetEnd=53 PartOfSpeech=NOUN]
[Text=de CharacterOffsetBegin=54 CharacterOffsetEnd=56 PartOfSpeech=ADP]
-[Text=l' CharacterOffsetBegin=57 CharacterOffsetEnd=59 PartOfSpeech=DET]
-[Text=hebdomadaire CharacterOffsetBegin=59 CharacterOffsetEnd=71 PartOfSpeech=NOUN]
+[Text=l’ CharacterOffsetBegin=57 CharacterOffsetEnd=59 PartOfSpeech=NOUN]
+[Text=hebdomadaire CharacterOffsetBegin=59 CharacterOffsetEnd=71 PartOfSpeech=ADJ]
[Text=quelques CharacterOffsetBegin=72 CharacterOffsetEnd=80 PartOfSpeech=DET]
[Text=jours CharacterOffsetBegin=81 CharacterOffsetEnd=86 PartOfSpeech=NOUN]
[Text=plus CharacterOffsetBegin=87 CharacterOffsetEnd=91 PartOfSpeech=ADV]
@@ -137,15 +171,15 @@ root(ROOT-0, fait-4)
det(enquête-2, Cette-1)
nsubj(fait-4, enquête-2)
amod(enquête-2, préliminaire-3)
-dobj(fait-4, suite-5)
+obj(fait-4, suite-5)
case(révélations-8, à-6)
det(révélations-8, les-7)
-nmod:à(suite-5, révélations-8)
-case(hebdomadaire-11, de-9)
-det(hebdomadaire-11, l'-10)
-nmod:de(révélations-8, hebdomadaire-11)
+obl:à(fait-4, révélations-8)
+case(l’-10, de-9)
+nmod:de(révélations-8, l’-10)
+amod(révélations-8, hebdomadaire-11)
det(jours-13, quelques-12)
-nmod(fait-4, jours-13)
+obl(fait-4, jours-13)
advmod(tôt-15, plus-14)
advmod(jours-13, tôt-15)
punct(fait-4, .-16)
@@ -155,8 +189,9 @@ FRENCH_JSON_GOLD = json.loads(open(f'{TEST_WORKING_DIR}/out/example_french.json'
ES_DOC = 'Andrés Manuel López Obrador es el presidente de México.'
-ES_PROPS = {'annotators': 'tokenize,ssplit,pos,depparse', 'tokenize.language': 'es',
- 'pos.model': 'edu/stanford/nlp/models/pos-tagger/spanish/spanish-ud.tagger',
+ES_PROPS = {'annotators': 'tokenize,ssplit,mwt,pos,depparse', 'tokenize.language': 'es',
+ 'pos.model': 'edu/stanford/nlp/models/pos-tagger/spanish-ud.tagger',
+ 'mwt.mappingFile': 'edu/stanford/nlp/models/mwt/spanish/spanish-mwt.tsv',
'depparse.model': 'edu/stanford/nlp/models/parser/nndep/UD_Spanish.gz'}
ES_PROPS_GOLD = """
@@ -168,7 +203,7 @@ Tokens:
[Text=Manuel CharacterOffsetBegin=7 CharacterOffsetEnd=13 PartOfSpeech=PROPN]
[Text=López CharacterOffsetBegin=14 CharacterOffsetEnd=19 PartOfSpeech=PROPN]
[Text=Obrador CharacterOffsetBegin=20 CharacterOffsetEnd=27 PartOfSpeech=PROPN]
-[Text=es CharacterOffsetBegin=28 CharacterOffsetEnd=30 PartOfSpeech=VERB]
+[Text=es CharacterOffsetBegin=28 CharacterOffsetEnd=30 PartOfSpeech=AUX]
[Text=el CharacterOffsetBegin=31 CharacterOffsetEnd=33 PartOfSpeech=DET]
[Text=presidente CharacterOffsetBegin=34 CharacterOffsetEnd=44 PartOfSpeech=NOUN]
[Text=de CharacterOffsetBegin=45 CharacterOffsetEnd=47 PartOfSpeech=ADP]
@@ -176,16 +211,16 @@ Tokens:
[Text=. CharacterOffsetBegin=54 CharacterOffsetEnd=55 PartOfSpeech=PUNCT]
Dependency Parse (enhanced plus plus dependencies):
-root(ROOT-0, es-5)
-nsubj(es-5, Andrés-1)
-name(Andrés-1, Manuel-2)
-name(Andrés-1, López-3)
-name(Andrés-1, Obrador-4)
+root(ROOT-0, presidente-7)
+nsubj(presidente-7, Andrés-1)
+flat(Andrés-1, Manuel-2)
+flat(Andrés-1, López-3)
+flat(Andrés-1, Obrador-4)
+cop(presidente-7, es-5)
det(presidente-7, el-6)
-nsubj(es-5, presidente-7)
case(México-9, de-8)
nmod:de(presidente-7, México-9)
-punct(es-5, .-10)
+punct(presidente-7, .-10)
"""
@@ -237,14 +272,11 @@ def test_switching_back_and_forth(corenlp_client):
def test_lang_setting(corenlp_client):
""" Test using a Stanford CoreNLP supported languages as a properties key """
ann = corenlp_client.annotate(GERMAN_DOC, properties_key="german", output_format="text")
- assert ann.strip() == GERMAN_DOC_GOLD.strip()
+ compare_ignoring_whitespace(ann, GERMAN_DOC_GOLD)
def test_annotators_and_output_format(corenlp_client):
""" Test setting the annotators and output_format """
ann = corenlp_client.annotate(FRENCH_DOC, properties=FRENCH_EXTRA_PROPS,
- annotators="tokenize,ssplit,pos", output_format="json")
+ annotators="tokenize,ssplit,mwt,pos", output_format="json")
assert FRENCH_JSON_GOLD == ann
-
-
-