diff options
Diffstat (limited to 'tests/test_server_request.py')
-rw-r--r-- | tests/test_server_request.py | 180 |
1 files changed, 106 insertions, 74 deletions
diff --git a/tests/test_server_request.py b/tests/test_server_request.py index 4fbaa7f5..4f9d63d4 100644 --- a/tests/test_server_request.py +++ b/tests/test_server_request.py @@ -7,7 +7,7 @@ import pytest import stanza.server as corenlp from stanza.protobuf import Document -from tests import TEST_WORKING_DIR +from tests import TEST_WORKING_DIR, compare_ignoring_whitespace pytestmark = pytest.mark.client @@ -34,39 +34,51 @@ Sentence #1 (10 tokens): Angela Merkel ist seit 2005 Bundeskanzlerin der Bundesrepublik Deutschland. Tokens: -[Text=Angela CharacterOffsetBegin=0 CharacterOffsetEnd=6 PartOfSpeech=NE Lemma=angela NamedEntityTag=PERSON] -[Text=Merkel CharacterOffsetBegin=7 CharacterOffsetEnd=13 PartOfSpeech=NE Lemma=merkel NamedEntityTag=PERSON] -[Text=ist CharacterOffsetBegin=14 CharacterOffsetEnd=17 PartOfSpeech=VAFIN Lemma=ist NamedEntityTag=O] -[Text=seit CharacterOffsetBegin=18 CharacterOffsetEnd=22 PartOfSpeech=APPR Lemma=seit NamedEntityTag=O] -[Text=2005 CharacterOffsetBegin=23 CharacterOffsetEnd=27 PartOfSpeech=CARD Lemma=2005 NamedEntityTag=O] -[Text=Bundeskanzlerin CharacterOffsetBegin=28 CharacterOffsetEnd=43 PartOfSpeech=NN Lemma=bundeskanzlerin NamedEntityTag=O] -[Text=der CharacterOffsetBegin=44 CharacterOffsetEnd=47 PartOfSpeech=ART Lemma=der NamedEntityTag=O] -[Text=Bundesrepublik CharacterOffsetBegin=48 CharacterOffsetEnd=62 PartOfSpeech=NN Lemma=bundesrepublik NamedEntityTag=LOCATION] -[Text=Deutschland CharacterOffsetBegin=63 CharacterOffsetEnd=74 PartOfSpeech=NE Lemma=deutschland NamedEntityTag=LOCATION] -[Text=. CharacterOffsetBegin=74 CharacterOffsetEnd=75 PartOfSpeech=$. Lemma=. NamedEntityTag=O] - -Constituency parse: -(ROOT - (S - (MPN (NE Angela) (NE Merkel)) - (VAFIN ist) - (PP (APPR seit) (CARD 2005) (NN Bundeskanzlerin) - (NP (ART der) (NN Bundesrepublik) (NE Deutschland))) - ($. .))) +[Text=Angela CharacterOffsetBegin=0 CharacterOffsetEnd=6 PartOfSpeech=PROPN Lemma=angela NamedEntityTag=PERSON] +[Text=Merkel CharacterOffsetBegin=7 CharacterOffsetEnd=13 PartOfSpeech=PROPN Lemma=merkel NamedEntityTag=PERSON] +[Text=ist CharacterOffsetBegin=14 CharacterOffsetEnd=17 PartOfSpeech=AUX Lemma=ist NamedEntityTag=O] +[Text=seit CharacterOffsetBegin=18 CharacterOffsetEnd=22 PartOfSpeech=ADP Lemma=seit NamedEntityTag=O] +[Text=2005 CharacterOffsetBegin=23 CharacterOffsetEnd=27 PartOfSpeech=NUM Lemma=2005 NamedEntityTag=O] +[Text=Bundeskanzlerin CharacterOffsetBegin=28 CharacterOffsetEnd=43 PartOfSpeech=NOUN Lemma=bundeskanzlerin NamedEntityTag=O] +[Text=der CharacterOffsetBegin=44 CharacterOffsetEnd=47 PartOfSpeech=DET Lemma=der NamedEntityTag=O] +[Text=Bundesrepublik CharacterOffsetBegin=48 CharacterOffsetEnd=62 PartOfSpeech=PROPN Lemma=bundesrepublik NamedEntityTag=LOCATION] +[Text=Deutschland CharacterOffsetBegin=63 CharacterOffsetEnd=74 PartOfSpeech=PROPN Lemma=deutschland NamedEntityTag=LOCATION] +[Text=. CharacterOffsetBegin=74 CharacterOffsetEnd=75 PartOfSpeech=PUNCT Lemma=. NamedEntityTag=O] +Dependency Parse (enhanced plus plus dependencies): +root(ROOT-0, Bundeskanzlerin-6) +nsubj(Bundeskanzlerin-6, Angela-1) +flat(Angela-1, Merkel-2) +cop(Bundeskanzlerin-6, ist-3) +case(2005-5, seit-4) +nmod:seit(Bundeskanzlerin-6, 2005-5) +det(Bundesrepublik-8, der-7) +nmod(Bundeskanzlerin-6, Bundesrepublik-8) +appos(Bundesrepublik-8, Deutschland-9) +punct(Bundeskanzlerin-6, .-10) Extracted the following NER entity mentions: -Angela Merkel PERSON -Bundesrepublik Deutschland LOCATION +Angela Merkel PERSON PERSON:0.9999981583355767 +Bundesrepublik Deutschland LOCATION LOCATION:0.968290232887181 """ -FRENCH_CUSTOM_PROPS = {'annotators': 'tokenize,ssplit,pos,parse', 'tokenize.language': 'fr', - 'pos.model': 'edu/stanford/nlp/models/pos-tagger/french/french.tagger', - 'parse.model': 'edu/stanford/nlp/models/lexparser/frenchFactored.ser.gz', +FRENCH_CUSTOM_PROPS = {'annotators': 'tokenize,ssplit,mwt,pos,parse', + 'tokenize.language': 'fr', + 'pos.model': 'edu/stanford/nlp/models/pos-tagger/french-ud.tagger', + 'parse.model': 'edu/stanford/nlp/models/srparser/frenchSR.ser.gz', + 'mwt.mappingFile': 'edu/stanford/nlp/models/mwt/french/french-mwt.tsv', + 'mwt.pos.model': 'edu/stanford/nlp/models/mwt/french/french-mwt.tagger', + 'mwt.statisticalMappingFile': 'edu/stanford/nlp/models/mwt/french/french-mwt-statistical.tsv', + 'mwt.preserveCasing': 'false', 'outputFormat': 'text'} -FRENCH_EXTRA_PROPS = {'annotators': 'tokenize,ssplit,pos,depparse', - 'pos.model': 'edu/stanford/nlp/models/pos-tagger/french/french-ud.tagger', +FRENCH_EXTRA_PROPS = {'annotators': 'tokenize,ssplit,mwt,pos,depparse', + 'tokenize.language': 'fr', + 'pos.model': 'edu/stanford/nlp/models/pos-tagger/french-ud.tagger', + 'mwt.mappingFile': 'edu/stanford/nlp/models/mwt/french/french-mwt.tsv', + 'mwt.pos.model': 'edu/stanford/nlp/models/mwt/french/french-mwt.tagger', + 'mwt.statisticalMappingFile': 'edu/stanford/nlp/models/mwt/french/french-mwt-statistical.tsv', + 'mwt.preserveCasing': 'false', 'depparse.model': 'edu/stanford/nlp/models/parser/nndep/UD_French.gz'} FRENCH_DOC = "Cette enquête préliminaire fait suite aux révélations de l’hebdomadaire quelques jours plus tôt." @@ -77,37 +89,59 @@ Cette enquête préliminaire fait suite aux révélations de l’hebdomadaire qu Tokens: [Text=Cette CharacterOffsetBegin=0 CharacterOffsetEnd=5 PartOfSpeech=DET] -[Text=enquête CharacterOffsetBegin=6 CharacterOffsetEnd=13 PartOfSpeech=NC] +[Text=enquête CharacterOffsetBegin=6 CharacterOffsetEnd=13 PartOfSpeech=NOUN] [Text=préliminaire CharacterOffsetBegin=14 CharacterOffsetEnd=26 PartOfSpeech=ADJ] -[Text=fait CharacterOffsetBegin=27 CharacterOffsetEnd=31 PartOfSpeech=V] -[Text=suite CharacterOffsetBegin=32 CharacterOffsetEnd=37 PartOfSpeech=N] -[Text=à CharacterOffsetBegin=38 CharacterOffsetEnd=39 PartOfSpeech=P] -[Text=les CharacterOffsetBegin=39 CharacterOffsetEnd=41 PartOfSpeech=DET] -[Text=révélations CharacterOffsetBegin=42 CharacterOffsetEnd=53 PartOfSpeech=NC] -[Text=de CharacterOffsetBegin=54 CharacterOffsetEnd=56 PartOfSpeech=P] -[Text=l' CharacterOffsetBegin=57 CharacterOffsetEnd=59 PartOfSpeech=DET] -[Text=hebdomadaire CharacterOffsetBegin=59 CharacterOffsetEnd=71 PartOfSpeech=NC] +[Text=fait CharacterOffsetBegin=27 CharacterOffsetEnd=31 PartOfSpeech=VERB] +[Text=suite CharacterOffsetBegin=32 CharacterOffsetEnd=37 PartOfSpeech=NOUN] +[Text=à CharacterOffsetBegin=38 CharacterOffsetEnd=41 PartOfSpeech=ADP] +[Text=les CharacterOffsetBegin=38 CharacterOffsetEnd=41 PartOfSpeech=DET] +[Text=révélations CharacterOffsetBegin=42 CharacterOffsetEnd=53 PartOfSpeech=NOUN] +[Text=de CharacterOffsetBegin=54 CharacterOffsetEnd=56 PartOfSpeech=ADP] +[Text=l’ CharacterOffsetBegin=57 CharacterOffsetEnd=59 PartOfSpeech=NOUN] +[Text=hebdomadaire CharacterOffsetBegin=59 CharacterOffsetEnd=71 PartOfSpeech=ADJ] [Text=quelques CharacterOffsetBegin=72 CharacterOffsetEnd=80 PartOfSpeech=DET] -[Text=jours CharacterOffsetBegin=81 CharacterOffsetEnd=86 PartOfSpeech=NC] +[Text=jours CharacterOffsetBegin=81 CharacterOffsetEnd=86 PartOfSpeech=NOUN] [Text=plus CharacterOffsetBegin=87 CharacterOffsetEnd=91 PartOfSpeech=ADV] [Text=tôt CharacterOffsetBegin=92 CharacterOffsetEnd=95 PartOfSpeech=ADV] -[Text=. CharacterOffsetBegin=95 CharacterOffsetEnd=96 PartOfSpeech=PUNC] +[Text=. CharacterOffsetBegin=95 CharacterOffsetEnd=96 PartOfSpeech=PUNCT] Constituency parse: (ROOT (SENT - (NP (DET Cette) (NC enquête) - (AP (ADJ préliminaire))) + (NP (DET Cette) + (MWN (NOUN enquête) (ADJ préliminaire))) (VN - (MWV (V fait) (N suite))) - (PP (P à) - (NP (DET les) (NC révélations) - (PP (P de) - (NP (DET l') (NC hebdomadaire) - (AdP - (NP (DET quelques) (NC jours)) - (ADV plus) (ADV tôt)))))) - (PUNC .))) + (MWV (VERB fait) (NOUN suite))) + (PP (ADP à) + (NP (DET les) (NOUN révélations) + (PP (ADP de) + (NP (NOUN l’) + (AP (ADJ hebdomadaire)))))) + (NP (DET quelques) (NOUN jours)) + (AdP (ADV plus) (ADV tôt)) + (PUNCT .))) + + +Binary Constituency parse: +(ROOT + (SENT + (NP (DET Cette) + (MWN (NOUN enquête) (ADJ préliminaire))) + (@SENT + (@SENT + (@SENT + (@SENT + (VN + (MWV (VERB fait) (NOUN suite))) + (PP (ADP à) + (NP + (@NP (DET les) (NOUN révélations)) + (PP (ADP de) + (NP (NOUN l’) + (AP (ADJ hebdomadaire))))))) + (NP (DET quelques) (NOUN jours))) + (AdP (ADV plus) (ADV tôt))) + (PUNCT .)))) """ FRENCH_EXTRA_GOLD = """ @@ -120,12 +154,12 @@ Tokens: [Text=préliminaire CharacterOffsetBegin=14 CharacterOffsetEnd=26 PartOfSpeech=ADJ] [Text=fait CharacterOffsetBegin=27 CharacterOffsetEnd=31 PartOfSpeech=VERB] [Text=suite CharacterOffsetBegin=32 CharacterOffsetEnd=37 PartOfSpeech=NOUN] -[Text=à CharacterOffsetBegin=38 CharacterOffsetEnd=39 PartOfSpeech=ADP] -[Text=les CharacterOffsetBegin=39 CharacterOffsetEnd=41 PartOfSpeech=DET] +[Text=à CharacterOffsetBegin=38 CharacterOffsetEnd=41 PartOfSpeech=ADP] +[Text=les CharacterOffsetBegin=38 CharacterOffsetEnd=41 PartOfSpeech=DET] [Text=révélations CharacterOffsetBegin=42 CharacterOffsetEnd=53 PartOfSpeech=NOUN] [Text=de CharacterOffsetBegin=54 CharacterOffsetEnd=56 PartOfSpeech=ADP] -[Text=l' CharacterOffsetBegin=57 CharacterOffsetEnd=59 PartOfSpeech=DET] -[Text=hebdomadaire CharacterOffsetBegin=59 CharacterOffsetEnd=71 PartOfSpeech=NOUN] +[Text=l’ CharacterOffsetBegin=57 CharacterOffsetEnd=59 PartOfSpeech=NOUN] +[Text=hebdomadaire CharacterOffsetBegin=59 CharacterOffsetEnd=71 PartOfSpeech=ADJ] [Text=quelques CharacterOffsetBegin=72 CharacterOffsetEnd=80 PartOfSpeech=DET] [Text=jours CharacterOffsetBegin=81 CharacterOffsetEnd=86 PartOfSpeech=NOUN] [Text=plus CharacterOffsetBegin=87 CharacterOffsetEnd=91 PartOfSpeech=ADV] @@ -137,15 +171,15 @@ root(ROOT-0, fait-4) det(enquête-2, Cette-1) nsubj(fait-4, enquête-2) amod(enquête-2, préliminaire-3) -dobj(fait-4, suite-5) +obj(fait-4, suite-5) case(révélations-8, à-6) det(révélations-8, les-7) -nmod:à(suite-5, révélations-8) -case(hebdomadaire-11, de-9) -det(hebdomadaire-11, l'-10) -nmod:de(révélations-8, hebdomadaire-11) +obl:à(fait-4, révélations-8) +case(l’-10, de-9) +nmod:de(révélations-8, l’-10) +amod(révélations-8, hebdomadaire-11) det(jours-13, quelques-12) -nmod(fait-4, jours-13) +obl(fait-4, jours-13) advmod(tôt-15, plus-14) advmod(jours-13, tôt-15) punct(fait-4, .-16) @@ -155,8 +189,9 @@ FRENCH_JSON_GOLD = json.loads(open(f'{TEST_WORKING_DIR}/out/example_french.json' ES_DOC = 'Andrés Manuel López Obrador es el presidente de México.' -ES_PROPS = {'annotators': 'tokenize,ssplit,pos,depparse', 'tokenize.language': 'es', - 'pos.model': 'edu/stanford/nlp/models/pos-tagger/spanish/spanish-ud.tagger', +ES_PROPS = {'annotators': 'tokenize,ssplit,mwt,pos,depparse', 'tokenize.language': 'es', + 'pos.model': 'edu/stanford/nlp/models/pos-tagger/spanish-ud.tagger', + 'mwt.mappingFile': 'edu/stanford/nlp/models/mwt/spanish/spanish-mwt.tsv', 'depparse.model': 'edu/stanford/nlp/models/parser/nndep/UD_Spanish.gz'} ES_PROPS_GOLD = """ @@ -168,7 +203,7 @@ Tokens: [Text=Manuel CharacterOffsetBegin=7 CharacterOffsetEnd=13 PartOfSpeech=PROPN] [Text=López CharacterOffsetBegin=14 CharacterOffsetEnd=19 PartOfSpeech=PROPN] [Text=Obrador CharacterOffsetBegin=20 CharacterOffsetEnd=27 PartOfSpeech=PROPN] -[Text=es CharacterOffsetBegin=28 CharacterOffsetEnd=30 PartOfSpeech=VERB] +[Text=es CharacterOffsetBegin=28 CharacterOffsetEnd=30 PartOfSpeech=AUX] [Text=el CharacterOffsetBegin=31 CharacterOffsetEnd=33 PartOfSpeech=DET] [Text=presidente CharacterOffsetBegin=34 CharacterOffsetEnd=44 PartOfSpeech=NOUN] [Text=de CharacterOffsetBegin=45 CharacterOffsetEnd=47 PartOfSpeech=ADP] @@ -176,16 +211,16 @@ Tokens: [Text=. CharacterOffsetBegin=54 CharacterOffsetEnd=55 PartOfSpeech=PUNCT] Dependency Parse (enhanced plus plus dependencies): -root(ROOT-0, es-5) -nsubj(es-5, Andrés-1) -name(Andrés-1, Manuel-2) -name(Andrés-1, López-3) -name(Andrés-1, Obrador-4) +root(ROOT-0, presidente-7) +nsubj(presidente-7, Andrés-1) +flat(Andrés-1, Manuel-2) +flat(Andrés-1, López-3) +flat(Andrés-1, Obrador-4) +cop(presidente-7, es-5) det(presidente-7, el-6) -nsubj(es-5, presidente-7) case(México-9, de-8) nmod:de(presidente-7, México-9) -punct(es-5, .-10) +punct(presidente-7, .-10) """ @@ -237,14 +272,11 @@ def test_switching_back_and_forth(corenlp_client): def test_lang_setting(corenlp_client): """ Test using a Stanford CoreNLP supported languages as a properties key """ ann = corenlp_client.annotate(GERMAN_DOC, properties_key="german", output_format="text") - assert ann.strip() == GERMAN_DOC_GOLD.strip() + compare_ignoring_whitespace(ann, GERMAN_DOC_GOLD) def test_annotators_and_output_format(corenlp_client): """ Test setting the annotators and output_format """ ann = corenlp_client.annotate(FRENCH_DOC, properties=FRENCH_EXTRA_PROPS, - annotators="tokenize,ssplit,pos", output_format="json") + annotators="tokenize,ssplit,mwt,pos", output_format="json") assert FRENCH_JSON_GOLD == ann - - - |