Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/stanfordnlp/stanza.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
Diffstat (limited to 'stanza/tests/test_server_request.py')
-rw-r--r--stanza/tests/test_server_request.py223
1 files changed, 223 insertions, 0 deletions
diff --git a/stanza/tests/test_server_request.py b/stanza/tests/test_server_request.py
new file mode 100644
index 00000000..6edf670a
--- /dev/null
+++ b/stanza/tests/test_server_request.py
@@ -0,0 +1,223 @@
+"""
+Tests for setting request properties of servers
+"""
+
+import json
+import pytest
+import stanza.server as corenlp
+
+from stanza.protobuf import Document
+from stanza.tests import TEST_WORKING_DIR, compare_ignoring_whitespace
+
+pytestmark = pytest.mark.client
+
+EN_DOC = "Joe Smith lives in California."
+
+# results with an example properties file
+EN_DOC_GOLD = """
+Sentence #1 (6 tokens):
+Joe Smith lives in California.
+
+Tokens:
+[Text=Joe CharacterOffsetBegin=0 CharacterOffsetEnd=3 PartOfSpeech=NNP]
+[Text=Smith CharacterOffsetBegin=4 CharacterOffsetEnd=9 PartOfSpeech=NNP]
+[Text=lives CharacterOffsetBegin=10 CharacterOffsetEnd=15 PartOfSpeech=VBZ]
+[Text=in CharacterOffsetBegin=16 CharacterOffsetEnd=18 PartOfSpeech=IN]
+[Text=California CharacterOffsetBegin=19 CharacterOffsetEnd=29 PartOfSpeech=NNP]
+[Text=. CharacterOffsetBegin=29 CharacterOffsetEnd=30 PartOfSpeech=.]
+"""
+
+GERMAN_DOC = "Angela Merkel ist seit 2005 Bundeskanzlerin der Bundesrepublik Deutschland."
+
+GERMAN_DOC_GOLD = """
+Sentence #1 (10 tokens):
+Angela Merkel ist seit 2005 Bundeskanzlerin der Bundesrepublik Deutschland.
+
+Tokens:
+[Text=Angela CharacterOffsetBegin=0 CharacterOffsetEnd=6 PartOfSpeech=PROPN]
+[Text=Merkel CharacterOffsetBegin=7 CharacterOffsetEnd=13 PartOfSpeech=PROPN]
+[Text=ist CharacterOffsetBegin=14 CharacterOffsetEnd=17 PartOfSpeech=AUX]
+[Text=seit CharacterOffsetBegin=18 CharacterOffsetEnd=22 PartOfSpeech=ADP]
+[Text=2005 CharacterOffsetBegin=23 CharacterOffsetEnd=27 PartOfSpeech=NUM]
+[Text=Bundeskanzlerin CharacterOffsetBegin=28 CharacterOffsetEnd=43 PartOfSpeech=NOUN]
+[Text=der CharacterOffsetBegin=44 CharacterOffsetEnd=47 PartOfSpeech=DET]
+[Text=Bundesrepublik CharacterOffsetBegin=48 CharacterOffsetEnd=62 PartOfSpeech=PROPN]
+[Text=Deutschland CharacterOffsetBegin=63 CharacterOffsetEnd=74 PartOfSpeech=PROPN]
+[Text=. CharacterOffsetBegin=74 CharacterOffsetEnd=75 PartOfSpeech=PUNCT]
+"""
+
+FRENCH_CUSTOM_PROPS = {'annotators': 'tokenize,ssplit,mwt,pos,parse',
+ 'tokenize.language': 'fr',
+ 'pos.model': 'edu/stanford/nlp/models/pos-tagger/french-ud.tagger',
+ 'parse.model': 'edu/stanford/nlp/models/srparser/frenchSR.ser.gz',
+ 'mwt.mappingFile': 'edu/stanford/nlp/models/mwt/french/french-mwt.tsv',
+ 'mwt.pos.model': 'edu/stanford/nlp/models/mwt/french/french-mwt.tagger',
+ 'mwt.statisticalMappingFile': 'edu/stanford/nlp/models/mwt/french/french-mwt-statistical.tsv',
+ 'mwt.preserveCasing': 'false',
+ 'outputFormat': 'text'}
+
+FRENCH_EXTRA_PROPS = {'annotators': 'tokenize,ssplit,mwt,pos,depparse',
+ 'tokenize.language': 'fr',
+ 'pos.model': 'edu/stanford/nlp/models/pos-tagger/french-ud.tagger',
+ 'mwt.mappingFile': 'edu/stanford/nlp/models/mwt/french/french-mwt.tsv',
+ 'mwt.pos.model': 'edu/stanford/nlp/models/mwt/french/french-mwt.tagger',
+ 'mwt.statisticalMappingFile': 'edu/stanford/nlp/models/mwt/french/french-mwt-statistical.tsv',
+ 'mwt.preserveCasing': 'false',
+ 'depparse.model': 'edu/stanford/nlp/models/parser/nndep/UD_French.gz'}
+
+FRENCH_DOC = "Cette enquête préliminaire fait suite aux révélations de l’hebdomadaire quelques jours plus tôt."
+
+FRENCH_CUSTOM_GOLD = """
+Sentence #1 (16 tokens):
+Cette enquête préliminaire fait suite aux révélations de l’hebdomadaire quelques jours plus tôt.
+
+Tokens:
+[Text=Cette CharacterOffsetBegin=0 CharacterOffsetEnd=5 PartOfSpeech=DET]
+[Text=enquête CharacterOffsetBegin=6 CharacterOffsetEnd=13 PartOfSpeech=NOUN]
+[Text=préliminaire CharacterOffsetBegin=14 CharacterOffsetEnd=26 PartOfSpeech=ADJ]
+[Text=fait CharacterOffsetBegin=27 CharacterOffsetEnd=31 PartOfSpeech=VERB]
+[Text=suite CharacterOffsetBegin=32 CharacterOffsetEnd=37 PartOfSpeech=NOUN]
+[Text=à CharacterOffsetBegin=38 CharacterOffsetEnd=41 PartOfSpeech=ADP]
+[Text=les CharacterOffsetBegin=38 CharacterOffsetEnd=41 PartOfSpeech=DET]
+[Text=révélations CharacterOffsetBegin=42 CharacterOffsetEnd=53 PartOfSpeech=NOUN]
+[Text=de CharacterOffsetBegin=54 CharacterOffsetEnd=56 PartOfSpeech=ADP]
+[Text=l’ CharacterOffsetBegin=57 CharacterOffsetEnd=59 PartOfSpeech=NOUN]
+[Text=hebdomadaire CharacterOffsetBegin=59 CharacterOffsetEnd=71 PartOfSpeech=ADJ]
+[Text=quelques CharacterOffsetBegin=72 CharacterOffsetEnd=80 PartOfSpeech=DET]
+[Text=jours CharacterOffsetBegin=81 CharacterOffsetEnd=86 PartOfSpeech=NOUN]
+[Text=plus CharacterOffsetBegin=87 CharacterOffsetEnd=91 PartOfSpeech=ADV]
+[Text=tôt CharacterOffsetBegin=92 CharacterOffsetEnd=95 PartOfSpeech=ADV]
+[Text=. CharacterOffsetBegin=95 CharacterOffsetEnd=96 PartOfSpeech=PUNCT]
+
+Constituency parse:
+(ROOT
+ (SENT
+ (NP (DET Cette)
+ (MWN (NOUN enquête) (ADJ préliminaire)))
+ (VN
+ (MWV (VERB fait) (NOUN suite)))
+ (PP (ADP à)
+ (NP (DET les) (NOUN révélations)
+ (PP (ADP de)
+ (NP (NOUN l’)
+ (AP (ADJ hebdomadaire))))))
+ (NP (DET quelques) (NOUN jours))
+ (AdP (ADV plus) (ADV tôt))
+ (PUNCT .)))
+"""
+
+FRENCH_EXTRA_GOLD = """
+Sentence #1 (16 tokens):
+Cette enquête préliminaire fait suite aux révélations de l’hebdomadaire quelques jours plus tôt.
+
+Tokens:
+[Text=Cette CharacterOffsetBegin=0 CharacterOffsetEnd=5 PartOfSpeech=DET]
+[Text=enquête CharacterOffsetBegin=6 CharacterOffsetEnd=13 PartOfSpeech=NOUN]
+[Text=préliminaire CharacterOffsetBegin=14 CharacterOffsetEnd=26 PartOfSpeech=ADJ]
+[Text=fait CharacterOffsetBegin=27 CharacterOffsetEnd=31 PartOfSpeech=VERB]
+[Text=suite CharacterOffsetBegin=32 CharacterOffsetEnd=37 PartOfSpeech=NOUN]
+[Text=à CharacterOffsetBegin=38 CharacterOffsetEnd=41 PartOfSpeech=ADP]
+[Text=les CharacterOffsetBegin=38 CharacterOffsetEnd=41 PartOfSpeech=DET]
+[Text=révélations CharacterOffsetBegin=42 CharacterOffsetEnd=53 PartOfSpeech=NOUN]
+[Text=de CharacterOffsetBegin=54 CharacterOffsetEnd=56 PartOfSpeech=ADP]
+[Text=l’ CharacterOffsetBegin=57 CharacterOffsetEnd=59 PartOfSpeech=NOUN]
+[Text=hebdomadaire CharacterOffsetBegin=59 CharacterOffsetEnd=71 PartOfSpeech=ADJ]
+[Text=quelques CharacterOffsetBegin=72 CharacterOffsetEnd=80 PartOfSpeech=DET]
+[Text=jours CharacterOffsetBegin=81 CharacterOffsetEnd=86 PartOfSpeech=NOUN]
+[Text=plus CharacterOffsetBegin=87 CharacterOffsetEnd=91 PartOfSpeech=ADV]
+[Text=tôt CharacterOffsetBegin=92 CharacterOffsetEnd=95 PartOfSpeech=ADV]
+[Text=. CharacterOffsetBegin=95 CharacterOffsetEnd=96 PartOfSpeech=PUNCT]
+
+Dependency Parse (enhanced plus plus dependencies):
+root(ROOT-0, fait-4)
+det(enquête-2, Cette-1)
+nsubj(fait-4, enquête-2)
+amod(enquête-2, préliminaire-3)
+obj(fait-4, suite-5)
+case(révélations-8, à-6)
+det(révélations-8, les-7)
+obl:à(fait-4, révélations-8)
+case(l’-10, de-9)
+nmod:de(révélations-8, l’-10)
+amod(révélations-8, hebdomadaire-11)
+det(jours-13, quelques-12)
+obl(fait-4, jours-13)
+advmod(tôt-15, plus-14)
+advmod(jours-13, tôt-15)
+punct(fait-4, .-16)
+"""
+
+FRENCH_JSON_GOLD = json.loads(open(f'{TEST_WORKING_DIR}/out/example_french.json').read())
+
+ES_DOC = 'Andrés Manuel López Obrador es el presidente de México.'
+
+ES_PROPS = {'annotators': 'tokenize,ssplit,mwt,pos,depparse', 'tokenize.language': 'es',
+ 'pos.model': 'edu/stanford/nlp/models/pos-tagger/spanish-ud.tagger',
+ 'mwt.mappingFile': 'edu/stanford/nlp/models/mwt/spanish/spanish-mwt.tsv',
+ 'depparse.model': 'edu/stanford/nlp/models/parser/nndep/UD_Spanish.gz'}
+
+ES_PROPS_GOLD = """
+Sentence #1 (10 tokens):
+Andrés Manuel López Obrador es el presidente de México.
+
+Tokens:
+[Text=Andrés CharacterOffsetBegin=0 CharacterOffsetEnd=6 PartOfSpeech=PROPN]
+[Text=Manuel CharacterOffsetBegin=7 CharacterOffsetEnd=13 PartOfSpeech=PROPN]
+[Text=López CharacterOffsetBegin=14 CharacterOffsetEnd=19 PartOfSpeech=PROPN]
+[Text=Obrador CharacterOffsetBegin=20 CharacterOffsetEnd=27 PartOfSpeech=PROPN]
+[Text=es CharacterOffsetBegin=28 CharacterOffsetEnd=30 PartOfSpeech=AUX]
+[Text=el CharacterOffsetBegin=31 CharacterOffsetEnd=33 PartOfSpeech=DET]
+[Text=presidente CharacterOffsetBegin=34 CharacterOffsetEnd=44 PartOfSpeech=NOUN]
+[Text=de CharacterOffsetBegin=45 CharacterOffsetEnd=47 PartOfSpeech=ADP]
+[Text=México CharacterOffsetBegin=48 CharacterOffsetEnd=54 PartOfSpeech=PROPN]
+[Text=. CharacterOffsetBegin=54 CharacterOffsetEnd=55 PartOfSpeech=PUNCT]
+
+Dependency Parse (enhanced plus plus dependencies):
+root(ROOT-0, presidente-7)
+nsubj(presidente-7, Andrés-1)
+flat(Andrés-1, Manuel-2)
+flat(Andrés-1, López-3)
+flat(Andrés-1, Obrador-4)
+cop(presidente-7, es-5)
+det(presidente-7, el-6)
+case(México-9, de-8)
+nmod:de(presidente-7, México-9)
+punct(presidente-7, .-10)
+"""
+
+
+@pytest.fixture(scope="module")
+def corenlp_client():
+ """ Client to run tests on """
+ client = corenlp.CoreNLPClient(annotators='tokenize,ssplit,pos', server_id='stanza_request_tests_server')
+ yield client
+ client.stop()
+
+
+def test_basic(corenlp_client):
+ """ Basic test of making a request, test default output format is a Document """
+ ann = corenlp_client.annotate(EN_DOC, output_format="text")
+ assert ann.strip() == EN_DOC_GOLD.strip()
+ ann = corenlp_client.annotate(EN_DOC)
+ assert isinstance(ann, Document)
+
+
+def test_python_dict(corenlp_client):
+ """ Test using a Python dictionary to specify all request properties """
+ ann = corenlp_client.annotate(ES_DOC, properties=ES_PROPS, output_format="text")
+ assert ann.strip() == ES_PROPS_GOLD.strip()
+ ann = corenlp_client.annotate(FRENCH_DOC, properties=FRENCH_CUSTOM_PROPS)
+ assert ann.strip() == FRENCH_CUSTOM_GOLD.strip()
+
+
+def test_lang_setting(corenlp_client):
+ """ Test using a Stanford CoreNLP supported languages as a properties key """
+ ann = corenlp_client.annotate(GERMAN_DOC, properties="german", output_format="text")
+ compare_ignoring_whitespace(ann, GERMAN_DOC_GOLD)
+
+
+def test_annotators_and_output_format(corenlp_client):
+ """ Test setting the annotators and output_format """
+ ann = corenlp_client.annotate(FRENCH_DOC, properties=FRENCH_EXTRA_PROPS,
+ annotators="tokenize,ssplit,mwt,pos", output_format="json")
+ assert FRENCH_JSON_GOLD == ann