Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/stanfordnlp/stanza.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
Diffstat (limited to 'stanza/tests/test_data_conversion.py')
-rw-r--r--stanza/tests/test_data_conversion.py118
1 files changed, 118 insertions, 0 deletions
diff --git a/stanza/tests/test_data_conversion.py b/stanza/tests/test_data_conversion.py
new file mode 100644
index 00000000..a498143f
--- /dev/null
+++ b/stanza/tests/test_data_conversion.py
@@ -0,0 +1,118 @@
+"""
+Basic tests of the data conversion
+"""
+import pytest
+
+import stanza
+from stanza.utils.conll import CoNLL
+from stanza.models.common.doc import Document
+from stanza.tests import *
+
+pytestmark = pytest.mark.pipeline
+
+# data for testing
+CONLL = [[['1', 'Nous', 'il', 'PRON', '_', 'Number=Plur|Person=1|PronType=Prs', '3', 'nsubj', '_', 'start_char=0|end_char=4'],
+ ['2', 'avons', 'avoir', 'AUX', '_', 'Mood=Ind|Number=Plur|Person=1|Tense=Pres|VerbForm=Fin', '3', 'aux:tense', '_', 'start_char=5|end_char=10'],
+ ['3', 'atteint', 'atteindre', 'VERB', '_', 'Gender=Masc|Number=Sing|Tense=Past|VerbForm=Part', '0', 'root', '_', 'start_char=11|end_char=18'],
+ ['4', 'la', 'le', 'DET', '_', 'Definite=Def|Gender=Fem|Number=Sing|PronType=Art', '5', 'det', '_', 'start_char=19|end_char=21'],
+ ['5', 'fin', 'fin', 'NOUN', '_', 'Gender=Fem|Number=Sing', '3', 'obj', '_', 'start_char=22|end_char=25'],
+ ['6-7', 'du', '_', '_', '_', '_', '_', '_', '_', 'start_char=26|end_char=28'],
+ ['6', 'de', 'de', 'ADP', '_', '_', '8', 'case', '_', '_'],
+ ['7', 'le', 'le', 'DET', '_', 'Definite=Def|Gender=Masc|Number=Sing|PronType=Art', '8', 'det', '_', '_'],
+ ['8', 'sentier', 'sentier', 'NOUN', '_', 'Gender=Masc|Number=Sing', '5', 'nmod', '_', 'start_char=29|end_char=36'],
+ ['9', '.', '.', 'PUNCT', '_', '_', '3', 'punct', '_', 'start_char=36|end_char=37']]]
+
+
+DICT = [[{'id': (1,), 'text': 'Nous', 'lemma': 'il', 'upos': 'PRON', 'feats': 'Number=Plur|Person=1|PronType=Prs', 'head': 3, 'deprel': 'nsubj', 'misc': 'start_char=0|end_char=4'},
+ {'id': (2,), 'text': 'avons', 'lemma': 'avoir', 'upos': 'AUX', 'feats': 'Mood=Ind|Number=Plur|Person=1|Tense=Pres|VerbForm=Fin', 'head': 3, 'deprel': 'aux:tense', 'misc': 'start_char=5|end_char=10'},
+ {'id': (3,), 'text': 'atteint', 'lemma': 'atteindre', 'upos': 'VERB', 'feats': 'Gender=Masc|Number=Sing|Tense=Past|VerbForm=Part', 'head': 0, 'deprel': 'root', 'misc': 'start_char=11|end_char=18'},
+ {'id': (4,), 'text': 'la', 'lemma': 'le', 'upos': 'DET', 'feats': 'Definite=Def|Gender=Fem|Number=Sing|PronType=Art', 'head': 5, 'deprel': 'det', 'misc': 'start_char=19|end_char=21'},
+ {'id': (5,), 'text': 'fin', 'lemma': 'fin', 'upos': 'NOUN', 'feats': 'Gender=Fem|Number=Sing', 'head': 3, 'deprel': 'obj', 'misc': 'start_char=22|end_char=25'},
+ {'id': (6, 7), 'text': 'du', 'misc': 'start_char=26|end_char=28'},
+ {'id': (6,), 'text': 'de', 'lemma': 'de', 'upos': 'ADP', 'head': 8, 'deprel': 'case'},
+ {'id': (7,), 'text': 'le', 'lemma': 'le', 'upos': 'DET', 'feats': 'Definite=Def|Gender=Masc|Number=Sing|PronType=Art', 'head': 8, 'deprel': 'det'},
+ {'id': (8,), 'text': 'sentier', 'lemma': 'sentier', 'upos': 'NOUN', 'feats': 'Gender=Masc|Number=Sing', 'head': 5, 'deprel': 'nmod', 'misc': 'start_char=29|end_char=36'},
+ {'id': (9,), 'text': '.', 'lemma': '.', 'upos': 'PUNCT', 'head': 3, 'deprel': 'punct', 'misc': 'start_char=36|end_char=37'}]]
+
+def test_conll_to_dict():
+ dicts = CoNLL.convert_conll(CONLL)
+ assert dicts == DICT
+
+def test_dict_to_conll():
+ conll = CoNLL.convert_dict(DICT)
+ assert conll == CONLL
+
+def test_dict_to_doc_and_doc_to_dict():
+ """
+ Test the conversion from raw dict to Document and back
+ This code path will first turn start_char|end_char into start_char & end_char fields in the Document
+ That version to a dict will have separate fields for each of those
+ Finally, the conversion from that dict to a list of conll entries should convert that back to misc
+ """
+ doc = Document(DICT)
+ dicts = doc.to_dict()
+ dicts_tupleid = []
+ for sentence in dicts:
+ items = []
+ for item in sentence:
+ item['id'] = item['id'] if isinstance(item['id'], tuple) else (item['id'], )
+ items.append(item)
+ dicts_tupleid.append(items)
+ conll = CoNLL.convert_dict(DICT)
+ assert conll == CONLL
+
+RUSSIAN_SAMPLE="""
+# sent_id = yandex.reviews-f-8xh5zqnmwak3t6p68y4rhwd4e0-1969-9253
+# genre = review
+# text = Как- то слишком мало цветов получают актёры после спектакля.
+1 Как как-то ADV _ Degree=Pos|PronType=Ind 7 advmod _ SpaceAfter=No
+2 - - PUNCT _ _ 3 punct _ _
+3 то то PART _ _ 1 list _ deprel=list:goeswith
+4 слишком слишком ADV _ Degree=Pos 5 advmod _ _
+5 мало мало ADV _ Degree=Pos 6 advmod _ _
+6 цветов цветок NOUN _ Animacy=Inan|Case=Gen|Gender=Masc|Number=Plur 7 obj _ _
+7 получают получать VERB _ Aspect=Imp|Mood=Ind|Number=Plur|Person=3|Tense=Pres|VerbForm=Fin|Voice=Act 0 root _ _
+8 актёры актер NOUN _ Animacy=Anim|Case=Nom|Gender=Masc|Number=Plur 7 nsubj _ _
+9 после после ADP _ _ 10 case _ _
+10 спектакля спектакль NOUN _ Animacy=Inan|Case=Gen|Gender=Masc|Number=Sing 7 obl _ SpaceAfter=No
+11 . . PUNCT _ _ 7 punct _ _
+""".strip()
+
+
+def test_doc_with_comments():
+ """
+ Test that a doc with comments gets converted back with comments
+ """
+ lines = RUSSIAN_SAMPLE.split("\n")
+
+ doc = CoNLL.conll2doc(input_str=RUSSIAN_SAMPLE)
+ assert len(doc.sentences) == 1
+ assert len(doc.sentences[0].comments) == 3
+ assert lines[0] == doc.sentences[0].comments[0]
+ assert lines[1] == doc.sentences[0].comments[1]
+ assert lines[2] == doc.sentences[0].comments[2]
+
+ sentences = CoNLL.doc2conll(doc)
+ assert len(sentences) == 1
+
+ sentence = sentences[0]
+ assert len(sentence) == 14
+ assert lines[0] == sentence[0]
+ assert lines[1] == sentence[1]
+ assert lines[2] == sentence[2]
+
+def test_unusual_misc():
+ """
+ The above RUSSIAN_SAMPLE resulted in a blank misc field in one particular implementation of the conll code
+ (the below test would fail)
+ """
+ doc = CoNLL.conll2doc(input_str=RUSSIAN_SAMPLE)
+ sentences = CoNLL.doc2conll(doc)
+ assert len(sentences) == 1
+ assert len(sentences[0]) == 14
+
+ for word in sentences[0]:
+ pieces = word.split("\t")
+ assert len(pieces) == 1 or len(pieces) == 10
+ if len(pieces) == 10:
+ assert all(piece for piece in pieces)