diff options
Diffstat (limited to 'stanza/tests/test_data_conversion.py')
-rw-r--r-- | stanza/tests/test_data_conversion.py | 118 |
1 files changed, 118 insertions, 0 deletions
diff --git a/stanza/tests/test_data_conversion.py b/stanza/tests/test_data_conversion.py new file mode 100644 index 00000000..a498143f --- /dev/null +++ b/stanza/tests/test_data_conversion.py @@ -0,0 +1,118 @@ +""" +Basic tests of the data conversion +""" +import pytest + +import stanza +from stanza.utils.conll import CoNLL +from stanza.models.common.doc import Document +from stanza.tests import * + +pytestmark = pytest.mark.pipeline + +# data for testing +CONLL = [[['1', 'Nous', 'il', 'PRON', '_', 'Number=Plur|Person=1|PronType=Prs', '3', 'nsubj', '_', 'start_char=0|end_char=4'], + ['2', 'avons', 'avoir', 'AUX', '_', 'Mood=Ind|Number=Plur|Person=1|Tense=Pres|VerbForm=Fin', '3', 'aux:tense', '_', 'start_char=5|end_char=10'], + ['3', 'atteint', 'atteindre', 'VERB', '_', 'Gender=Masc|Number=Sing|Tense=Past|VerbForm=Part', '0', 'root', '_', 'start_char=11|end_char=18'], + ['4', 'la', 'le', 'DET', '_', 'Definite=Def|Gender=Fem|Number=Sing|PronType=Art', '5', 'det', '_', 'start_char=19|end_char=21'], + ['5', 'fin', 'fin', 'NOUN', '_', 'Gender=Fem|Number=Sing', '3', 'obj', '_', 'start_char=22|end_char=25'], + ['6-7', 'du', '_', '_', '_', '_', '_', '_', '_', 'start_char=26|end_char=28'], + ['6', 'de', 'de', 'ADP', '_', '_', '8', 'case', '_', '_'], + ['7', 'le', 'le', 'DET', '_', 'Definite=Def|Gender=Masc|Number=Sing|PronType=Art', '8', 'det', '_', '_'], + ['8', 'sentier', 'sentier', 'NOUN', '_', 'Gender=Masc|Number=Sing', '5', 'nmod', '_', 'start_char=29|end_char=36'], + ['9', '.', '.', 'PUNCT', '_', '_', '3', 'punct', '_', 'start_char=36|end_char=37']]] + + +DICT = [[{'id': (1,), 'text': 'Nous', 'lemma': 'il', 'upos': 'PRON', 'feats': 'Number=Plur|Person=1|PronType=Prs', 'head': 3, 'deprel': 'nsubj', 'misc': 'start_char=0|end_char=4'}, + {'id': (2,), 'text': 'avons', 'lemma': 'avoir', 'upos': 'AUX', 'feats': 'Mood=Ind|Number=Plur|Person=1|Tense=Pres|VerbForm=Fin', 'head': 3, 'deprel': 'aux:tense', 'misc': 'start_char=5|end_char=10'}, + {'id': (3,), 'text': 'atteint', 'lemma': 'atteindre', 'upos': 'VERB', 'feats': 'Gender=Masc|Number=Sing|Tense=Past|VerbForm=Part', 'head': 0, 'deprel': 'root', 'misc': 'start_char=11|end_char=18'}, + {'id': (4,), 'text': 'la', 'lemma': 'le', 'upos': 'DET', 'feats': 'Definite=Def|Gender=Fem|Number=Sing|PronType=Art', 'head': 5, 'deprel': 'det', 'misc': 'start_char=19|end_char=21'}, + {'id': (5,), 'text': 'fin', 'lemma': 'fin', 'upos': 'NOUN', 'feats': 'Gender=Fem|Number=Sing', 'head': 3, 'deprel': 'obj', 'misc': 'start_char=22|end_char=25'}, + {'id': (6, 7), 'text': 'du', 'misc': 'start_char=26|end_char=28'}, + {'id': (6,), 'text': 'de', 'lemma': 'de', 'upos': 'ADP', 'head': 8, 'deprel': 'case'}, + {'id': (7,), 'text': 'le', 'lemma': 'le', 'upos': 'DET', 'feats': 'Definite=Def|Gender=Masc|Number=Sing|PronType=Art', 'head': 8, 'deprel': 'det'}, + {'id': (8,), 'text': 'sentier', 'lemma': 'sentier', 'upos': 'NOUN', 'feats': 'Gender=Masc|Number=Sing', 'head': 5, 'deprel': 'nmod', 'misc': 'start_char=29|end_char=36'}, + {'id': (9,), 'text': '.', 'lemma': '.', 'upos': 'PUNCT', 'head': 3, 'deprel': 'punct', 'misc': 'start_char=36|end_char=37'}]] + +def test_conll_to_dict(): + dicts = CoNLL.convert_conll(CONLL) + assert dicts == DICT + +def test_dict_to_conll(): + conll = CoNLL.convert_dict(DICT) + assert conll == CONLL + +def test_dict_to_doc_and_doc_to_dict(): + """ + Test the conversion from raw dict to Document and back + This code path will first turn start_char|end_char into start_char & end_char fields in the Document + That version to a dict will have separate fields for each of those + Finally, the conversion from that dict to a list of conll entries should convert that back to misc + """ + doc = Document(DICT) + dicts = doc.to_dict() + dicts_tupleid = [] + for sentence in dicts: + items = [] + for item in sentence: + item['id'] = item['id'] if isinstance(item['id'], tuple) else (item['id'], ) + items.append(item) + dicts_tupleid.append(items) + conll = CoNLL.convert_dict(DICT) + assert conll == CONLL + +RUSSIAN_SAMPLE=""" +# sent_id = yandex.reviews-f-8xh5zqnmwak3t6p68y4rhwd4e0-1969-9253 +# genre = review +# text = Как- то слишком мало цветов получают актёры после спектакля. +1 Как как-то ADV _ Degree=Pos|PronType=Ind 7 advmod _ SpaceAfter=No +2 - - PUNCT _ _ 3 punct _ _ +3 то то PART _ _ 1 list _ deprel=list:goeswith +4 слишком слишком ADV _ Degree=Pos 5 advmod _ _ +5 мало мало ADV _ Degree=Pos 6 advmod _ _ +6 цветов цветок NOUN _ Animacy=Inan|Case=Gen|Gender=Masc|Number=Plur 7 obj _ _ +7 получают получать VERB _ Aspect=Imp|Mood=Ind|Number=Plur|Person=3|Tense=Pres|VerbForm=Fin|Voice=Act 0 root _ _ +8 актёры актер NOUN _ Animacy=Anim|Case=Nom|Gender=Masc|Number=Plur 7 nsubj _ _ +9 после после ADP _ _ 10 case _ _ +10 спектакля спектакль NOUN _ Animacy=Inan|Case=Gen|Gender=Masc|Number=Sing 7 obl _ SpaceAfter=No +11 . . PUNCT _ _ 7 punct _ _ +""".strip() + + +def test_doc_with_comments(): + """ + Test that a doc with comments gets converted back with comments + """ + lines = RUSSIAN_SAMPLE.split("\n") + + doc = CoNLL.conll2doc(input_str=RUSSIAN_SAMPLE) + assert len(doc.sentences) == 1 + assert len(doc.sentences[0].comments) == 3 + assert lines[0] == doc.sentences[0].comments[0] + assert lines[1] == doc.sentences[0].comments[1] + assert lines[2] == doc.sentences[0].comments[2] + + sentences = CoNLL.doc2conll(doc) + assert len(sentences) == 1 + + sentence = sentences[0] + assert len(sentence) == 14 + assert lines[0] == sentence[0] + assert lines[1] == sentence[1] + assert lines[2] == sentence[2] + +def test_unusual_misc(): + """ + The above RUSSIAN_SAMPLE resulted in a blank misc field in one particular implementation of the conll code + (the below test would fail) + """ + doc = CoNLL.conll2doc(input_str=RUSSIAN_SAMPLE) + sentences = CoNLL.doc2conll(doc) + assert len(sentences) == 1 + assert len(sentences[0]) == 14 + + for word in sentences[0]: + pieces = word.split("\t") + assert len(pieces) == 1 or len(pieces) == 10 + if len(pieces) == 10: + assert all(piece for piece in pieces) |