stanza/tests/constituency/test_tree_reader.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61

import pytest
from stanza.models.constituency import tree_reader

from stanza.tests import *

pytestmark = [pytest.mark.pipeline, pytest.mark.travis]

def test_simple():
    """
    Tests reading two simple trees from the same text
    """
    text = "(VB Unban) (NNP Opal)"
    trees = tree_reader.read_trees(text)
    assert len(trees) == 2
    assert trees[0].is_preterminal()
    assert trees[0].label == 'VB'
    assert trees[0].children[0].label == 'Unban'
    assert trees[1].is_preterminal()
    assert trees[1].label == 'NNP'
    assert trees[1].children[0].label == 'Opal'

def test_newlines():
    """
    The same test should work if there are newlines
    """
    text = "(VB Unban)\n\n(NNP Opal)"
    trees = tree_reader.read_trees(text)
    assert len(trees) == 2

def test_complicated():
    """
    A more complicated tree that should successfully read
    """
    text="( (SBARQ (WHNP (WP Who)) (SQ (VP (VBZ sits) (PP (IN in) (NP (DT this) (NN seat))))) (. ?)))"
    trees = tree_reader.read_trees(text)
    assert len(trees) == 1
    tree = trees[0]
    assert not tree.is_leaf()
    assert not tree.is_preterminal()
    assert tree.label == 'ROOT'
    assert len(tree.children) == 1
    assert tree.children[0].label == 'SBARQ'
    assert len(tree.children[0].children) == 3
    assert [x.label for x in tree.children[0].children] == ['WHNP', 'SQ', '.']
    # etc etc

def test_one_word():
    """
    Check that one node trees are correctly read

    probably not super relevant for the parsing use case
    """
    text="(FOO) (BAR)"
    trees = tree_reader.read_trees(text)
    assert len(trees) == 2

    assert trees[0].is_leaf()
    assert trees[0].label == 'FOO'

    assert trees[1].is_leaf()
    assert trees[1].label == 'BAR'