scripts/training/wrappers/senna2brackets.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97

#!/usr/bin/env python

# Read SENNA output (from stdin), extract the parse trees, and write them in
# PTB-style bracketed format (to stdout).
#
# The SENNA output is assumed to contain tokens in the first column, POS tags
# in the second column, and PSG fragments in the final column.
#
# It is also assumed that SENNA was run through the parse-en-senna.perl wrapper,
# which:
#
#   - Substitutes the special "SENTENCE_TOO_LONG" token for sentences that
#     exceed SENNA's hardcoded limit.
#
#   - Replaces the bracket-like tokens "-LRB-", "-RRB-", etc. with "(", ")",
#     etc.

import optparse
import os
import sys

def main():
    usage = "usage: %prog [options]"
    parser = optparse.OptionParser(usage=usage)
    parser.add_option("--berkeley-style", action="store_true", default=False,
                      dest="berkeley",
                      help="mimic the Berkeley Parser's output format")
    (options, args) = parser.parse_args()
    if len(args) > 0:
        parser.error("incorrect number of arguments")

    tree = ""
    line_num = 0
    for line in sys.stdin:
        line_num += 1
        # Check for a blank line (the sentence delimiter).
        if line.strip() == "":
            if not balanced(tree):
                warn("unbalanced parentheses in tree ending at line %d: "
                     "discarding tree" % line_num)
                tree = ""
            tree = beautify(tree)
            if options.berkeley:
                tree = berkelify(tree)
            print tree
            tree = ""
            continue
        tokens = line.split()
        word, pos, frag = tokens[0], tokens[1], tokens[-1]
        # Check for the special "SENTENCE_TOO_LONG" token (see
        # parse-en-senna.perl)
        if word == "SENTENCE_TOO_LONG":
            continue
        # Restore -LRB-, -RRB-, etc.
        if word == "(":
            word = "-LRB-"
        elif word == ")":
            word = "-RRB-"
        elif word == "[":
            word = "-LSB-"
        elif word == "]":
            word = "-RSB-"
        elif word == "{":
            word = "-LCB-"
        elif word == "}":
            word = "-RCB-"
        tree += frag.replace("*", "(%s %s)" % (pos, word))

def balanced(s):
    num_left = 0
    num_right = 0
    for char in s:
        if char == "(":
            num_left += 1
        elif char == ")":
            num_right += 1
    return num_left == num_right

def beautify(tree):
    s = tree.replace("(", " (")
    return s.strip()

def berkelify(tree):
    if tree == "":
        return "(())"
    assert tree[0] == "("
    pos = tree.find(" (", 1)
    assert pos != -1
    old_root = tree[1:pos]
    return tree.replace(old_root, "TOP")

def warn(msg):
    prog_name = os.path.basename(sys.argv[0])
    sys.stderr.write("%s: warning: %s\n" % (prog_name, msg))

if __name__ == "__main__":
    main()