1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
|
from collections import deque
import subprocess
from stanza.models.constituency.parse_tree import Tree
from stanza.protobuf import FlattenedParseTree
from stanza.server.client import resolve_classpath
def send_request(request, response_type, java_main, classpath=None):
"""
Use subprocess to run a Java protobuf processor on the given request
Returns the protobuf response
"""
pipe = subprocess.run(["java", "-cp", resolve_classpath(classpath), java_main],
input=request.SerializeToString(),
stdout=subprocess.PIPE,
check=True)
response = response_type()
response.ParseFromString(pipe.stdout)
return response
def add_tree_nodes(proto_tree, tree, score):
# add an open node
node = proto_tree.nodes.add()
node.openNode = True
if score is not None:
node.score = score
# add the content of this node
node = proto_tree.nodes.add()
node.value = tree.label
# add all children...
# leaves get just one node
# branches are called recursively
for child in tree.children:
if child.is_leaf():
node = proto_tree.nodes.add()
node.value = child.label
else:
add_tree_nodes(proto_tree, child, None)
node = proto_tree.nodes.add()
node.closeNode = True
def build_tree(tree, score):
"""
Builds a FlattenedParseTree from CoreNLP.proto
Populates the value field from tree.label and iterates through the
children via tree.children. Should work on any tree structure
which follows that layout
The score will be added to the top node (if it is not None)
Operates by recursively calling add_tree_nodes
"""
proto_tree = FlattenedParseTree()
add_tree_nodes(proto_tree, tree, score)
return proto_tree
def from_tree(proto_tree):
"""
Convert a FlattenedParseTree back into a Tree
returns Tree, score
(score might be None if it is missing)
"""
score = None
stack = deque()
for node in proto_tree.nodes:
if node.HasField("score") and score is None:
score = node.score
if node.openNode:
if len(stack) > 0 and isinstance(stack[-1], FlattenedParseTree.Node) and stack[-1].openNode:
raise ValueError("Got a proto with no label on a node: {}".format(proto_tree))
stack.append(node)
continue
if not node.closeNode:
child = Tree(label=node.value)
# TODO: do something with the score
stack.append(child)
continue
# must be a close operation...
if len(stack) <= 1:
raise ValueError("Got a proto with too many close operations: {}".format(proto_tree))
# on a close operation, pop until we hit the open
# then turn everything in that span into a new node
children = []
nextNode = stack.pop()
while not isinstance(nextNode, FlattenedParseTree.Node):
children.append(nextNode)
nextNode = stack.pop()
if len(children) == 0:
raise ValueError("Got a proto with an open immediately followed by a close: {}".format(proto_tree))
children.reverse()
label = children[0]
children = children[1:]
subtree = Tree(label=label.label, children=children)
stack.append(subtree)
if len(stack) > 1:
raise ValueError("Got a proto which does not close all of the nodes: {}".format(proto_tree))
tree = stack.pop()
if not isinstance(tree, Tree):
raise ValueError("Got a proto which was just one Open operation: {}".format(proto_tree))
return tree, score
def add_token(token_list, word, token):
"""
Add a token to a proto request.
CoreNLP tokens have components of both word and token from stanza.
"""
query_token = token_list.add()
query_token.word = word.text
query_token.value = word.text
if word.lemma is not None:
query_token.lemma = word.lemma
if word.xpos is not None:
query_token.pos = word.xpos
if word.upos is not None:
query_token.coarseTag = word.upos
if token.ner is not None:
query_token.ner = token.ner
def add_sentence(request_sentences, sentence, num_tokens):
"""
Add the tokens for this stanza sentence to a list of protobuf sentences
"""
request_sentence = request_sentences.add()
request_sentence.tokenOffsetBegin = num_tokens
request_sentence.tokenOffsetEnd = num_tokens + sum(len(token.words) for token in sentence.tokens)
for token in sentence.tokens:
for word in token.words:
add_token(request_sentence.token, word, token)
return request_sentence
def add_word_to_graph(graph, word, sent_idx, word_idx):
"""
Add a node and possibly an edge for a word in a basic dependency graph.
"""
node = graph.node.add()
node.sentenceIndex = sent_idx+1
node.index = word_idx+1
if word.head != 0:
edge = graph.edge.add()
edge.source = word.head
edge.target = word_idx+1
edge.dep = word.deprel
class JavaProtobufContext(object):
"""
A generic context for sending requests to a java program using protobufs in a subprocess
"""
def __init__(self, classpath, build_response, java_main):
self.classpath = resolve_classpath(classpath)
self.build_response = build_response
self.java_main = java_main
def __enter__(self):
self.pipe = subprocess.Popen(["java", "-cp", self.classpath, self.java_main, "-multiple"],
stdin=subprocess.PIPE,
stdout=subprocess.PIPE)
return self
def __exit__(self, type, value, traceback):
if self.pipe.poll() is None:
self.pipe.stdin.write((0).to_bytes(4, 'big'))
self.pipe.stdin.flush()
def process_request(self, request):
text = request.SerializeToString()
self.pipe.stdin.write(len(text).to_bytes(4, 'big'))
self.pipe.stdin.write(text)
self.pipe.stdin.flush()
response_length = self.pipe.stdout.read(4)
if len(response_length) < 4:
raise RuntimeError("Could not communicate with java process!")
response_length = int.from_bytes(response_length, "big")
response_text = self.pipe.stdout.read(response_length)
response = self.build_response()
response.ParseFromString(response_text)
return response
|