1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
|
"""Invokes the Java semgrex on a document
The server client has a method "semgrex" which sends text to Java
CoreNLP for processing with a semgrex (SEMantic GRaph regEX) query:
https://nlp.stanford.edu/nlp/javadoc/javanlp/edu/stanford/nlp/semgraph/semgrex/SemgrexPattern.html
However, this operates on text using the CoreNLP tools, which means
the dependency graphs may not align with stanza's depparse module, and
this also limits the languages for which it can be used. This module
allows for running semgrex commands on the graphs produced by
depparse.
To use, first process text into a doc using stanza.Pipeline
Next, pass the processed doc and a list of semgrex patterns to
process_doc in this module. It will run the java semgrex module as a
subprocess and return the result in the form of a SemgrexResponse,
whose description is in the proto file included with stanza.
A minimal example is the main method of this module.
Note that launching the subprocess is potentially quite expensive
relative to the search if used many times on small documents. Ideally
larger texts would be processed, and all of the desired semgrex
patterns would be run at once. The worst thing to do would be to call
this multiple times on a large document, one invocation per semgrex
pattern, as that would serialize the document each time. There are of
course multiple ways of making this more efficient, such as including
it as a separate call in the server or keeping the subprocess alive
for multiple queries, but we didn't do any of those. We do, however,
accept pull requests...
"""
import subprocess
import stanza
from stanza.protobuf import SemgrexRequest, SemgrexResponse
from stanza.server.client import resolve_classpath
def send_request(request, response_type, java_main):
"""
Use subprocess to run the Semgrex processor on the given request
Returns the protobuf response
"""
pipe = subprocess.run(["java", "-cp", resolve_classpath(), java_main],
input=request.SerializeToString(),
stdout=subprocess.PIPE)
response = response_type()
response.ParseFromString(pipe.stdout)
return response
def send_semgrex_request(request):
return send_request(request, SemgrexResponse,
"edu.stanford.nlp.semgraph.semgrex.ProcessSemgrexRequest")
def process_doc(doc, *semgrex_patterns):
"""
Returns the result of processing the given semgrex expression on the stanza doc.
Currently the return is a SemgrexResponse from CoreNLP.proto
"""
request = SemgrexRequest()
for semgrex in semgrex_patterns:
request.semgrex.append(semgrex)
for sent_idx, sentence in enumerate(doc.sentences):
query = request.query.add()
word_idx = 0
for token in sentence.tokens:
for word in token.words:
query_token = query.token.add()
query_token.word = word.text
query_token.value = word.text
if word.lemma is not None:
query_token.lemma = word.lemma
if word.xpos is not None:
query_token.pos = word.xpos
if word.upos is not None:
query_token.coarseTag = word.upos
if token.ner is not None:
query_token.ner = token.ner
node = query.graph.node.add()
node.sentenceIndex = sent_idx+1
node.index = word_idx+1
if word.head != 0:
edge = query.graph.edge.add()
edge.source = word.head
edge.target = word_idx+1
edge.dep = word.deprel
word_idx = word_idx + 1
return send_semgrex_request(request)
def main():
nlp = stanza.Pipeline('en',
processors='tokenize,pos,lemma,depparse')
doc = nlp('Unban Mox Opal! Unban Mox Opal!')
#print(doc.sentences[0].dependencies)
print(doc)
print(process_doc(doc, "{}=source >obj=zzz {}=target"))
if __name__ == '__main__':
main()
|