Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/stanfordnlp/stanza.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJohn Bauer <horatio@gmail.com>2022-11-05 20:12:54 +0300
committerJohn Bauer <horatio@gmail.com>2022-11-11 12:53:53 +0300
commite9fe09f7716c3275d93ee9d5238d3de3029f1776 (patch)
tree4e54a07de44265a36436f17cd4d73e3ffc84ae34
parentff0cb0f6a4a2ac9ed4e0768fd0655d35c7626898 (diff)
Add an in_order_compound transition scheme
Unary chains are collapsed into the top transition for open constituents. Preterminals which get unaried have their own transition as well. To make the ROOT logic simpler, we also add a Finalize transition which does the root (we could theoretically use this elsewhere).
-rw-r--r--stanza/models/constituency/base_model.py4
-rw-r--r--stanza/models/constituency/parse_transitions.py127
-rw-r--r--stanza/models/constituency/parse_tree.py9
-rw-r--r--stanza/models/constituency/trainer.py2
-rw-r--r--stanza/models/constituency/transition_sequence.py47
-rw-r--r--stanza/tests/constituency/test_transition_sequence.py3
6 files changed, 187 insertions, 5 deletions
diff --git a/stanza/models/constituency/base_model.py b/stanza/models/constituency/base_model.py
index 23781bce..dadb99cb 100644
--- a/stanza/models/constituency/base_model.py
+++ b/stanza/models/constituency/base_model.py
@@ -157,7 +157,9 @@ class BaseModel(ABC):
"""
Whether or not this model is TOP_DOWN
"""
- return not self._transition_scheme is TransitionScheme.IN_ORDER
+ return (self._transition_scheme is TransitionScheme.TOP_DOWN or
+ self._transition_scheme is TransitionScheme.TOP_DOWN_UNARY or
+ self._transition_scheme is TransitionScheme.TOP_DOWN_COMPOUND)
def predict(self, states, is_legal=True):
raise NotImplementedError("LSTMModel can predict, but SimpleModel cannot")
diff --git a/stanza/models/constituency/parse_transitions.py b/stanza/models/constituency/parse_transitions.py
index e87fcaeb..01c22c97 100644
--- a/stanza/models/constituency/parse_transitions.py
+++ b/stanza/models/constituency/parse_transitions.py
@@ -38,6 +38,11 @@ class TransitionScheme(Enum):
# 0.8205
IN_ORDER = 4
+ # in order, with unaries after preterminals represented as a single
+ # transition after the preterminal
+ # and unaries elsewhere tied to the rest of the constituent
+ IN_ORDER_COMPOUND = 5
+
class State(namedtuple('State', ['word_queue', 'transitions', 'constituents', 'gold_tree', 'gold_sequence',
'sentence_length', 'num_opens', 'word_position', 'score'])):
"""
@@ -331,6 +336,9 @@ class Dummy():
def __init__(self, label):
self.label = label
+ def is_preterminal(self):
+ return False
+
def __str__(self):
return "Dummy({})".format(self.label)
@@ -407,6 +415,11 @@ class OpenConstituent(Transition):
if isinstance(model.get_top_transition(state.transitions), OpenConstituent):
# consecutive Opens don't make sense in the context of in-order
return False
+ if model.transition_scheme() == TransitionScheme.IN_ORDER_COMPOUND:
+ # if compound unary opens are used
+ # can always open as long as the word queue isn't empty
+ # if the word queue is empty, only close is allowed
+ return not state.empty_word_queue()
# one other restriction - we assume all parse trees
# start with (ROOT (first_real_con ...))
# therefore ROOT can only occur via Open after everything
@@ -455,6 +468,110 @@ class OpenConstituent(Transition):
def __hash__(self):
return hash(self.label)
+class PreterminalUnary(Transition):
+ """
+ Can only be applied to a preterminal
+
+ Turns the preterminal into a unary chain
+ """
+ def __init__(self, *label):
+ self.label = tuple(label)
+ self.top_label = self.label[0]
+
+ def update_state(self, state, model):
+ """
+ Apply potentially multiple unary transitions to the same preterminal
+
+ Only applies to preterminals
+ It reuses the CloseConstituent machinery
+ """
+ # only the top constituent is meaningful here
+ constituents = state.constituents
+ children = [constituents.value]
+ constituents = constituents.pop()
+ # unlike with CloseConstituent, our label is not on the stack.
+ # it is just our label
+ label = self.label
+
+ # ... but we do reuse CloseConstituent's update
+ return state.word_position, constituents, (label, children), CloseConstituent
+
+ def is_legal(self, state, model):
+ """
+ Legal if & only if the previous item is a preterminal
+ """
+ tree = model.get_top_constituent(state.constituents)
+ if tree is None:
+ return False
+ return tree.is_preterminal()
+
+ def short_name(self):
+ return "PTUnary"
+
+ def __repr__(self):
+ return "PreterminalUnary(%s)" % ",".join(self.label)
+
+ def __eq__(self, other):
+ if self is other:
+ return True
+ if not isinstance(other, PreterminalUnary):
+ return False
+ return other.label == self.label
+
+ def __hash__(self):
+ return hash((28, self.label))
+
+class Finalize(Transition):
+ """
+ Specifically applies at the end of a parse sequence to add a ROOT
+
+ Seemed like the simplest way to remove ROOT from the
+ in_order_compound transitions while still using the mechanism of
+ the transitions to build the parse tree
+ """
+ def __init__(self, *label):
+ self.label = tuple(label)
+
+ def update_state(self, state, model):
+ """
+ Apply potentially multiple unary transitions to the same preterminal
+
+ Only applies to preterminals
+ It reuses the CloseConstituent machinery
+ """
+ # only the top constituent is meaningful here
+ constituents = state.constituents
+ children = [constituents.value]
+ constituents = constituents.pop()
+ # unlike with CloseConstituent, our label is not on the stack.
+ # it is just our label
+ label = self.label
+
+ # ... but we do reuse CloseConstituent's update
+ return state.word_position, constituents, (label, children), CloseConstituent
+
+ def is_legal(self, state, model):
+ """
+ Legal if & only if there is one tree, no more words, and no ROOT yet
+ """
+ return state.empty_word_queue() and state.has_one_constituent() and not state.finished(model)
+
+ def short_name(self):
+ return "Finalize"
+
+ def __repr__(self):
+ return "Finalize(%s)" % ",".join(self.label)
+
+ def __eq__(self, other):
+ if self is other:
+ return True
+ if not isinstance(other, Finalize):
+ return False
+ return other.label == self.label
+
+ def __hash__(self):
+ return hash((53, self.label))
+
class CloseConstituent(Transition):
def delta_opens(self):
return -1
@@ -499,6 +616,7 @@ class CloseConstituent(Transition):
def is_legal(self, state, model):
"""
Disallow if there is no Open on the stack yet
+
in TOP_DOWN, if the previous transition was the Open (nothing built yet)
in IN_ORDER, previous transition does not matter, except for one small corner case
"""
@@ -521,7 +639,7 @@ class CloseConstituent(Transition):
# under the ROOT open if unary transitions are not possible
if state.num_opens == 2 and not state.empty_word_queue():
return False
- else:
+ elif model.transition_scheme() == TransitionScheme.IN_ORDER:
if not isinstance(model.get_top_transition(state.transitions), OpenConstituent):
# we're not stuck in a loop of unaries
return True
@@ -539,6 +657,13 @@ class CloseConstituent(Transition):
# this means we'll be stuck having to open again if we do close
# this node, so instead we make the Close illegal
return False
+ elif model.transition_scheme() == TransitionScheme.IN_ORDER_COMPOUND:
+ # the only restriction here is that we can't close
+ # immediately after an open
+ # internal unaries are handled by the opens being compound
+ # preterminal unaries are handled with PreterminalUnary
+ if isinstance(model.get_top_transition(state.transitions), OpenConstituent):
+ return False
return True
def short_name(self):
diff --git a/stanza/models/constituency/parse_tree.py b/stanza/models/constituency/parse_tree.py
index ce729acf..a576b2ab 100644
--- a/stanza/models/constituency/parse_tree.py
+++ b/stanza/models/constituency/parse_tree.py
@@ -375,11 +375,16 @@ class Tree(StanzaObject):
return sorted(set(x.label for x in trees))
@staticmethod
- def get_compound_constituents(trees):
+ def get_compound_constituents(trees, separate_root=False):
constituents = set()
stack = deque()
for tree in trees:
- stack.append(tree)
+ if separate_root:
+ constituents.add((tree.label,))
+ for child in tree.children:
+ stack.append(child)
+ else:
+ stack.append(tree)
while len(stack) > 0:
node = stack.pop()
if node.is_leaf() or node.is_preterminal():
diff --git a/stanza/models/constituency/trainer.py b/stanza/models/constituency/trainer.py
index 09ef534c..a8d8e4bd 100644
--- a/stanza/models/constituency/trainer.py
+++ b/stanza/models/constituency/trainer.py
@@ -319,6 +319,8 @@ def get_open_nodes(trees, args):
"""
if args['transition_scheme'] is TransitionScheme.TOP_DOWN_COMPOUND:
return parse_tree.Tree.get_compound_constituents(trees)
+ elif args['transition_scheme'] is TransitionScheme.IN_ORDER_COMPOUND:
+ return parse_tree.Tree.get_compound_constituents(trees, separate_root=True)
else:
return [(x,) for x in parse_tree.Tree.get_unique_constituent_labels(trees)]
diff --git a/stanza/models/constituency/transition_sequence.py b/stanza/models/constituency/transition_sequence.py
index 1e2a71d9..2f2aec78 100644
--- a/stanza/models/constituency/transition_sequence.py
+++ b/stanza/models/constituency/transition_sequence.py
@@ -7,7 +7,7 @@ Supports multiple transition schemes - TOP_DOWN and variants, IN_ORDER
import logging
from stanza.models.common import utils
-from stanza.models.constituency.parse_transitions import Shift, CompoundUnary, OpenConstituent, CloseConstituent, TransitionScheme
+from stanza.models.constituency.parse_transitions import Shift, CompoundUnary, OpenConstituent, CloseConstituent, TransitionScheme, Finalize, PreterminalUnary
from stanza.models.constituency.tree_reader import read_trees
tqdm = utils.get_tqdm()
@@ -77,12 +77,57 @@ def yield_in_order_sequence(tree):
yield CloseConstituent()
+
+
+def yield_in_order_compound_sequence(tree):
+ def helper(tree):
+ if tree.is_preterminal():
+ yield Shift()
+ return
+
+ if tree.is_leaf():
+ return
+
+ labels = []
+ while len(tree.children) == 1 and not tree.is_preterminal():
+ labels.append(tree.label)
+ tree = tree.children[0]
+
+ if tree.is_preterminal():
+ yield Shift()
+ yield PreterminalUnary(*labels)
+ return
+
+ for transition in helper(tree.children[0]):
+ yield transition
+
+ labels.append(tree.label)
+ yield OpenConstituent(*labels)
+
+ for child in tree.children[1:]:
+ for transition in helper(child):
+ yield transition
+
+ yield CloseConstituent()
+
+ if len(tree.children) == 0:
+ raise ValueError("Cannot build IN_ORDER_COMPOUND on an empty tree")
+ if len(tree.children) != 1:
+ raise ValueError("Cannot build IN_ORDER_COMPOUND with a tree that has two top level nodes: %s" % tree)
+
+ for t in helper(tree.children[0]):
+ yield t
+
+ yield Finalize(tree.label)
+
def build_sequence(tree, transition_scheme=TransitionScheme.TOP_DOWN_UNARY):
"""
Turn a single tree into a list of transitions based on the TransitionScheme
"""
if transition_scheme is TransitionScheme.IN_ORDER:
return list(yield_in_order_sequence(tree))
+ elif transition_scheme is TransitionScheme.IN_ORDER_COMPOUND:
+ return list(yield_in_order_compound_sequence(tree))
else:
return list(yield_top_down_sequence(tree, transition_scheme))
diff --git a/stanza/tests/constituency/test_transition_sequence.py b/stanza/tests/constituency/test_transition_sequence.py
index ce34c0e7..b820803e 100644
--- a/stanza/tests/constituency/test_transition_sequence.py
+++ b/stanza/tests/constituency/test_transition_sequence.py
@@ -64,6 +64,9 @@ def test_top_down_no_unary():
def test_in_order():
check_reproduce_tree(transition_scheme=TransitionScheme.IN_ORDER)
+def test_in_order_compound():
+ check_reproduce_tree(transition_scheme=TransitionScheme.IN_ORDER_COMPOUND)
+
def test_all_transitions():
text="((SBARQ (WHNP (WP Who)) (SQ (VP (VBZ sits) (PP (IN in) (NP (DT this) (NN seat))))) (. ?)))"
trees = tree_reader.read_trees(text)