extract-target-trees.py: support for new-style trace files

author: Phil Williams <philip.williams@mac.com> 2015-08-14 18:53:24 +0300
committer: Phil Williams <philip.williams@mac.com> 2015-08-14 18:53:24 +0300
commit: 01a9dd2305fd16372a4c222b39e22b86c73fb00b (patch)
tree: 2f6689e7994c33988b84dc070f7f7389dba35ed6 /scripts/analysis
parent: 8af06a6f0dae14c36e30b052aae24279dac92c6a (diff)
1 files changed, 50 insertions, 1 deletions
diff --git a/scripts/analysis/extract-target-trees.py b/scripts/analysis/extract-target-trees.py
index 7166211d9..5dd097ff0 100755
--- a/scripts/analysis/extract-target-trees.py
+++ b/scripts/analysis/extract-target-trees.py
@@ -110,8 +110,15 @@ def read_derivations(input):
         yield derivation, start_line_num
 
 
-# Extract the hypothesis components and return a Hypothesis object.
 def parse_line(s):
+    if s.startswith("Trans Opt"):
+        return parse_line_old_format(s)
+    else:
+        return parse_line_new_format(s)
+
+
+# Extract the hypothesis components and return a Hypothesis object.
+def parse_line_old_format(s):
     pattern = r"Trans Opt (\d+) " + \
               r"\[(\d+)\.\.(\d+)\]:" + \
               r"((?: \[\d+\.\.\d+\]=\S+  )+):" + \
@@ -147,6 +154,48 @@ def parse_line(s):
     return hypothesis
 
 
+# Extract the hypothesis components and return a Hypothesis object.
+def parse_line_new_format(s):
+    pattern = r"(\d+) \|\|\|" + \
+              r" (\[\S+\]) -> ((?:\S+ )+)\|\|\|" + \
+              r" (\[\S+\]) -> ((?:\S+ )+)\|\|\|" + \
+              r" ((?:\d+-\d+ )*)\|\|\|" + \
+              r"((?: \d+\.\.\d+)*)"
+    regexp = re.compile(pattern)
+    match = regexp.match(s)
+    if not match:
+        sys.stderr.write("%s\n" % s)
+    assert match
+    group = match.groups()
+    hypothesis = Hypothesis()
+    hypothesis.sentence_num = int(group[0]) + 1
+    spans = []
+    for pair in group[6].split():
+        match = re.match(r'(\d+)\.\.(\d+)', pair)
+        assert match
+        span = (int(match.group(1)), int(match.group(2)))
+        spans.append(span)
+    hypothesis.span = (spans[0][0], spans[-1][1])
+    hypothesis.source_symbol_info = []
+    for i, symbol in enumerate(group[2].split()):
+        hypothesis.source_symbol_info.append((spans[i], strip_brackets(symbol)))
+    hypothesis.target_lhs = strip_brackets(group[3])
+    hypothesis.target_rhs = group[4].split()
+    hypothesis.nt_alignments = []
+    for pair in group[5].split():
+        match = re.match(r'(\d+)-(\d+)', pair)
+        assert match
+        ai = (int(match.group(1)), int(match.group(2)))
+        hypothesis.nt_alignments.append(ai)
+    return hypothesis
+
+
+def strip_brackets(symbol):
+    if symbol[0] == '[' and symbol[-1] == ']':
+        return symbol[1:-1]
+    return symbol
+
+
 def tree_to_xml(tree):
     if tree.is_leaf():
         return tree.label
author	Phil Williams <philip.williams@mac.com>	2015-08-14 18:53:24 +0300
committer	Phil Williams <philip.williams@mac.com>	2015-08-14 18:53:24 +0300
commit	01a9dd2305fd16372a4c222b39e22b86c73fb00b (patch)
tree	2f6689e7994c33988b84dc070f7f7389dba35ed6 /scripts/analysis
parent	8af06a6f0dae14c36e30b052aae24279dac92c6a (diff)