Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/moses-smt/mosesdecoder.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorPhil Williams <philip.williams@mac.com>2015-08-14 18:53:24 +0300
committerPhil Williams <philip.williams@mac.com>2015-08-14 18:53:24 +0300
commit01a9dd2305fd16372a4c222b39e22b86c73fb00b (patch)
tree2f6689e7994c33988b84dc070f7f7389dba35ed6 /scripts/analysis
parent8af06a6f0dae14c36e30b052aae24279dac92c6a (diff)
extract-target-trees.py: support for new-style trace files
Diffstat (limited to 'scripts/analysis')
-rwxr-xr-xscripts/analysis/extract-target-trees.py51
1 files changed, 50 insertions, 1 deletions
diff --git a/scripts/analysis/extract-target-trees.py b/scripts/analysis/extract-target-trees.py
index 7166211d9..5dd097ff0 100755
--- a/scripts/analysis/extract-target-trees.py
+++ b/scripts/analysis/extract-target-trees.py
@@ -110,8 +110,15 @@ def read_derivations(input):
yield derivation, start_line_num
-# Extract the hypothesis components and return a Hypothesis object.
def parse_line(s):
+ if s.startswith("Trans Opt"):
+ return parse_line_old_format(s)
+ else:
+ return parse_line_new_format(s)
+
+
+# Extract the hypothesis components and return a Hypothesis object.
+def parse_line_old_format(s):
pattern = r"Trans Opt (\d+) " + \
r"\[(\d+)\.\.(\d+)\]:" + \
r"((?: \[\d+\.\.\d+\]=\S+ )+):" + \
@@ -147,6 +154,48 @@ def parse_line(s):
return hypothesis
+# Extract the hypothesis components and return a Hypothesis object.
+def parse_line_new_format(s):
+ pattern = r"(\d+) \|\|\|" + \
+ r" (\[\S+\]) -> ((?:\S+ )+)\|\|\|" + \
+ r" (\[\S+\]) -> ((?:\S+ )+)\|\|\|" + \
+ r" ((?:\d+-\d+ )*)\|\|\|" + \
+ r"((?: \d+\.\.\d+)*)"
+ regexp = re.compile(pattern)
+ match = regexp.match(s)
+ if not match:
+ sys.stderr.write("%s\n" % s)
+ assert match
+ group = match.groups()
+ hypothesis = Hypothesis()
+ hypothesis.sentence_num = int(group[0]) + 1
+ spans = []
+ for pair in group[6].split():
+ match = re.match(r'(\d+)\.\.(\d+)', pair)
+ assert match
+ span = (int(match.group(1)), int(match.group(2)))
+ spans.append(span)
+ hypothesis.span = (spans[0][0], spans[-1][1])
+ hypothesis.source_symbol_info = []
+ for i, symbol in enumerate(group[2].split()):
+ hypothesis.source_symbol_info.append((spans[i], strip_brackets(symbol)))
+ hypothesis.target_lhs = strip_brackets(group[3])
+ hypothesis.target_rhs = group[4].split()
+ hypothesis.nt_alignments = []
+ for pair in group[5].split():
+ match = re.match(r'(\d+)-(\d+)', pair)
+ assert match
+ ai = (int(match.group(1)), int(match.group(2)))
+ hypothesis.nt_alignments.append(ai)
+ return hypothesis
+
+
+def strip_brackets(symbol):
+ if symbol[0] == '[' and symbol[-1] == ']':
+ return symbol[1:-1]
+ return symbol
+
+
def tree_to_xml(tree):
if tree.is_leaf():
return tree.label