diff options
author | Phil Williams <philip.williams@mac.com> | 2015-08-14 18:53:24 +0300 |
---|---|---|
committer | Phil Williams <philip.williams@mac.com> | 2015-08-14 18:53:24 +0300 |
commit | 01a9dd2305fd16372a4c222b39e22b86c73fb00b (patch) | |
tree | 2f6689e7994c33988b84dc070f7f7389dba35ed6 /scripts/analysis | |
parent | 8af06a6f0dae14c36e30b052aae24279dac92c6a (diff) |
extract-target-trees.py: support for new-style trace files
Diffstat (limited to 'scripts/analysis')
-rwxr-xr-x | scripts/analysis/extract-target-trees.py | 51 |
1 files changed, 50 insertions, 1 deletions
diff --git a/scripts/analysis/extract-target-trees.py b/scripts/analysis/extract-target-trees.py index 7166211d9..5dd097ff0 100755 --- a/scripts/analysis/extract-target-trees.py +++ b/scripts/analysis/extract-target-trees.py @@ -110,8 +110,15 @@ def read_derivations(input): yield derivation, start_line_num -# Extract the hypothesis components and return a Hypothesis object. def parse_line(s): + if s.startswith("Trans Opt"): + return parse_line_old_format(s) + else: + return parse_line_new_format(s) + + +# Extract the hypothesis components and return a Hypothesis object. +def parse_line_old_format(s): pattern = r"Trans Opt (\d+) " + \ r"\[(\d+)\.\.(\d+)\]:" + \ r"((?: \[\d+\.\.\d+\]=\S+ )+):" + \ @@ -147,6 +154,48 @@ def parse_line(s): return hypothesis +# Extract the hypothesis components and return a Hypothesis object. +def parse_line_new_format(s): + pattern = r"(\d+) \|\|\|" + \ + r" (\[\S+\]) -> ((?:\S+ )+)\|\|\|" + \ + r" (\[\S+\]) -> ((?:\S+ )+)\|\|\|" + \ + r" ((?:\d+-\d+ )*)\|\|\|" + \ + r"((?: \d+\.\.\d+)*)" + regexp = re.compile(pattern) + match = regexp.match(s) + if not match: + sys.stderr.write("%s\n" % s) + assert match + group = match.groups() + hypothesis = Hypothesis() + hypothesis.sentence_num = int(group[0]) + 1 + spans = [] + for pair in group[6].split(): + match = re.match(r'(\d+)\.\.(\d+)', pair) + assert match + span = (int(match.group(1)), int(match.group(2))) + spans.append(span) + hypothesis.span = (spans[0][0], spans[-1][1]) + hypothesis.source_symbol_info = [] + for i, symbol in enumerate(group[2].split()): + hypothesis.source_symbol_info.append((spans[i], strip_brackets(symbol))) + hypothesis.target_lhs = strip_brackets(group[3]) + hypothesis.target_rhs = group[4].split() + hypothesis.nt_alignments = [] + for pair in group[5].split(): + match = re.match(r'(\d+)-(\d+)', pair) + assert match + ai = (int(match.group(1)), int(match.group(2))) + hypothesis.nt_alignments.append(ai) + return hypothesis + + +def strip_brackets(symbol): + if symbol[0] == '[' and symbol[-1] == ']': + return symbol[1:-1] + return symbol + + def tree_to_xml(tree): if tree.is_leaf(): return tree.label |