Merge s2t branch

This adds a new string-to-tree decoder, which can be enabled with the -s2t option. It's intended to be faster and simpler than the generic chart decoder, and is designed to support lattice input (still WIP). For a en-de system trained on WMT14 data, it's approximately 40% faster in practice. For background information, see the decoding section of the EMNLP tutorial on syntax-based MT: http://www.emnlp2014.org/tutorials/5_notes.pdf Some features are not implemented yet, including support for internal tree structure and soft source-syntactic constraints.
author: Phil Williams <philip.williams@mac.com> 2014-11-04 16:13:56 +0300
committer: Phil Williams <philip.williams@mac.com> 2014-11-04 16:13:56 +0300
commit: 5240c430cec0f78b4f14abd8b86da3764fea842c (patch)
tree: 3f37b70417a920a25c425ad025348b6115204122 /scripts/ems/support/analysis.perl
parent: e0b3105fc055982b8d38783d7d016535ff861718 (diff)
1 files changed, 87 insertions, 31 deletions
diff --git a/scripts/ems/support/analysis.perl b/scripts/ems/support/analysis.perl
index ba92e6026..be5b76a5e 100755
--- a/scripts/ems/support/analysis.perl
+++ b/scripts/ems/support/analysis.perl
@@ -745,37 +745,15 @@ sub hierarchical_segmentation {
     open(OUTPUT_TREE,">$dir/output-tree") or die "Cannot open: $!";
     open(NODE,">$dir/node") or die "Cannot open: $!";
     while(<TRACE>) {
-	/^Trans Opt (\d+) \[(\d+)\.\.(\d+)\]: (.+)  : (\S+) \-\>(.+) :([\(\),\d\- ]*): pC=[\d\.\-e]+, c=/ ||
-	/^Trans Opt (\d+) \[(\d+)\.\.(\d+)\]: (.+)  : (\S+) \-\>\S+  \-\> (.+) :([\(\),\d\- ]*): c=/ || die("cannot scan line $_");
-	my ($sentence,$start,$end,$spans,$rule_lhs,$rule_rhs,$alignment) = ($1,$2,$3,$4,$5,$6,$7);
-	if ($last_sentence >= 0 && $sentence != $last_sentence) {
-	    &hs_process($last_sentence,\@DERIVATION,\%STATS);
-	    @DERIVATION = ();
-	}
-	my %ITEM;
-	$ITEM{'start'} = $start;
-	$ITEM{'end'} = $end;
-	$ITEM{'rule_lhs'} = $rule_lhs;
-	
-	$rule_rhs =~ s/</&lt;/g;
-	$rule_rhs =~ s/>/&gt;/g;
-	@{$ITEM{'rule_rhs'}} = split(/ /,$rule_rhs);
-	
-	foreach (split(/ /,$alignment)) {
-		/(\d+)[\-,](\d+)/ || die("funny alignment: $_\n");
-		$ITEM{'alignment'}{$2} = $1; # target non-terminal to source span
-		$ITEM{'alignedSpan'}{$1} = 1;
-	}
-
-	@{$ITEM{'spans'}} = ();
-	foreach my $span (reverse split(/\s+/,$spans)) {
-		$span =~ /\[(\d+)\.\.(\d+)\]=(\S+)$/ || die("funny span: $span\n");
-		my %SPAN = ( 'from' => $1, 'to' => $2, 'word' => $3 );
-		push @{$ITEM{'spans'}}, \%SPAN;
-	}
-
-	push @DERIVATION,\%ITEM;
-	$last_sentence = $sentence;
+        my $sentence;
+        my %ITEM;
+        &hs_scan_line($_, \$sentence, \%ITEM) || die("cannot scan line $_");
+        if ($last_sentence >= 0 && $sentence != $last_sentence) {
+            &hs_process($last_sentence,\@DERIVATION,\%STATS);
+            @DERIVATION = ();
+        }
+        push @DERIVATION,\%ITEM;
+        $last_sentence = $sentence;
     }
     &hs_process($last_sentence,\@DERIVATION,\%STATS);
     close(TRACE);
@@ -793,6 +771,84 @@ sub hierarchical_segmentation {
     close(SUMMARY);
 }
 
+# scan a single line of the trace file
+sub hs_scan_line {
+    my ($line,$ref_sentence,$ref_item) = @_;
+
+    if ($line =~ /^Trans Opt/) {
+        # Old format
+        $line =~ /^Trans Opt (\d+) \[(\d+)\.\.(\d+)\]: (.+)  : (\S+) \-\>(.+) :([\(\),\d\- ]*): pC=[\d\.\-e]+, c=/ ||
+        $line =~ /^Trans Opt (\d+) \[(\d+)\.\.(\d+)\]: (.+)  : (\S+) \-\>\S+  \-\> (.+) :([\(\),\d\- ]*): c=/ || return 0;
+        my ($sentence,$start,$end,$spans,$rule_lhs,$rule_rhs,$alignment) = ($1,$2,$3,$4,$5,$6,$7);
+
+        ${$ref_sentence} = $sentence;
+
+        $ref_item->{'start'} = $start;
+        $ref_item->{'end'} = $end;
+        $ref_item->{'rule_lhs'} = $rule_lhs;
+
+        $rule_rhs =~ s/</&lt;/g;
+        $rule_rhs =~ s/>/&gt;/g;
+        @{$ref_item->{'rule_rhs'}} = split(/ /,$rule_rhs);
+
+        foreach (split(/ /,$alignment)) {
+            /(\d+)[\-,](\d+)/ || die("funny alignment: $_\n");
+            $ref_item->{'alignment'}{$2} = $1; # target non-terminal to source span
+            $ref_item->{'alignedSpan'}{$1} = 1;
+        }
+
+        @{$ref_item->{'spans'}} = ();
+        foreach my $span (reverse split(/\s+/,$spans)) {
+            $span =~ /\[(\d+)\.\.(\d+)\]=(\S+)$/ || die("funny span: $span\n");
+            my %SPAN = ( 'from' => $1, 'to' => $2, 'word' => $3 );
+            push @{$ref_item->{'spans'}}, \%SPAN;
+        }
+    } else {
+        # New format
+        $line =~ /^(\d+) \|\|\| \[\S+\] -> (.+) \|\|\| \[(\S+)\] -> (.+) \|\|\| (.*)\|\|\| (.*)/ || return 0;
+        my ($sentence,$source_rhs,$target_lhs,$target_rhs,$alignment,$source_spans) = ($1,$2,$3,$4,$5,$6);
+
+        ${$ref_sentence} = $sentence;
+
+        @{$ref_item->{'spans'}} = ();
+        foreach (split(/ /,$source_rhs)) {
+            /^\[?([^\]]+)\]?$/;
+            my %SPAN = ( 'word' => $1 );
+            push @{$ref_item->{'spans'}}, \%SPAN;
+        }
+
+        my $i = 0;
+        foreach my $span (split(/ /,$source_spans)) {
+            $span =~ /(\d+)\.\.(\d+)/ || die("funny span: $span\n");
+            $ref_item->{'spans'}[$i]{'from'} = $1;
+            $ref_item->{'spans'}[$i]{'to'} = $2;
+            if ($i == 0) {
+                $ref_item->{'start'} = $1;
+            }
+            $ref_item->{'end'} = $2;
+            $i++;
+        }
+
+        $ref_item->{'rule_lhs'} = $target_lhs;
+
+        $target_rhs =~ s/</&lt;/g;
+        $target_rhs =~ s/>/&gt;/g;
+        @{$ref_item->{'rule_rhs'}} = ();
+        foreach (split(/ /,$target_rhs)) {
+            /^\[?([^\]]+)\]?$/;
+            push @{$ref_item->{'rule_rhs'}}, $1;
+        }
+
+        foreach (split(/ /,$alignment)) {
+            /(\d+)[\-,](\d+)/ || die("funny alignment: $_\n");
+            $ref_item->{'alignment'}{$2} = $1; # target non-terminal to source span
+            $ref_item->{'alignedSpan'}{$1} = 1;
+        }
+    }
+
+    return 1;
+}
+
 # process a single sentence for hierarchical segmentation
 sub hs_process {
     my ($sentence,$DERIVATION,$STATS) = @_;
author	Phil Williams <philip.williams@mac.com>	2014-11-04 16:13:56 +0300
committer	Phil Williams <philip.williams@mac.com>	2014-11-04 16:13:56 +0300
commit	5240c430cec0f78b4f14abd8b86da3764fea842c (patch)
tree	3f37b70417a920a25c425ad025348b6115204122 /scripts/ems/support/analysis.perl
parent	e0b3105fc055982b8d38783d7d016535ff861718 (diff)