diff options
author | Phil Williams <philip.williams@mac.com> | 2014-11-04 16:13:56 +0300 |
---|---|---|
committer | Phil Williams <philip.williams@mac.com> | 2014-11-04 16:13:56 +0300 |
commit | 5240c430cec0f78b4f14abd8b86da3764fea842c (patch) | |
tree | 3f37b70417a920a25c425ad025348b6115204122 /scripts/ems/support/analysis.perl | |
parent | e0b3105fc055982b8d38783d7d016535ff861718 (diff) |
Merge s2t branch
This adds a new string-to-tree decoder, which can be enabled with the -s2t
option. It's intended to be faster and simpler than the generic chart
decoder, and is designed to support lattice input (still WIP). For a en-de
system trained on WMT14 data, it's approximately 40% faster in practice.
For background information, see the decoding section of the EMNLP tutorial
on syntax-based MT:
http://www.emnlp2014.org/tutorials/5_notes.pdf
Some features are not implemented yet, including support for internal tree
structure and soft source-syntactic constraints.
Diffstat (limited to 'scripts/ems/support/analysis.perl')
-rwxr-xr-x | scripts/ems/support/analysis.perl | 118 |
1 files changed, 87 insertions, 31 deletions
diff --git a/scripts/ems/support/analysis.perl b/scripts/ems/support/analysis.perl index ba92e6026..be5b76a5e 100755 --- a/scripts/ems/support/analysis.perl +++ b/scripts/ems/support/analysis.perl @@ -745,37 +745,15 @@ sub hierarchical_segmentation { open(OUTPUT_TREE,">$dir/output-tree") or die "Cannot open: $!"; open(NODE,">$dir/node") or die "Cannot open: $!"; while(<TRACE>) { - /^Trans Opt (\d+) \[(\d+)\.\.(\d+)\]: (.+) : (\S+) \-\>(.+) :([\(\),\d\- ]*): pC=[\d\.\-e]+, c=/ || - /^Trans Opt (\d+) \[(\d+)\.\.(\d+)\]: (.+) : (\S+) \-\>\S+ \-\> (.+) :([\(\),\d\- ]*): c=/ || die("cannot scan line $_"); - my ($sentence,$start,$end,$spans,$rule_lhs,$rule_rhs,$alignment) = ($1,$2,$3,$4,$5,$6,$7); - if ($last_sentence >= 0 && $sentence != $last_sentence) { - &hs_process($last_sentence,\@DERIVATION,\%STATS); - @DERIVATION = (); - } - my %ITEM; - $ITEM{'start'} = $start; - $ITEM{'end'} = $end; - $ITEM{'rule_lhs'} = $rule_lhs; - - $rule_rhs =~ s/</</g; - $rule_rhs =~ s/>/>/g; - @{$ITEM{'rule_rhs'}} = split(/ /,$rule_rhs); - - foreach (split(/ /,$alignment)) { - /(\d+)[\-,](\d+)/ || die("funny alignment: $_\n"); - $ITEM{'alignment'}{$2} = $1; # target non-terminal to source span - $ITEM{'alignedSpan'}{$1} = 1; - } - - @{$ITEM{'spans'}} = (); - foreach my $span (reverse split(/\s+/,$spans)) { - $span =~ /\[(\d+)\.\.(\d+)\]=(\S+)$/ || die("funny span: $span\n"); - my %SPAN = ( 'from' => $1, 'to' => $2, 'word' => $3 ); - push @{$ITEM{'spans'}}, \%SPAN; - } - - push @DERIVATION,\%ITEM; - $last_sentence = $sentence; + my $sentence; + my %ITEM; + &hs_scan_line($_, \$sentence, \%ITEM) || die("cannot scan line $_"); + if ($last_sentence >= 0 && $sentence != $last_sentence) { + &hs_process($last_sentence,\@DERIVATION,\%STATS); + @DERIVATION = (); + } + push @DERIVATION,\%ITEM; + $last_sentence = $sentence; } &hs_process($last_sentence,\@DERIVATION,\%STATS); close(TRACE); @@ -793,6 +771,84 @@ sub hierarchical_segmentation { close(SUMMARY); } +# scan a single line of the trace file +sub hs_scan_line { + my ($line,$ref_sentence,$ref_item) = @_; + + if ($line =~ /^Trans Opt/) { + # Old format + $line =~ /^Trans Opt (\d+) \[(\d+)\.\.(\d+)\]: (.+) : (\S+) \-\>(.+) :([\(\),\d\- ]*): pC=[\d\.\-e]+, c=/ || + $line =~ /^Trans Opt (\d+) \[(\d+)\.\.(\d+)\]: (.+) : (\S+) \-\>\S+ \-\> (.+) :([\(\),\d\- ]*): c=/ || return 0; + my ($sentence,$start,$end,$spans,$rule_lhs,$rule_rhs,$alignment) = ($1,$2,$3,$4,$5,$6,$7); + + ${$ref_sentence} = $sentence; + + $ref_item->{'start'} = $start; + $ref_item->{'end'} = $end; + $ref_item->{'rule_lhs'} = $rule_lhs; + + $rule_rhs =~ s/</</g; + $rule_rhs =~ s/>/>/g; + @{$ref_item->{'rule_rhs'}} = split(/ /,$rule_rhs); + + foreach (split(/ /,$alignment)) { + /(\d+)[\-,](\d+)/ || die("funny alignment: $_\n"); + $ref_item->{'alignment'}{$2} = $1; # target non-terminal to source span + $ref_item->{'alignedSpan'}{$1} = 1; + } + + @{$ref_item->{'spans'}} = (); + foreach my $span (reverse split(/\s+/,$spans)) { + $span =~ /\[(\d+)\.\.(\d+)\]=(\S+)$/ || die("funny span: $span\n"); + my %SPAN = ( 'from' => $1, 'to' => $2, 'word' => $3 ); + push @{$ref_item->{'spans'}}, \%SPAN; + } + } else { + # New format + $line =~ /^(\d+) \|\|\| \[\S+\] -> (.+) \|\|\| \[(\S+)\] -> (.+) \|\|\| (.*)\|\|\| (.*)/ || return 0; + my ($sentence,$source_rhs,$target_lhs,$target_rhs,$alignment,$source_spans) = ($1,$2,$3,$4,$5,$6); + + ${$ref_sentence} = $sentence; + + @{$ref_item->{'spans'}} = (); + foreach (split(/ /,$source_rhs)) { + /^\[?([^\]]+)\]?$/; + my %SPAN = ( 'word' => $1 ); + push @{$ref_item->{'spans'}}, \%SPAN; + } + + my $i = 0; + foreach my $span (split(/ /,$source_spans)) { + $span =~ /(\d+)\.\.(\d+)/ || die("funny span: $span\n"); + $ref_item->{'spans'}[$i]{'from'} = $1; + $ref_item->{'spans'}[$i]{'to'} = $2; + if ($i == 0) { + $ref_item->{'start'} = $1; + } + $ref_item->{'end'} = $2; + $i++; + } + + $ref_item->{'rule_lhs'} = $target_lhs; + + $target_rhs =~ s/</</g; + $target_rhs =~ s/>/>/g; + @{$ref_item->{'rule_rhs'}} = (); + foreach (split(/ /,$target_rhs)) { + /^\[?([^\]]+)\]?$/; + push @{$ref_item->{'rule_rhs'}}, $1; + } + + foreach (split(/ /,$alignment)) { + /(\d+)[\-,](\d+)/ || die("funny alignment: $_\n"); + $ref_item->{'alignment'}{$2} = $1; # target non-terminal to source span + $ref_item->{'alignedSpan'}{$1} = 1; + } + } + + return 1; +} + # process a single sentence for hierarchical segmentation sub hs_process { my ($sentence,$DERIVATION,$STATS) = @_; |