Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/moses-smt/mosesdecoder.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorPhil Williams <philip.williams@mac.com>2014-11-04 16:13:56 +0300
committerPhil Williams <philip.williams@mac.com>2014-11-04 16:13:56 +0300
commit5240c430cec0f78b4f14abd8b86da3764fea842c (patch)
tree3f37b70417a920a25c425ad025348b6115204122 /scripts/ems/support/analysis.perl
parente0b3105fc055982b8d38783d7d016535ff861718 (diff)
Merge s2t branch
This adds a new string-to-tree decoder, which can be enabled with the -s2t option. It's intended to be faster and simpler than the generic chart decoder, and is designed to support lattice input (still WIP). For a en-de system trained on WMT14 data, it's approximately 40% faster in practice. For background information, see the decoding section of the EMNLP tutorial on syntax-based MT: http://www.emnlp2014.org/tutorials/5_notes.pdf Some features are not implemented yet, including support for internal tree structure and soft source-syntactic constraints.
Diffstat (limited to 'scripts/ems/support/analysis.perl')
-rwxr-xr-xscripts/ems/support/analysis.perl118
1 files changed, 87 insertions, 31 deletions
diff --git a/scripts/ems/support/analysis.perl b/scripts/ems/support/analysis.perl
index ba92e6026..be5b76a5e 100755
--- a/scripts/ems/support/analysis.perl
+++ b/scripts/ems/support/analysis.perl
@@ -745,37 +745,15 @@ sub hierarchical_segmentation {
open(OUTPUT_TREE,">$dir/output-tree") or die "Cannot open: $!";
open(NODE,">$dir/node") or die "Cannot open: $!";
while(<TRACE>) {
- /^Trans Opt (\d+) \[(\d+)\.\.(\d+)\]: (.+) : (\S+) \-\>(.+) :([\(\),\d\- ]*): pC=[\d\.\-e]+, c=/ ||
- /^Trans Opt (\d+) \[(\d+)\.\.(\d+)\]: (.+) : (\S+) \-\>\S+ \-\> (.+) :([\(\),\d\- ]*): c=/ || die("cannot scan line $_");
- my ($sentence,$start,$end,$spans,$rule_lhs,$rule_rhs,$alignment) = ($1,$2,$3,$4,$5,$6,$7);
- if ($last_sentence >= 0 && $sentence != $last_sentence) {
- &hs_process($last_sentence,\@DERIVATION,\%STATS);
- @DERIVATION = ();
- }
- my %ITEM;
- $ITEM{'start'} = $start;
- $ITEM{'end'} = $end;
- $ITEM{'rule_lhs'} = $rule_lhs;
-
- $rule_rhs =~ s/</&lt;/g;
- $rule_rhs =~ s/>/&gt;/g;
- @{$ITEM{'rule_rhs'}} = split(/ /,$rule_rhs);
-
- foreach (split(/ /,$alignment)) {
- /(\d+)[\-,](\d+)/ || die("funny alignment: $_\n");
- $ITEM{'alignment'}{$2} = $1; # target non-terminal to source span
- $ITEM{'alignedSpan'}{$1} = 1;
- }
-
- @{$ITEM{'spans'}} = ();
- foreach my $span (reverse split(/\s+/,$spans)) {
- $span =~ /\[(\d+)\.\.(\d+)\]=(\S+)$/ || die("funny span: $span\n");
- my %SPAN = ( 'from' => $1, 'to' => $2, 'word' => $3 );
- push @{$ITEM{'spans'}}, \%SPAN;
- }
-
- push @DERIVATION,\%ITEM;
- $last_sentence = $sentence;
+ my $sentence;
+ my %ITEM;
+ &hs_scan_line($_, \$sentence, \%ITEM) || die("cannot scan line $_");
+ if ($last_sentence >= 0 && $sentence != $last_sentence) {
+ &hs_process($last_sentence,\@DERIVATION,\%STATS);
+ @DERIVATION = ();
+ }
+ push @DERIVATION,\%ITEM;
+ $last_sentence = $sentence;
}
&hs_process($last_sentence,\@DERIVATION,\%STATS);
close(TRACE);
@@ -793,6 +771,84 @@ sub hierarchical_segmentation {
close(SUMMARY);
}
+# scan a single line of the trace file
+sub hs_scan_line {
+ my ($line,$ref_sentence,$ref_item) = @_;
+
+ if ($line =~ /^Trans Opt/) {
+ # Old format
+ $line =~ /^Trans Opt (\d+) \[(\d+)\.\.(\d+)\]: (.+) : (\S+) \-\>(.+) :([\(\),\d\- ]*): pC=[\d\.\-e]+, c=/ ||
+ $line =~ /^Trans Opt (\d+) \[(\d+)\.\.(\d+)\]: (.+) : (\S+) \-\>\S+ \-\> (.+) :([\(\),\d\- ]*): c=/ || return 0;
+ my ($sentence,$start,$end,$spans,$rule_lhs,$rule_rhs,$alignment) = ($1,$2,$3,$4,$5,$6,$7);
+
+ ${$ref_sentence} = $sentence;
+
+ $ref_item->{'start'} = $start;
+ $ref_item->{'end'} = $end;
+ $ref_item->{'rule_lhs'} = $rule_lhs;
+
+ $rule_rhs =~ s/</&lt;/g;
+ $rule_rhs =~ s/>/&gt;/g;
+ @{$ref_item->{'rule_rhs'}} = split(/ /,$rule_rhs);
+
+ foreach (split(/ /,$alignment)) {
+ /(\d+)[\-,](\d+)/ || die("funny alignment: $_\n");
+ $ref_item->{'alignment'}{$2} = $1; # target non-terminal to source span
+ $ref_item->{'alignedSpan'}{$1} = 1;
+ }
+
+ @{$ref_item->{'spans'}} = ();
+ foreach my $span (reverse split(/\s+/,$spans)) {
+ $span =~ /\[(\d+)\.\.(\d+)\]=(\S+)$/ || die("funny span: $span\n");
+ my %SPAN = ( 'from' => $1, 'to' => $2, 'word' => $3 );
+ push @{$ref_item->{'spans'}}, \%SPAN;
+ }
+ } else {
+ # New format
+ $line =~ /^(\d+) \|\|\| \[\S+\] -> (.+) \|\|\| \[(\S+)\] -> (.+) \|\|\| (.*)\|\|\| (.*)/ || return 0;
+ my ($sentence,$source_rhs,$target_lhs,$target_rhs,$alignment,$source_spans) = ($1,$2,$3,$4,$5,$6);
+
+ ${$ref_sentence} = $sentence;
+
+ @{$ref_item->{'spans'}} = ();
+ foreach (split(/ /,$source_rhs)) {
+ /^\[?([^\]]+)\]?$/;
+ my %SPAN = ( 'word' => $1 );
+ push @{$ref_item->{'spans'}}, \%SPAN;
+ }
+
+ my $i = 0;
+ foreach my $span (split(/ /,$source_spans)) {
+ $span =~ /(\d+)\.\.(\d+)/ || die("funny span: $span\n");
+ $ref_item->{'spans'}[$i]{'from'} = $1;
+ $ref_item->{'spans'}[$i]{'to'} = $2;
+ if ($i == 0) {
+ $ref_item->{'start'} = $1;
+ }
+ $ref_item->{'end'} = $2;
+ $i++;
+ }
+
+ $ref_item->{'rule_lhs'} = $target_lhs;
+
+ $target_rhs =~ s/</&lt;/g;
+ $target_rhs =~ s/>/&gt;/g;
+ @{$ref_item->{'rule_rhs'}} = ();
+ foreach (split(/ /,$target_rhs)) {
+ /^\[?([^\]]+)\]?$/;
+ push @{$ref_item->{'rule_rhs'}}, $1;
+ }
+
+ foreach (split(/ /,$alignment)) {
+ /(\d+)[\-,](\d+)/ || die("funny alignment: $_\n");
+ $ref_item->{'alignment'}{$2} = $1; # target non-terminal to source span
+ $ref_item->{'alignedSpan'}{$1} = 1;
+ }
+ }
+
+ return 1;
+}
+
# process a single sentence for hierarchical segmentation
sub hs_process {
my ($sentence,$DERIVATION,$STATS) = @_;