diff options
author | phkoehn <phkoehn@1f5c12ca-751b-0410-a591-d2e778427230> | 2010-10-21 13:49:27 +0400 |
---|---|---|
committer | phkoehn <phkoehn@1f5c12ca-751b-0410-a591-d2e778427230> | 2010-10-21 13:49:27 +0400 |
commit | 85a5a13e4c722fccc28394d06da1ea0194bf7ab6 (patch) | |
tree | 97876a77307d0a21bbf0be6ea913c1a9f067a225 /scripts/ems/support/analysis.perl | |
parent | 88eaf49c5e051f7c1202d01aa2136c811c17e401 (diff) |
improvements to web analysis, fixes to syntax wrappers
git-svn-id: https://mosesdecoder.svn.sourceforge.net/svnroot/mosesdecoder/trunk@3633 1f5c12ca-751b-0410-a591-d2e778427230
Diffstat (limited to 'scripts/ems/support/analysis.perl')
-rwxr-xr-x | scripts/ems/support/analysis.perl | 34 |
1 files changed, 24 insertions, 10 deletions
diff --git a/scripts/ems/support/analysis.perl b/scripts/ems/support/analysis.perl index fab2f54e6..e19150518 100755 --- a/scripts/ems/support/analysis.perl +++ b/scripts/ems/support/analysis.perl @@ -5,7 +5,7 @@ use Getopt::Long "GetOptions"; my $MAX_LENGTH = 4; -my ($system,$segmentation,$reference,$dir,$input,$corpus,$ttable,$hierarchical); +my ($system,$segmentation,$reference,$dir,$input,$corpus,$ttable,$hierarchical,$output_corpus,$alignment,$biconcor); if (!&GetOptions('system=s' => \$system, # raw output from decoder 'reference=s' => \$reference, # tokenized reference 'dir=s' => \$dir, # directory for storing results @@ -13,9 +13,12 @@ if (!&GetOptions('system=s' => \$system, # raw output from decoder 'segmentation=s' => \$segmentation, # system output with segmentation markup 'input-corpus=s' => \$corpus, # input side of parallel training corpus 'ttable=s' => \$ttable, # phrase translation table used for decoding + 'output-corpus=s' => \$output_corpus, # output side of parallel training corpus + 'alignment-file=s' => \$alignment, # alignment of parallel corpus + 'biconcor=s' => \$biconcor, # binary for bilingual concordancer 'hierarchical' => \$hierarchical) || # hierarchical model? !defined($dir)) { - die("ERROR: syntax: analysis.perl -system FILE -reference FILE -dir DIR [-input FILE] [-input-corpus FILE] [-ttable FILE] [-segmentation FILE]"); + die("ERROR: syntax: analysis.perl -system FILE -reference FILE -dir DIR [-input FILE] [-input-corpus FILE] [-ttable FILE] [-segmentation FILE] [-output-corpus FILE] [-alignment-file FILE] [-biconcor BIN]"); } `mkdir -p $dir`; @@ -84,6 +87,11 @@ if (defined($ttable) || defined($corpus)) { &input_annotation(); } +# bilingual concordance -- not used by experiment.perl +if (defined($corpus) && defined($output_corpus) && defined($alignment) && defined($biconcor)) { + `$biconcor -s $dir/biconcor -c $corpus -t $output_corpus -a $alignment`; +} + sub best_matches { my ($CORRECT,$TOTAL,$out) = @_; my $type = ($out =~ /precision/) ? "precision" : "recall"; @@ -208,6 +216,9 @@ sub ttable_coverage { if (! -e $ttable && -e $ttable.".gz") { open(TTABLE,"gzip -cd $ttable.gz|"); } + elsif ($ttable =~ /.gz$/) { + open(TTABLE,"gzip -cd $ttable|"); + } else { open(TTABLE,$ttable) or die "Can't read ttable $ttable"; } @@ -219,7 +230,7 @@ sub ttable_coverage { my @COLUMN = split(/ \|\|\| /); my ($in,$out,$scores) = @COLUMN; # handling hierarchical - $in =~ s/\[[^ \]]+\]$//; # remove lhs nt + $in =~ s/ \[[^ \]]+\]$//; # remove lhs nt next if $in =~ /\[[^ \]]+\]\[[^ \]]+\]/; # only consider flat rules $scores = $COLUMN[4] if scalar @COLUMN == 5; my @IN = split(/ /,$in); @@ -255,6 +266,7 @@ sub compute_entropy { } my $entropy = 0; foreach my $p (@_) { + next if $p == 0; $entropy -= ($p/$z)*log($p/$z)/log(2); } return $entropy; @@ -465,7 +477,7 @@ sub hierarchical_segmentation { open(OUTPUT_TREE,">$dir/output-tree"); open(NODE,">$dir/node"); while(<TRACE>) { - /^Trans Opt (\d+) \[(\d+)\.\.(\d+)\]: (.+) : (\S+) \-\>(.+) :([\d\- ]*): pC=[\d\.\-e]+, c=/ || die("cannot scan line $_"); + /^Trans Opt (\d+) \[(\d+)\.\.(\d+)\]: (.+) : (\S+) \-\>(.+) :([\(\),\d\- ]*): pC=[\d\.\-e]+, c=/ || die("cannot scan line $_"); my ($sentence,$start,$end,$spans,$rule_lhs,$rule_rhs,$alignment) = ($1,$2,$3,$4,$5,$6,$7); if ($last_sentence >= 0 && $sentence != $last_sentence) { &hs_process($last_sentence,\@DERIVATION,\%STATS); @@ -481,7 +493,7 @@ sub hierarchical_segmentation { @{$ITEM{'rule_rhs'}} = split(/ /,$rule_rhs); foreach (split(/ /,$alignment)) { - /(\d+)\-(\d+)/ || die("funny alignment: $_\n"); + /(\d+)[\-,](\d+)/ || die("funny alignment: $_\n"); $ITEM{'alignment'}{$2} = $1; # target non-terminal to source span $ITEM{'alignedSpan'}{$1} = 1; } @@ -528,12 +540,14 @@ sub hs_process { my $x=0; while(1) { my $RULE = shift @{$DERIVATION}; - if ($$RULE{'rule_lhs'} eq "S" && - scalar(@{$$RULE{'rule_rhs'}}) == 2 && - $$RULE{'rule_rhs'}[0] eq "S" && - $$RULE{'rule_rhs'}[1] eq "X") { + if (scalar(@{$$RULE{'rule_rhs'}}) == 2 && + ($$RULE{'rule_lhs'} eq "S" && + $$RULE{'rule_rhs'}[0] eq "S" && + $$RULE{'rule_rhs'}[1] eq "X") || + ($$RULE{'rule_lhs'} eq "Q" && + $$RULE{'rule_rhs'}[0] eq "Q")) { unshift @{$GLUE_RULE{'spans'}},$$RULE{'spans'}[1]; - push @{$GLUE_RULE{'rule_rhs'}}, "X"; + push @{$GLUE_RULE{'rule_rhs'}}, $$RULE{'rule_rhs'}[1]; $GLUE_RULE{'alignment'}{$x} = $x; $GLUE_RULE{'alignedSpan'}{$x} = 1; $x++; |