diff options
author | phkoehn <phkoehn@1f5c12ca-751b-0410-a591-d2e778427230> | 2010-05-07 15:28:55 +0400 |
---|---|---|
committer | phkoehn <phkoehn@1f5c12ca-751b-0410-a591-d2e778427230> | 2010-05-07 15:28:55 +0400 |
commit | 447dccfc597ce34d6011fbbc1194d96e1b8ec191 (patch) | |
tree | d506529b860cbc51cd2de4101543bfb4c94d1fb7 /scripts/ems/support/analysis.perl | |
parent | 57b129732182010237ffa27d74435beab27731d2 (diff) |
more analysis in experiment.perl
git-svn-id: https://mosesdecoder.svn.sourceforge.net/svnroot/mosesdecoder/trunk@3234 1f5c12ca-751b-0410-a591-d2e778427230
Diffstat (limited to 'scripts/ems/support/analysis.perl')
-rwxr-xr-x | scripts/ems/support/analysis.perl | 237 |
1 files changed, 224 insertions, 13 deletions
diff --git a/scripts/ems/support/analysis.perl b/scripts/ems/support/analysis.perl index 3fe247581..9d5a21264 100755 --- a/scripts/ems/support/analysis.perl +++ b/scripts/ems/support/analysis.perl @@ -1,26 +1,63 @@ #!/usr/bin/perl -w use strict; +use Getopt::Long "GetOptions"; my $MAX_LENGTH = 4; -die("ERROR: syntax: analysis.perl system reference directory") - unless scalar @ARGV == 3; -my ($system,$reference,$dir) = @ARGV; +my ($system,$segmentation,$reference,$dir,$input,$corpus,$ttable); +if (!&GetOptions('system=s' => \$system, # raw output from decoder + 'reference=s' => \$reference, # tokenized reference + 'dir=s' => \$dir, # directory for storing results + 'input=s' => \$input, # tokenized input (as for decoder) + 'segmentation=s' => \$segmentation, # system output with segmentation markup + 'input-corpus=s' => \$corpus, # input side of parallel training corpus + 'ttable=s' => \$ttable) || # phrase translation table used for decoding + !defined($dir)) { + die("ERROR: syntax: analysis.perl -system FILE -reference FILE -dir DIR [-input FILE] [-input-corpus FILE] [-ttable FILE] [-segmentation FILE]"); +} `mkdir -p $dir`; -my @SYSTEM = `cat $system`; chop(@SYSTEM); -my @REFERENCE = `cat $reference`; chop(@REFERENCE); - +# compare system output against reference translation +my(@SYSTEM,@REFERENCE); my (%PRECISION_CORRECT,%PRECISION_TOTAL, %RECALL_CORRECT,%RECALL_TOTAL); -open(SUMMARY,">$dir/summary"); -&create_n_gram_stats(); -&best_matches(\%PRECISION_CORRECT,\%PRECISION_TOTAL,"$dir/n-gram-precision"); -&best_matches(\%RECALL_CORRECT,\%RECALL_TOTAL,"$dir/n-gram-recall"); -&bleu_annotation(); -close(SUMMARY); +if (defined($system) || defined($reference)) { + die("you need to you specify both system and reference, not just either") + unless defined($system) && defined($reference); + die("can't open system file $system") if ! -e $system; + die("can't open system file $reference") if ! -e $reference; + @SYSTEM = `cat $system`; + @REFERENCE = `cat $reference`; + chop(@SYSTEM); + chop(@REFERENCE); + + open(SUMMARY,">$dir/summary"); + &create_n_gram_stats(); + &best_matches(\%PRECISION_CORRECT,\%PRECISION_TOTAL,"$dir/n-gram-precision"); + &best_matches(\%RECALL_CORRECT,\%RECALL_TOTAL,"$dir/n-gram-recall"); + &bleu_annotation(); + close(SUMMARY); +} + +# segmentation +if (defined($segmentation)) { + &segmentation(); +} + +# coverage analysis +my (%INPUT_PHRASE,%CORPUS_COVERED,%TTABLE_COVERED,%TTABLE_ENTROPY); +if (defined($ttable) || defined($corpus)) { + if (!defined($input)) { + die("ERROR: when specifying either ttable or input-corpus, please also specify input\n"); + } + $MAX_LENGTH = 7; + &input_phrases(); + &ttable_coverage() if defined($ttable); + &corpus_coverage() if defined($corpus); + &input_annotation(); +} sub create_n_gram_stats { for(my $i=0;$i<scalar @SYSTEM;$i++) { @@ -51,6 +88,14 @@ sub best_matches { } } +sub input_phrases { + open(INPUT,$input) or die "Can't read input $input"; + while(my $line = <INPUT>) { + &extract_n_grams($line,\%INPUT_PHRASE); + } + close(INPUT); +} + sub bleu_annotation { open(OUT,"| sort -r >$dir/bleu-annotation"); for(my $i=0;$i<scalar @SYSTEM;$i++) { @@ -94,7 +139,6 @@ sub bleu_annotation { $bleu *= exp(1-$ref_length/scalar(@WORD)); } - my @COLOR = ("#FFC0C0","#FFC0FF","#C0C0FF","#C0FFFF","#C0FFC0"); printf OUT "%5.4f\t%d\t",$bleu,$i; for(my $i=0;$i<scalar @WORD;$i++) { print OUT " " if $i; @@ -124,6 +168,135 @@ sub add_match { } } +sub ttable_coverage { + if (! -e $ttable && -e $ttable.".gz") { + open(TTABLE,"gzip -cd $ttable.gz|"); + } + else { + open(TTABLE,$ttable) or die "Can't read ttable $ttable"; + } + open(REPORT,">$dir/ttable-coverage-by-phrase"); + my ($last_in,$last_size,$entropy,$size) = ("",0,0); + while(<TTABLE>) { + chop; + my ($in,$out,$scores) = split(/ \|\|\| /); + my @IN = split(/ /,$in); + $size = scalar @IN; + next unless defined($INPUT_PHRASE{$size}{$in}); + $TTABLE_COVERED{$size}{$in}++; + my @SCORE = split(/ /,$scores); + my $p = $SCORE[2]; # forward probability + if ($in ne $last_in) { + if ($last_in ne "") { + printf REPORT "%s\t%d\t%.5f\n",$last_in,$TTABLE_COVERED{$last_size}{$last_in},$entropy; + $TTABLE_ENTROPY{$last_size}{$last_in} = $entropy; + $entropy = 0; + } + $last_in = $in; + $last_size = $size; + } + # TODO: normalized entropy? + $entropy -= $p*log($p)/log(2); + } + print REPORT "%s\t%d\t%.5f\n",$last_in,$TTABLE_COVERED{$last_size}{$last_in},$entropy; + close(REPORT); + close(TTABLE); + + &additional_coverage_reports("ttable",\%TTABLE_COVERED); +} + +sub corpus_coverage { + # compute how often input phrases occur in the corpus + open(CORPUS,$corpus) or die "Can't read corpus $corpus"; + while(<CORPUS>) { + my @WORD = split; + my $sentence_length = scalar @WORD; + for(my $start=0;$start < $sentence_length;$start++) { + my $phrase = ""; + for(my $length=1;$length<$MAX_LENGTH && $start+$length<=$sentence_length;$length++) { + $phrase .= " " if $length > 1; + $phrase .= $WORD[$start+$length-1]; + last if !defined($INPUT_PHRASE{$length}{$phrase}); + $CORPUS_COVERED{$length}{$phrase}++; + } + } + } + close(CORPUS); + + # report occurrence counts for all known input phrases + open(REPORT,">$dir/corpus-coverage-by-phrase"); + foreach my $size (sort {$a <=> $b} keys %INPUT_PHRASE) { + foreach my $phrase (keys %{$INPUT_PHRASE{$size}}) { + next unless defined $CORPUS_COVERED{$size}{$phrase}; + printf REPORT "%s\t%d\n", $phrase, $CORPUS_COVERED{$size}{$phrase}; + } + } + close(REPORT); + + &additional_coverage_reports("corpus",\%CORPUS_COVERED); +} + +sub additional_coverage_reports { + my ($name,$COVERED) = @_; + + # unknown word report ---- TODO: extend to rare words? + open(REPORT,">$dir/$name-unknown"); + foreach my $phrase (keys %{$INPUT_PHRASE{1}}) { + next if defined($$COVERED{1}{$phrase}); + printf REPORT "%s\t%d\n",$phrase,$INPUT_PHRASE{1}{$phrase}; + } + close(REPORT); + + # summary report + open(REPORT,">$dir/$name-coverage-summary"); + foreach my $size (sort {$a <=> $b} keys %INPUT_PHRASE) { + my (%COUNT_TYPE,%COUNT_TOKEN); + foreach my $phrase (keys %{$INPUT_PHRASE{$size}}) { + my $covered = $$COVERED{$size}{$phrase}; + $covered = 0 unless defined($covered); + $COUNT_TYPE{$covered}++; + $COUNT_TOKEN{$covered} += $INPUT_PHRASE{$size}{$phrase}; + } + foreach my $count (sort {$a <=> $b} keys %COUNT_TYPE) { + printf REPORT "%d\t%d\t%d\t%d\n",$size,$count,$COUNT_TYPE{$count},$COUNT_TOKEN{$count}; + } + } + close(REPORT); +} + +sub input_annotation { + open(OUT,">$dir/input-annotation"); + open(INPUT,$input) or die "Can't read input $input"; + while(<INPUT>) { + chop; + print OUT $_."\t"; + my @WORD = split; + my $sentence_length = scalar @WORD; + for(my $start=0;$start < $sentence_length;$start++) { + my $phrase = ""; + for(my $length=1;$length<$MAX_LENGTH && $start+$length<=$sentence_length;$length++) { + $phrase .= " " if $length > 1; + $phrase .= $WORD[$start+$length-1]; + + my $ttable_covered = $TTABLE_COVERED{$length}{$phrase}; + my $corpus_covered = $CORPUS_COVERED{$length}{$phrase}; + next unless defined($ttable_covered) || defined($corpus_covered); + my $ttable_entropy = $TTABLE_ENTROPY{$length}{$phrase} || 0; + #$ttable_entropy = 0 unless defined($ttable_entropy); + $ttable_covered = 0 unless defined($ttable_covered); + $corpus_covered = 0 unless defined($corpus_covered); + + if (defined($TTABLE_COVERED{$length}{$phrase})) { + printf OUT "%d-%d:%d:%d:%.5f ",$start,$start+$length-1,$corpus_covered,$ttable_covered,$ttable_entropy + } + } + } + print OUT "\n"; + } + close(INPUT); + close(OUT); +} + sub extract_n_grams { my ($sentence,$NGRAM) = @_; $sentence =~ s/\s+/ /g; @@ -142,3 +315,41 @@ sub extract_n_grams { } } } + +sub segmentation { + my %SEGMENTATION; + + open(FILE,$segmentation) || die("ERROR: could not open segmentation file $segmentation"); + open(OUT,">$dir/segmentation-annotation"); + while(<FILE>) { + chop; + my $count=0; + my $out = -1; + foreach (split) { + if (/^\|(\d+)\-(\d+)\|$/) { + print OUT " " unless $out-($count-1) == 0; + printf OUT "%d:%d:%d:%d",$1,$2,$out-($count-1),$out; + my $in_count = $2-$1+1; + $SEGMENTATION{$in_count}{$count}++; + $count = 0; + } + else { + $out++; + $count++; + } + } + print OUT "\n"; + } + close(OUT); + close(FILE); + + open(SUMMARY,">$dir/segmentation"); + foreach my $in (sort { $a <=> $b } keys %SEGMENTATION) { + foreach my $out (sort { $a <=> $b } keys %{$SEGMENTATION{$in}}) { + printf SUMMARY "%d\t%d\t%d\n", $in, $out, $SEGMENTATION{$in}{$out}; + } + } + close(SUMMARY); + + # TODO: error by segmentation +} |