more analysis in experiment.perl

git-svn-id: https://mosesdecoder.svn.sourceforge.net/svnroot/mosesdecoder/trunk@3234 1f5c12ca-751b-0410-a591-d2e778427230
author: phkoehn <phkoehn@1f5c12ca-751b-0410-a591-d2e778427230> 2010-05-07 15:28:55 +0400
committer: phkoehn <phkoehn@1f5c12ca-751b-0410-a591-d2e778427230> 2010-05-07 15:28:55 +0400
commit: 447dccfc597ce34d6011fbbc1194d96e1b8ec191 (patch)
tree: d506529b860cbc51cd2de4101543bfb4c94d1fb7 /scripts/ems/support/analysis.perl
parent: 57b129732182010237ffa27d74435beab27731d2 (diff)
1 files changed, 224 insertions, 13 deletions
diff --git a/scripts/ems/support/analysis.perl b/scripts/ems/support/analysis.perl
index 3fe247581..9d5a21264 100755
--- a/scripts/ems/support/analysis.perl
+++ b/scripts/ems/support/analysis.perl
@@ -1,26 +1,63 @@
 #!/usr/bin/perl -w
 
 use strict;
+use Getopt::Long "GetOptions";
 
 my $MAX_LENGTH = 4;
 
-die("ERROR: syntax: analysis.perl system reference directory") 
-    unless scalar @ARGV == 3;
-my ($system,$reference,$dir) = @ARGV;
+my ($system,$segmentation,$reference,$dir,$input,$corpus,$ttable);
+if (!&GetOptions('system=s' => \$system, # raw output from decoder
+                 'reference=s' => \$reference, # tokenized reference
+                 'dir=s' => \$dir, # directory for storing results
+                 'input=s' => \$input, # tokenized input (as for decoder)
+                 'segmentation=s' => \$segmentation, # system output with segmentation markup
+                 'input-corpus=s' => \$corpus, # input side of parallel training corpus
+                 'ttable=s' => \$ttable) || # phrase translation table used for decoding
+    !defined($dir)) {
+	die("ERROR: syntax: analysis.perl -system FILE -reference FILE -dir DIR [-input FILE] [-input-corpus FILE] [-ttable FILE] [-segmentation FILE]");	
+}
 
 `mkdir -p $dir`;
 
-my @SYSTEM = `cat $system`; chop(@SYSTEM);
-my @REFERENCE = `cat $reference`; chop(@REFERENCE);
-
+# compare system output against reference translation
+my(@SYSTEM,@REFERENCE);
 my (%PRECISION_CORRECT,%PRECISION_TOTAL,
     %RECALL_CORRECT,%RECALL_TOTAL);
-open(SUMMARY,">$dir/summary");
-&create_n_gram_stats();
-&best_matches(\%PRECISION_CORRECT,\%PRECISION_TOTAL,"$dir/n-gram-precision");
-&best_matches(\%RECALL_CORRECT,\%RECALL_TOTAL,"$dir/n-gram-recall");
-&bleu_annotation();
-close(SUMMARY);
+if (defined($system) || defined($reference)) {
+  die("you need to you specify both system and reference, not just either")
+    unless defined($system) && defined($reference);
+  die("can't open system file $system") if ! -e $system;
+  die("can't open system file $reference") if ! -e $reference;
+  @SYSTEM = `cat $system`;
+  @REFERENCE = `cat $reference`;
+  chop(@SYSTEM);
+  chop(@REFERENCE);
+
+  open(SUMMARY,">$dir/summary");
+  &create_n_gram_stats();
+  &best_matches(\%PRECISION_CORRECT,\%PRECISION_TOTAL,"$dir/n-gram-precision");
+  &best_matches(\%RECALL_CORRECT,\%RECALL_TOTAL,"$dir/n-gram-recall");
+  &bleu_annotation();
+  close(SUMMARY);
+}
+
+# segmentation
+if (defined($segmentation)) {
+  &segmentation();
+}
+
+# coverage analysis
+my (%INPUT_PHRASE,%CORPUS_COVERED,%TTABLE_COVERED,%TTABLE_ENTROPY);
+if (defined($ttable) || defined($corpus)) {	
+  if (!defined($input)) {
+    die("ERROR: when specifying either ttable or input-corpus, please also specify input\n");
+  }
+  $MAX_LENGTH = 7;
+  &input_phrases();
+  &ttable_coverage() if defined($ttable);
+  &corpus_coverage() if defined($corpus);
+  &input_annotation();
+}
 
 sub create_n_gram_stats {
     for(my $i=0;$i<scalar @SYSTEM;$i++) {
@@ -51,6 +88,14 @@ sub best_matches {
     }
 }
 
+sub input_phrases {
+  open(INPUT,$input) or die "Can't read input $input";
+  while(my $line = <INPUT>) {
+    &extract_n_grams($line,\%INPUT_PHRASE);
+  }
+  close(INPUT);  
+}
+
 sub bleu_annotation {
     open(OUT,"| sort -r >$dir/bleu-annotation");
     for(my $i=0;$i<scalar @SYSTEM;$i++) {
@@ -94,7 +139,6 @@ sub bleu_annotation {
 	    $bleu *= exp(1-$ref_length/scalar(@WORD));
 	}
 
-	my @COLOR = ("#FFC0C0","#FFC0FF","#C0C0FF","#C0FFFF","#C0FFC0");
 	printf OUT "%5.4f\t%d\t",$bleu,$i;
         for(my $i=0;$i<scalar @WORD;$i++) {
 	    print OUT " " if $i;
@@ -124,6 +168,135 @@ sub add_match {
     }
 }
 
+sub ttable_coverage {
+  if (! -e $ttable && -e $ttable.".gz") {
+    open(TTABLE,"gzip -cd $ttable.gz|");
+  }
+  else {
+    open(TTABLE,$ttable) or die "Can't read ttable $ttable";
+  }
+  open(REPORT,">$dir/ttable-coverage-by-phrase");
+  my ($last_in,$last_size,$entropy,$size) = ("",0,0);
+  while(<TTABLE>) {
+    chop;
+    my ($in,$out,$scores) = split(/ \|\|\| /);	
+    my @IN = split(/ /,$in);
+    $size = scalar @IN;
+    next unless defined($INPUT_PHRASE{$size}{$in});
+    $TTABLE_COVERED{$size}{$in}++;
+    my @SCORE = split(/ /,$scores);
+    my $p = $SCORE[2]; # forward probability
+    if ($in ne $last_in) {
+      if ($last_in ne "") {
+        printf REPORT "%s\t%d\t%.5f\n",$last_in,$TTABLE_COVERED{$last_size}{$last_in},$entropy;
+	$TTABLE_ENTROPY{$last_size}{$last_in} = $entropy;
+        $entropy = 0;
+      }
+      $last_in = $in;
+      $last_size = $size;
+    }
+    # TODO: normalized entropy?
+    $entropy -= $p*log($p)/log(2);
+  }
+  print REPORT "%s\t%d\t%.5f\n",$last_in,$TTABLE_COVERED{$last_size}{$last_in},$entropy;
+  close(REPORT);
+  close(TTABLE);
+
+  &additional_coverage_reports("ttable",\%TTABLE_COVERED);
+}
+
+sub corpus_coverage {
+  # compute how often input phrases occur in the corpus
+  open(CORPUS,$corpus) or die "Can't read corpus $corpus";
+  while(<CORPUS>) {
+    my @WORD = split;
+    my $sentence_length = scalar @WORD;
+    for(my $start=0;$start < $sentence_length;$start++) {
+      my $phrase = "";
+      for(my $length=1;$length<$MAX_LENGTH && $start+$length<=$sentence_length;$length++) {
+	$phrase .= " " if $length > 1;
+	$phrase .= $WORD[$start+$length-1];
+	last if !defined($INPUT_PHRASE{$length}{$phrase});
+	$CORPUS_COVERED{$length}{$phrase}++;
+      }
+    }
+  }
+  close(CORPUS);
+
+  # report occurrence counts for all known input phrases
+  open(REPORT,">$dir/corpus-coverage-by-phrase");
+  foreach my $size (sort {$a <=> $b} keys %INPUT_PHRASE) {
+    foreach my $phrase (keys %{$INPUT_PHRASE{$size}}) {
+      next unless defined $CORPUS_COVERED{$size}{$phrase};
+      printf REPORT "%s\t%d\n", $phrase, $CORPUS_COVERED{$size}{$phrase};
+    }
+  }
+  close(REPORT);
+
+  &additional_coverage_reports("corpus",\%CORPUS_COVERED);
+}
+
+sub additional_coverage_reports {
+  my ($name,$COVERED) = @_;
+
+  # unknown word report ---- TODO: extend to rare words?
+  open(REPORT,">$dir/$name-unknown");
+  foreach my $phrase (keys %{$INPUT_PHRASE{1}}) {
+    next if defined($$COVERED{1}{$phrase});
+    printf REPORT "%s\t%d\n",$phrase,$INPUT_PHRASE{1}{$phrase};
+  }
+  close(REPORT);
+
+  # summary report
+  open(REPORT,">$dir/$name-coverage-summary");
+  foreach my $size (sort {$a <=> $b} keys %INPUT_PHRASE) {
+    my (%COUNT_TYPE,%COUNT_TOKEN);
+    foreach my $phrase (keys %{$INPUT_PHRASE{$size}}) {
+      my $covered = $$COVERED{$size}{$phrase};
+      $covered = 0 unless defined($covered);
+      $COUNT_TYPE{$covered}++;
+      $COUNT_TOKEN{$covered} += $INPUT_PHRASE{$size}{$phrase};
+    }
+    foreach my $count (sort {$a <=> $b} keys %COUNT_TYPE) {
+      printf REPORT "%d\t%d\t%d\t%d\n",$size,$count,$COUNT_TYPE{$count},$COUNT_TOKEN{$count};
+    }
+  }
+  close(REPORT);
+}
+
+sub input_annotation {
+  open(OUT,">$dir/input-annotation");
+  open(INPUT,$input) or die "Can't read input $input";
+  while(<INPUT>) {
+    chop;
+    print OUT $_."\t";
+    my @WORD = split;
+    my $sentence_length = scalar @WORD;
+    for(my $start=0;$start < $sentence_length;$start++) {
+      my $phrase = "";
+      for(my $length=1;$length<$MAX_LENGTH && $start+$length<=$sentence_length;$length++) {
+	$phrase .= " " if $length > 1;
+	$phrase .= $WORD[$start+$length-1];
+
+	my $ttable_covered = $TTABLE_COVERED{$length}{$phrase};
+	my $corpus_covered = $CORPUS_COVERED{$length}{$phrase};
+	next unless defined($ttable_covered) || defined($corpus_covered);
+	my $ttable_entropy = $TTABLE_ENTROPY{$length}{$phrase} || 0;
+	#$ttable_entropy = 0 unless defined($ttable_entropy);
+	$ttable_covered = 0 unless defined($ttable_covered);
+	$corpus_covered = 0 unless defined($corpus_covered);
+	
+	if (defined($TTABLE_COVERED{$length}{$phrase})) {
+	  printf OUT "%d-%d:%d:%d:%.5f ",$start,$start+$length-1,$corpus_covered,$ttable_covered,$ttable_entropy
+	}
+      }
+    }
+    print OUT "\n";
+  }
+  close(INPUT);    
+  close(OUT);
+}
+
 sub extract_n_grams {
     my ($sentence,$NGRAM) = @_;
     $sentence =~ s/\s+/ /g;
@@ -142,3 +315,41 @@ sub extract_n_grams {
 	}
     }
 }
+
+sub segmentation {
+  my %SEGMENTATION;
+
+  open(FILE,$segmentation) || die("ERROR: could not open segmentation file $segmentation");
+  open(OUT,">$dir/segmentation-annotation");
+  while(<FILE>) {
+    chop;
+    my $count=0;
+    my $out = -1;
+    foreach (split) {
+      if (/^\|(\d+)\-(\d+)\|$/) {
+	print OUT " " unless $out-($count-1) == 0;
+	printf OUT "%d:%d:%d:%d",$1,$2,$out-($count-1),$out;
+	my $in_count = $2-$1+1;
+	$SEGMENTATION{$in_count}{$count}++;
+        $count = 0;
+      }
+      else {
+        $out++;
+        $count++;
+      }
+    }
+    print OUT "\n";
+  }
+  close(OUT);
+  close(FILE);
+
+  open(SUMMARY,">$dir/segmentation");
+  foreach my $in (sort { $a <=> $b } keys %SEGMENTATION) {
+    foreach my $out (sort { $a <=> $b } keys %{$SEGMENTATION{$in}}) {
+      printf SUMMARY "%d\t%d\t%d\n", $in, $out, $SEGMENTATION{$in}{$out};
+    }
+  }
+  close(SUMMARY);
+
+  # TODO: error by segmentation
+}
author	phkoehn <phkoehn@1f5c12ca-751b-0410-a591-d2e778427230>	2010-05-07 15:28:55 +0400
committer	phkoehn <phkoehn@1f5c12ca-751b-0410-a591-d2e778427230>	2010-05-07 15:28:55 +0400
commit	447dccfc597ce34d6011fbbc1194d96e1b8ec191 (patch)
tree	d506529b860cbc51cd2de4101543bfb4c94d1fb7 /scripts/ems/support/analysis.perl
parent	57b129732182010237ffa27d74435beab27731d2 (diff)