modified sentence-by-sentence to handle multiple outputs;

edited cache handling in newsmtgui (should increase speed and decrease errors) git-svn-id: https://mosesdecoder.svn.sourceforge.net/svnroot/mosesdecoder/trunk@767 1f5c12ca-751b-0410-a591-d2e778427230
author: eherbst <eherbst@1f5c12ca-751b-0410-a591-d2e778427230> 2006-08-16 18:49:10 +0400
committer: eherbst <eherbst@1f5c12ca-751b-0410-a591-d2e778427230> 2006-08-16 18:49:10 +0400
commit: c34aca3053a675a56334db1a476ff3f67d9777fa (patch)
tree: 2b6428982dbdef51c89860de4d384d3bbf02b0c6 /scripts/analysis
parent: be6b9d8ce586034c3e0f4df700cdda890805d94f (diff)
4 files changed, 232 insertions, 116 deletions
diff --git a/scripts/analysis/README b/scripts/analysis/README
index 6e28bc9e0..177550e70 100644
--- a/scripts/analysis/README
+++ b/scripts/analysis/README
@@ -1,6 +1,5 @@
 Put any scripts useful for human analysis of MT output here.
 
-[EVH]
-sentence-by-sentence.pl: show comparison of sentences in reference translation(s)/system otuput/(truth) in colorful format
+sentence-by-sentence.pl [EVH]: show comparison of sentences in reference translation(s)/system output(s)/(truth) in colorful format
 -- shows all sentences given, with non-matching words in the system output marked, BLEU scores given by sentence, and matching n-grams shown in a table
 -- requires all input files be utf8-encoded (you can convert a file with `cat FILE | perl -n -e 'binmode(STDOUT, ":utf8"); print;' > FILE.utf8`)
diff --git a/scripts/analysis/sentence-by-sentence.pl b/scripts/analysis/sentence-by-sentence.pl
index d5ca9879c..47e4ef63d 100755
--- a/scripts/analysis/sentence-by-sentence.pl
+++ b/scripts/analysis/sentence-by-sentence.pl
@@ -7,32 +7,39 @@ use strict;
 use Getopt::Long;
 
 my $sourcefile = undef;
+my @truthfiles;
 GetOptions(
-	"source=s" => \$sourcefile,
+	"source|s=s" => \$sourcefile,
+	"reference|r=s" => \@truthfiles
 ) or exit(1);
 
-my ($sysoutfile, @truthfiles) = @ARGV;
-
-if (!defined $sysoutfile || scalar(@truthfiles) == 0) {
-	print STDERR "
-usage: $0 system_output reference(s...) > sentence-by-sentence.html
+my @sysoutfiles = @ARGV;
+if (scalar(@sysoutfiles) == 0 || scalar(@truthfiles) == 0)
+{
+	print STDERR "usage: $0 system_output(s) > sentence-by-sentence.html
 Options:
-  --source STRING  ... foreign original
+  --source,-s STRING      foreign input (can be used multiple times)
+  --reference,-r STRING   English truth (can be used multiple times)
 
 N-grams are colored by the number of supporting references:
- red for fewest, green for most, mediate shades otherwise.
-";
+ red for fewest, green for most, mediate shades otherwise.\n";
   exit(1);
 }
 
+####################################################################################################################
+
 my @TRUTHS = () x scalar(@truthfiles);
 for(my $i = 0; $i < scalar(@truthfiles); $i++)
 {
 	open($TRUTHS[$i], "<$truthfiles[$i]") or die "couldn't open '$truthfiles[$i]' for read: $!\n";
 	binmode($TRUTHS[$i], ":utf8");
 }
-open(SYSOUT, "<$sysoutfile") or die "couldn't open '$sysoutfile' for read: $!\n";
-binmode(SYSOUT, ":utf8");
+my @SYSOUTS = () x scalar(@sysoutfiles);
+for(my $i = 0; $i < scalar(@sysoutfiles); $i++)
+{
+	open($SYSOUTS[$i], "<$sysoutfiles[$i]") or die "couldn't open '$sysoutfiles[$i]' for read: $!\n";
+	binmode($SYSOUTS[$i], ":utf8");
+}
 binmode(STDOUT, ":utf8");
 if (defined $sourcefile)
 {
@@ -40,63 +47,64 @@ if (defined $sourcefile)
 	binmode(SOURCE, ":utf8");
 }
 my @bleuScores;
+for(my $i = 0; $i < scalar(@sysoutfiles); $i++) {push @bleuScores, [];}
 my @htmlSentences;
+my @javascripts;
 my @htmlColors = ('#99ff99', '#aaaaff', '#ffff99', '#ff9933', '#ff9999'); #color sentences by rank (split in n tiers)
 my $ngramSingleRefColor = '#aaffaa';
 my @ngramMultirefColors = ('#ff9999', '#ff9933', '#ffff99', '#a0a0ff', '#99ff99'); #arbitrary-length list; first entry is used for worst n-grams
-my $i = 0;
-while(my $sLine = <SYSOUT>)
+my $numSentences = 0;
+my (@sLines, @eLines);
+while(readLines(\@SYSOUTS, \@sLines) && readLines(\@TRUTHS, \@eLines))
 {
-	escapeMetachars($sLine); #remove inconsistencies in encoding
-	my @sFactors = @{extractFactorArrays($sLine)};
-	my @eLines = () x scalar(@truthfiles);
-	my @eFactors;
-	for(my $j = 0; $j < scalar(@truthfiles); $j++)
-	{
-		my $fh = $TRUTHS[$j];
-		$eLines[$j] = <$fh>;
-		escapeMetachars($eLines[$j]); #remove inconsistencies in encoding
-		push @eFactors, extractFactorArrays($eLines[$j], "$truthfiles[$j] shorter than $sysoutfile");
-	}
-	my $sourceFactors;
+	#create array of lines of HTML
+	my @html = ("<div class=\"sentence_%%%%\" id=\"sentence$numSentences\">"); #%%%% is a flag to be replaced
+
+	my (@sFactors, @eFactors, $sourceFactors);
+	#process source
 	if (defined $sourcefile)
 	{
 		my $sourceLine = <SOURCE>;
 		escapeMetachars($sourceLine); #remove inconsistencies in encoding
-		$sourceFactors = extractFactorArrays($sourceLine, "$sourcefile shorter than $sysoutfile");
+		$sourceFactors = extractFactorArrays($sourceLine);
+		push @html, "<tr><td class=\"sent_title\">Source</td><td class=\"source_sentence\" id=\"source$numSentences\">" 
+								. getFactoredSentenceHTML($sourceFactors) . "</td></tr>\n";
 	}
-		  
-	my $bleuData = getBLEUSentenceDetails(\@sFactors, \@eFactors, 0);
-	push @bleuScores, [$i, $bleuData->[0], 0]; #the last number will be the rank
-	my $pwerData = getPWERSentenceDetails(\@sFactors, \@eFactors, 0);
-	my $html = "<div class=\"sentence\" style=\"background-color: %%%%\" id=\"sentence$i\">"; #the %%%% and other tokens like it are flags to be replaced
-	$html .= "<div class=\"bleu_report\"><b>Sentence $i)&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;BLEU:</b> " . sprintf("%.4lg", $bleuData->[0]->[0]) . " (" . join('/', map {sprintf("%.4lg", $_)} @{$bleuData->[0]}[1 .. 4]) . ")</div><table>\n";
-	if(defined $sourcefile)
+	#process truth
+	for(my $j = 0; $j < scalar(@truthfiles); $j++)
 	{
-		$html .= "<tr><td class=\"sent_title\">Source</td><td class=\"source_sentence\" id=\"source$i\">" . getFactoredSentenceHTML($sourceFactors) . "</td></tr>\n";
+		escapeMetachars($eLines[$j]); #remove inconsistencies in encoding
+		push @eFactors, extractFactorArrays($eLines[$j]);
+		push @html, "<tr><td class=\"sent_title\">Ref $j</td><td class=\"truth_sentence\" id=\"truth${numSentences}_$j\">" 
+								. getFactoredSentenceHTML($eFactors[$j]) . "</td></tr>\n";
 	}
-	for(my $j = 0; $j < scalar(@truthfiles); $j++)
+	#process sysouts
+	my @bleuData;
+	for(my $j = 0; $j < scalar(@sysoutfiles); $j++)
 	{
-		$html .= "<tr><td class=\"sent_title\">Ref $j</td><td class=\"truth_sentence\" id=\"truth${i}_$j\">" . getFactoredSentenceHTML($eFactors[$j]) . "</td></tr>\n";
+		escapeMetachars($sLines[$j]); #remove inconsistencies in encoding
+		push @sFactors, extractFactorArrays($sLines[$j]);
+		push @bleuData, getBLEUSentenceDetails($sFactors[$j], \@eFactors, 0);
+		push @{$bleuScores[$j]}, [$numSentences, $bleuData[$j]->[0], 0]; #the last number will be the rank
+		my $pwerData = getPWERSentenceDetails($sFactors[$j], \@eFactors, 0);
+		push @html, "<tr><td class=\"sent_title\">Output $j</td><td class=\"sysout_sentence\" id=\"sysout$numSentences\">" 
+								. getFactoredSentenceHTML($sFactors[$j], $pwerData) . "</td></tr>\n";
+		push @html, "<tr><td class=\"sent_title\">N-grams</td><td class=\"sysout_ngrams\" id=\"ngrams$numSentences\">" 
+								. getAllNgramsHTML($sFactors[$j], $bleuData[$j]->[1], scalar(@truthfiles)) . "</td></tr>\n";
 	}
-	my $j = 0;
-	$html .= "<tr><td class=\"sent_title\">Output</td><td class=\"sysout_sentence\" id=\"sysout$i\">" . getFactoredSentenceHTML(\@sFactors, $pwerData) . "</td></tr>\n";
-	$j = 0;
-	$html .= "<tr><td class=\"sent_title\">N-grams</td><td class=\"sysout_ngrams\" id=\"ngrams$i\">" . getAllNgramsHTML(\@sFactors, $bleuData->[1], scalar(@truthfiles)) . "</td></tr>\n";
-	$html .= "</table></div>\n";
-	push @htmlSentences, $html;
-	$i++;
+	splice(@html, 1, 0, "<div class=\"bleu_report\"><b>Sentence $numSentences)&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;BLEU:</b> " 
+							. join("; ", map {sprintf("%.4lg", $_->[0]->[0]) . " (" . join('/', map {sprintf("%.4lg", $_)} @{$_->[0]}[1 .. 4]) . ") "} @bleuData) . "</div><table>\n");
+	push @html, "</table></div>\n";
+	push @htmlSentences, join('', @html);
+	$numSentences++;
+	@sLines = (); @eLines = (); #clear writable arrays to be refilled
 }
-close(SYSOUT);
+foreach my $sysoutfh (@SYSOUTS) {close($sysoutfh);}
 foreach my $truthfh (@TRUTHS) {close($truthfh);}
 
-rankSentencesByBLEU(\@bleuScores);
-my $stylesheet = <<EOHTML;
-<style type="text/css">
+my $stylesheet = "<style type=\"text/css\">
 .legend {background: #fff; border: 1px solid #000; padding: 2px; margin-bottom: 10px; margin-right: 15px}
 .legend_title {font-weight: bold; font-size: medium; text-decoration: underline}
-div.sentence {background: #ffffee; border: 1px solid #000088; padding: 0px 8px 0px 8px} //entire composition for a given sentence
-div.sentence td {margin: 8px 0px 8px 0px}
 div.bleu_report {margin-bottom: 5px}
 td.sent_title {font-weight: bold; font-size: medium; margin-bottom: 12px}
 .source_sentence {background: #ffcccc; border: 1px solid #bbb}
@@ -105,25 +113,67 @@ td.sent_title {font-weight: bold; font-size: medium; margin-bottom: 12px}
 table.sentence_table {border: none}
 .sysout_ngrams {background: #fff; border: 1px solid #bbb}
 table.ngram_table {}
-td.ngram_cell {padding: 1px}
-</style>
-EOHTML
-print "<html><head>\n";
-print "<meta http-equiv=\"Content-type: text/html; charset=utf-8\">\n";
-print "<title>$sysoutfile vs. [" . join(' ', @truthfiles) . "]: Sentence-by-sentence Comparison</title>$stylesheet</head><body>\n";
+td.ngram_cell {padding: 1px}\n";
+for(my $i = 0; $i < scalar(@htmlColors); $i++)
+{
+	$stylesheet .= ".sentence_tier$i {background: $htmlColors[$i]; border: 1px solid #000088; padding: 0px 8px 0px 8px} //entire composition for a given sentence\n";
+	$stylesheet .= "div.sentence_tier$i td {margin: 8px 0px 8px 0px}\n";
+}
+$stylesheet .= "</style>\n";
+
+print "<html><head><meta http-equiv=\"Content-type: text/html; charset=utf-8\">\n";
+print "<title>[" . join(', ', @sysoutfiles) . "] vs. [" . join(', ', @truthfiles) . "]: Sentence-by-Sentence Comparison</title>$stylesheet</head><body>\n";
 
-#javascript to sort by BLEU, by order in corpus, ...
-my %rank2index = map {$bleuScores[$_]->[2] => $_} (0 .. scalar(@htmlSentences) - 1);
+foreach my $systemScores (@bleuScores) {rankSentencesByBLEU($systemScores);}
+#javascript to sort by BLEU for any system, by order in corpus ...
 print "<script type=\"text/javascript\">
+var selectedSysout = 0; //index of system currently being used to rank/sort
+
+function selectSysout(index)
+{
+	//update the BLEU-range text shown in the legend
+	var legend = document.getElementById('legendBLEU');
+	var rows = legend.getElementsByTagName('tr');
+	for(var i = 0; i < rows.length; i++)
+	{
+		var cell = rows[i].childNodes[1];
+		var spans = cell.getElementsByTagName('span');
+		cell.childNodes[0].nodeValue = spans[index].firstChild.nodeValue; //something like '0.1 - 0.3'
+	}
+	
+	//update the background colors of the sentence divs
+	var allSentences = document.getElementById('all_sentences');
+	var sentences = allSentences.childNodes;
+	for(var i = 0; i < sentences.length; i++)
+	{
+		if(typeof sentences[i].tagName != 'undefined' && sentences[i].tagName.toLowerCase() == 'div') //text nodes have undefined tagName
+		{
+			var tierSpans = sentences[i].firstChild.childNodes;
+			sentences[i].childNodes[2].className = tierSpans[index].firstChild.nodeValue; //something like 'tier3'
+		}
+	}
+	selectedSysout = index; //selectedSysout is a flag to the sort functions
+}
+
 function sortByBLEU()
 {
-	var body = document.getElementById('all_sentences'); var row;\n";
-foreach my $rank (sort {$a <=> $b} keys %rank2index)
+	var body = document.getElementById('all_sentences'); var row;
+	switch(selectedSysout)
+	{\n";
+for(my $i = 0; $i < scalar(@sysoutfiles); $i++)
 {
-	print "\trow = document.getElementById('everything" . $rank2index{$rank} . "');\n";
-	print "\tbody.removeChild(row); body.appendChild(row);\n";
+	print "case $i:
+	{";
+	my %rank2index = map {$bleuScores[$i]->[$_]->[2] => $_} (0 .. scalar(@htmlSentences) - 1);
+	foreach my $rank (sort {$a <=> $b} keys %rank2index)
+	{
+		print "\trow = document.getElementById('everything" . $rank2index{$rank} . "');\n";
+		print "\tbody.removeChild(row); body.appendChild(row);\n";
+	}
+	print "break;}\n";
 }
 print "}
+}
 function sortByCorpusOrder()
 {
 	var body = document.getElementById('all_sentences'); var row;\n";
@@ -135,42 +185,57 @@ for(my $j = 0; $j < scalar(@htmlSentences); $j++)
 print "}
 </script>\n";
 
-#legend for background colors
-my @minBLEU = (1e9) x scalar(@htmlColors);
-my @maxBLEU = (-1e9) x scalar(@htmlColors);
-for(my $k = 0; $k < scalar(@htmlSentences); $k++)
+#legends for background colors of sentences and n-grams
+my (@minBLEU, @maxBLEU);
+my @bleuTiers = () x scalar(@htmlSentences); #for each sentence, arrayref of tier indices for each system
+for(my $i = 0; $i < scalar(@sysoutfiles); $i++)
 {
-	my $tier = int($bleuScores[$k]->[2] / (scalar(@htmlSentences) / scalar(@htmlColors)));
-	if($bleuScores[$k]->[1]->[0] < $minBLEU[$tier]) {$minBLEU[$tier] = $bleuScores[$k]->[1]->[0];}
-	elsif($bleuScores[$k]->[1]->[0] > $maxBLEU[$tier]) {$maxBLEU[$tier] = $bleuScores[$k]->[1]->[0];}
+	my @a = (1e9) x scalar(@htmlColors);
+	my @b = (-1e9) x scalar(@htmlColors);
+	for(my $k = 0; $k < scalar(@htmlSentences); $k++)
+	{
+		my $tier = int($bleuScores[$i]->[$k]->[2] / (scalar(@htmlSentences) / scalar(@htmlColors)));
+		push @{$bleuTiers[$k]}, $tier;
+		if($bleuScores[$i]->[$k]->[1]->[0] < $a[$tier]) {$a[$tier] = $bleuScores[$i]->[$k]->[1]->[0];}
+		if($bleuScores[$i]->[$k]->[1]->[0] > $b[$tier]) {$b[$tier] = $bleuScores[$i]->[$k]->[1]->[0];}
+	}
+	push @minBLEU, \@a;
+	push @maxBLEU, \@b;
 }
-print "<table border=0><tr><td><div class=\"legend\"><span class=\"legend_title\">Sentence Background Colors => BLEU Ranges</span><table border=0>";
+print "<table border=0><tr><td><div id=\"legendBLEU\" class=\"legend\"><span class=\"legend_title\">Sentence Background Colors => BLEU Ranges</span><table border=0>";
 for(my $k = 0; $k < scalar(@htmlColors); $k++)
 {
 	print "<tr><td style=\"width: 15px; height: 15px; background: " . $htmlColors[$k] . "\"></td><td align=left style=\"padding-left: 12px\">" 
-							. sprintf("%.4lg", $minBLEU[$k]) . " - " . sprintf("%.4lg", $maxBLEU[$k]) . "</td>";
+							. sprintf("%.4lg", $minBLEU[0]->[$k]) . " - " . sprintf("%.4lg", $maxBLEU[0]->[$k]);
+	for(my $j = 0; $j < scalar(@sysoutfiles); $j++)
+	{
+		print "<span style=\"display: none\">" . sprintf("%.4lg", $minBLEU[$j]->[$k]) . " - " . sprintf("%.4lg", $maxBLEU[$j]->[$k]) . "</span>";
+	}
+	print "</td></tr>";
 }
 print "</table></div></td>\n";
 print "<td><div class=\"legend\"><span class=\"legend_title\">N-gram Colors => Number of Matching Reference Translations</span><table border=0>";
 for(my $k = 1; $k <= scalar(@truthfiles); $k++)
 {
-	print "<tr><td style=\"width: 15px; height: 15px; background: " . getNgramColorHTML($k, scalar(@truthfiles)) . "\"></td><td align=left style=\"padding-left: 12px\">$k</td>";
+	print "<tr><td style=\"width: 15px; height: 15px; background: " . getNgramColorHTML($k, scalar(@truthfiles)) . "\"></td><td align=left style=\"padding-left: 12px\">$k</td></tr>";
 }
 print "</table></div></td></tr></table><div style=\"font-weight: bold; margin-bottom: 15px\">
 PWER errors are marked in red on output sentence displays.</div>
+<div style=\"margin-bottom: 8px\">Color by system # " 
+						. join(' | ', map {"<a href=\"javascript:selectSysout($_);\">$_</a>" . (($_ == '0') ? " (default)" : "")} (0 .. scalar(@sysoutfiles) - 1)) . "</div>
 <div style=\"margin-bottom: 8px\">Sort by <a href=\"javascript:sortByBLEU();\">BLEU score</a> | <a href=\"javascript:sortByCorpusOrder();\">corpus order</a> (default)</div>\n";
 
 #sentence boxes
 print "<div id=\"all_sentences\">";
-my $j = 0;
-foreach my $sentenceHTML (@htmlSentences)
+for(my $j = 0; $j < scalar(@htmlSentences); $j++)
 {
 	print "<div id=\"everything$j\" style=\"margin: 0px; padding: 0px\">";
+	print "<div class=\"ranks_container\" style=\"display: none\">" . join('', map {"<span>sentence_tier$_</span>"} @{$bleuTiers[$j]}) . "</div>";
 	print "<hr width=98%>";
-	my $bgcolor = getSentenceBGColorHTML($bleuScores[$j], $i); #i is now # of sentences
-	$sentenceHTML =~ s/%%%%/$bgcolor/;
-	print "$sentenceHTML</div>\n";
-	$j++;
+#	my $bgcolor = getSentenceBGColorHTML($bleuScores[0]->[$j], $i); #i is now # of sentences
+	my $tierNum = $bleuTiers[$j]->[0];
+	$htmlSentences[$j] =~ s/%%%%/tier$tierNum/;
+	print "$htmlSentences[$j]</div>\n";
 }
 print "</div></body></html>";
 
@@ -211,7 +276,7 @@ sub round
 }
 
 #escape HTML metacharacters for display purposes and to allow for consistent string comparison
-#arguments: string to be formatted
+#arguments: string to be formatted in place
 #return: none
 sub escapeMetachars
 {
@@ -223,13 +288,27 @@ sub escapeMetachars
 
 ###############################################################################################################################################################
 
-#arguments: line read from corpus file, (optionally) string to die with if line isn't defined (default die-msg is empty)
+#read one line from each of any number of filehandles
+#arguments: arrayref of filehandles, (empty) arrayref to be filled with read lines
+#return: 1 on success, 0 on failure (on failure the lines arrayref's value isn't defined)
+sub readLines
+{
+	my ($refFilehandles, $refLines) = @_;
+	foreach my $fh (@$refFilehandles)
+	{
+		my $line = <$fh>;
+		return 0 unless defined($line);
+		push @$refLines, $line;
+	}
+	return 1;
+}
+
+#arguments: line read from corpus file
 #return: sentence (arrayref of arrayrefs of factor strings) taken from line
 sub extractFactorArrays
 {
-	my ($line, $msg) = (shift, '');
-	$msg = shift if scalar(@_);
-	die $msg if !defined $line;
+	my $line = shift;
+	die "" if !defined $line;
 	chomp $line;
 	$line =~ s/^\s*|\s*$//g; #added by Ondrej to handle moses-mert-parallel output
 	my @words = split(/\s+/, $line);
@@ -284,15 +363,17 @@ sub getBLEUSentenceDetails
 			{
 				$correct[$k]++;
 				my @indices = ();
+				my $notOvercounting = 0; #make sure we don't 'match' against truth n-grams whose instances have all been used already
 				for(my $m = 0; $m < scalar(@{$REF_GRAM{$gram}}); $m++)
 				{
 					if($REF_GRAM{$gram}->[$m] > 0)
 					{
 						push @indices, $m;
 						$REF_GRAM{$gram}->[$m]--;
+						$notOvercounting = 1;
 					}
 				}
-				push @$ngramMatches, [$i - $k, $k + 1, \@indices];
+				if($notOvercounting == 1) {push @$ngramMatches, [$i - $k, $k + 1, \@indices];}
 			}
 		}
 	}
diff --git a/scripts/analysis/smtgui/Corpus.pm b/scripts/analysis/smtgui/Corpus.pm
index 5a2753fdf..f4629d5a2 100644
--- a/scripts/analysis/smtgui/Corpus.pm
+++ b/scripts/analysis/smtgui/Corpus.pm
@@ -99,7 +99,7 @@ sub calcUnknownTokens
 {
 	my ($self, $factorName) = @_;
 	#check in-memory cache first
-	if(defined($self->{'unknownCount'}->{$factorName}))
+	if(exists $self->{'unknownCount'}->{$factorName} && exists $self->{'tokenCount'}->{'input'})
 	{
 		return ($self->{'unknownCount'}->{$factorName}, $self->{'tokenCount'}->{'input'});
 	}
@@ -179,7 +179,7 @@ sub calcOverallWER
 	my ($self, $sysname, $factorName) = (shift, shift, 'surf');
 	if(scalar(@_) > 0) {$factorName = shift;}
 	#check in-memory cache first
-	if(defined($self->{'sysoutWER'}->{$sysname}->{$factorName}))
+	if(exists $self->{'sysoutWER'}->{$sysname}->{$factorName})
 	{
 		return $self->{'sysoutWER'}->{$sysname}->{$factorName}->[0];
 	}
@@ -209,7 +209,7 @@ sub calcOverallPWER
 	my ($self, $sysname, $factorName) = (shift, shift, 'surf');
 	if(scalar(@_) > 0) {$factorName = shift;}
 	#check in-memory cache first
-	if(defined($self->{'sysoutPWER'}->{$sysname}->{$factorName}))
+	if(exists $self->{'sysoutPWER'}->{$sysname}->{$factorName})
 	{
 		return $self->{'sysoutPWER'}->{$sysname}->{$factorName}->[0];
 	}
@@ -353,7 +353,6 @@ sub statisticallyTestBLEUResults
 		$devs[$i] = sqrt($devs[$i] / ($k - 1));
 		$t->[$i] = ($fullCorpusBLEU->[$i + 1] / 100 - $means[$i]) / $devs[$i];
 		push @{$self->{'bleuConfidence'}->{$sysname}->{$factorName}->[0]}, getLowerBoundPValue($t->[$i]); #p-value for overall score vs. subset average
-#		warn "$i: mean " . $means[$i] . ", dev " . $devs[$i] . ", t " . $t->[$i] . ", conf " . getLowerBoundPValue($t->[$i]) . "\n";
 		push @{$self->{'bleuConfidence'}->{$sysname}->{$factorName}->[1]}, 
 							[$means[$i] - $criticalTStat * $devs[$i] / sqrt($k), $means[$i] + $criticalTStat * $devs[$i] / sqrt($k)]; #the confidence interval
 	}
@@ -384,9 +383,9 @@ sub calcPerplexity
 	my $cmd = "perl ./extract-factors.pl $sysoutFilename " . $self->{'factorIndices'}->{$factorName} . " > $tmpfile";
 	`$cmd`; #extract just the factor we're interested in; ngram doesn't understand factored notation
 	my @output = `./ngram -lm $lmFilename -ppl $tmpfile`; #run the SRI n-gram tool
-	`rm $tmpfile`;
+	`rm -f $tmpfile`;
 	$output[1] =~ /ppl1=\s*([0-9\.]+)/;
-	$self->{'perplexity'}->{$sysname} = $1;
+	$self->{'perplexity'}->{$sysname}->{$factorName} = $1;
 	return $self->{'perplexity'}->{$sysname}->{$factorName};
 }
 
@@ -511,8 +510,8 @@ sub writeCacheFile
 	{
 		my $ext = shift;
 		#check for a previously read value
-		if(exists $self->{'fileCtimes'}->{$ext}) {print CACHEFILE $self->{'corpusName'} . ".$ext " . $self->{'fileCtimes'}->{$ext} . "\n";}
-		else {print CACHEFILE $self->{'corpusName'} . ".$ext " . time . "\n";}
+		if(exists $self->{'fileCtimes'}->{$ext} && $self->cacheIsCurrentForFile($ext)) {print CACHEFILE "$ext " . $self->{'fileCtimes'}->{$ext} . "\n";}
+		else {print CACHEFILE "$ext " . time . "\n";} #our info must just have been calculated
 	};
 	if(exists $self->{'truthFilename'}) {&$ensureCtimeIsOutput('e');}
 	if(exists $self->{'inputFilename'}) {&$ensureCtimeIsOutput('f');}
@@ -619,16 +618,16 @@ sub loadCacheFile
 	while(my $line = <CACHEFILE>)
 	{
 		next if $line =~ /^[ \t\n\r\x0a]*$/; #anyone know why char 10 (0x0a) shows up on empty lines, at least on solaris?
-		chop $line;
+		chomp $line;
 		#check for start of section
-		if($line eq "File changetimes\n") {$mode = 'ctime';}
-		elsif($line eq "BLEU scores\n") {$mode = 'bleu';}
-		elsif($line eq "BLEU statistics\n") {$mode = 'bstats';}
-		elsif($line eq "Statistical comparisons\n") {$mode = 'cmp';}
-		elsif($line eq "Unknown-token counts\n") {$mode = 'unk';}
-		elsif($line eq "WER scores") {$mode = 'wer';}
-		elsif($line eq "Perplexity") {$mode = 'ppl';}
-		elsif($line eq "NN/ADJ WER/PWER") {$mode = 'nawp';}
+		if($line =~ /File changetimes/) {$mode = 'ctime';}
+		elsif($line =~ /BLEU scores/) {$mode = 'bleu';}
+		elsif($line =~ /BLEU statistics/) {$mode = 'bstats';}
+		elsif($line =~ /Statistical comparisons/) {$mode = 'cmp';}
+		elsif($line =~ /Unknown-token counts/) {$mode = 'unk';}
+		elsif($line =~ /WER scores/) {$mode = 'wer';}
+		elsif($line =~ /Perplexity/) {$mode = 'ppl';}
+		elsif($line =~ /NN\/ADJ WER\/PWER/) {$mode = 'nawp';}
 		#get data when in a mode already
 		elsif($mode eq 'ctime')
 		{
@@ -638,7 +637,7 @@ sub loadCacheFile
 		elsif($mode eq 'bleu')
 		{
 			local ($sysname, $factorName, $rest) = split(/\s+/, $line, 3);
-			if(!$self->cacheIsCurrentForFile($sysname) || !$self->cacheIsCurrentForFile('e')) {next;}
+			next if !$self->cacheIsCurrentForFile($sysname) || !$self->cacheIsCurrentForFile('e');
 			if(!exists $self->{'bleuScores'}->{$sysname}) {$self->{'bleuScores'}->{$sysname} = {};}
 			if(!exists $self->{'bleuScores'}->{$sysname}->{$factorName}) {$self->{'bleuScores'}->{$sysname}->{$factorName} = [[], []];}
 			my @stats = map {my @tmp = split(/\s+/, $_); \@tmp;} split(/;/, $rest);
@@ -648,7 +647,7 @@ sub loadCacheFile
 		elsif($mode eq 'bstats')
 		{
 			local ($sysname, $factorName, $rest) = split(/\s+/, $line, 3);
-			if(!$self->cacheIsCurrentForFile($sysname) || !$self->cacheIsCurrentForFile('e')) {next;}
+			next if !$self->cacheIsCurrentForFile($sysname) || !$self->cacheIsCurrentForFile('e');
 			if(!exists $self->{'bleuConfidence'}->{$sysname}) {$self->{'bleuConfidence'}->{$sysname} = {};}
 			if(!exists $self->{'bleuConfidence'}->{$sysname}->{$factorName}) {$self->{'bleuConfidence'}->{$sysname}->{$factorName} = [[], []];}
 			my @stats = map {my @tmp = split(/\s+/, $_); \@tmp;} split(/;/, $rest);
@@ -658,7 +657,7 @@ sub loadCacheFile
 		elsif($mode eq 'cmp')
 		{
 			local ($sysname1, $sysname2, $factorName, $rest) = split(/\s+/, $line, 4);
-			if(!$self->cacheIsCurrentForFile($sysname1) || !$self->cacheIsCurrentForFile($sysname2) || !$self->cacheIsCurrentForFile('e')) {next;}
+			next if !$self->cacheIsCurrentForFile($sysname1) || !$self->cacheIsCurrentForFile($sysname2) || !$self->cacheIsCurrentForFile('e');
 			if(!exists $self->{'comparisonStats'}->{$sysname1}) {$self->{'comparisonStats'}->{$sysname1} = {};}
 			if(!exists $self->{'comparisonStats'}->{$sysname1}->{$sysname2}) {$self->{'comparisonStats'}->{$sysname1}->{$sysname2} = {};}
 			if(!exists $self->{'comparisonStats'}->{$sysname1}->{$sysname2}->{$factorName}) {$self->{'comparisonStats'}->{$sysname1}->{$sysname2}->{$factorName} = [];}
@@ -668,7 +667,7 @@ sub loadCacheFile
 		elsif($mode eq 'unk')
 		{
 			local ($factorName, $phraseTableFilename, $unknownCount, $totalCount) = split(' ', $line);
-			if(!$self->cacheIsCurrentForFile('f') || !$self->cacheIsCurrentForFile("pt_$factorName")) {next;}
+			next if !$self->cacheIsCurrentForFile('f') || !$self->cacheIsCurrentForFile("pt_$factorName");
 			if(defined($self->{'phraseTableFilenames'}->{$factorName}) && $self->{'phraseTableFilenames'}->{$factorName} eq $phraseTableFilename)
 			{
 				$self->{'unknownCount'}->{$factorName} = $unknownCount;
@@ -678,7 +677,7 @@ sub loadCacheFile
 		elsif($mode eq 'wer')
 		{
 			local ($werType, $sysname, $factorName, $totalWER, $details) = split(/\s+/, $line, 5); #werType is 'sysoutWER' or 'sysoutPWER'
-			if(!$self->cacheIsCurrentForFile($sysname) || !$self->cacheIsCurrentForFile('e')) {next;}
+			next if !$self->cacheIsCurrentForFile($sysname) || !$self->cacheIsCurrentForFile('e');
 			$details =~ /^([^;]*);(.*)/;
 			my @sentenceWERs = split(/\s+/, $1);
 			if(!exists $self->{$werType}->{$sysname}) {$self->{$werType}->{$sysname} = {};}
@@ -693,16 +692,20 @@ sub loadCacheFile
 		elsif($mode eq 'ppl')
 		{
 			local ($sysname, $factorName, $perplexity) = split(/\s+/, $line);
+			next if !$self->cacheIsCurrentForFile($sysname);
 			if(!exists $self->{'perplexity'}->{$sysname}) {$self->{'perplexity'}->{$sysname} = {};}
 			$self->{'perplexity'}->{$sysname}->{$factorName} = $perplexity;
 		}
 		elsif($mode eq 'nawp')
 		{
 			local ($sysname, @scores) = split(/\s+/, $line);
+			next if !$self->cacheIsCurrentForFile($sysname);
 			$self->{'nnAdjWERPWER'}->{$sysname} = \@scores;
 		}
 	}
 	close(CACHEFILE);
+#	print STDERR "\nafter load cache:\n";
+#	$self->printDetails();
 }
 
 #arguments: cache type ('bleu' | ...), system name, factor name
@@ -724,8 +727,11 @@ sub flushCache
 sub cacheIsCurrentForFile
 {
 	my ($self, $ext) = @_;
-	return 0 if(!exists $self->{'fileCtimes'}->{$ext});
+	print STDERR "cicff($self, $ext)\n";
+	return 0 if !exists $self->{'fileCtimes'}->{$ext} ;
+	print STDERR "  $ext exists in ctimes\n";
 	my @liveStats = stat($self->{'corpusName'} . ".$ext");
+	print STDERR "  time stat: " . $liveStats[9] . "\n";
 	return ($liveStats[9] <= $self->{'fileCtimes'}->{$ext}) ? 1 : 0;
 }
 
@@ -1309,3 +1315,26 @@ sub printSingleSentenceComparison
 	print "</div>\n";
 	select $curFH;
 }
+
+#print contents of all fields of this object, with useful formatting for arrayrefs and hashrefs
+#arguments: none
+#return: none
+sub printDetails
+{
+	my $self = shift;
+	foreach my $key (keys %$self)
+	{
+		if(ref($self->{$key}) eq 'HASH')
+		{
+			print STDERR "obj: $key => {" . join(', ', map {"$_ => " . $self->{$key}->{$_}} (keys %{$self->{$key}})) . "}\n";
+		}
+		elsif(ref($self->{$key}) eq 'ARRAY')
+		{
+			print STDERR "obj: $key => (" . join(', ', @{$self->{$key}}) . ")\n";
+		}
+		elsif(ref($self->{$key}) eq '') #not a reference
+		{
+			print STDERR "obj: $key => " . $self->{$key} . "\n";
+		}
+	}
+}
diff --git a/scripts/analysis/smtgui/newsmtgui.cgi b/scripts/analysis/smtgui/newsmtgui.cgi
index a31ac558e..0e5cd24b5 100755
--- a/scripts/analysis/smtgui/newsmtgui.cgi
+++ b/scripts/analysis/smtgui/newsmtgui.cgi
@@ -83,6 +83,7 @@ sub view_corpus {
   
   # find corpora in evaluation directory
   my $corpus = new Corpus('-name' => "$in{CORPUS}", '-descriptions' => \%FILEDESC, '-info_line' => $factorData{$in{CORPUS}});
+  $corpus->printDetails();
   
   my ($sentence_count, $lineInfo);
   if(-e "$in{CORPUS}.f")
@@ -126,10 +127,13 @@ sub view_corpus {
   print "<TD>Surface vs. lemma PWER</TD>"; #can't sort on; only applies to sysoutputs
 	print "<TD>Statistical Measures</TD>";
 
-  open(DIR,"ls $in{CORPUS}.*|");
-  while(<DIR>) {
-    my $sort = "";
-    chop;
+  opendir(DIR, ".") or die "couldn't open '.' for read";
+  my @filenames = readdir(DIR); #includes . and ..
+  closedir(DIR);
+  foreach $_ (@filenames)
+  {
+  	next if -d $_; #if is a directory
+	print STDERR "file: $_\n";
     my $sgm = 0;
     if (/.sgm$/)
 	 {
@@ -142,8 +146,9 @@ sub view_corpus {
 	 	`wc -l $_` =~ /^\s*(\d+)\s+/;
 		next unless $1 == $sentence_count;
     }
-    /^$in{CORPUS}.([^\/]+)$/;
+	 next unless /^$in{CORPUS}\.([^\/]+)$/;
     my $file = $1;
+	 my $sort = "";
     # checkbox for compare
     my $row = "<TR><TD style=\"font-size: small\"><INPUT TYPE=CHECKBOX NAME=FILE_$file VALUE=1>";
     # README
@@ -185,6 +190,7 @@ sub view_corpus {
       $row .= "<TD>";
       if (!defined($DONTSCORE{$file}) && $file !~ /^f$/ && $file ne "e" && $file !~ /^pt/) {
 	my ($score,$p1,$p2,$p3,$p4,$bp) = $corpus->calcBLEU($file, 'surf');
+	print STDERR "193: `$score `$p1 `$p2 `$p3 `$p4 `$bp\n";
 	$row .= sprintf("<B>%.04f</B> %.01f/%.01f/%.01f/%.01f *%.03f", $score, $p1, $p2, $p3, $p4, $bp);
 	if (defined($in{SORT}) && $in{SORT} eq 'IBM') { $sort = $score; }
       }
@@ -200,7 +206,7 @@ sub view_corpus {
       print "$DONTSCORE{$file}+";
       my ($nist,$nist_bleu);
       if ($file =~ /sgm$/) {
-	($nist,$nist_bleu) = &get_nist_score("$in{CORPUS}.ref.sgm","$in{CORPUS}.src.sgm","$in{CORPUS}.$file");
+	($nist,$nist_bleu) = get_nist_score("$in{CORPUS}.ref.sgm","$in{CORPUS}.src.sgm","$in{CORPUS}.$file");
 	$row .= sprintf("<B>%.04f</B>",$nist);
 	if ($in{SORT} eq 'NIST') { $sort = $nist; }
       }
@@ -218,7 +224,7 @@ sub view_corpus {
     if ($in{mBLEU} && (scalar keys %MEMORY) && -e "$in{CORPUS}.e") {
       $row .= "<TD>";
       if (!defined($DONTSCORE{$file}) && $file !~ /^f$/ && $file ne "e") {
-	my ($score,$p1,$p2,$p3,$p4,$bp) = &get_multi_bleu_score("$in{CORPUS}.f","$in{CORPUS}.e","$in{CORPUS}.$file");
+	my ($score,$p1,$p2,$p3,$p4,$bp) = get_multi_bleu_score("$in{CORPUS}.f","$in{CORPUS}.e","$in{CORPUS}.$file");
 	$row .= sprintf("<B>%.04f</B> %.01f/%.01f/%.01f/%.01f *%.03f",$score,$p1,$p2,$p3,$p4,$bp);
 	if ($in{SORT} eq 'mBLEU') { $sort = $score; }
       }
@@ -303,7 +309,7 @@ sub view_corpus {
     my($correct,$wrong,$unknown);
     $row .= "<TD>";
     if (!defined($DONTSCORE{$file}) && (scalar keys %MEMORY)) {
-      my ($correct,$just_syn,$just_sem,$wrong,$unknown) = &get_score_from_memory("$in{CORPUS}.$FOREIGN",
+      my ($correct,$just_syn,$just_sem,$wrong,$unknown) = get_score_from_memory("$in{CORPUS}.$FOREIGN",
 			       "$in{CORPUS}.$file");
       $row .= "<B><FONT COLOR=GREEN>$correct</FONT></B>";
       $row .= "/<FONT COLOR=ORANGE>$just_syn</FONT>";
@@ -897,6 +903,7 @@ sub trim {
 sub load_descriptions {
   open(FD,"file-descriptions") or die "load_descriptions(): couldn't open 'file-descriptions' for read\n";
   while(<FD>) {
+  	chomp;
     my($file,$description) = split(/\s+/,$_,2);
     $FILEDESC{$file} = $description;
   }
author	eherbst <eherbst@1f5c12ca-751b-0410-a591-d2e778427230>	2006-08-16 18:49:10 +0400
committer	eherbst <eherbst@1f5c12ca-751b-0410-a591-d2e778427230>	2006-08-16 18:49:10 +0400
commit	c34aca3053a675a56334db1a476ff3f67d9777fa (patch)
tree	2b6428982dbdef51c89860de4d384d3bbf02b0c6 /scripts/analysis
parent	be6b9d8ce586034c3e0f4df700cdda890805d94f (diff)