added statistical testing, both to compare different outputs and to get a confidence measure for a single output

git-svn-id: https://mosesdecoder.svn.sourceforge.net/svnroot/mosesdecoder/trunk@529 1f5c12ca-751b-0410-a591-d2e778427230
author: eherbst <eherbst@1f5c12ca-751b-0410-a591-d2e778427230> 2006-08-07 21:22:39 +0400
committer: eherbst <eherbst@1f5c12ca-751b-0410-a591-d2e778427230> 2006-08-07 21:22:39 +0400
commit: 8420ecf516204528e0de956f392c24d38d6d86cb (patch)
tree: 2905b3fb3a41afba33fd9123bad16869a00f470b /scripts/analysis
parent: ac34c07dc003957393804a39e4da58491df742ab (diff)
4 files changed, 549 insertions, 95 deletions
diff --git a/scripts/analysis/smtgui/Corpus.pm b/scripts/analysis/smtgui/Corpus.pm
index f191ce273..e77f9b384 100644
--- a/scripts/analysis/smtgui/Corpus.pm
+++ b/scripts/analysis/smtgui/Corpus.pm
@@ -12,33 +12,55 @@ return 1;
 
 ###########################################################################################################################
 
-#'our' variables are available outside the package
+##### 'our' variables are available outside the package #####
+#all factor names used should be in this list, just in case
 our @FACTORNAMES = ('surf', 'pos', 'lemma', 'stem', 'morph');
 
 #constructor
-#arguments: short corpus name (-name), hashref of filenames to descriptions (-descriptions), hashref of factor names to indices in this corpus (-indices)
+#arguments: short corpus name (-name), hashref of filenames to descriptions (-descriptions), formatted string with various config info (-info_line)
 sub new
 {
 	my $class = shift;
 	my %args = @_; #turn the remainder of @_ into a hash
-	my ($corpusName, $refFileDescs, $factorIndices) = ($args{'-name'}, $args{'-descriptions'}, $args{'-indices'});
+	my ($corpusName, $refFileDescs, $infoLine) = ($args{'-name'}, $args{'-descriptions'}, $args{'-info_line'});
+	my ($factorList, $inputLingmodels, $outputLingmodels) = split(/\s*:\s*/, $infoLine);
 	my $self = {};
 	$self->{'corpusName'} = $corpusName;
 	$self->{'truth'} = []; #arrayref of arrayrefs of factors
 	$self->{'input'} = []; #same; also same for any system outputs that get loaded
+	$self->{'tokenCount'} = {}; #sysname => number of tokens in file
 	$self->{'truthFilename'} = "";
 	$self->{'inputFilename'} = "";
 	$self->{'sysoutFilenames'} = {}; #hashref of (string => string) for (system name, filename)
-	$self->{'phraseTableFilenames'} = {}; #factor name (from @FACTORNAMES) => filename
-	$self->{'factorIndices'} = {}; #factor name => index; names ought to be strings from the standard list in @FACTORNAMES
-	%{$self->{'factorIndices'}} = %$factorIndices;
+	$self->{'phraseTableFilenames'} = {}; #factor name => filename
+	$self->{'fileCtimes'} = {}; #file ID of some kind => changetime in seconds
+	$self->{'factorIndices'} = {}; #factor name => index
+	my @factors = split(/\s+/, $factorList);
+	for(my $i = 0; $i < scalar(@factors); $i++)
+	{
+		$self->{'factorIndices'}->{$factors[$i]} = $i;
+	}
+	$self->{'inputLMs'} = {}; #factor name => lingmodel filename
+	$self->{'outputLMs'} = {};
+	foreach my $lmInfo (split(/\s*,\s*/, $inputLingmodels))
+	{
+		my @tokens = split(/\s+/, $lmInfo);
+		$self->{'inputLMs'}->{$tokens[0]} = $tokens[1];
+	}
+	foreach my $lmInfo (split(/\s*,\s*/, $outputLingmodels))
+	{
+		my @tokens = split(/\s+/, $lmInfo);
+		$self->{'outputLMs'}->{$tokens[0]} = $tokens[1];
+	}
 	$self->{'phraseTables'} = {}; #factor name (from @FACTORNAMES) => hashref of source phrases to anything; used for unknown-word counting
 	$self->{'unknownCount'} = {}; #factor name => count of unknown tokens in input
-	$self->{'totalTokens'} = 0; #useful with counts of unknown tokens
 	$self->{'sysoutWER'} = {}; #system name => (factor name => arrayref with system output total WER and arrayref of WER scores for individual sysout sentences wrt truth)
 	$self->{'sysoutPWER'} = {}; #similarly
 	$self->{'fileDescriptions'} = {}; #filename associated with us => string description of file
 	$self->{'bleuScores'} = {}; #system name => (factor name => arrayref of (overall score, arrayref of per-sentence scores) )
+	$self->{'bleuConfidence'} = {}; #system name => (factor name => arrayrefs holding statistical test data on BLEU scores)
+	$self->{'subsetBLEUstats'} = {}; #system name => (factor name => n-gram precisions and lengths for independent corpus subsets)
+	$self->{'comparisonStats'} = {}; #system name 1 => (system name 2 => (factor name => p-values, and indices of better system, for all tests used))
 	$self->{'cacheFilename'} = "cache/$corpusName.cache"; #all memory of various scores is stored here
 	bless $self, $class;
 	$self->locateFiles($refFileDescs); #find all relevant files in the current directory; set filenames and descriptions
@@ -59,12 +81,12 @@ sub getFileDescription
 	return $self->{'fileDescriptions'}->{$filename};
 }
 
-#arguments: hashref of factor names to indices (not all factors in @FACTORNAMES must be included, but no strings not from the list should be)
-#return: none
-sub setFactorIndices
+#arguments: none
+#return: list of system names (NOT including 'input', 'truth' and other special cases)
+sub getSystemNames
 {
-	my ($self, $refIndices) = @_;
-	%{$self->{'factorIndices'}} = %{$refIndices};
+	my $self = shift;
+	return keys %{$self->{'sysoutFilenames'}};
 }
 
 #calculate the number of unknown factor values for the given factor in the input file
@@ -77,7 +99,7 @@ sub calcUnknownTokens
 	#check in-memory cache first
 	if(defined($self->{'unknownCount'}->{$factorName}))
 	{
-		return ($self->{'unknownCount'}->{$factorName}, $self->{'totalTokens'});
+		return ($self->{'unknownCount'}->{$factorName}, $self->{'tokenCount'}->{'input'});
 	}
 	
 	$self->ensureFilenameDefined('input');
@@ -101,7 +123,7 @@ sub calcUnknownTokens
 		}
 	}
 	$self->{'unknownCount'}->{$factorName} = $unknownTokens;
-	$self->{'totalTokens'} = $totalTokens;
+	$self->{'tokenCount'}->{'input'} = $totalTokens;
 	
 	return ($unknownTokens, $totalTokens);
 }
@@ -123,12 +145,8 @@ sub calcNounAdjWER_PWERDiff
 	my $nnNadjTags = $self->getPOSTagList('nounAndAdj');
 	for(my $i = 0; $i < scalar(@{$self->{'truth'}}); $i++)
 	{
-#		warn "truth " . join(', ', map {join('|', @$_)} @{$self->{'truth'}->[$i]}) . "\n";
-#		warn "sysout " . join(', ', map {join('|', @$_)} @{$self->{$sysname}->[$i]}) . "\n";
 		my @nnAdjEWords = $self->filterFactors($self->{'truth'}->[$i], $self->{'factorIndices'}->{'pos'}, $nnNadjTags);
 		my @nnAdjSWords = $self->filterFactors($self->{$sysname}->[$i], $self->{'factorIndices'}->{'pos'}, $nnNadjTags);
-#		warn "filtered truth: " . join(' ', map {join('|', @$_)} @nnAdjEWords) . "\n";
-#		warn "filtered sysout: " . join(' ', map {join('|', @$_)} @nnAdjSWords) . "\n\n\n";
 		my ($sentWer, $tmp) = $self->sentenceWER(\@nnAdjSWords, \@nnAdjEWords, $self->{'factorIndices'}->{'surf'});
 		$werScore += $sentWer;
 		($sentWer, $tmp) = $self->sentencePWER(\@nnAdjSWords, \@nnAdjEWords, $self->{'factorIndices'}->{'surf'});
@@ -138,7 +156,7 @@ sub calcNounAdjWER_PWERDiff
 	#unhog memory
 	$self->releaseSentences('truth');
 	$self->releaseSentences($sysname);
-	return ($werScore, $pwerScore);
+	return ($werScore / $self->{'tokenCount'}->{'truth'}, $pwerScore / $self->{'tokenCount'}->{'truth'});
 }
 
 #calculate detailed WER statistics and put them into $self
@@ -167,7 +185,7 @@ sub calcOverallWER
 	#unhog memory
 	$self->releaseSentences('truth');
 	$self->releaseSentences($sysname);
-	return $self->{'sysoutWER'}->{$sysname}->{$factorName}->[0];
+	return $self->{'sysoutWER'}->{$sysname}->{$factorName}->[0] / $self->{'tokenCount'}->{'truth'};
 }
 
 #calculate detailed PWER statistics and put them into $self
@@ -196,7 +214,7 @@ sub calcOverallPWER
 	#unhog memory
 	$self->releaseSentences('truth');
 	$self->releaseSentences($sysname);
-	return $self->{'sysoutPWER'}->{$sysname}->{$factorName}->[0];
+	return $self->{'sysoutPWER'}->{$sysname}->{$factorName}->[0] / $self->{'tokenCount'}->{'truth'};
 }
 
 #arguments: system name, factor name to consider (default 'surf')
@@ -206,7 +224,7 @@ sub calcBLEU
 	my ($self, $sysname, $factorName) = (shift, shift, 'surf');
 	if(scalar(@_) > 0) {$factorName = shift;}
 	#check in-memory cache first
-	if(defined($self->{'bleuScores'}->{$sysname}->{$factorName}))
+	if(exists $self->{'bleuScores'}->{$sysname} && exists $self->{'bleuScores'}->{$sysname}->{$factorName})
 	{
 		return $self->{'bleuScores'}->{$sysname}->{$factorName};
 	}
@@ -218,8 +236,8 @@ sub calcBLEU
 	$self->loadSentences($sysname, $self->{'sysoutFilenames'}->{$sysname});
 	
 	#score structure: various total scores, arrayref of by-sentence score arrays
-	if(!defined($self->{'bleuScores'}->{$sysname})) {$self->{'bleuScores'}->{$sysname} = {};}
-	if(!defined($self->{'bleuScores'}->{$sysname}->{$factorName})) {$self->{'bleuScores'}->{$sysname}->{$factorName} = [[], []];}
+	if(!exists $self->{'bleuScores'}->{$sysname}) {$self->{'bleuScores'}->{$sysname} = {};}
+	if(!exists $self->{'bleuScores'}->{$sysname}->{$factorName}) {$self->{'bleuScores'}->{$sysname}->{$factorName} = [[], []];}
 	
 	my ($good1, $tot1, $good2, $tot2, $good3, $tot3, $good4, $tot4, $totCLength, $totRLength) = (0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
 	my $factorIndex = $self->{'factorIndices'}->{$factorName};
@@ -237,12 +255,14 @@ sub calcBLEU
 		$totRLength += $rLength;
 	}
 	my $brevity = ($totCLength > $totRLength || $totCLength == 0) ? 1 : exp(1 - $totRLength / $totCLength);
-	my ($pct1, $pct2, $pct3, $pct4) = ($good1 / $tot1, $good2 / $tot2, $good3 / $tot3, $good4 / $tot4);
-#	warn sprintf("n-gram prec: %d/%d = %.3lf, %d/%d = %.3lf, %d/%d = %.3lf, %d/%d = %.3lf\n", 
-#						$good1, $tot1, $pct1, $good2, $tot2, $pct2, $good3, $tot3, $pct3, $good4, $tot4, $pct4);
-	my $bleu = $brevity * exp((my_log($pct1) + my_log($pct2) + my_log($pct3) + my_log($pct4)) / 4);
-#	warn sprintf("brevity: %.3lf (%d ref, %d sys)\n", $brevity, $totRLength, $totCLength);
-	sleep 8;
+	my ($pct1, $pct2, $pct3, $pct4) = ($tot1 == 0 ? -1 : $good1 / $tot1, $tot2 == 0 ? -1 : $good2 / $tot2, 
+													$tot3 == 0 ? -1 : $good3 / $tot3, $tot4 == 0 ? -1 : $good4 / $tot4);
+	my ($logsum, $logcount) = (0, 0);
+	if($tot1 > 0) {$logsum += my_log($pct1); $logcount++;}
+	if($tot2 > 0) {$logsum += my_log($pct2); $logcount++;}
+	if($tot3 > 0) {$logsum += my_log($pct3); $logcount++;}
+	if($tot4 > 0) {$logsum += my_log($pct4); $logcount++;}
+	my $bleu = $brevity * exp($logsum / $logcount);
 	$self->{'bleuScores'}->{$sysname}->{$factorName}->[0] = [$bleu, 100 * $pct1, 100 * $pct2, 100 * $pct3, 100 * $pct4, $brevity];
   
 	#unhog memory
@@ -251,8 +271,161 @@ sub calcBLEU
 	return @{$self->{'bleuScores'}->{$sysname}->{$factorName}->[0]};
 }
 
+#do t-tests on the whole-corpus n-gram precisions vs. the average precisions over a set number of disjoint subsets
+#arguments: system name, factor name BLEU was run on (default 'surf')
+#return: arrayref of [arrayref of p-values for overall precision vs. subset average, arrayrefs of [(lower, upper) 95% credible intervals for true overall n-gram precisions]]
+#
+#written to try to save memory
+sub statisticallyTestBLEUResults
+{
+	my ($self, $sysname, $factorName) = (shift, shift, 'surf');
+	if(scalar(@_) > 0) {$factorName = shift;}
+	my $k = 30; #HARDCODED NUMBER OF SUBSETS (WE DO k-FOLD CROSS-VALIDATION); IF YOU CHANGE THIS YOU MUST ALSO CHANGE getApproxPValue() and $criticalTStat
+	my $criticalTStat = 2.045; #hardcoded value given alpha (.025 here) and degrees of freedom (= $k - 1) ########################################
+	$self->ensureFilenameDefined('truth');
+	$self->ensureFilenameDefined($sysname);
+	$self->ensureFactorPosDefined($factorName);
+	
+	#ensure we have full-corpus BLEU results
+	if(!exists $self->{'bleuScores'}->{$sysname}->{$factorName})
+	{
+		$self->calcBLEU($sysname, $factorName);
+	}
+	if(!exists $self->{'subsetBLEUstats'}->{$sysname}) {$self->{'subsetBLEUstats'}->{$sysname} = {};}
+	if(!exists $self->{'subsetBLEUstats'}->{$sysname}->{$factorName}) {$self->{'subsetBLEUstats'}->{$sysname}->{$factorName} = [];}
+	
+	#calculate n-gram precisions for each small subset
+	my @sentenceStats = @{$self->{'bleuScores'}->{$sysname}->{$factorName}->[1]};
+	for(my $i = 0; $i < $k; $i++)
+	{
+		my ($good1, $tot1, $good2, $tot2, $good3, $tot3, $good4, $tot4, $sysoutLength, $truthLength) = (0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
+		for(my $j = $i; $j < scalar(@sentenceStats); $j += $k) #subset #K consists of every Kth sentence
+		{
+			$good1 += $sentenceStats[$j]->[0]; $tot1 += $sentenceStats[$j]->[1];
+			$good2 += $sentenceStats[$j]->[2]; $tot2 += $sentenceStats[$j]->[3];
+			$good3 += $sentenceStats[$j]->[4]; $tot3 += $sentenceStats[$j]->[5];
+			$good4 += $sentenceStats[$j]->[6]; $tot4 += $sentenceStats[$j]->[7];
+			$sysoutLength += $sentenceStats[$j]->[8];
+			$truthLength += $sentenceStats[$j]->[9];
+		}
+		push @{$self->{'subsetBLEUstats'}->{$sysname}->{$factorName}}, [$good1, $tot1, $good2, $tot2, $good3, $tot3, $good4, $tot4, $sysoutLength, $truthLength];
+	}
+	my $subsetStats = $self->{'subsetBLEUstats'}->{$sysname}->{$factorName};
+	#calculate first two moments for subset scores for each n-gram precision, and t statistic
+	my $fullCorpusBLEU = $self->{'bleuScores'}->{$sysname}->{$factorName}->[0]; #an arrayref
+	my @means = (0) x 4;
+	my @devs = (0) x 4;
+	my $t = []; #t statistics for all n-gram orders
+	if(!exists $self->{'bleuConfidence'}->{$sysname}) {$self->{'bleuConfidence'}->{$sysname} = {};}
+	$self->{'bleuConfidence'}->{$sysname}->{$factorName} = [[], []]; #lower-bound p-values for whole corpus vs. subset average; confidence intervals for all n-gram orders
+	for(my $i = 0; $i < 4; $i++) #run through n-gram orders
+	{
+		for(my $j = 0; $j < $k; $j++) #run through subsets
+		{
+			$means[$i] += $subsetStats->[$j]->[2 * $i] / $subsetStats->[$j]->[2 * $i + 1]; #matching / total n-grams
+		}
+		$means[$i] /= $k;
+		for(my $j = 0; $j < $k; $j++) #run through subsets
+		{
+			$devs[$i] += ($subsetStats->[$j]->[2 * $i] / $subsetStats->[$j]->[2 * $i + 1] - $means[$i]) ** 2;
+		}
+		$devs[$i] = sqrt($devs[$i] / ($k - 1));
+		$t->[$i] = ($fullCorpusBLEU->[$i + 1] / 100 - $means[$i]) / $devs[$i];
+		push @{$self->{'bleuConfidence'}->{$sysname}->{$factorName}->[0]}, getLowerBoundPValue($t->[$i]); #p-value for overall score vs. subset average
+#		warn "$i: mean " . $means[$i] . ", dev " . $devs[$i] . ", t " . $t->[$i] . ", conf " . getLowerBoundPValue($t->[$i]) . "\n";
+		push @{$self->{'bleuConfidence'}->{$sysname}->{$factorName}->[1]}, 
+							[$means[$i] - $criticalTStat * $devs[$i] / sqrt($k), $means[$i] + $criticalTStat * $devs[$i] / sqrt($k)]; #the confidence interval
+	}
+	
+	return $self->{'bleuConfidence'}->{$sysname}->{$factorName};
+}
+
+#arguments: system name, factor name
+#return: perplexity of language model (specified in a config file) wrt given system output
+sub calcPerplexity
+{
+	my ($self, $sysname, $factorName) = @_;
+	$self->ensureFilenameDefined($sysname);
+	my $sysoutFilename;
+	if($sysname eq 'truth' || $sysname eq 'input') {$sysoutFilename = $self->{"${sysname}Filename"};}
+	else {$sysoutFilename = $self->{'sysoutFilenames'}->{$sysname};}
+	my $lmFilename;
+	if($sysname eq 'input') {$lmFilename = $self->{'inputLMs'}->{$factorName};}
+	else {$lmFilename = $self->{'outputLMs'}->{$factorName};}
+	my $tmpfile = ".tmp" . time;
+	my $cmd = "perl ./extract-factors.pl $sysoutFilename " . $self->{'factorIndices'}->{$factorName} . " > $tmpfile";
+	`$cmd`; #extract just the factor we're interested in; ngram doesn't understand factored notation
+	my @output = `./ngram -lm $lmFilename -ppl $tmpfile`; #run the SRI n-gram tool
+	`rm $tmpfile`;
+	$output[1] =~ /ppl1=\s*([0-9\.]+)/;
+	return $1;
+}
+
+#run a paired t test and a sign test on BLEU statistics for subsets of both systems' outputs
+#arguments: system name 1, system name 2, factor name
+#return: arrayref of [arrayref of confidence levels for t test at which results differ, arrayref of index (0/1) of better system by t test,
+#                     arrayref of confidence levels for sign test at which results differ, arrayref of index (0/1) of better system by sign test], 
+#     where each inner arrayref has one element per n-gram order considered
+sub statisticallyCompareSystemResults
+{
+	my ($self, $sysname1, $sysname2, $factorName) = @_;
+	$self->ensureFilenameDefined($sysname1);
+	$self->ensureFilenameDefined($sysname2);
+	$self->ensureFactorPosDefined($factorName);
+	#make sure we have tallied results for both systems
+	if(!exists $self->{'subsetBLEUstats'}->{$sysname1}->{$factorName}) {$self->statisticallyTestBLEUResults($sysname1, $factorName);}
+	if(!exists $self->{'subsetBLEUstats'}->{$sysname2}->{$factorName}) {$self->statisticallyTestBLEUResults($sysname2, $factorName);}
+	
+	if(!exists $self->{'comparisonStats'}->{$sysname1}) {$self->{'comparisonStats'}->{$sysname1} = {};}
+	if(!exists $self->{'comparisonStats'}->{$sysname1}->{$sysname2}) {$self->{'comparisonStats'}->{$sysname1}->{$sysname2} = {};}
+	if(!exists $self->{'comparisonStats'}->{$sysname1}->{$sysname2}->{$factorName}) {$self->{'comparisonStats'}->{$sysname1}->{$sysname2}->{$factorName} = [];}
+	my ($tConfidences, $tWinningIndices, $signConfidences, $signWinningIndices) = ([], [], [], []);
+	for(my $i = 0; $i < 4; $i++) #loop over n-gram order
+	{
+		#t-test stats
+		my ($mean, $dev) = (0, 0); #of the difference between the first and second systems' precisions
+		#sign-test stats
+		my ($nPlus, $nMinus) = (0, 0);
+		my $j;
+		for($j = 0; $j < scalar(@{$self->{'subsetBLEUstats'}->{$sysname1}->{$factorName}}); $j++)
+		{
+			my ($stats1, $stats2) = ($self->{'subsetBLEUstats'}->{$sysname1}->{$factorName}->[$j], $self->{'subsetBLEUstats'}->{$sysname2}->{$factorName}->[$j]);
+			my ($prec1, $prec2) = ($stats1->[2 * $i] / $stats1->[2 * $i + 1], $stats2->[2 * $i] / $stats2->[2 * $i + 1]); #n-gram precisions
+			$mean += $prec1 - $prec2;
+			if($prec1 > $prec2) {$nPlus++;} else {$nMinus++;}
+		}
+		$mean /= $j;
+		for($j = 0; $j < scalar(@{$self->{'subsetBLEUstats'}->{$sysname1}->{$factorName}}); $j++)
+		{
+			my ($stats1, $stats2) = ($self->{'subsetBLEUstats'}->{$sysname1}->{$factorName}->[$j], $self->{'subsetBLEUstats'}->{$sysname2}->{$factorName}->[$j]);
+			my ($prec1, $prec2) = ($stats1->[2 * $i] / $stats1->[2 * $i + 1], $stats2->[2 * $i] / $stats2->[2 * $i + 1]); #n-gram precisions
+			$dev += ($prec1 - $prec2 - $mean) ** 2;
+		}
+		$dev = sqrt($dev / (($j - 1) * $j)); #need the extra j because the variance of Xbar is 1/n the variance of X
+		#t test
+		my $t = $mean / $dev; #this isn't the standard form; remember the difference of the means is equal to the mean of the differences
+		push @$tConfidences, getUpperBoundPValue($t);
+		push @$tWinningIndices, ($mean > 0) ? 0 : 1;
+		#sign test
+		my %binomialCoefficients; #map (n+ - n-) to a coefficient; compute on the fly!
+		for(my $k = 0; $k <= $nPlus + $nMinus; $k++)
+		{
+			$binomialCoefficients{$k} = binCoeff($nPlus + $nMinus, $k);
+		}
+		my $sumCoeffs = 0;
+		foreach my $coeff (values %binomialCoefficients) #get a lower bound on the probability mass inside (n+ - n-)
+		{
+			if($coeff > $binomialCoefficients{$nPlus}) {$sumCoeffs += $coeff;}
+		}
+		push @$signConfidences, $sumCoeffs;
+		push @$signWinningIndices, ($nPlus > $nMinus) ? 0 : 1;
+	}
+	$self->{'comparisonStats'}->{$sysname1}->{$sysname2}->{$factorName} = [$tConfidences, $tWinningIndices, $signConfidences, $signWinningIndices];
+	return $self->{'comparisonStats'}->{$sysname1}->{$sysname2}->{$factorName};
+}
+
 #write HTML to be displayed to compare the various versions we have of each sentence in the corpus;
-#allow filtering of which versions will be displayed
+#allow to filter which versions will be displayed
 #(we don't write the whole page, just the contents of the body)
 #arguments: filehandleref to which to write, regex to filter filename extensions to be included
 #return: none
@@ -297,6 +470,15 @@ sub writeCacheFile
 
 	#store file changetimes to disk
 	print CACHEFILE "File changetimes\n";
+	my $writeCtime = sub
+	{
+		my $ext = shift;
+		print CACHEFILE $self->{'corpusName'} . ".$ext " . time . "\n";
+	};
+	if(exists $self->{'truthFilename'}) {&$writeCtime('e');}
+	if(exists $self->{'inputFilename'}) {&$writeCtime('f');}
+	foreach my $factorName (keys %{$self->{'phraseTableFilenames'}}) {&$writeCtime("pt_$factorName");}
+	foreach my $sysname (keys %{$self->{'sysoutFilenames'}}) {&$writeCtime($sysname);}
 	#store bleu scores to disk
 	print CACHEFILE "\nBLEU scores\n";
 	foreach my $sysname (keys %{$self->{'bleuScores'}})
@@ -311,11 +493,37 @@ sub writeCacheFile
 			print CACHEFILE "\n";
 		}
 	}
+	#store t statistics for overall BLEU score and subsets in k-fold cross-validation
+	print CACHEFILE "\nBLEU statistics\n";
+	foreach my $sysname (keys %{$self->{'bleuConfidence'}})
+	{
+		foreach my $factorName (keys %{$self->{'bleuConfidence'}->{$sysname}})
+		{
+			print CACHEFILE "$sysname $factorName " . join(' ', @{$self->{'bleuConfidence'}->{$sysname}->{$factorName}->[0]});
+			foreach my $subsetConfidence (@{$self->{'bleuConfidence'}->{$sysname}->{$factorName}->[1]})
+			{
+				print CACHEFILE "; " . join(' ', @$subsetConfidence);
+			}
+			print CACHEFILE "\n";
+		}
+	}
+	#store statistics comparing system outputs
+	print CACHEFILE "\nStatistical comparisons\n";
+	foreach my $sysname1 (keys %{$self->{'comparisonStats'}})
+	{
+		foreach my $sysname2 (keys %{$self->{'comparisonStats'}->{$sysname1}})
+		{
+			foreach my $factorName (keys %{$self->{'comparisonStats'}->{$sysname1}->{$sysname2}})
+			{
+				print CACHEFILE "$sysname1 $sysname2 $factorName " . join('; ', @{$self->{'comparisonStats'}->{$sysname1}->{$sysname2}->{$factorName}}) . "\n";
+			}
+		}
+	}
 	#store unknown-token counts to disk
 	print CACHEFILE "\nUnknown-token counts\n";
 	foreach my $factorName (keys %{$self->{'unknownCount'}})
 	{
-		print CACHEFILE $factorName . " " . $self->{'phraseTableFilenames'}->{$factorName} . " " . $self->{'unknownCount'}->{$factorName} . " " . $self->{'totalTokens'} . "\n";
+		print CACHEFILE $factorName . " " . $self->{'phraseTableFilenames'}->{$factorName} . " " . $self->{'unknownCount'}->{$factorName} . " " . $self->{'tokenCount'}->{'input'} . "\n";
 	}
 	#store WER, PWER to disk
 	print CACHEFILE "\nWER scores\n";
@@ -340,7 +548,7 @@ sub writeCacheFile
 	&$printWERFunc('sysoutWER');
 	&$printWERFunc('sysoutPWER');
 	#store misc scores to disk
-	print CACHEFILE "\nMisc scores\n";
+	print CACHEFILE "\n";
 	close(CACHEFILE);
 }
 
@@ -363,25 +571,50 @@ sub loadCacheFile
 		#check for start of section
 		if($line eq "File changetimes\n") {$mode = 'ctime';}
 		elsif($line eq "BLEU scores\n") {$mode = 'bleu';}
+		elsif($line eq "BLEU statistics\n") {$mode = 'bstats';}
+		elsif($line eq "Statistical comparisons\n") {$mode = 'cmp';}
 		elsif($line eq "Unknown-token counts\n") {$mode = 'unk';}
 		elsif($line eq "WER scores") {$mode = 'wer';}
-		elsif($line eq "Misc scores") {$mode = 'misc';}
 		#get data when in a mode already
 		elsif($mode eq 'ctime')
 		{
+			local ($fileExtension, $ctime) = split(/\s+/, $line);
+			$self->{'fileCtimes'}->{$fileExtension} = $ctime;
 		}
 		elsif($mode eq 'bleu')
 		{
 			local ($sysname, $factorName, $rest) = split(/\s+/, $line, 3);
-			if(!defined($self->{'bleuScores'}->{$sysname})) {$self->{'bleuScores'}->{$sysname} = {};}
-			if(!defined($self->{'bleuScores'}->{$sysname}->{$factorName})) {$self->{'bleuScores'}->{$sysname}->{$factorName} = [[], []];}
+			if(!$self->cacheIsCurrentForFile($sysname) || !$self->cacheIsCurrentForFile('e')) {next;}
+			if(!exists $self->{'bleuScores'}->{$sysname}) {$self->{'bleuScores'}->{$sysname} = {};}
+			if(!exists $self->{'bleuScores'}->{$sysname}->{$factorName}) {$self->{'bleuScores'}->{$sysname}->{$factorName} = [[], []];}
 			my @stats = map {my @tmp = split(/\s+/, $_); \@tmp;} split(/;/, $rest);
 			$self->{'bleuScores'}->{$sysname}->{$factorName}->[0] = shift @stats;
 			$self->{'bleuScores'}->{$sysname}->{$factorName}->[1] = \@stats;
 		}
+		elsif($mode eq 'bstats')
+		{
+			local ($sysname, $factorName, $rest) = split(/\s+/, $line, 3);
+			if(!$self->cacheIsCurrentForFile($sysname) || !$self->cacheIsCurrentForFile('e')) {next;}
+			if(!exists $self->{'bleuConfidence'}->{$sysname}) {$self->{'bleuConfidence'}->{$sysname} = {};}
+			if(!exists $self->{'bleuConfidence'}->{$sysname}->{$factorName}) {$self->{'bleuConfidence'}->{$sysname}->{$factorName} = [[], []];}
+			my @stats = map {my @tmp = split(/\s+/, $_); \@tmp;} split(/;/, $rest);
+			$self->{'bleuConfidence'}->{$sysname}->{$factorName}->[0] = shift @stats;
+			$self->{'bleuConfidence'}->{$sysname}->{$factorName}->[1] = \@stats;
+		}
+		elsif($mode eq 'cmp')
+		{
+			local ($sysname1, $sysname2, $factorName, $rest) = split(/\s+/, $line, 4);
+			if(!$self->cacheIsCurrentForFile($sysname1) || !$self->cacheIsCurrentForFile($sysname2) || !$self->cacheIsCurrentForFile('e')) {next;}
+			if(!exists $self->{'comparisonStats'}->{$sysname1}) {$self->{'comparisonStats'}->{$sysname1} = {};}
+			if(!exists $self->{'comparisonStats'}->{$sysname1}->{$sysname2}) {$self->{'comparisonStats'}->{$sysname1}->{$sysname2} = {};}
+			if(!exists $self->{'comparisonStats'}->{$sysname1}->{$sysname2}->{$factorName}) {$self->{'comparisonStats'}->{$sysname1}->{$sysname2}->{$factorName} = [];}
+			my @stats = split(/;/, $rest);
+			$self->{'comparisonStats'}->{$sysname1}->{$sysname2}->{$factorName} = \@stats;
+		}
 		elsif($mode eq 'unk')
 		{
 			local ($factorName, $phraseTableFilename, $unknownCount, $totalCount) = split(' ', $line);
+			if(!$self->cacheIsCurrentForFile('f') || !$self->cacheIsCurrentForFile("pt_$factorName")) {next;}
 			if(defined($self->{'phraseTableFilenames'}->{$factorName}) && $self->{'phraseTableFilenames'}->{$factorName} eq $phraseTableFilename)
 			{
 				$self->{'unknownCount'}->{$factorName} = $unknownCount;
@@ -391,23 +624,44 @@ sub loadCacheFile
 		elsif($mode eq 'wer')
 		{
 			local ($werType, $sysname, $factorName, $totalWER, $details) = split(/\s+/, $line, 5); #werType is 'sysoutWER' or 'sysoutPWER'
+			if(!$self->cacheIsCurrentForFile($sysname) || !$self->cacheIsCurrentForFile('e')) {next;}
 			$details =~ /^([^;]*);(.*)/;
 			my @sentenceWERs = split(/\s+/, $1);
-			if(!defined($self->{$werType}->{$sysname})) {$self->{$werType}->{$sysname} = {};}
+			if(!exists $self->{$werType}->{$sysname}) {$self->{$werType}->{$sysname} = {};}
 			$self->{$werType}->{$sysname}->{$factorName} = [$totalWER, \@sentenceWERs, []];
 			my @indexLists = split(/;/, $2);
-			sleep 6;
 			for(my $i = 0; $i < scalar(@sentenceWERs); $i++)
 			{
-				my @indices = split(/\s+/, $indexLists[$i]);
+				my @indices = grep(/\S/, split(/\s+/, $indexLists[$i])); #find all nonempty tokens
 				$self->{$werType}->{$sysname}->{$factorName}->[2] = \@indices;
 			}
 		}
-		elsif($mode eq 'misc')
+	}
+	close(CACHEFILE);
+}
+
+#arguments: cache type ('bleu' | ...), system name, factor name
+#return: none
+sub flushCache
+{
+	my ($self, $cacheType, $sysname, $factorName) = @_;
+	if($cacheType eq 'bleu')
+	{
+		if(defined($self->{'bleuScores'}->{$sysname}) && defined($self->{'bleuScores'}->{$sysname}->{$factorName}))
 		{
+			delete $self->{'bleuScores'}->{$sysname}->{$factorName};
 		}
 	}
-	close(CACHEFILE);
+}
+
+#arguments: file extension
+#return: whether (0/1) our cache for the given file is at least as recent as the file
+sub cacheIsCurrentForFile
+{
+	my ($self, $ext) = @_;
+	return 0 if(!exists $self->{'fileCtimes'}->{$ext});
+	my @liveStats = stat($self->{'corpusName'} . ".$ext");
+	return ($liveStats[9] <= $self->{'fileCtimes'}->{$ext}) ? 1 : 0;
 }
 
 ##### utils #####
@@ -429,6 +683,81 @@ sub my_log
   return -9999999999 unless $_[0];
   return log($_[0]);
 }
+#arguments: x
+sub round
+{
+	my $x = shift;
+	if($x - int($x) < .5) {return int($x);}
+	return int($x) + 1;
+}
+
+#return an approximation of the p-value for a given t FOR A HARDCODED NUMBER OF DEGREES OF FREEDOM
+# (IF YOU CHANGE THIS HARDCODED NUMBER YOU MUST ALSO CHANGE statisticallyTestBLEUResults() and getLowerBoundPValue() )
+#arguments: the t statistic, $t
+#return: a lower bound on the probability mass outside (beyond) +/-$t in the t distribution
+#
+#for a wonderful t-distribution calculator, see <http://math.uc.edu/~brycw/classes/148/tables.htm#t>. UC.edu is Cincinnati.
+sub getLowerBoundPValue
+{
+	my $t = abs(shift);
+	#encode various known p-values for ###### DOF = 29 ######
+	my %t2p = #since we're comparing (hopefully) very similar values, this chart is weighted toward the low end of the t-stat
+	(
+		0.0063 => .995,
+		0.0126 => .99,
+		0.0253 => .98,
+		0.0380 => .97,
+		0.0506 => .96,
+		0.0633 => .95,
+		0.0950 => .925,
+		0.127  => .9,
+		0.191  => .85,
+		0.256  => .8,
+		0.389  => .7,
+		0.530  => .6,
+		0.683  => .5,
+		0.854  => .4,
+		1.055  => .3,
+		1.311  => .2, 
+		1.699  => .1
+	);
+	foreach my $tCmp (sort keys %t2p) {return $t2p{$tCmp} if $t <= $tCmp;}
+	return 0; #loosest bound ever! groovy, man
+}
+#arguments: the t statistic, $t
+#return: an upper bound on the probability mass outside (beyond) +/-$t in the t distribution
+sub getUpperBoundPValue
+{
+	my $t = abs(shift);
+	#encode various known p-values for ###### DOF = 29 ######
+	my %t2p = 
+	(
+		4.506 => .0001,
+		4.254 => .0002,
+		3.918 => .0005,
+		3.659 => .001,
+		3.396 => .002,
+		3.038 => .005,
+		2.756 => .01,
+		2.462 => .02,
+		2.045 => .05,
+		1.699 => .1,
+		1.311 => .2,
+		0.683 => .5
+	);
+	foreach my $tCmp (reverse sort keys %t2p) {return $t2p{$tCmp} if $t >= $tCmp;}
+	return 1; #loosest bound ever!
+}
+
+#arguments: n, r
+#return: binomial coefficient for p = .5 (ie nCr * (1/2)^n)
+sub binCoeff
+{
+	my ($n, $r) = @_;
+	my $coeff = 1;
+	for(my $i = $r + 1; $i <= $n; $i++) {$coeff *= $i; $coeff /= ($i - $r);}
+	return $coeff * (.5 ** $n);
+}
 
 #throw if the given factor doesn't have an index defined
 #arguments: factor name
@@ -501,10 +830,6 @@ sub locateFiles
 		}
 	}
 	close(DIR);
-#	warn "truth is set to '" . $self->{'truthFilename'} . "'\n";
-#	warn "input is set to '" . $self->{'inputFilename'} . "'\n";
-#	warn "sysouts set are '" . join("', '", keys %{$self->{'sysoutFilenames'}}) . "'\n";
-#	warn "phrase tables set are '" . join("', '", keys %{$self->{'phraseTableFilenames'}}) . "'\n";
 }
 
 #arguments: type ('truth' | 'input' | a string to represent a system output), filename
@@ -513,11 +838,16 @@ sub locateFiles
 sub loadSentences
 {
 	my ($self, $sysname, $filename) = @_;
+	#if the sentences are already loaded, leave them be
+	if(exists $self->{$sysname} && scalar(@{$self->{$sysname}}) > 0) {return;}
+	
 	$self->{$sysname} = [];
+	$self->{'tokenCount'}->{$sysname} = 0;
 	open(INFILE, "<$filename") or die "Corpus::load(): couldn't open '$filename' for read\n";
 	while(my $line = <INFILE>)
 	{
 		my @words = split(/\s+/, $line);
+		$self->{'tokenCount'}->{$sysname} += scalar(@words);
 		my $refFactors = [];
 		foreach my $word (@words)
 		{
@@ -526,7 +856,7 @@ sub loadSentences
 		}
 		push @{$self->{$sysname}}, $refFactors;
 	}
-	close(INFILE);
+	close(INFILE);	
 }
 
 #free the memory used for the given corpus (but NOT any associated calculations, eg WER)
@@ -534,8 +864,8 @@ sub loadSentences
 #return: none
 sub releaseSentences
 {
-	my ($self, $sysname) = @_;
-	$self->{$sysname} = [];
+#	my ($self, $sysname) = @_;
+#	$self->{$sysname} = [];
 }
 
 #arguments: factor name
@@ -613,19 +943,80 @@ sub corpusWER
 #return: wer score, arrayref of arrayrefs of indices of errorful words
 sub sentenceWER
 {
+	#constants: direction we came through the table
+	my ($DIR_NONE, $DIR_SKIPTRUTH, $DIR_SKIPOUT, $DIR_SKIPBOTH) = (-1, 0, 1, 2); #values don't matter but must be unique
 	my ($self, $refSysOutput, $refTruth, $index) = @_;
 	my ($totWER, $indices) = (0, []);
 	my ($sLength, $eLength) = (scalar(@$refSysOutput), scalar(@$refTruth));
-	for(my $j = 0; $j < min($sLength, $eLength); $j++)
+	if($sLength == 0 || $eLength == 0) {return ($totWER, $indices);} #special case
+	
+	my @refWordsMatchIndices = (-1) x $eLength; #at what sysout-word index this truth word is first matched
+	my @sysoutWordsMatchIndices = (-1) x $sLength; #at what truth-word index this sysout word is first matched
+	my $table = []; #index by sysout word index, then truth word index; a cell holds max count of matching words and direction we came to get it
+	#dynamic-programming time: find the path through the table with the maximum number of matching words
+	for(my $i = 0; $i < $sLength; $i++)
 	{
-		if(lc $refSysOutput->[$j]->[$index] ne lc $refTruth->[$j]->[$index]) #check output word against truth word in same position
+		push @$table, [];
+		for(my $j = 0; $j < $eLength; $j++)
 		{
-			$totWER++;
-			push @$indices, $j;
+			my ($maxPrev, $prevDir) = (0, $DIR_NONE);
+			if($i > 0 && $table->[$i - 1]->[$j]->[0] >= $maxPrev) {$maxPrev = $table->[$i - 1]->[$j]->[0]; $prevDir = $DIR_SKIPOUT;}
+			if($j > 0 && $table->[$i]->[$j - 1]->[0] >= $maxPrev) {$maxPrev = $table->[$i]->[$j - 1]->[0]; $prevDir = $DIR_SKIPTRUTH;}
+			if($i > 0 && $j > 0 && $table->[$i - 1]->[$j - 1]->[0] >= $maxPrev) {$maxPrev = $table->[$i - 1]->[$j - 1]->[0]; $prevDir = $DIR_SKIPBOTH;}
+			my $match = ($refSysOutput->[$i]->[$index] eq $refTruth->[$j]->[$index] && $refWordsMatchIndices[$j] == -1 && $sysoutWordsMatchIndices[$i] == -1) ? 1 : 0;
+			if($match == 1) {$refWordsMatchIndices[$j] = $i; $sysoutWordsMatchIndices[$i] = $j;}
+			push @{$table->[$i]}, [($match ? $maxPrev + 1 : $maxPrev), $prevDir];
 		}
 	}
-	$totWER += max(0, $sLength - $eLength);
-	return ($totWER, $indices);
+	
+	#look back along the path and get indices of non-matching words
+	my @unusedSysout = (0) x $sLength; #whether each sysout word was matched--used for outputting html table
+	my ($i, $j) = ($sLength - 1, $eLength - 1);
+	while($i > 0) #work our way back to the first sysout word
+	{
+		push @{$table->[$i]->[$j]}, 0; #length is flag to highlight cell
+		if($table->[$i]->[$j]->[1] == $DIR_SKIPTRUTH)
+		{
+			$j--;
+		}
+		elsif($table->[$i]->[$j]->[1] == $DIR_SKIPOUT)
+		{
+			if($table->[$i - 1]->[$j]->[0] == $table->[$i]->[$j]->[0]) {unshift @$indices, $i; $unusedSysout[$i] = 1;}
+			$i--;
+		}
+		elsif($table->[$i]->[$j]->[1] == $DIR_SKIPBOTH)
+		{
+			if($table->[$i - 1]->[$j - 1]->[0] == $table->[$i]->[$j]->[0]) {unshift @$indices, $i; $unusedSysout[$i] = 1;}
+			$i--; $j--;
+		}
+	}
+	#we're at the first sysout word; finish up checking for matches
+	while($j > 0 && $refWordsMatchIndices[$j] != 0) {push @{$table->[0]->[$j]}, 0; $j--;}
+	if($j == 0 && $refWordsMatchIndices[0] != 0) {unshift @$indices, 0; $unusedSysout[0] = 1;} #no truth word was matched to the first sysout word
+	
+	#print some HTML to debug the WER algorithm
+#	print "<table border=1><tr><td></td><td>" . join("</td><td>", map {() . $_->[$index]} @$refTruth) . "</td></tr>";
+#	for(my $i = 0; $i < $sLength; $i++)
+#	{
+#		print "<tr><td" . (($unusedSysout[$i] == 1) ? " style=\"background-color: #ffdd88\">" : ">") . $refSysOutput->[$i]->[$index] . "</td>";
+#		for(my $j = 0; $j < $eLength; $j++)
+#		{
+#			print "<td";
+#			if(scalar(@{$table->[$i]->[$j]}) > 2) {print " style=\"color: yellow; background-color: #000080\"";}
+#			my $arrow;
+#			if($table->[$i]->[$j]->[1] == $DIR_NONE) {$arrow = "&times;";}
+#			elsif($table->[$i]->[$j]->[1] == $DIR_SKIPTRUTH) {$arrow = "&larr;";}
+#			elsif($table->[$i]->[$j]->[1] == $DIR_SKIPOUT) {$arrow = "&uarr;";}
+#			elsif($table->[$i]->[$j]->[1] == $DIR_SKIPBOTH) {$arrow = "&loz;";}
+#			print ">" . $table->[$i]->[$j]->[0] . "  " . $arrow . "</td>";
+#		}
+#		print "</tr>";
+#	}
+#	print "</table>";
+	
+	my $matchCount = 0;
+	if($sLength > 0) {$matchCount = $table->[$sLength - 1]->[$eLength - 1]->[0];}
+	return ($sLength - $matchCount, $indices);
 }
 
 #arguments: system output (arrayref of arrayrefs of arrayrefs of factor strings), truth (same), factor index to use
diff --git a/scripts/analysis/smtgui/file-descriptions b/scripts/analysis/smtgui/file-descriptions
index 97ac6f31a..caf1507e6 100644
--- a/scripts/analysis/smtgui/file-descriptions
+++ b/scripts/analysis/smtgui/file-descriptions
@@ -1 +1,3 @@
-devtest2006.de-en.top100.matrix05-baseline.pharaoh Pharaoh JHUWS baseline run
+devtest2006.de-en.matrix05-baseline.pharaoh Pharaoh JHUWS baseline run
+devtest2006.de-en.matrix05-baseline.moses-2006-07-20 Moses baseline run
+devtest2006.en-de.matrix05-baseline.pharaoh Pharaoh JHUWS baseline run
diff --git a/scripts/analysis/smtgui/file-factors b/scripts/analysis/smtgui/file-factors
index a1128eb2a..fad031ec4 100644
--- a/scripts/analysis/smtgui/file-factors
+++ b/scripts/analysis/smtgui/file-factors
@@ -1,6 +1,6 @@
-#corpus name, factors given (/\s+/-delimited)
-#(the given factors should be present in all target-language files for the given corpus)
-devtest2006.de-en					surf pos lemma
-devtest2006.de-en.top100		surf pos lemma
+#corpus name : list of factors in corpus : [input] factor LMfilename, factor LMfilename, ... : [output] factor LMfilename, factor LMfilename, ...
+#(the given factors should be present in all files for the given corpus)
+devtest2006.de-en	: surf pos lemma : surf europarl.de.srilm.gz : surf europarl.en.srilm.gz
+devtest2006.en-de	: surf pos lemma : surf europarl.en.srilm.gz : surf europarl.de.srilm.gz
 #pstem: lemmas come from the Porter stemmer (and so are really a mix of stems and lemmas)
-pstem_devtest2006.de-en			surf pos lemma
+pstem_devtest2006.de-en	: surf pos lemma : : surf europarl.en.srilm.gz
diff --git a/scripts/analysis/smtgui/newsmtgui.cgi b/scripts/analysis/smtgui/newsmtgui.cgi
index 0e969a788..a31ac558e 100755
--- a/scripts/analysis/smtgui/newsmtgui.cgi
+++ b/scripts/analysis/smtgui/newsmtgui.cgi
@@ -22,7 +22,7 @@ my $FOREIGN = 'f';
 
 #FILEDESC: textual descriptions associated with specific filenames; to be displayed on the single-corpus view
 my %FILEDESC = (); &load_descriptions();
-my %factorIndices = loadFactorIndices('file-factors');
+my %factorData = loadFactorData('file-factors'); 
 my %MEMORY;        &load_memory();
 my (@mBLEU,@NIST);
 @mBLEU=`cat mbleu-memory.dat` if -e "mbleu-memory.dat"; chop(@mBLEU);
@@ -61,7 +61,7 @@ sub show_corpora {
   my %CORPUS = ();
   
   # find corpora in evaluation directory: see the factor-index file, which was already read in
-  foreach my $corpusName (keys %factorIndices)
+  foreach my $corpusName (keys %factorData)
   {
   	$CORPUS{$corpusName} = 1;
   }
@@ -82,7 +82,7 @@ sub view_corpus {
   &htmlhead("View Corpus $in{CORPUS}");
   
   # find corpora in evaluation directory
-  my $corpus = new Corpus('-name' => "$in{CORPUS}", '-descriptions' => \%FILEDESC, '-indices' => $factorIndices{$in{CORPUS}});
+  my $corpus = new Corpus('-name' => "$in{CORPUS}", '-descriptions' => \%FILEDESC, '-info_line' => $factorData{$in{CORPUS}});
   
   my ($sentence_count, $lineInfo);
   if(-e "$in{CORPUS}.f")
@@ -105,8 +105,8 @@ sub view_corpus {
   print "<INPUT TYPE=HIDDEN NAME=ACTION VALUE=COMPARE>\n";
   print "<INPUT TYPE=HIDDEN NAME=CORPUS VALUE=\"$in{CORPUS}\">\n";
   print "<TABLE BORDER=1 CELLSPACING=0><TR>
-<TD>Filename (<A HREF=?ACTION=VIEW_CORPUS&CORPUS=" . CGI::escape($in{CORPUS}).">sort</A>)</TD>
-<TD>Date     (<A HREF=?ACTION=VIEW_CORPUS&CORPUS=" . CGI::escape($in{CORPUS})."&SORT=TIME>sort</A>)</TD>";
+<TD>File (<A HREF=?ACTION=VIEW_CORPUS&CORPUS=" . CGI::escape($in{CORPUS}).">sort</A>)</TD>
+<TD>Date (<A HREF=?ACTION=VIEW_CORPUS&CORPUS=" . CGI::escape($in{CORPUS})."&SORT=TIME>sort</A>)</TD>";
   if (-e "$in{CORPUS}.e") {
     print "<TD>IBM BLEU (<A HREF=?ACTION=VIEW_CORPUS&CORPUS=" . CGI::escape($in{CORPUS})."&SORT=IBM>sort</A>)</TD>";
   }
@@ -120,9 +120,11 @@ sub view_corpus {
     print "<TD>mBLEU (<A HREF=?ACTION=VIEW_CORPUS&CORPUS=" . CGI::escape($in{CORPUS})."&SORT=mBLEU>sort</A>)</TD>";
   }
   print "<TD>Unknown Words</TD>"; #can't sort on; only applies to the input
+  print "<TD>Perplexity</TD>"; #applies to truth and system outputs
+  print "<TD>WER (<A HREF=?ACTION=VIEW_CORPUS&CORPUS=" . CGI::escape($in{CORPUS})."&SORT=WER>sort</A>)</TD>";
   print "<TD>Noun & adj WER-PWER</TD>"; #can't sort on; only applies to sysoutputs
   print "<TD>Surface vs. lemma PWER</TD>"; #can't sort on; only applies to sysoutputs
-  print "<TD>Score (<A HREF=?ACTION=VIEW_CORPUS&CORPUS=" . CGI::escape($in{CORPUS})."&SORT=SCORE>sort</A>)</TD><TD>Actions</TD></TR>";
+	print "<TD>Statistical Measures</TD>";
 
   open(DIR,"ls $in{CORPUS}.*|");
   while(<DIR>) {
@@ -143,7 +145,7 @@ sub view_corpus {
     /^$in{CORPUS}.([^\/]+)$/;
     my $file = $1;
     # checkbox for compare
-    my $row = "<TR><TD><INPUT TYPE=CHECKBOX NAME=FILE_$file VALUE=1>";
+    my $row = "<TR><TD style=\"font-size: small\"><INPUT TYPE=CHECKBOX NAME=FILE_$file VALUE=1>";
     # README
     if (-e "$in{CORPUS}.$file.README") {
       my $readme = `cat $in{CORPUS}.$file.README`;
@@ -153,7 +155,7 @@ sub view_corpus {
       $row .= "<A HREF='javascript:FieldInfo(\"$in{CORPUS}.$file\",\"$readme\")'>";
     }
     # filename
-    $row .= "$in{CORPUS}.$file</A>";
+    $row .= "$file</A>";
     # description (hard-coded)
     my @TRANSLATION_SENTENCE = `cat $in{CORPUS}.$file`; 
     chop(@TRANSLATION_SENTENCE);
@@ -223,7 +225,8 @@ sub view_corpus {
       $row .= "</TD>\n";
     }
 	 
-	 # misc stats
+	 my $isSystemOutput = ($file ne 'e' && $file ne 'f' && $file !~ /^pt/);
+	 # misc stats (note the unknown words should come first so the total word count is available for WER)
 	 $row .= "<TD align=\"center\">";
 	 if($file eq 'f') #input
 	 {
@@ -235,18 +238,38 @@ sub view_corpus {
 		catch Error::Simple with {$row .= "[system error]";};
 	 }
 	 $row .= "</TD>\n<TD align=\"center\">";
+	 if($file eq 'e' || $file eq 'f' || $isSystemOutput)
+	 {
+	 	try
+		{
+			my $perplexity = $corpus->calcPerplexity(($file eq 'e') ? 'truth' : (($file eq 'f') ? 'input' : $file), 'surf');
+			$row .= sprintf("%.2lf", $perplexity);
+		}
+		catch Error::Simple with {$row .= "[system error]";}
+	 }
+	 $row .= "</TD>\n<TD align=\"center\">";
+	 if($isSystemOutput)
+	 {
+	 	try
+		{
+			my $surfaceWER = $corpus->calcOverallWER($file);
+			$row .= sprintf("%.4lf", $surfaceWER);
+		}
+		catch Error::Simple with {$row .= "[system error]";};
+	 }
+	 $row .= "</TD>\n<TD align=\"center\">";
 	 my ($nnAdjWER, $nnAdjPWER, $surfPWER, $lemmaPWER);
-	 if($file ne 'e' && $file ne 'f' && $file !~ /^pt/) #system output
+	 if($isSystemOutput)
 	 {
 		try
 		{
 			($nnAdjWER, $nnAdjPWER, $surfPWER, $lemmaPWER) = calc_misc_stats($corpus, $file);
-			$row .= "WER = $nnAdjWER<br>PWER = $nnAdjPWER<br><b>diff = " . ($nnAdjWER - $nnAdjPWER) . "</b>";
+			$row .= sprintf("WER = %.4lg<br>PWER = %.4lg<br><b>ratio = %.3lf</b>", $nnAdjWER, $nnAdjPWER, $nnAdjPWER / $nnAdjWER);
 		}
 		catch Error::Simple with {$row .= "[system error]";};
 	}
 	$row .= "</TD>\n<TD align=\"center\">";
-	if($file ne 'e' && $file ne 'f' && $file !~ /^pt/) #system output
+	if($isSystemOutput)
 	{
 		if($surfPWER == -1)
 		{
@@ -255,18 +278,32 @@ sub view_corpus {
 		else
 		{
 			my ($lemmaBLEU, $p1, $p2, $p3, $p4, $brevity) = $corpus->calcBLEU($file, 'lemma');
-			$row .= sprintf("surface = %d<br>lemma = %d<br><b>lemma BLEU = %.04f</b> %.01f/%.01f/%.01f/%.01f *%.03f", 
+			$row .= sprintf("surface = %.3lf<br>lemma = %.3lf<br><b>lemma BLEU = %.04f</b> %.01f/%.01f/%.01f/%.01f *%.03f", 
 									$surfPWER, $lemmaPWER, $lemmaBLEU, $p1, $p2, $p3, $p4, $brevity);
 		}
 	}
+	$row .= "</TD>\n<TD align=\"center\">";
+	if($isSystemOutput)
+	{
+		try
+		{
+			my $testInfo = $corpus->statisticallyTestBLEUResults($file, 'surf');
+			my @tTestPValues = @{$testInfo->[0]};
+			my @confidenceIntervals = @{$testInfo->[1]};
+			$row .= "n-gram precision p-values (high p <=> consistent score):<br>t test " . join("/", map {sprintf("%.4lf", $_)} @tTestPValues);
+			$row .= "<p>n-gram precision 95% intervals:<br>" . join(",<br>", map {sprintf("[%.4lf - %.4lf]", $_->[0], $_->[1])} @confidenceIntervals);
+			my @bleuInterval = (approxBLEUFromNgramScores(map {$_->[0]} @confidenceIntervals), approxBLEUFromNgramScores(map {$_->[1]} @confidenceIntervals));
+			$row .= sprintf("<br><b>(BLEU: ~[%.4lf - %.4lf])</b>", $bleuInterval[0], $bleuInterval[1]);
+		}
+		catch Error::Simple with {$row .= "[system error]";}
+	}
 	$row .= "</TD>\n";
 
     # correct sentence score
     my($correct,$wrong,$unknown);
     $row .= "<TD>";
     if (!defined($DONTSCORE{$file}) && (scalar keys %MEMORY)) {
-      my ($correct,$just_syn,$just_sem,$wrong,$unknown) = 
-	&get_score_from_memory("$in{CORPUS}.$FOREIGN",
+      my ($correct,$just_syn,$just_sem,$wrong,$unknown) = &get_score_from_memory("$in{CORPUS}.$FOREIGN",
 			       "$in{CORPUS}.$file");
       $row .= "<B><FONT COLOR=GREEN>$correct</FONT></B>";
       $row .= "/<FONT COLOR=ORANGE>$just_syn</FONT>";
@@ -281,12 +318,7 @@ sub view_corpus {
 	 	$row .= "</TD>\n";
 	}
 
-    # score / review links
-    $row .= "<TD>";
-    $row .= "<A HREF=\"?ACTION=SCORE_FILE&VIEW=1&CORPUS=".CGI::escape($in{CORPUS})."&FILE=".CGI::escape($file)."\">view</A>" if (!defined($DONTSCORE{$file}));
-    $row .= " <A HREF=\"?ACTION=SCORE_FILE&CORPUS=".CGI::escape($in{CORPUS})."&FILE=".CGI::escape($file)."\">score</A>" if (!defined($DONTSCORE{$file}) && ($unknown || scalar keys %MEMORY == 0));
-    $row .= " <A HREF=\"?ACTION=RESCORE_FILE&CORPUS=".CGI::escape($in{CORPUS})."&FILE=".CGI::escape($file)."\">review</A>" if (!defined($DONTSCORE{$file}) && scalar keys %MEMORY);
-    $row .= "</TD></TR>\n"; 
+    $row .= "</TR>\n"; 
     push @TABLE, "<!-- $sort -->\n$row";
   }
   close(DIR);
@@ -295,7 +327,31 @@ sub view_corpus {
   print "<INPUT TYPE=SUBMIT VALUE=\"Compare\">\n";
   print "<INPUT TYPE=CHECKBOX NAME=SURFACE VALUE=1 CHECKED> Compare all different sentences (instead of just differently <I>evaluated</I> sentences) <INPUT TYPE=CHECKBOX NAME=WITH_EVAL VALUE=1 CHECKED> with evaluation</FORM><P>\n";
   print "<P>The score is to be read as: <FONT COLOR=GREEN>correct</FONT>/<FONT COLOR=ORANGE>just-syn-correct</FONT>/<FONT COLOR=ORANGE>just-sem-correct</FONT>/<FONT COLOR=RED>wrong</FONT> (unscored)\n";
-  print "<BR>IBM BLEU is to be read as: <B>metric</B> unigram/bigram/trigram/quadgram *brevity-penalty\n"; 
+  print "<BR>IBM BLEU is to be read as: <B>metric</B> unigram/bigram/trigram/quadgram *brevity-penalty<P>";
+  print "<DIV STYLE=\"border: 1px solid #006600\">";
+  print "<H2>Comparison of System Translations (p-values)</H2>";
+  my @sysnames = $corpus->getSystemNames();
+  for(my $i = 0; $i < scalar(@sysnames); $i++)
+  {
+  	for(my $j = $i + 1; $j < scalar(@sysnames); $j++)
+	{
+		my $comparison = $corpus->statisticallyCompareSystemResults($sysnames[$i], $sysnames[$j], 'surf');
+		print "<P><FONT COLOR=#00aa22>" . $sysnames[$i] . " vs. " . $sysnames[$j] . "</FONT>: [<I>t</I> test] ";
+		for(my $k = 0; $k < scalar(@{$comparison->[0]}); $k++)
+		{
+			print sprintf(($k == 0) ? "%.4lg" : "; %.4lg ", $comparison->[0]->[$k]);
+			if($comparison->[1]->[$k] == 0) {print "(&larr;)";} else {print "(&rarr;)";}
+		}
+		print "&nbsp;&nbsp;---&nbsp;&nbsp;[sign test] ";
+		for(my $k = 0; $k < scalar(@{$comparison->[2]}); $k++)
+		{
+			print sprintf(($k == 0) ? "%.4lg " : "; %.4lg ", $comparison->[2]->[$k]);
+			if($comparison->[3]->[$k] == 0) {print "(&larr;)";} else {print "(&rarr;)";}
+		}
+		print "\n";
+	}
+  }
+  print "</DIV\n";
   print "<P><A HREF=\"newsmtgui.cgi?action=\">All corpora</A>\n";
 }
 
@@ -496,6 +552,16 @@ sub calc_misc_stats
 	};
 }
 
+#approximate BLEU score from n-gram precisions (currently assume no length penalty)
+#arguments: n-gram precisions as an array
+#return: BLEU score
+sub approxBLEUFromNgramScores
+{
+	my $logsum = 0;
+	foreach my $p (@_) {$logsum += log($p);}
+	return exp($logsum / scalar(@_));
+}
+
 ###### NIST SCORE
 
 sub get_nist_score {
@@ -669,7 +735,7 @@ sub compare2
 	print "<FORM ACTION=\"\" METHOD=POST>\n";
 	print "<INPUT TYPE=HIDDEN NAME=ACTION VALUE=$in{ACTION}>\n";
 	print "<INPUT TYPE=HIDDEN NAME=CORPUS VALUE=\"$in{CORPUS}\">\n";
-	my $corpus = new Corpus('-name' => "$in{CORPUS}", '-descriptions' => \%FILEDESC, '-indices' => \%factorIndices);
+	my $corpus = new Corpus('-name' => "$in{CORPUS}", '-descriptions' => \%FILEDESC, '-info_line' => $factorData{$in{CORPUS}});
 	$corpus->writeComparisonPage(\*STDOUT, /^.*$/);
 	print "</FORM>\n";
 }
@@ -826,7 +892,6 @@ sub trim {
   $$translation =~ s/ +/ /g;
   $$translation =~ s/^ +//;
   $$translation =~ s/ +$//;
-#  $$translation =~ s/ +[\.]$//;
 }
 
 sub load_descriptions {
@@ -838,24 +903,20 @@ sub load_descriptions {
   close(FD);
 }
 
-#read config file giving names of corpi and fill factor-index map
+#read config file giving various corpus config info
 #arguments: filename to read
-#return: hash of corpus names to hashrefs of factor names to indices
-sub loadFactorIndices
+#return: hash of corpus names to strings containing formatted info
+sub loadFactorData
 {
 	my $filename = shift;
 	my %data = ();
-	open(INFILE, "<$filename") or die "loadFactorIndices(): couldn't open '$filename' for read\n";
+	open(INFILE, "<$filename") or die "loadFactorData(): couldn't open '$filename' for read\n";
 	while(my $line = <INFILE>)
 	{
 		if($line =~ /^\#/) {next;} #skip comment lines
-		my @tokens = split(/\s+/, $line);
-		my $corpusName = shift @tokens;
-		$data{$corpusName} = {};
-		for(my $i = 0; $i < scalar(@tokens); $i++)
-		{
-			$data{$corpusName}->{$tokens[$i]} = $i;
-		}
+		$line =~ /^\s*(\S+)\s*:\s*(\S.*\S)\s*$/;
+		my $corpusName = $1;
+		$data{$corpusName} = $2;
 	}
 	close(INFILE);
 	return %data;
author	eherbst <eherbst@1f5c12ca-751b-0410-a591-d2e778427230>	2006-08-07 21:22:39 +0400
committer	eherbst <eherbst@1f5c12ca-751b-0410-a591-d2e778427230>	2006-08-07 21:22:39 +0400
commit	8420ecf516204528e0de956f392c24d38d6d86cb (patch)
tree	2905b3fb3a41afba33fd9123bad16869a00f470b /scripts/analysis
parent	ac34c07dc003957393804a39e4da58491df742ab (diff)