add CGI-based tool for calculating and displaying various error measures

git-svn-id: https://mosesdecoder.svn.sourceforge.net/svnroot/mosesdecoder/trunk@427 1f5c12ca-751b-0410-a591-d2e778427230
author: eherbst <eherbst@1f5c12ca-751b-0410-a591-d2e778427230> 2006-08-01 02:05:11 +0400
committer: eherbst <eherbst@1f5c12ca-751b-0410-a591-d2e778427230> 2006-08-01 02:05:11 +0400
commit: 5cce8336c076371dfa30c5cd37c29bebb56a558a (patch)
tree: b13af443c55c50398385b21fecb4b3e5f17c8eb1 /scripts/analysis
parent: dfe989fbd22f76045301693edfb84a36371dc5d9 (diff)
7 files changed, 2663 insertions, 0 deletions
diff --git a/scripts/analysis/smtgui/Corpus.pm b/scripts/analysis/smtgui/Corpus.pm
new file mode 100644
index 000000000..f191ce273
--- /dev/null
+++ b/scripts/analysis/smtgui/Corpus.pm
@@ -0,0 +1,867 @@
+#package Corpus: hold a bunch of sentences in any language, with translation factors and stats about individual sentences and the corpus as a whole
+#Evan Herbst, 7 / 25 / 06
+
+package Corpus;
+BEGIN
+{
+	push @INC, ".";
+}
+use Error;
+
+return 1;
+
+###########################################################################################################################
+
+#'our' variables are available outside the package
+our @FACTORNAMES = ('surf', 'pos', 'lemma', 'stem', 'morph');
+
+#constructor
+#arguments: short corpus name (-name), hashref of filenames to descriptions (-descriptions), hashref of factor names to indices in this corpus (-indices)
+sub new
+{
+	my $class = shift;
+	my %args = @_; #turn the remainder of @_ into a hash
+	my ($corpusName, $refFileDescs, $factorIndices) = ($args{'-name'}, $args{'-descriptions'}, $args{'-indices'});
+	my $self = {};
+	$self->{'corpusName'} = $corpusName;
+	$self->{'truth'} = []; #arrayref of arrayrefs of factors
+	$self->{'input'} = []; #same; also same for any system outputs that get loaded
+	$self->{'truthFilename'} = "";
+	$self->{'inputFilename'} = "";
+	$self->{'sysoutFilenames'} = {}; #hashref of (string => string) for (system name, filename)
+	$self->{'phraseTableFilenames'} = {}; #factor name (from @FACTORNAMES) => filename
+	$self->{'factorIndices'} = {}; #factor name => index; names ought to be strings from the standard list in @FACTORNAMES
+	%{$self->{'factorIndices'}} = %$factorIndices;
+	$self->{'phraseTables'} = {}; #factor name (from @FACTORNAMES) => hashref of source phrases to anything; used for unknown-word counting
+	$self->{'unknownCount'} = {}; #factor name => count of unknown tokens in input
+	$self->{'totalTokens'} = 0; #useful with counts of unknown tokens
+	$self->{'sysoutWER'} = {}; #system name => (factor name => arrayref with system output total WER and arrayref of WER scores for individual sysout sentences wrt truth)
+	$self->{'sysoutPWER'} = {}; #similarly
+	$self->{'fileDescriptions'} = {}; #filename associated with us => string description of file
+	$self->{'bleuScores'} = {}; #system name => (factor name => arrayref of (overall score, arrayref of per-sentence scores) )
+	$self->{'cacheFilename'} = "cache/$corpusName.cache"; #all memory of various scores is stored here
+	bless $self, $class;
+	$self->locateFiles($refFileDescs); #find all relevant files in the current directory; set filenames and descriptions
+	$self->loadCacheFile();
+	return $self;
+}
+
+#arguments: filename
+#return: description string
+#throw if filename doesn't belong to this corpus
+sub getFileDescription
+{
+	my ($self, $filename) = @_;
+	if(!defined($self->{'fileDescriptions'}->{$filename}))
+	{
+		throw Error::Simple(-text => "Corpus::getFileDescription(): invalid filename '$filename'\n");
+	}
+	return $self->{'fileDescriptions'}->{$filename};
+}
+
+#arguments: hashref of factor names to indices (not all factors in @FACTORNAMES must be included, but no strings not from the list should be)
+#return: none
+sub setFactorIndices
+{
+	my ($self, $refIndices) = @_;
+	%{$self->{'factorIndices'}} = %{$refIndices};
+}
+
+#calculate the number of unknown factor values for the given factor in the input file
+#arguments: factor name
+#return: unknown factor count, total factor count (note the total doesn't depend on the factor)
+#throw if we don't have an input file or a phrase table for the given factor defined or if there's no index known for the given factor
+sub calcUnknownTokens
+{
+	my ($self, $factorName) = @_;
+	#check in-memory cache first
+	if(defined($self->{'unknownCount'}->{$factorName}))
+	{
+		return ($self->{'unknownCount'}->{$factorName}, $self->{'totalTokens'});
+	}
+	
+	$self->ensureFilenameDefined('input');
+	$self->ensurePhraseTableDefined($factorName);
+	$self->ensureFactorPosDefined($factorName);
+	$self->loadSentences('input', $self->{'inputFilename'});
+	$self->loadPhraseTable($factorName);
+
+	#count unknown and total words
+	my ($unknownTokens, $totalTokens) = (0, 0);
+	my $factorIndex = $self->{'factorIndices'}->{$factorName};
+	foreach my $sentence (@{$self->{'input'}})
+	{
+		$totalTokens += scalar(@$sentence);
+		foreach my $word (@$sentence)
+		{
+			if(!defined($self->{'phraseTables'}->{$factorName}->{$word->[$factorIndex]}))
+			{
+				$unknownTokens++;
+			}
+		}
+	}
+	$self->{'unknownCount'}->{$factorName} = $unknownTokens;
+	$self->{'totalTokens'} = $totalTokens;
+	
+	return ($unknownTokens, $totalTokens);
+}
+
+#arguments: system name
+#return: (WER, PWER) for nouns and adjectives in given system wrt truth
+#throw if given system or truth is not set or if index of 'surf' or 'pos' hasn't been specified
+sub calcNounAdjWER_PWERDiff
+{
+	my ($self, $sysname) = @_;
+	$self->ensureFilenameDefined('truth');
+	$self->ensureFilenameDefined($sysname);
+	$self->ensureFactorPosDefined('surf');
+	$self->ensureFactorPosDefined('pos');
+	$self->loadSentences('truth', $self->{'truthFilename'});
+	$self->loadSentences($sysname, $self->{'sysoutFilenames'}->{$sysname});
+	#find nouns and adjectives and score them
+	my ($werScore, $pwerScore) = (0, 0);
+	my $nnNadjTags = $self->getPOSTagList('nounAndAdj');
+	for(my $i = 0; $i < scalar(@{$self->{'truth'}}); $i++)
+	{
+#		warn "truth " . join(', ', map {join('|', @$_)} @{$self->{'truth'}->[$i]}) . "\n";
+#		warn "sysout " . join(', ', map {join('|', @$_)} @{$self->{$sysname}->[$i]}) . "\n";
+		my @nnAdjEWords = $self->filterFactors($self->{'truth'}->[$i], $self->{'factorIndices'}->{'pos'}, $nnNadjTags);
+		my @nnAdjSWords = $self->filterFactors($self->{$sysname}->[$i], $self->{'factorIndices'}->{'pos'}, $nnNadjTags);
+#		warn "filtered truth: " . join(' ', map {join('|', @$_)} @nnAdjEWords) . "\n";
+#		warn "filtered sysout: " . join(' ', map {join('|', @$_)} @nnAdjSWords) . "\n\n\n";
+		my ($sentWer, $tmp) = $self->sentenceWER(\@nnAdjSWords, \@nnAdjEWords, $self->{'factorIndices'}->{'surf'});
+		$werScore += $sentWer;
+		($sentWer, $tmp) = $self->sentencePWER(\@nnAdjSWords, \@nnAdjEWords, $self->{'factorIndices'}->{'surf'});
+		$pwerScore += $sentWer;
+	}
+	
+	#unhog memory
+	$self->releaseSentences('truth');
+	$self->releaseSentences($sysname);
+	return ($werScore, $pwerScore);
+}
+
+#calculate detailed WER statistics and put them into $self
+#arguments: system name, factor name to consider (default 'surf', surface form)
+#return: overall surface WER for given system (w/o filtering)
+#throw if given system or truth is not set or if index of factor name hasn't been specified
+sub calcOverallWER
+{
+	my ($self, $sysname, $factorName) = (shift, shift, 'surf');
+	if(scalar(@_) > 0) {$factorName = shift;}
+	#check in-memory cache first
+	if(defined($self->{'sysoutWER'}->{$sysname}->{$factorName}))
+	{
+		return $self->{'sysoutWER'}->{$sysname}->{$factorName}->[0];
+	}
+	
+	$self->ensureFilenameDefined('truth');
+	$self->ensureFilenameDefined($sysname);
+	$self->ensureFactorPosDefined($factorName);
+	$self->loadSentences('truth', $self->{'truthFilename'});
+	$self->loadSentences($sysname, $self->{'sysoutFilenames'}->{$sysname});
+	
+	my ($wer, $swers, $indices) = $self->corpusWER($self->{$sysname}, $self->{'truth'}, $self->{'factorIndices'}->{$factorName});
+	$self->{'sysoutWER'}->{$sysname}->{$factorName} = [$wer, $swers, $indices]; #total; arrayref of scores for individual sentences; arrayref of arrayrefs of offending words in each sentence
+	
+	#unhog memory
+	$self->releaseSentences('truth');
+	$self->releaseSentences($sysname);
+	return $self->{'sysoutWER'}->{$sysname}->{$factorName}->[0];
+}
+
+#calculate detailed PWER statistics and put them into $self
+#arguments: system name, factor name to consider (default 'surf')
+#return: overall surface PWER for given system (w/o filtering)
+#throw if given system or truth is not set or if index of factor name hasn't been specified
+sub calcOverallPWER
+{
+	my ($self, $sysname, $factorName) = (shift, shift, 'surf');
+	if(scalar(@_) > 0) {$factorName = shift;}
+	#check in-memory cache first
+	if(defined($self->{'sysoutPWER'}->{$sysname}->{$factorName}))
+	{
+		return $self->{'sysoutPWER'}->{$sysname}->{$factorName}->[0];
+	}
+	
+	$self->ensureFilenameDefined('truth');
+	$self->ensureFilenameDefined($sysname);
+	$self->ensureFactorPosDefined($factorName);
+	$self->loadSentences('truth', $self->{'truthFilename'});
+	$self->loadSentences($sysname, $self->{'sysoutFilenames'}->{$sysname});
+	
+	my ($pwer, $spwers, $indices) = $self->corpusPWER($self->{$sysname}, $self->{'truth'}, $self->{'factorIndices'}->{$factorName});
+	$self->{'sysoutPWER'}->{$sysname}->{$factorName} = [$pwer, $spwers, $indices]; #total; arrayref of scores for individual sentences; arrayref of arrayrefs of offending words in each sentence
+	
+	#unhog memory
+	$self->releaseSentences('truth');
+	$self->releaseSentences($sysname);
+	return $self->{'sysoutPWER'}->{$sysname}->{$factorName}->[0];
+}
+
+#arguments: system name, factor name to consider (default 'surf')
+#return: BLEU score, n-gram precisions, brevity penalty
+sub calcBLEU
+{
+	my ($self, $sysname, $factorName) = (shift, shift, 'surf');
+	if(scalar(@_) > 0) {$factorName = shift;}
+	#check in-memory cache first
+	if(defined($self->{'bleuScores'}->{$sysname}->{$factorName}))
+	{
+		return $self->{'bleuScores'}->{$sysname}->{$factorName};
+	}
+	
+	$self->ensureFilenameDefined('truth');
+	$self->ensureFilenameDefined($sysname);
+	$self->ensureFactorPosDefined($factorName);
+	$self->loadSentences('truth', $self->{'truthFilename'});
+	$self->loadSentences($sysname, $self->{'sysoutFilenames'}->{$sysname});
+	
+	#score structure: various total scores, arrayref of by-sentence score arrays
+	if(!defined($self->{'bleuScores'}->{$sysname})) {$self->{'bleuScores'}->{$sysname} = {};}
+	if(!defined($self->{'bleuScores'}->{$sysname}->{$factorName})) {$self->{'bleuScores'}->{$sysname}->{$factorName} = [[], []];}
+	
+	my ($good1, $tot1, $good2, $tot2, $good3, $tot3, $good4, $tot4, $totCLength, $totRLength) = (0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
+	my $factorIndex = $self->{'factorIndices'}->{$factorName};
+	for(my $i = 0; $i < scalar(@{$self->{'truth'}}); $i++)
+	{
+		my ($truthSentence, $sysoutSentence) = ($self->{'truth'}->[$i], $self->{$sysname}->[$i]);
+		my ($unigood, $unicount, $bigood, $bicount, $trigood, $tricount, $quadrugood, $quadrucount, $cLength, $rLength) = 
+				$self->sentenceBLEU($truthSentence, $sysoutSentence, $factorIndex, 0); #last argument is whether to debug-print
+		push @{$self->{'bleuScores'}->{$sysname}->{$factorName}->[1]}, [$unigood, $unicount, $bigood, $bicount, $trigood, $tricount, $quadrugood, $quadrucount, $cLength, $rLength];
+		$good1 += $unigood; $tot1 += $unicount;
+		$good2 += $bigood; $tot2 += $bicount;
+		$good3 += $trigood; $tot3 += $tricount;
+		$good4 += $quadrugood; $tot4 += $quadrucount;
+		$totCLength += $cLength;
+		$totRLength += $rLength;
+	}
+	my $brevity = ($totCLength > $totRLength || $totCLength == 0) ? 1 : exp(1 - $totRLength / $totCLength);
+	my ($pct1, $pct2, $pct3, $pct4) = ($good1 / $tot1, $good2 / $tot2, $good3 / $tot3, $good4 / $tot4);
+#	warn sprintf("n-gram prec: %d/%d = %.3lf, %d/%d = %.3lf, %d/%d = %.3lf, %d/%d = %.3lf\n", 
+#						$good1, $tot1, $pct1, $good2, $tot2, $pct2, $good3, $tot3, $pct3, $good4, $tot4, $pct4);
+	my $bleu = $brevity * exp((my_log($pct1) + my_log($pct2) + my_log($pct3) + my_log($pct4)) / 4);
+#	warn sprintf("brevity: %.3lf (%d ref, %d sys)\n", $brevity, $totRLength, $totCLength);
+	sleep 8;
+	$self->{'bleuScores'}->{$sysname}->{$factorName}->[0] = [$bleu, 100 * $pct1, 100 * $pct2, 100 * $pct3, 100 * $pct4, $brevity];
+  
+	#unhog memory
+	$self->releaseSentences('truth');
+	$self->releaseSentences($sysname);
+	return @{$self->{'bleuScores'}->{$sysname}->{$factorName}->[0]};
+}
+
+#write HTML to be displayed to compare the various versions we have of each sentence in the corpus;
+#allow filtering of which versions will be displayed
+#(we don't write the whole page, just the contents of the body)
+#arguments: filehandleref to which to write, regex to filter filename extensions to be included
+#return: none
+sub writeComparisonPage
+{
+	my ($self, $fh, $filter) = @_;
+	my @filteredExtensions = grep($filter, ('e', 'f', keys %{$self->{'sysoutFilenames'}}));
+	my %openedFiles = $self->openFiles(@filteredExtensions);
+	my $id = 1; #sentence ID string
+	while(my %lines = $self->readLineFromFiles(%openedFiles))
+	{
+		$self->printSingleSentenceComparison($fh, $id, %lines);
+		$id++;
+	}
+	$self->closeFiles(%openedFiles);
+}
+
+##########################################################################################################
+#####     INTERNAL     ###################################################################################
+##########################################################################################################
+
+#destructor!
+#arguments: none
+#return: none
+sub DESTROY
+{
+	my $self = shift;
+	$self->writeCacheFile();
+}
+
+#write all scores in memory to disk
+#arguments: none
+#return: none
+sub writeCacheFile
+{
+	my $self = shift;
+	if(!open(CACHEFILE, ">" . $self->{'cacheFilename'}))
+	{
+		warn "Corpus::writeCacheFile(): can't open '" . $self->{'cacheFilename'} . "' for write\n";
+		return;
+	}
+
+	#store file changetimes to disk
+	print CACHEFILE "File changetimes\n";
+	#store bleu scores to disk
+	print CACHEFILE "\nBLEU scores\n";
+	foreach my $sysname (keys %{$self->{'bleuScores'}})
+	{
+		foreach my $factorName (keys %{$self->{'bleuScores'}->{$sysname}})
+		{
+			print CACHEFILE "$sysname $factorName " . join(' ', @{$self->{'bleuScores'}->{$sysname}->{$factorName}->[0]});
+			foreach my $sentenceBLEU (@{$self->{'bleuScores'}->{$sysname}->{$factorName}->[1]})
+			{
+				print CACHEFILE "; " . join(' ', @$sentenceBLEU);
+			}
+			print CACHEFILE "\n";
+		}
+	}
+	#store unknown-token counts to disk
+	print CACHEFILE "\nUnknown-token counts\n";
+	foreach my $factorName (keys %{$self->{'unknownCount'}})
+	{
+		print CACHEFILE $factorName . " " . $self->{'phraseTableFilenames'}->{$factorName} . " " . $self->{'unknownCount'}->{$factorName} . " " . $self->{'totalTokens'} . "\n";
+	}
+	#store WER, PWER to disk
+	print CACHEFILE "\nWER scores\n";
+	my $printWERFunc = 
+	sub
+	{
+		my $werType = shift;
+		foreach my $sysname (keys %{$self->{$werType}})
+		{
+			foreach my $factorName (keys %{$self->{$werType}->{$sysname}})
+			{
+				my ($totalWER, $sentenceWERs, $errorWords) = @{$self->{$werType}->{$sysname}->{$factorName}};
+				print CACHEFILE "$werType $sysname $factorName $totalWER " . join(' ', @$sentenceWERs);
+				foreach my $indices (@$errorWords)
+				{
+					print CACHEFILE ";" . join(' ', @$indices);
+				}
+				print CACHEFILE "\n";
+			}
+		}
+	};
+	&$printWERFunc('sysoutWER');
+	&$printWERFunc('sysoutPWER');
+	#store misc scores to disk
+	print CACHEFILE "\nMisc scores\n";
+	close(CACHEFILE);
+}
+
+#load all scores present in the cache file into the appropriate fields of $self
+#arguments: none
+#return: none
+sub loadCacheFile
+{
+	my $self = shift;
+	if(!open(CACHEFILE, "<" . $self->{'cacheFilename'}))
+	{
+		warn "Corpus::loadCacheFile(): can't open '" . $self->{'cacheFilename'} . "' for read\n";
+		return;
+	}
+	my $mode = 'none';
+	while(my $line = <CACHEFILE>)
+	{
+		next if $line =~ /^[ \t\n\r\x0a]*$/; #anyone know why char 10 (0x0a) shows up on empty lines, at least on solaris?
+		chop $line;
+		#check for start of section
+		if($line eq "File changetimes\n") {$mode = 'ctime';}
+		elsif($line eq "BLEU scores\n") {$mode = 'bleu';}
+		elsif($line eq "Unknown-token counts\n") {$mode = 'unk';}
+		elsif($line eq "WER scores") {$mode = 'wer';}
+		elsif($line eq "Misc scores") {$mode = 'misc';}
+		#get data when in a mode already
+		elsif($mode eq 'ctime')
+		{
+		}
+		elsif($mode eq 'bleu')
+		{
+			local ($sysname, $factorName, $rest) = split(/\s+/, $line, 3);
+			if(!defined($self->{'bleuScores'}->{$sysname})) {$self->{'bleuScores'}->{$sysname} = {};}
+			if(!defined($self->{'bleuScores'}->{$sysname}->{$factorName})) {$self->{'bleuScores'}->{$sysname}->{$factorName} = [[], []];}
+			my @stats = map {my @tmp = split(/\s+/, $_); \@tmp;} split(/;/, $rest);
+			$self->{'bleuScores'}->{$sysname}->{$factorName}->[0] = shift @stats;
+			$self->{'bleuScores'}->{$sysname}->{$factorName}->[1] = \@stats;
+		}
+		elsif($mode eq 'unk')
+		{
+			local ($factorName, $phraseTableFilename, $unknownCount, $totalCount) = split(' ', $line);
+			if(defined($self->{'phraseTableFilenames'}->{$factorName}) && $self->{'phraseTableFilenames'}->{$factorName} eq $phraseTableFilename)
+			{
+				$self->{'unknownCount'}->{$factorName} = $unknownCount;
+				$self->{'totalTokens'} = $totalCount;
+			}
+		}
+		elsif($mode eq 'wer')
+		{
+			local ($werType, $sysname, $factorName, $totalWER, $details) = split(/\s+/, $line, 5); #werType is 'sysoutWER' or 'sysoutPWER'
+			$details =~ /^([^;]*);(.*)/;
+			my @sentenceWERs = split(/\s+/, $1);
+			if(!defined($self->{$werType}->{$sysname})) {$self->{$werType}->{$sysname} = {};}
+			$self->{$werType}->{$sysname}->{$factorName} = [$totalWER, \@sentenceWERs, []];
+			my @indexLists = split(/;/, $2);
+			sleep 6;
+			for(my $i = 0; $i < scalar(@sentenceWERs); $i++)
+			{
+				my @indices = split(/\s+/, $indexLists[$i]);
+				$self->{$werType}->{$sysname}->{$factorName}->[2] = \@indices;
+			}
+		}
+		elsif($mode eq 'misc')
+		{
+		}
+	}
+	close(CACHEFILE);
+}
+
+##### utils #####
+#arguments: a, b (scalars)
+sub min
+{
+	my ($a, $b) = @_;
+	return ($a < $b) ? $a : $b;
+}
+#arguments: a, b (scalars)
+sub max
+{
+	my ($a, $b) = @_;
+	return ($a > $b) ? $a : $b;
+}
+#arguments: x
+sub my_log
+{
+  return -9999999999 unless $_[0];
+  return log($_[0]);
+}
+
+#throw if the given factor doesn't have an index defined
+#arguments: factor name
+#return: none
+sub ensureFactorPosDefined
+{
+	my ($self, $factorName) = @_;
+	if(!defined($self->{'factorIndices'}->{$factorName}))
+	{
+		throw Error::Simple(-text => "Corpus: no index known for factor '$factorName'\n");
+	}
+}
+
+#throw if the filename field corresponding to the argument hasn't been defined
+#arguments: 'truth' | 'input' | a system name
+#return: none
+sub ensureFilenameDefined
+{
+	my ($self, $sysname) = @_;
+	if($sysname eq 'truth' || $sysname eq 'input')
+	{
+		if(!defined($self->{"${sysname}Filename"}))
+		{
+			throw Error::Simple(-text => "Corpus: no $sysname corpus defined\n");
+		}
+	}
+	else
+	{
+		if(!defined($self->{'sysoutFilenames'}->{$sysname}))
+		{
+			throw Error::Simple(-text => "Corpus: no system $sysname defined\n");
+		}
+	}
+}
+
+#throw if there isn't a defined phrase-table filename for the given factor
+#arguments: factor name
+#return: none
+sub ensurePhraseTableDefined
+{
+	my ($self, $factorName) = @_;
+	if(!defined($self->{'phraseTableFilenames'}->{$factorName}))
+	{
+		throw Error::Simple(-text => "Corpus: no phrase table defined for factor '$factorName'\n");
+	}
+}
+
+#search current directory for files with our corpus name as basename and set filename fields of $self
+#arguments: hashref of filenames to descriptions
+#return: none
+sub locateFiles
+{
+	my ($self, $refDescs) = @_;
+	open(DIR, "ls -x1 . |") or die "Corpus::locateFiles(): couldn't list current directory\n";
+	my $corpusName = $self->{'corpusName'};
+	while(my $filename = <DIR>)
+	{
+		chop $filename; #remove \n
+		if($filename =~ /^$corpusName\.(.*)$/)
+		{
+			my $ext = $1;
+			if($ext eq 'e') {$self->{'truthFilename'} = $filename;}
+			elsif($ext eq 'f') {$self->{'inputFilename'} = $filename;}
+			elsif($ext =~ /pt_(.*)/) {$self->{'phraseTableFilenames'}->{$1} = $filename;}
+			else {$self->{'sysoutFilenames'}->{$ext} = $filename;}
+			if(defined($refDescs->{$filename}))
+			{
+				$self->{'fileDescriptions'}->{$filename} = $refDescs->{$filename};
+			}
+		}
+	}
+	close(DIR);
+#	warn "truth is set to '" . $self->{'truthFilename'} . "'\n";
+#	warn "input is set to '" . $self->{'inputFilename'} . "'\n";
+#	warn "sysouts set are '" . join("', '", keys %{$self->{'sysoutFilenames'}}) . "'\n";
+#	warn "phrase tables set are '" . join("', '", keys %{$self->{'phraseTableFilenames'}}) . "'\n";
+}
+
+#arguments: type ('truth' | 'input' | a string to represent a system output), filename
+#pre: filename exists
+#return: none
+sub loadSentences
+{
+	my ($self, $sysname, $filename) = @_;
+	$self->{$sysname} = [];
+	open(INFILE, "<$filename") or die "Corpus::load(): couldn't open '$filename' for read\n";
+	while(my $line = <INFILE>)
+	{
+		my @words = split(/\s+/, $line);
+		my $refFactors = [];
+		foreach my $word (@words)
+		{
+			my @factors = split(/\|/, $word);
+			push @$refFactors, \@factors;
+		}
+		push @{$self->{$sysname}}, $refFactors;
+	}
+	close(INFILE);
+}
+
+#free the memory used for the given corpus (but NOT any associated calculations, eg WER)
+#arguments: type ('truth' | 'input' | a string to represent a system output)
+#return: none
+sub releaseSentences
+{
+	my ($self, $sysname) = @_;
+	$self->{$sysname} = [];
+}
+
+#arguments: factor name
+#return: none
+#throw if we don't have a filename for the given phrase table
+sub loadPhraseTable
+{
+	my ($self, $factorName) = @_;
+	$self->ensurePhraseTableDefined($factorName);
+	
+	my $filename = $self->{'phraseTableFilenames'}->{$factorName};
+	open(PTABLE, "<$filename") or die "couldn't open '$filename' for read\n";
+	$self->{'phraseTables'}->{$factorName} = {}; #create ref to phrase table (hash of strings, for source phrases, to anything whatsoever)
+	#assume the table is sorted so that duplicate source phrases will be consecutive
+	while(my $line = <PTABLE>)
+	{
+		my @phrases = split(/\s*\|\|\|\s*/, $line, 2);
+		$self->{'phraseTables'}->{$factorName}->{$phrases[0]} = 0; #just so that it's set to something
+	}
+	close(PTABLE);
+}
+
+#arguments: factor name
+#return: none
+sub releasePhraseTable
+{
+	my ($self, $factorName) = @_;
+	$self->{'phraseTables'}->{$factorName} = {};
+}
+
+#arguments: name of list ('nounAndAdj' | ...)
+#return: arrayref of strings (postags)
+sub getPOSTagList
+{
+	my ($self, $listname) = @_;
+	##### assume PTB tagset #####
+	if($listname eq 'nounAndAdj') {return ['NN', 'NNS', 'NNP', 'NNPS', 'JJ', 'JJR', 'JJS'];}
+#	if($listname eq '') {return [];}
+}
+
+#arguments: list to be filtered (arrayref of arrayrefs of factor strings), desired factor index, arrayref of allowable values
+#return: filtered list as array of arrayrefs of factor strings
+sub filterFactors
+{
+	my ($self, $refFullList, $index, $refFactorValues) = @_;
+	my $valuesRegex = join("|", @$refFactorValues);
+	my @filteredList = ();
+	foreach my $factors (@$refFullList)
+	{
+		if($factors->[$index] =~ m/$valuesRegex/)
+		{
+			push @filteredList, $factors;
+		}
+	}
+	return @filteredList;
+}
+
+#arguments: system output (arrayref of arrayrefs of arrayrefs of factor strings), truth (same), factor index to use
+#return: wer score, arrayref of sentence scores, arrayref of arrayrefs of indices of errorful words
+sub corpusWER
+{
+	my ($self, $refSysOutput, $refTruth, $index) = @_;
+	my ($totWER, $sentenceWER, $errIndices) = (0, [], []);
+	for(my $i = 0; $i < scalar(@$refSysOutput); $i++)
+	{
+		my ($sentWER, $indices) = $self->sentenceWER($refSysOutput->[$i], $refTruth->[$i], $index);
+		$totWER += $sentWER;
+		push @$sentenceWER, $sentWER;
+		push @$errIndices, $indices;
+	}
+	return ($totWER, $sentenceWER, $errIndices);
+}
+
+#arguments: system output (arrayref of arrayrefs of factor strings), truth (same), factor index to use
+#return: wer score, arrayref of arrayrefs of indices of errorful words
+sub sentenceWER
+{
+	my ($self, $refSysOutput, $refTruth, $index) = @_;
+	my ($totWER, $indices) = (0, []);
+	my ($sLength, $eLength) = (scalar(@$refSysOutput), scalar(@$refTruth));
+	for(my $j = 0; $j < min($sLength, $eLength); $j++)
+	{
+		if(lc $refSysOutput->[$j]->[$index] ne lc $refTruth->[$j]->[$index]) #check output word against truth word in same position
+		{
+			$totWER++;
+			push @$indices, $j;
+		}
+	}
+	$totWER += max(0, $sLength - $eLength);
+	return ($totWER, $indices);
+}
+
+#arguments: system output (arrayref of arrayrefs of arrayrefs of factor strings), truth (same), factor index to use
+#return: wer score, arrayref of sentence scores, arrayref of arrayrefs of indices of errorful words
+sub corpusPWER
+{
+	my ($self, $refSysOutput, $refTruth, $index) = @_;
+	my ($totWER, $sentenceWER, $errIndices) = (0, [], []);
+	for(my $i = 0; $i < scalar(@$refSysOutput); $i++)
+	{
+		my ($sentWER, $indices) = $self->sentencePWER($refSysOutput->[$i], $refTruth->[$i], $index);
+		$totWER += $sentWER;
+		push @$sentenceWER, $sentWER;
+		push @$errIndices, $indices;
+	}
+	return ($totWER, $sentenceWER, $errIndices);
+}
+
+#arguments: system output (arrayref of arrayrefs of factor strings), truth (same), factor index to use
+#return: wer score, arrayref of arrayrefs of indices of errorful words
+sub sentencePWER
+{
+	my ($self, $refSysOutput, $refTruth, $index) = @_;
+	my ($totWER, $indices) = (0, []);
+	my ($sLength, $eLength) = (scalar(@$refSysOutput), scalar(@$refTruth));
+	my @truthWordUsed = (0) x $eLength; #array of 0/1; can only match a given truth word once
+	for(my $j = 0; $j < $sLength; $j++)
+	{
+		my $found = 0;
+		for(my $k = 0; $k < $eLength; $k++) #check output word against entire truth sentence
+		{
+			if(lc $refSysOutput->[$j]->[$index] eq lc $refTruth->[$k]->[$index] && $truthWordUsed[$k] == 0)
+			{
+				$truthWordUsed[$k] = 1;
+				$found = 1;
+			}
+		}
+		if($found == 0)
+		{
+			$totWER++;
+			push @$indices, $j;
+		}
+	}
+	return ($totWER, $indices);
+}
+
+#BLEU calculation for a single sentence
+#arguments: truth sentence (arrayref of arrayrefs of factor strings), sysout sentence (same), factor index to use
+#return: 1- through 4-gram matching and total counts (1-g match, 1-g tot, 2-g match...), candidate length, reference length
+sub sentenceBLEU
+{
+	my ($self, $refTruth, $refSysOutput, $factorIndex, $debug) = @_;
+	my ($length_reference, $length_translation) = (scalar(@$refTruth), scalar(@$refSysOutput));
+	my ($correct1, $correct2, $correct3, $correct4, $total1, $total2, $total3, $total4) = (0, 0, 0, 0, 0, 0, 0, 0);
+	my %REF_GRAM = ();
+	my ($i, $gram);
+	for($i = 0; $i < $length_reference; $i++)
+	{
+		$gram = $refTruth->[$i]->[$factorIndex];
+		$REF_GRAM{$gram}++;
+		next if $i<1;
+		$gram = $refTruth->[$i - 1]->[$factorIndex] ." ".$gram;
+		$REF_GRAM{$gram}++;
+      next if $i<2;
+      $gram = $refTruth->[$i - 2]->[$factorIndex] ." ".$gram;
+      $REF_GRAM{$gram}++;
+      next if $i<3;
+      $gram = $refTruth->[$i - 3]->[$factorIndex] ." ".$gram;
+      $REF_GRAM{$gram}++;
+	}
+	for($i = 0; $i < $length_translation; $i++)
+	{
+      $gram = $refSysOutput->[$i]->[$factorIndex];
+      if (defined($REF_GRAM{$gram}) && $REF_GRAM{$gram} > 0) {
+			$REF_GRAM{$gram}--;
+			if($debug != 0) {warn "'$gram' ";}
+			$correct1 += 1;
+      }
+      next if $i<1;
+      $gram = $refSysOutput->[$i - 1]->[$factorIndex] ." ".$gram;
+      if (defined($REF_GRAM{$gram}) && $REF_GRAM{$gram} > 0) {
+			$REF_GRAM{$gram}--;
+			if($debug != 0) {warn "'$gram' ";}
+			$correct2 += 1;
+      }
+      next if $i<2;
+      $gram = $refSysOutput->[$i - 2]->[$factorIndex] ." ".$gram;
+      if (defined($REF_GRAM{$gram}) && $REF_GRAM{$gram} > 0) {
+			$REF_GRAM{$gram}--;
+			if($debug != 0) {warn "'$gram' ";}
+			$correct3 += 1;
+      }
+      next if $i<3;
+      $gram = $refSysOutput->[$i - 3]->[$factorIndex] ." ".$gram;
+      if (defined($REF_GRAM{$gram}) && $REF_GRAM{$gram} > 0) {
+			$REF_GRAM{$gram}--;
+			if($debug != 0) {warn "'$gram' ";}
+			$correct4 += 1;
+      }
+	}
+	if($debug != 0) {warn "\n";}
+	my $total = $length_translation;
+	$total1 = max(1, $total);
+	$total2 = max(1, $total - 1);
+	$total3 = max(1, $total - 2);
+	$total4 = max(1, $total - 3);
+	if($debug != 0)
+	{
+	warn "BLEU($factorIndex)\n";
+	warn "truth: " . join(' ', map {join('|', @$_)} @{$refTruth}) . "\n";
+	warn "sysop: " . join(' ', map {join('|', @$_)} @{$refSysOutput}) . "\n";
+	warn "stats: $correct1 / $total1, $correct2 / $total2, $correct3 / $total3, $correct4 / $total4\n";
+	sleep 8;
+	}
+	
+	return ($correct1, $total1, $correct2, $total2, $correct3, $total3, $correct4, $total4, $length_translation, $length_reference);
+}
+
+##### filesystem #####
+
+#open as many given files as possible; only warn about the rest
+#arguments: list of filename extensions to open (assume corpus name is file title)
+#return: hash from type string to filehandleref, giving all files that were successfully opened
+sub openFiles
+{
+	my ($self, @extensions) = @_;
+	my %openedFiles = ();
+	foreach my $ext (@extensions)
+	{
+		if(!open(FILE, "<" . $self->{'corpusName'} . $ext))
+		{
+			warn "Corpus::openFiles(): couldn't open '" . $self->{'corpusName'} . $ext . "' for read\n";
+		}
+		else #success
+		{
+			$openedFiles{$ext} = \*FILE;
+		}
+	}
+	return %openedFiles;
+}
+
+#read one line from each given file
+#arguments: hash from type string to filehandleref
+#return: hash from type string to sentence (stored as arrayref of arrayrefs of factors) read from corresponding file
+sub readLineFromFiles
+{
+	my ($self, %openedFiles) = @_;
+	my %lines;
+	foreach my $type (keys %openedFiles)
+	{
+		$lines{$type} = [];
+		my $sentence = <$openedFiles{$type}>;
+		my @words = split(/\s+/, $sentence);
+		foreach my $word (@words)
+		{
+			my @factors = split(/\|/, $word);
+			push @{$lines{$type}}, \@factors;
+		}
+	}
+	return %lines;
+}
+
+#close all given files
+#arguments: hash from type string to filehandleref
+#return: none
+sub closeFiles
+{
+	my ($self, %openedFiles) = @_;
+	foreach my $type (keys %openedFiles)
+	{
+		close($openedFiles{$type});
+	}
+}
+
+##### write HTML #####
+
+#print HTML for comparing various versions of a sentence, with special processing for each version as appropriate
+#arguments: filehandleref to which to write, sentence ID string, hashref of version string to sentence (stored as arrayref of arrayref of factor strings)
+#return: none
+sub printSingleSentenceComparison
+{
+	my ($self, $fh, $sentID, $sentences) = @_;
+	my $curFH = select;
+	select $fh;
+	#javascript to reorder rows to look nice afterward
+	print "<script type=\"text/javascript\">
+	function reorder_$sentID()
+	{/*
+		var table = document.getElementById('div_$sentID').firstChild;
+		var refTransRow = table.getElementById('row_e');
+		var inputRow = table.getElementById('row_f');
+		table.removeRow(refTransRow);
+		table.removeRow(inputRow);
+		var newRow1 = table.insertRow(0);
+		var newRow2 = table.insertRow(1);
+		newRow1.childNodes = inputRow.childNodes;
+		newRow2.childNodes = refTransRow.childNodes;*/
+	}
+	</script>";
+	#html for sentences
+	print "<div id=\"div_$sentID\" style=\"padding: 3px; margin: 5px\">";
+	print "<table border=\"1\">";
+#	my $rowCount = 0;
+#	my @bgColors = ("#ffefbf", "#ffdf7f");
+	#process all rows in order
+	foreach my $sentType (keys %$sentences)
+	{
+		my $bgcolor = $bgColors[$rowCount % 2];
+		print "<tr id=\"row_$sentType\"><td align=right>";
+		#description of sentence
+		if(defined($self->{'fileDescriptions'}->{$self->{'corpusName'} . $sentType}))
+		{
+			print "(" . $self->{'fileDescriptions'}->{$self->{'corpusName'} . $sentType} . ")";
+		}
+		else
+		{
+			print "($sentType)";
+		}
+		print "</td><td align=left>";
+		#sentence with markup
+		if($sentType eq 'f') #input
+		{
+#			$self->writeHTMLSentenceWithFactors($fh, $sentences->{$sentType}, $inputColor);
+		}
+		elsif($sentType eq 'e') #reference translation
+		{
+#			$self->writeHTMLSentenceWithFactors($fh, $sentences->{$sentType}, $reftransColor);
+		}
+		else #system output
+		{
+#			$self->writeHTMLTranslationHighlightedWithFactors($fh, $sentences->{$sentType}, $sentences->{'e'}, $highlightColors);
+		}
+		print "</td></tr>";
+#		$rowCount++;
+	}
+	print "</table>";
+	print "</div>\n";
+	select $curFH;
+}
diff --git a/scripts/analysis/smtgui/Error.pm b/scripts/analysis/smtgui/Error.pm
new file mode 100644
index 000000000..cc9edbb69
--- /dev/null
+++ b/scripts/analysis/smtgui/Error.pm
@@ -0,0 +1,744 @@
+# Error.pm
+#
+# Copyright (c) 1997-8 Graham Barr <gbarr@ti.com>. All rights reserved.
+# This program is free software; you can redistribute it and/or
+# modify it under the same terms as Perl itself.
+#
+# Based on my original Error.pm, and Exceptions.pm by Peter Seibel
+# <peter@weblogic.com> and adapted by Jesse Glick <jglick@sig.bsh.com>.
+#
+# but modified ***significantly***
+
+package Error;
+
+use strict;
+use vars qw($VERSION);
+use 5.004;
+
+$VERSION = "0.15"; 
+
+use overload (
+	'""'	   =>	'stringify',
+	'0+'	   =>	'value',
+	'bool'     =>	sub { return 1; },
+	'fallback' =>	1
+);
+
+$Error::Depth = 0;	# Depth to pass to caller()
+$Error::Debug = 0;	# Generate verbose stack traces
+@Error::STACK = ();	# Clause stack for try
+$Error::THROWN = undef;	# last error thrown, a workaround until die $ref works
+
+my $LAST;		# Last error created
+my %ERROR;		# Last error associated with package
+
+# Exported subs are defined in Error::subs
+
+sub import {
+    shift;
+    local $Exporter::ExportLevel = $Exporter::ExportLevel + 1;
+    Error::subs->import(@_);
+}
+
+# I really want to use last for the name of this method, but it is a keyword
+# which prevent the syntax  last Error
+
+sub prior {
+    shift; # ignore
+
+    return $LAST unless @_;
+
+    my $pkg = shift;
+    return exists $ERROR{$pkg} ? $ERROR{$pkg} : undef
+	unless ref($pkg);
+
+    my $obj = $pkg;
+    my $err = undef;
+    if($obj->isa('HASH')) {
+	$err = $obj->{'__Error__'}
+	    if exists $obj->{'__Error__'};
+    }
+    elsif($obj->isa('GLOB')) {
+	$err = ${*$obj}{'__Error__'}
+	    if exists ${*$obj}{'__Error__'};
+    }
+
+    $err;
+}
+
+# Return as much information as possible about where the error
+# happened. The -stacktrace element only exists if $Error::DEBUG
+# was set when the error was created
+
+sub stacktrace {
+    my $self = shift;
+
+    return $self->{'-stacktrace'}
+	if exists $self->{'-stacktrace'};
+
+    my $text = exists $self->{'-text'} ? $self->{'-text'} : "Died";
+
+    $text .= sprintf(" at %s line %d.\n", $self->file, $self->line)
+	unless($text =~ /\n$/s);
+
+    $text;
+}
+
+# Allow error propagation, ie
+#
+# $ber->encode(...) or
+#    return Error->prior($ber)->associate($ldap);
+
+sub associate {
+    my $err = shift;
+    my $obj = shift;
+
+    return unless ref($obj);
+
+    if($obj->isa('HASH')) {
+	$obj->{'__Error__'} = $err;
+    }
+    elsif($obj->isa('GLOB')) {
+	${*$obj}{'__Error__'} = $err;
+    }
+    $obj = ref($obj);
+    $ERROR{ ref($obj) } = $err;
+
+    return;
+}
+
+sub new {
+    my $self = shift;
+    my($pkg,$file,$line) = caller($Error::Depth);
+
+    my $err = bless {
+	'-package' => $pkg,
+	'-file'    => $file,
+	'-line'    => $line,
+	@_
+    }, $self;
+
+    $err->associate($err->{'-object'})
+	if(exists $err->{'-object'});
+
+    # To always create a stacktrace would be very inefficient, so
+    # we only do it if $Error::Debug is set
+
+    if($Error::Debug) {
+	require Carp;
+	local $Carp::CarpLevel = $Error::Depth;
+	my $text = defined($err->{'-text'}) ? $err->{'-text'} : "Error";
+	my $trace = Carp::longmess($text);
+	# Remove try calls from the trace
+	$trace =~ s/(\n\s+\S+__ANON__[^\n]+)?\n\s+eval[^\n]+\n\s+Error::subs::try[^\n]+(?=\n)//sog;
+	$trace =~ s/(\n\s+\S+__ANON__[^\n]+)?\n\s+eval[^\n]+\n\s+Error::subs::run_clauses[^\n]+\n\s+Error::subs::try[^\n]+(?=\n)//sog;
+	$err->{'-stacktrace'} = $trace
+    }
+
+    $@ = $LAST = $ERROR{$pkg} = $err;
+}
+
+# Throw an error. this contains some very gory code.
+
+sub throw {
+    my $self = shift;
+    local $Error::Depth = $Error::Depth + 1;
+
+    # if we are not rethrow-ing then create the object to throw
+    $self = $self->new(@_) unless ref($self);
+    
+    die $Error::THROWN = $self;
+}
+
+# syntactic sugar for
+#
+#    die with Error( ... );
+
+sub with {
+    my $self = shift;
+    local $Error::Depth = $Error::Depth + 1;
+
+    $self->new(@_);
+}
+
+# syntactic sugar for
+#
+#    record Error( ... ) and return;
+
+sub record {
+    my $self = shift;
+    local $Error::Depth = $Error::Depth + 1;
+
+    $self->new(@_);
+}
+
+# catch clause for
+#
+# try { ... } catch CLASS with { ... }
+
+sub catch {
+    my $pkg = shift;
+    my $code = shift;
+    my $clauses = shift || {};
+    my $catch = $clauses->{'catch'} ||= [];
+
+    unshift @$catch,  $pkg, $code;
+
+    $clauses;
+}
+
+# Object query methods
+
+sub object {
+    my $self = shift;
+    exists $self->{'-object'} ? $self->{'-object'} : undef;
+}
+
+sub file {
+    my $self = shift;
+    exists $self->{'-file'} ? $self->{'-file'} : undef;
+}
+
+sub line {
+    my $self = shift;
+    exists $self->{'-line'} ? $self->{'-line'} : undef;
+}
+
+sub text {
+    my $self = shift;
+    exists $self->{'-text'} ? $self->{'-text'} : undef;
+}
+
+# overload methods
+
+sub stringify {
+    my $self = shift;
+    defined $self->{'-text'} ? $self->{'-text'} : "Died";
+}
+
+sub value {
+    my $self = shift;
+    exists $self->{'-value'} ? $self->{'-value'} : undef;
+}
+
+package Error::Simple;
+
+@Error::Simple::ISA = qw(Error);
+
+sub new {
+    my $self  = shift;
+    my $text  = "" . shift;
+    my $value = shift;
+    my(@args) = ();
+
+    local $Error::Depth = $Error::Depth + 1;
+
+    @args = ( -file => $1, -line => $2)
+	if($text =~ s/ at (\S+) line (\d+)(\.\n)?$//s);
+
+    push(@args, '-value', 0 + $value)
+	if defined($value);
+
+    $self->SUPER::new(-text => $text, @args);
+}
+
+sub stringify {
+    my $self = shift;
+    my $text = $self->SUPER::stringify;
+    $text .= sprintf(" at %s line %d.\n", $self->file, $self->line)
+	unless($text =~ /\n$/s);
+    $text;
+}
+
+##########################################################################
+##########################################################################
+
+# Inspired by code from Jesse Glick <jglick@sig.bsh.com> and
+# Peter Seibel <peter@weblogic.com>
+
+package Error::subs;
+
+use Exporter ();
+use vars qw(@EXPORT_OK @ISA %EXPORT_TAGS);
+
+@EXPORT_OK   = qw(try with finally except otherwise);
+%EXPORT_TAGS = (try => \@EXPORT_OK);
+
+@ISA = qw(Exporter);
+
+sub run_clauses ($$$\@) {
+    my($clauses,$err,$wantarray,$result) = @_;
+    my $code = undef;
+
+    $err = new Error::Simple($err) unless ref($err);
+
+    CATCH: {
+
+	# catch
+	my $catch;
+	if(defined($catch = $clauses->{'catch'})) {
+	    my $i = 0;
+
+	    CATCHLOOP:
+	    for( ; $i < @$catch ; $i += 2) {
+		my $pkg = $catch->[$i];
+		unless(defined $pkg) {
+		    #except
+		    splice(@$catch,$i,2,$catch->[$i+1]->());
+		    $i -= 2;
+		    next CATCHLOOP;
+		}
+		elsif($err->isa($pkg)) {
+		    $code = $catch->[$i+1];
+		    while(1) {
+			my $more = 0;
+			local($Error::THROWN);
+			my $ok = eval {
+			    if($wantarray) {
+				@{$result} = $code->($err,\$more);
+			    }
+			    elsif(defined($wantarray)) {
+			        @{$result} = ();
+				$result->[0] = $code->($err,\$more);
+			    }
+			    else {
+				$code->($err,\$more);
+			    }
+			    1;
+			};
+			if( $ok ) {
+			    next CATCHLOOP if $more;
+			    undef $err;
+			}
+			else {
+			    $err = defined($Error::THROWN)
+				    ? $Error::THROWN : $@;
+			    $err = new Error::Simple($err)
+				    unless ref($err);
+			}
+			last CATCH;
+		    };
+		}
+	    }
+	}
+
+	# otherwise
+	my $owise;
+	if(defined($owise = $clauses->{'otherwise'})) {
+	    my $code = $clauses->{'otherwise'};
+	    my $more = 0;
+	    my $ok = eval {
+		if($wantarray) {
+		    @{$result} = $code->($err,\$more);
+		}
+		elsif(defined($wantarray)) {
+		    @{$result} = ();
+		    $result->[0] = $code->($err,\$more);
+		}
+		else {
+		    $code->($err,\$more);
+		}
+		1;
+	    };
+	    if( $ok ) {
+		undef $err;
+	    }
+	    else {
+		$err = defined($Error::THROWN)
+			? $Error::THROWN : $@;
+		$err = new Error::Simple($err)
+			unless ref($err);
+	    }
+	}
+    }
+    $err;
+}
+
+sub try (&;$) {
+    my $try = shift;
+    my $clauses = @_ ? shift : {};
+    my $ok = 0;
+    my $err = undef;
+    my @result = ();
+
+    unshift @Error::STACK, $clauses;
+
+    my $wantarray = wantarray();
+
+    do {
+	local $Error::THROWN = undef;
+
+	$ok = eval {
+	    if($wantarray) {
+		@result = $try->();
+	    }
+	    elsif(defined $wantarray) {
+		$result[0] = $try->();
+	    }
+	    else {
+		$try->();
+	    }
+	    1;
+	};
+
+	$err = defined($Error::THROWN) ? $Error::THROWN : $@
+	    unless $ok;
+    };
+
+    shift @Error::STACK;
+
+    $err = run_clauses($clauses,$err,wantarray,@result)
+	unless($ok);
+
+    $clauses->{'finally'}->()
+	if(defined($clauses->{'finally'}));
+
+    throw $err if defined($err);
+
+    wantarray ? @result : $result[0];
+}
+
+# Each clause adds a sub to the list of clauses. The finally clause is
+# always the last, and the otherwise clause is always added just before
+# the finally clause.
+#
+# All clauses, except the finally clause, add a sub which takes one argument
+# this argument will be the error being thrown. The sub will return a code ref
+# if that clause can handle that error, otherwise undef is returned.
+#
+# The otherwise clause adds a sub which unconditionally returns the users
+# code reference, this is why it is forced to be last.
+#
+# The catch clause is defined in Error.pm, as the syntax causes it to
+# be called as a method
+
+sub with (&;$) {
+    @_
+}
+
+sub finally (&) {
+    my $code = shift;
+    my $clauses = { 'finally' => $code };
+    $clauses;
+}
+
+# The except clause is a block which returns a hashref or a list of
+# key-value pairs, where the keys are the classes and the values are subs.
+
+sub except (&;$) {
+    my $code = shift;
+    my $clauses = shift || {};
+    my $catch = $clauses->{'catch'} ||= [];
+    
+    my $sub = sub {
+	my $ref;
+	my(@array) = $code->($_[0]);
+	if(@array == 1 && ref($array[0])) {
+	    $ref = $array[0];
+	    $ref = [ %$ref ]
+		if(UNIVERSAL::isa($ref,'HASH'));
+	}
+	else {
+	    $ref = \@array;
+	}
+	@$ref
+    };
+
+    unshift @{$catch}, undef, $sub;
+
+    $clauses;
+}
+
+sub otherwise (&;$) {
+    my $code = shift;
+    my $clauses = shift || {};
+
+    if(exists $clauses->{'otherwise'}) {
+	require Carp;
+	Carp::croak("Multiple otherwise clauses");
+    }
+
+    $clauses->{'otherwise'} = $code;
+
+    $clauses;
+}
+
+1;
+__END__
+
+=head1 NAME
+
+Error - Error/exception handling in an OO-ish way
+
+=head1 SYNOPSIS
+
+    use Error qw(:try);
+
+    throw Error::Simple( "A simple error");
+
+    sub xyz {
+        ...
+	record Error::Simple("A simple error")
+	    and return;
+    }
+ 
+    unlink($file) or throw Error::Simple("$file: $!",$!);
+
+    try {
+	do_some_stuff();
+	die "error!" if $condition;
+	throw Error::Simple -text => "Oops!" if $other_condition;
+    }
+    catch Error::IO with {
+	my $E = shift;
+	print STDERR "File ", $E->{'-file'}, " had a problem\n";
+    }
+    except {
+	my $E = shift;
+	my $general_handler=sub {send_message $E->{-description}};
+	return {
+	    UserException1 => $general_handler,
+	    UserException2 => $general_handler
+	};
+    }
+    otherwise {
+	print STDERR "Well I don't know what to say\n";
+    }
+    finally {
+	close_the_garage_door_already(); # Should be reliable
+    }; # Don't forget the trailing ; or you might be surprised
+
+=head1 DESCRIPTION
+
+The C<Error> package provides two interfaces. Firstly C<Error> provides
+a procedural interface to exception handling. Secondly C<Error> is a
+base class for errors/exceptions that can either be thrown, for
+subsequent catch, or can simply be recorded.
+
+Errors in the class C<Error> should not be thrown directly, but the
+user should throw errors from a sub-class of C<Error>.
+
+=head1 PROCEDURAL INTERFACE
+
+C<Error> exports subroutines to perform exception handling. These will
+be exported if the C<:try> tag is used in the C<use> line.
+
+=over 4
+
+=item try BLOCK CLAUSES
+
+C<try> is the main subroutine called by the user. All other subroutines
+exported are clauses to the try subroutine.
+
+The BLOCK will be evaluated and, if no error is throw, try will return
+the result of the block.
+
+C<CLAUSES> are the subroutines below, which describe what to do in the
+event of an error being thrown within BLOCK.
+
+=item catch CLASS with BLOCK
+
+This clauses will cause all errors that satisfy C<$err-E<gt>isa(CLASS)>
+to be caught and handled by evaluating C<BLOCK>.
+
+C<BLOCK> will be passed two arguments. The first will be the error
+being thrown. The second is a reference to a scalar variable. If this
+variable is set by the catch block then, on return from the catch
+block, try will continue processing as if the catch block was never
+found.
+
+To propagate the error the catch block may call C<$err-E<gt>throw>
+
+If the scalar reference by the second argument is not set, and the
+error is not thrown. Then the current try block will return with the
+result from the catch block.
+
+=item except BLOCK
+
+When C<try> is looking for a handler, if an except clause is found
+C<BLOCK> is evaluated. The return value from this block should be a
+HASHREF or a list of key-value pairs, where the keys are class names
+and the values are CODE references for the handler of errors of that
+type.
+
+=item otherwise BLOCK
+
+Catch any error by executing the code in C<BLOCK>
+
+When evaluated C<BLOCK> will be passed one argument, which will be the
+error being processed.
+
+Only one otherwise block may be specified per try block
+
+=item finally BLOCK
+
+Execute the code in C<BLOCK> either after the code in the try block has
+successfully completed, or if the try block throws an error then
+C<BLOCK> will be executed after the handler has completed.
+
+If the handler throws an error then the error will be caught, the
+finally block will be executed and the error will be re-thrown.
+
+Only one finally block may be specified per try block
+
+=back
+
+=head1 CLASS INTERFACE
+
+=head2 CONSTRUCTORS
+
+The C<Error> object is implemented as a HASH. This HASH is initialized
+with the arguments that are passed to it's constructor. The elements
+that are used by, or are retrievable by the C<Error> class are listed
+below, other classes may add to these.
+
+	-file
+	-line
+	-text
+	-value
+	-object
+
+If C<-file> or C<-line> are not specified in the constructor arguments
+then these will be initialized with the file name and line number where
+the constructor was called from.
+
+If the error is associated with an object then the object should be
+passed as the C<-object> argument. This will allow the C<Error> package
+to associate the error with the object.
+
+The C<Error> package remembers the last error created, and also the
+last error associated with a package. This could either be the last
+error created by a sub in that package, or the last error which passed
+an object blessed into that package as the C<-object> argument.
+
+=over 4
+
+=item throw ( [ ARGS ] )
+
+Create a new C<Error> object and throw an error, which will be caught
+by a surrounding C<try> block, if there is one. Otherwise it will cause
+the program to exit.
+
+C<throw> may also be called on an existing error to re-throw it.
+
+=item with ( [ ARGS ] )
+
+Create a new C<Error> object and returns it. This is defined for
+syntactic sugar, eg
+
+    die with Some::Error ( ... );
+
+=item record ( [ ARGS ] )
+
+Create a new C<Error> object and returns it. This is defined for
+syntactic sugar, eg
+
+    record Some::Error ( ... )
+	and return;
+
+=back
+
+=head2 STATIC METHODS
+
+=over 4
+
+=item prior ( [ PACKAGE ] )
+
+Return the last error created, or the last error associated with
+C<PACKAGE>
+
+=back
+
+=head2 OBJECT METHODS
+
+=over 4
+
+=item stacktrace
+
+If the variable C<$Error::Debug> was non-zero when the error was
+created, then C<stacktrace> returns a string created by calling
+C<Carp::longmess>. If the variable was zero the C<stacktrace> returns
+the text of the error appended with the filename and line number of
+where the error was created, providing the text does not end with a
+newline.
+
+=item object
+
+The object this error was associated with
+
+=item file
+
+The file where the constructor of this error was called from
+
+=item line
+
+The line where the constructor of this error was called from
+
+=item text
+
+The text of the error
+
+=back
+
+=head2 OVERLOAD METHODS
+
+=over 4
+
+=item stringify
+
+A method that converts the object into a string. This method may simply
+return the same as the C<text> method, or it may append more
+information. For example the file name and line number.
+
+By default this method returns the C<-text> argument that was passed to
+the constructor, or the string C<"Died"> if none was given.
+
+=item value
+
+A method that will return a value that can be associated with the
+error. For example if an error was created due to the failure of a
+system call, then this may return the numeric value of C<$!> at the
+time.
+
+By default this method returns the C<-value> argument that was passed
+to the constructor.
+
+=back
+
+=head1 PRE-DEFINED ERROR CLASSES
+
+=over 4
+
+=item Error::Simple
+
+This class can be used to hold simple error strings and values. It's
+constructor takes two arguments. The first is a text value, the second
+is a numeric value. These values are what will be returned by the
+overload methods.
+
+If the text value ends with C<at file line 1> as $@ strings do, then
+this infomation will be used to set the C<-file> and C<-line> arguments
+of the error object.
+
+This class is used internally if an eval'd block die's with an error
+that is a plain string.
+
+=back
+
+=head1 KNOWN BUGS
+
+None, but that does not mean there are not any.
+
+=head1 AUTHORS
+
+Graham Barr <gbarr@pobox.com>
+
+The code that inspired me to write this was originally written by
+Peter Seibel <peter@weblogic.com> and adapted by Jesse Glick
+<jglick@sig.bsh.com>.
+
+=head1 MAINTAINER
+
+Arun Kumar U <u_arunkumar@yahoo.com>
+
+=cut
diff --git a/scripts/analysis/smtgui/README b/scripts/analysis/smtgui/README
new file mode 100644
index 000000000..c86cd9c1a
--- /dev/null
+++ b/scripts/analysis/smtgui/README
@@ -0,0 +1,31 @@
+Readme for SMTGUI
+Philipp Koehn, Evan Herbst
+7 / 31 / 06
+-----------------------------------
+
+SMTGUI is Philipp's and my code to analyze a decoder's output (the decoder doesn't have to be moses, but most of SMTGUI's features relate to factors, so it probably will be). You can view a list of available corpora by running <newsmtgui.cgi?ACTION=> on any web server. When you're viewing a corpus, click the checkboxes and Compare to see sentences from various sources on one screen. Currently they're in an annoying format; feel free to make the display nicer and more useful. There are per-sentence stats stored in a Corpus object; they just aren't used yet. See compare2() in newsmtgui and Corpus::printSingleSentenceComparison() for a start to better display code. For now it's mostly the view-corpus screen that's useful.
+
+newsmtgui.cgi is the main program. Corpus.pm is my module; Error.pm is a standard part of Perl but appears to not always be distributed. The accompanying version is Error.pm v1.15.
+
+The program requires file 'file-factors', which gives the list of factors included in each corpus (see the example file for details). Only corpi included in 'file-factors' are displayed. The file 'file-descriptions' is optional and associates a descriptive string with each included filename. These are used only for display. Again an example is provided.
+
+For the corpus with name CORPUS, there should be present the files:
+- CORPUS.f, the foreign input
+- CORPUS.e, the truth (aka reference translation)
+- CORPUS.SYSTEM_TRANSLATION for each system to be analyzed
+- CORPUS.pt_FACTORNAME for each factor that requires a phrase table (these are currently used only to count unknown source words)
+
+The .f, .e and system-output files should have the usual pipe-delimited format, one sentence per line. Phrase tables should also have standard three-pipe format.
+
+A list of standard factor names is available in @Corpus::FACTORNAMES. Feel free to add, but woe betide you if you muck with 'surf', 'pos' and 'lemma'; those are hardcoded all over the place.
+
+Currently the program assumes you've included factors 'surf', 'pos' and 'lemma', in whatever order; if not you'll want to edit view_corpus() in newsmtgui.cgi to not automatically display all info. To get English POS tags and lemmas from a words-only corpus and put together factors into one file:
+
+$ $BIN/tag-english < CORPUS.lc > CORPUS.pos-tmp                                 (call Brill)
+$ $BIN/morph < CORPUS.pos-tmp > CORPUS.morph
+$ $DATA/test/factor-stem.en.perl < CORPUS.morph > CORPUS.lemma
+$ cat CORPUS.pos-tmp | perl -n -e 's/_/\|/g; print;' > CORPUS.lc+pos            (replace _ with |)
+$ $DATA/test/combine-features.perl CORPUS lc+pos lemma > CORPUS.lc+pos+lemma
+$ rm CORPUS.pos-tmp                                                             (cleanup)
+
+where $BIN=/export/ws06osmt/bin, $DATA=/export/ws06osmt/data.
diff --git a/scripts/analysis/smtgui/file-descriptions b/scripts/analysis/smtgui/file-descriptions
new file mode 100644
index 000000000..97ac6f31a
--- /dev/null
+++ b/scripts/analysis/smtgui/file-descriptions
@@ -0,0 +1 @@
+devtest2006.de-en.top100.matrix05-baseline.pharaoh Pharaoh JHUWS baseline run
diff --git a/scripts/analysis/smtgui/file-factors b/scripts/analysis/smtgui/file-factors
new file mode 100644
index 000000000..a1128eb2a
--- /dev/null
+++ b/scripts/analysis/smtgui/file-factors
@@ -0,0 +1,6 @@
+#corpus name, factors given (/\s+/-delimited)
+#(the given factors should be present in all target-language files for the given corpus)
+devtest2006.de-en					surf pos lemma
+devtest2006.de-en.top100		surf pos lemma
+#pstem: lemmas come from the Porter stemmer (and so are really a mix of stems and lemmas)
+pstem_devtest2006.de-en			surf pos lemma
diff --git a/scripts/analysis/smtgui/filter-phrase-table.pl b/scripts/analysis/smtgui/filter-phrase-table.pl
new file mode 100644
index 000000000..85f325935
--- /dev/null
+++ b/scripts/analysis/smtgui/filter-phrase-table.pl
@@ -0,0 +1,79 @@
+#!/usr/bin/perl -w
+
+#filter phrase tables
+#arguments: phrasetable_filename input_filename factor_index (0...)
+#outputs to phrasetable_filename.short
+
+use strict;
+
+my $MAX_LENGTH = 10;
+
+my ($file, $input, $source_factor) = @ARGV;
+my $dir = ".";
+
+    # get tables to be filtered (and modify config file)
+    my (@TABLE,@TABLE_FACTORS,@TABLE_NEW_NAME,%CONSIDER_FACTORS);
+		push @TABLE,$file;
+
+		my $new_name = "$file.short";#"$dir/phrase-table.$source_factor";
+		push @TABLE_NEW_NAME,$new_name;
+
+		$CONSIDER_FACTORS{$source_factor} = 1;
+		push @TABLE_FACTORS,$source_factor;
+
+    # get the phrase pairs appearing in the input text
+    my %PHRASE_USED;
+    die("could not find input file $input") unless -e $input;
+    open(INPUT,$input);
+    while(my $line = <INPUT>) {
+	chop($line);
+	my @WORD = split(/ +/,$line);
+	for(my $i=0;$i<=$#WORD;$i++) {
+	    for(my $j=0;$j<$MAX_LENGTH && $j+$i<=$#WORD;$j++) {
+		foreach (keys %CONSIDER_FACTORS) {
+		    my @FACTOR = split(/,/);
+		    my $phrase = "";
+		    for(my $k=$i;$k<=$i+$j;$k++) {
+			my @WORD_FACTOR = split(/\|/,$WORD[$k]);
+			for(my $f=0;$f<=$#FACTOR;$f++) {
+			    $phrase .= $WORD_FACTOR[$FACTOR[$f]]."|";
+			}
+			chop($phrase);
+			$phrase .= " ";
+		    }
+		    chop($phrase);
+		    $PHRASE_USED{$_}{$phrase}++;
+		}
+	    }
+	}
+    }
+    close(INPUT);
+
+    # filter files
+    for(my $i=0;$i<=$#TABLE;$i++) {
+	my ($used,$total) = (0,0);
+	my $file = $TABLE[$i];
+	my $factors = $TABLE_FACTORS[$i];
+	my $new_file = $TABLE_NEW_NAME[$i];
+	print STDERR "filtering $file -> $new_file...\n";
+
+        if (-e $file && $file =~ /\.gz$/) { open(FILE,"zcat $file |"); }
+        elsif (! -e $file && -e "$file.gz") { open(FILE,"zcat $file.gz|"); }
+        elsif (-e $file) { open(FILE,$file); }
+	else { die("could not find model file $file");  }
+
+	open(FILE_OUT,">$new_file");
+
+	while(my $entry = <FILE>) {
+	    my ($foreign,$rest) = split(/ \|\|\| /,$entry,2);
+	    $foreign =~ s/ $//;
+	    if (defined($PHRASE_USED{$factors}{$foreign})) {
+		print FILE_OUT $entry;
+		$used++;
+	    }
+	    $total++;
+	}
+	close(FILE);
+	close(FILE_OUT);
+	printf STDERR "$used of $total phrases pairs used (%.2f%s) - note: max length $MAX_LENGTH\n",(100*$used/$total),'%';
+    }
diff --git a/scripts/analysis/smtgui/newsmtgui.cgi b/scripts/analysis/smtgui/newsmtgui.cgi
new file mode 100755
index 000000000..0e969a788
--- /dev/null
+++ b/scripts/analysis/smtgui/newsmtgui.cgi
@@ -0,0 +1,935 @@
+#!/usr/bin/perl -w
+
+use strict;
+
+use CGI;
+use Corpus; #Evan's code
+use Error qw(:try);
+
+#files with extensions other than these are interpreted as system translations; see the file 'file-descriptions', if it exists, for the comments that go with them
+my %FILETYPE = ('e' => 'Reference Translation',
+		'f' => 'Foreign Original',
+		'ref.sgm' => 'Reference Translations',
+		'e.sgm' => 'Reference Translations',
+		'src.sgm' => 'Foreign Originals',
+		'f.sgm' => 'Foreign Originals');
+my %DONTSCORE = ('f' => 1, 'f.sgm' => 1, 'src.sgm' => 1,
+		 'e' => 1, 'e.sgm' => 1, 'ref.sgm' => 1);
+my @SHOW = ('f', 'e', 'comm');
+my %SHOW_COLOR = ('f' => "BLUE",
+		  'e' => "GREEN");
+my $FOREIGN = 'f';
+
+#FILEDESC: textual descriptions associated with specific filenames; to be displayed on the single-corpus view
+my %FILEDESC = (); &load_descriptions();
+my %factorIndices = loadFactorIndices('file-factors');
+my %MEMORY;        &load_memory();
+my (@mBLEU,@NIST);
+@mBLEU=`cat mbleu-memory.dat` if -e "mbleu-memory.dat"; chop(@mBLEU);
+@NIST = `cat nist-memory.dat` if -e "nist-memory.dat"; chop(@NIST);
+my %in;            &ReadParse(); #parse arguments
+
+if (scalar(@ARGV) > 0 && $ARGV[0] eq 'bleu') {
+  $in{CORPUS} = $ARGV[1];
+  $in{ACTION} = "VIEW_CORPUS";
+}
+
+my %MULTI_REF;
+if ($in{CORPUS} && -e "$in{CORPUS}.ref.sgm") {
+  my $sysid;
+  open(REF,"$in{CORPUS}.ref.sgm");
+  while(<REF>) {
+    $sysid = $1 if /<DOC.+sysid=\"([^\"]+)\"/;
+    if (/<seg[^>]*> *(\S.+\S) *<\/seg>/) {
+      push @{$MULTI_REF{$sysid}}, $1;
+    }
+  }
+  close(REF);
+}
+
+if ($in{ACTION} eq '') { &show_corpora(); }
+elsif ($in{ACTION} eq 'VIEW_CORPUS') { &view_corpus(); }
+elsif ($in{ACTION} eq 'SCORE_FILE') { &score_file(); }
+elsif ($in{ACTION} eq 'RESCORE_FILE') { &score_file(); }
+elsif ($in{ACTION} eq 'COMPARE') { &compare(); }
+else { &htmlhead("Unknown Action $in{ACTION}"); }
+print "</BODY></HTML>\n";
+
+###### SHOW CORPORA IN EVALUATION DIRECTORY
+
+sub show_corpora {
+  my %CORPUS = ();
+  
+  # find corpora in evaluation directory: see the factor-index file, which was already read in
+  foreach my $corpusName (keys %factorIndices)
+  {
+  	$CORPUS{$corpusName} = 1;
+  }
+  
+  # list corpora
+  &htmlhead("All Corpora");
+  print "<UL>\n";
+  foreach (sort (keys %CORPUS)) {
+    print "<LI><A HREF=\"?ACTION=VIEW_CORPUS&CORPUS=".CGI::escape($_)."\">Corpus $_</A>\n";
+  }
+  print "</UL>\n";
+}
+
+###### SHOW INFORMATION FOR ONE CORPUS
+
+sub view_corpus {
+  my @TABLE;
+  &htmlhead("View Corpus $in{CORPUS}");
+  
+  # find corpora in evaluation directory
+  my $corpus = new Corpus('-name' => "$in{CORPUS}", '-descriptions' => \%FILEDESC, '-indices' => $factorIndices{$in{CORPUS}});
+  
+  my ($sentence_count, $lineInfo);
+  if(-e "$in{CORPUS}.f")
+  {
+  	$lineInfo = `wc -l $in{CORPUS}.f`;
+  	$lineInfo =~ /^\s*(\d+)\s+/;
+  	$sentence_count = 0 + $1;
+	}
+	else
+	{
+	  $lineInfo = `wc -l $in{CORPUS}.e`;
+	  $lineInfo =~ /^\s*(\d+)\s+/;
+	  $sentence_count = 0 + $1;
+	}
+  
+  print "Corpus '$in{CORPUS}' consists of $sentence_count sentences\n";
+  print "(<A HREF=?ACTION=VIEW_CORPUS&CORPUS=" . CGI::escape($in{CORPUS})."&mBLEU=1>with mBLEU</A>)" if ((!defined($in{mBLEU})) && (scalar keys %MEMORY) && -e "$in{CORPUS}.e" && -e "$in{CORPUS}.f");
+  print "<P>\n";
+  print "<FORM ACTION=''>\n";
+  print "<INPUT TYPE=HIDDEN NAME=ACTION VALUE=COMPARE>\n";
+  print "<INPUT TYPE=HIDDEN NAME=CORPUS VALUE=\"$in{CORPUS}\">\n";
+  print "<TABLE BORDER=1 CELLSPACING=0><TR>
+<TD>Filename (<A HREF=?ACTION=VIEW_CORPUS&CORPUS=" . CGI::escape($in{CORPUS}).">sort</A>)</TD>
+<TD>Date     (<A HREF=?ACTION=VIEW_CORPUS&CORPUS=" . CGI::escape($in{CORPUS})."&SORT=TIME>sort</A>)</TD>";
+  if (-e "$in{CORPUS}.e") {
+    print "<TD>IBM BLEU (<A HREF=?ACTION=VIEW_CORPUS&CORPUS=" . CGI::escape($in{CORPUS})."&SORT=IBM>sort</A>)</TD>";
+  }
+  if (-e "$in{CORPUS}.ref.sgm" && -e "$in{CORPUS}.src.sgm") {
+    print "<TD>NIST (<A HREF=?ACTION=VIEW_CORPUS&CORPUS=" . CGI::escape($in{CORPUS})."&SORT=NIST>sort</A>)</TD>";
+    if (! -e "$in{CORPUS}.e") {
+      print "<TD>BLEU (<A HREF=?ACTION=VIEW_CORPUS&CORPUS=" . CGI::escape($in{CORPUS})."&SORT=BLEU>sort</A>)</TD>";
+    }
+  }
+  if ($in{mBLEU} && (scalar keys %MEMORY) && -e "$in{CORPUS}.e" && -e "$in{CORPUS}.f") {
+    print "<TD>mBLEU (<A HREF=?ACTION=VIEW_CORPUS&CORPUS=" . CGI::escape($in{CORPUS})."&SORT=mBLEU>sort</A>)</TD>";
+  }
+  print "<TD>Unknown Words</TD>"; #can't sort on; only applies to the input
+  print "<TD>Noun & adj WER-PWER</TD>"; #can't sort on; only applies to sysoutputs
+  print "<TD>Surface vs. lemma PWER</TD>"; #can't sort on; only applies to sysoutputs
+  print "<TD>Score (<A HREF=?ACTION=VIEW_CORPUS&CORPUS=" . CGI::escape($in{CORPUS})."&SORT=SCORE>sort</A>)</TD><TD>Actions</TD></TR>";
+
+  open(DIR,"ls $in{CORPUS}.*|");
+  while(<DIR>) {
+    my $sort = "";
+    chop;
+    my $sgm = 0;
+    if (/.sgm$/)
+	 {
+	 	`grep '<seg' $_ | wc -l` =~ /^\s*(\d+)\s+/;
+		next unless $1 == $sentence_count;
+		$sgm = 1;
+    }
+    else
+	 {
+	 	`wc -l $_` =~ /^\s*(\d+)\s+/;
+		next unless $1 == $sentence_count;
+    }
+    /^$in{CORPUS}.([^\/]+)$/;
+    my $file = $1;
+    # checkbox for compare
+    my $row = "<TR><TD><INPUT TYPE=CHECKBOX NAME=FILE_$file VALUE=1>";
+    # README
+    if (-e "$in{CORPUS}.$file.README") {
+      my $readme = `cat $in{CORPUS}.$file.README`;
+      $readme =~ s/([\"\'])/\\\"/g;
+      $readme =~ s/[\n\r]/\\n/g;
+      $readme =~ s/\t/\\t/g;
+      $row .= "<A HREF='javascript:FieldInfo(\"$in{CORPUS}.$file\",\"$readme\")'>";
+    }
+    # filename
+    $row .= "$in{CORPUS}.$file</A>";
+    # description (hard-coded)
+    my @TRANSLATION_SENTENCE = `cat $in{CORPUS}.$file`; 
+    chop(@TRANSLATION_SENTENCE);
+    
+	 #count sentences that contain null words
+	 my $null_count = 0;
+    foreach (@TRANSLATION_SENTENCE)
+	 {
+      $null_count++ if /^NULL$/ || /^NONE$/;
+    }
+    if ($null_count > 0) { 
+      $row .= "$null_count NULL ";
+    }
+
+    $row .= " (".$FILETYPE{$file}.")" if defined($FILETYPE{$file});
+    $row .= " (".$FILEDESC{$in{CORPUS}.".".$file}.")" if defined($FILEDESC{$in{CORPUS}.".".$file});
+    $row .= " (".$FILEDESC{$file}.")" if defined($FILEDESC{$file});
+    # filedate
+    my @STAT = stat("$in{CORPUS}.$file");
+    my ($sec,$min,$hour,$mday,$mon,$year,$wday,$yday,$isdst) = localtime($STAT[8]); #STAT[8] should be last modify time
+    my $time = sprintf("%04d-%02d-%02d %02d:%02d:%02d",$year+1900,$mon+1,$mday,$hour,$min,$sec);
+    $row .= "</TD>\n<TD>".$time."</TD>\n";
+    if (defined($in{SORT}) && $in{SORT} eq 'TIME') { $sort = $time; }
+    # IBM BLEU score
+    my $no_bleu =0;
+    if (!$sgm && -e "$in{CORPUS}.e") {
+      $row .= "<TD>";
+      if (!defined($DONTSCORE{$file}) && $file !~ /^f$/ && $file ne "e" && $file !~ /^pt/) {
+	my ($score,$p1,$p2,$p3,$p4,$bp) = $corpus->calcBLEU($file, 'surf');
+	$row .= sprintf("<B>%.04f</B> %.01f/%.01f/%.01f/%.01f *%.03f", $score, $p1, $p2, $p3, $p4, $bp);
+	if (defined($in{SORT}) && $in{SORT} eq 'IBM') { $sort = $score; }
+      }
+      $row .= "</TD>\n";
+    }
+    else {
+      $no_bleu=1;
+    }
+    # NIST score
+    if (-e "$in{CORPUS}.ref.sgm" && -e "$in{CORPUS}.src.sgm" 
+	&& !$DONTSCORE{$file}) {    
+      $row .= "<TD>";
+      print "$DONTSCORE{$file}+";
+      my ($nist,$nist_bleu);
+      if ($file =~ /sgm$/) {
+	($nist,$nist_bleu) = &get_nist_score("$in{CORPUS}.ref.sgm","$in{CORPUS}.src.sgm","$in{CORPUS}.$file");
+	$row .= sprintf("<B>%.04f</B>",$nist);
+	if ($in{SORT} eq 'NIST') { $sort = $nist; }
+      }
+      $row .= "</TD>\n";
+      if ($no_bleu) {
+	$row .= "<TD>";
+	if ($file =~ /sgm$/) {
+	  $row .= sprintf("<B>%.04f</B>",$nist_bleu);
+	  if ($in{SORT} eq 'BLEU') { $sort = $nist_bleu; }
+	}
+	$row .= "</TD>\n";
+      }
+    }
+    # multi-bleu
+    if ($in{mBLEU} && (scalar keys %MEMORY) && -e "$in{CORPUS}.e") {
+      $row .= "<TD>";
+      if (!defined($DONTSCORE{$file}) && $file !~ /^f$/ && $file ne "e") {
+	my ($score,$p1,$p2,$p3,$p4,$bp) = &get_multi_bleu_score("$in{CORPUS}.f","$in{CORPUS}.e","$in{CORPUS}.$file");
+	$row .= sprintf("<B>%.04f</B> %.01f/%.01f/%.01f/%.01f *%.03f",$score,$p1,$p2,$p3,$p4,$bp);
+	if ($in{SORT} eq 'mBLEU') { $sort = $score; }
+      }
+      $row .= "</TD>\n";
+    }
+	 
+	 # misc stats
+	 $row .= "<TD align=\"center\">";
+	 if($file eq 'f') #input
+	 {
+	 	try
+		{
+			my ($unknownCount, $totalCount) = calc_unknown_words($corpus, 'surf');
+	 		$row .= sprintf("%.4lf (%d / %d)", $unknownCount / $totalCount, $unknownCount, $totalCount);
+		}
+		catch Error::Simple with {$row .= "[system error]";};
+	 }
+	 $row .= "</TD>\n<TD align=\"center\">";
+	 my ($nnAdjWER, $nnAdjPWER, $surfPWER, $lemmaPWER);
+	 if($file ne 'e' && $file ne 'f' && $file !~ /^pt/) #system output
+	 {
+		try
+		{
+			($nnAdjWER, $nnAdjPWER, $surfPWER, $lemmaPWER) = calc_misc_stats($corpus, $file);
+			$row .= "WER = $nnAdjWER<br>PWER = $nnAdjPWER<br><b>diff = " . ($nnAdjWER - $nnAdjPWER) . "</b>";
+		}
+		catch Error::Simple with {$row .= "[system error]";};
+	}
+	$row .= "</TD>\n<TD align=\"center\">";
+	if($file ne 'e' && $file ne 'f' && $file !~ /^pt/) #system output
+	{
+		if($surfPWER == -1)
+		{
+			$row .= "[system error]";
+		}
+		else
+		{
+			my ($lemmaBLEU, $p1, $p2, $p3, $p4, $brevity) = $corpus->calcBLEU($file, 'lemma');
+			$row .= sprintf("surface = %d<br>lemma = %d<br><b>lemma BLEU = %.04f</b> %.01f/%.01f/%.01f/%.01f *%.03f", 
+									$surfPWER, $lemmaPWER, $lemmaBLEU, $p1, $p2, $p3, $p4, $brevity);
+		}
+	}
+	$row .= "</TD>\n";
+
+    # correct sentence score
+    my($correct,$wrong,$unknown);
+    $row .= "<TD>";
+    if (!defined($DONTSCORE{$file}) && (scalar keys %MEMORY)) {
+      my ($correct,$just_syn,$just_sem,$wrong,$unknown) = 
+	&get_score_from_memory("$in{CORPUS}.$FOREIGN",
+			       "$in{CORPUS}.$file");
+      $row .= "<B><FONT COLOR=GREEN>$correct</FONT></B>";
+      $row .= "/<FONT COLOR=ORANGE>$just_syn</FONT>";
+      $row .= "/<FONT COLOR=ORANGE>$just_sem</FONT>";
+      $row .= "/<FONT COLOR=RED>$wrong</FONT> ($unknown)</TD>\n";
+      if ($in{SORT} eq 'SCORE') { 
+	$sort = sprintf("%03d %04d",$correct,$just_syn+$just_sem);
+      }
+    }
+	 else
+	 {
+	 	$row .= "</TD>\n";
+	}
+
+    # score / review links
+    $row .= "<TD>";
+    $row .= "<A HREF=\"?ACTION=SCORE_FILE&VIEW=1&CORPUS=".CGI::escape($in{CORPUS})."&FILE=".CGI::escape($file)."\">view</A>" if (!defined($DONTSCORE{$file}));
+    $row .= " <A HREF=\"?ACTION=SCORE_FILE&CORPUS=".CGI::escape($in{CORPUS})."&FILE=".CGI::escape($file)."\">score</A>" if (!defined($DONTSCORE{$file}) && ($unknown || scalar keys %MEMORY == 0));
+    $row .= " <A HREF=\"?ACTION=RESCORE_FILE&CORPUS=".CGI::escape($in{CORPUS})."&FILE=".CGI::escape($file)."\">review</A>" if (!defined($DONTSCORE{$file}) && scalar keys %MEMORY);
+    $row .= "</TD></TR>\n"; 
+    push @TABLE, "<!-- $sort -->\n$row";
+  }
+  close(DIR);
+  foreach (reverse sort @TABLE) { print $_; }
+  print "</TABLE>\n";
+  print "<INPUT TYPE=SUBMIT VALUE=\"Compare\">\n";
+  print "<INPUT TYPE=CHECKBOX NAME=SURFACE VALUE=1 CHECKED> Compare all different sentences (instead of just differently <I>evaluated</I> sentences) <INPUT TYPE=CHECKBOX NAME=WITH_EVAL VALUE=1 CHECKED> with evaluation</FORM><P>\n";
+  print "<P>The score is to be read as: <FONT COLOR=GREEN>correct</FONT>/<FONT COLOR=ORANGE>just-syn-correct</FONT>/<FONT COLOR=ORANGE>just-sem-correct</FONT>/<FONT COLOR=RED>wrong</FONT> (unscored)\n";
+  print "<BR>IBM BLEU is to be read as: <B>metric</B> unigram/bigram/trigram/quadgram *brevity-penalty\n"; 
+  print "<P><A HREF=\"newsmtgui.cgi?action=\">All corpora</A>\n";
+}
+
+###### SCORE TRANSLATIONS
+
+sub score_file {
+  if ($in{VIEW}) {
+    &htmlhead("View Translations");
+  }
+  else {
+    &htmlhead("Score Translations");
+  }
+  print "<A HREF=\"?ACTION=VIEW_CORPUS&CORPUS=".CGI::escape($in{CORPUS})."\">View Corpus $in{CORPUS}</A><P>\n";
+  print "<FORM ACTION=\"\" METHOD=POST>\n";
+  print "<INPUT TYPE=HIDDEN NAME=ACTION VALUE=$in{ACTION}>\n";
+  print "<INPUT TYPE=HIDDEN NAME=CORPUS VALUE=\"$in{CORPUS}\">\n";
+  print "<INPUT TYPE=HIDDEN NAME=FILE VALUE=\"$in{FILE}\">\n";
+
+  # get sentences
+  my @SENTENCES;
+  if ($in{FILE} =~ /.sgm$/) {
+      @SENTENCES = `grep '<seg' $in{CORPUS}.$in{FILE}`;
+      for(my $i=0;$i<$#SENTENCES;$i++) {
+	  $SENTENCES[$i] =~ s/^<seg[^>]+> *(\S.+\S) *<\/seg> *$/$1/;
+      }
+  }
+  else {
+      @SENTENCES = `cat $in{CORPUS}.$in{FILE}`; chop(@SENTENCES);
+  }
+
+  my %REFERENCE;
+  foreach (@SHOW) {
+    if (-e "$in{CORPUS}.$_") {
+      @{$REFERENCE{$_}} = `cat $in{CORPUS}.$_`; chop(@{$REFERENCE{$_}});
+    }
+  }
+
+  # update memory
+  foreach (keys %in) {
+    next unless /^SYN_SCORE_(\d+)$/;
+    next unless $in{"SEM_SCORE_$1"};
+    &store_in_memory($REFERENCE{$FOREIGN}[$1],
+		     $SENTENCES[$1],
+		     "syn_".$in{"SYN_SCORE_$1"}." sem_".$in{"SEM_SCORE_$1"});
+  }
+
+  # display sentences
+  for(my $i=0;$i<=$#SENTENCES;$i++) {
+    my $evaluation = &get_from_memory($REFERENCE{$FOREIGN}[$i],$SENTENCES[$i]);
+    next if ($in{ACTION} eq 'SCORE_FILE' &&
+	     ! $in{VIEW} && 
+	     $evaluation ne '' && $evaluation ne 'wrong');
+    print "<P>Sentence ".($i+1).":<BR>\n";
+    # color coding
+    &color_highlight_ngrams($i,&nist_normalize_text($SENTENCES[$i]),$REFERENCE{"e"}[$i]);
+    if (%MULTI_REF) {
+	foreach my $sysid (keys %MULTI_REF) {
+	    print "<FONT COLOR=GREEN>".$MULTI_REF{$sysid}[$i]."</FONT> (Reference $sysid)<BR>\n";
+	}
+    }
+
+    # all sentences 
+    print "$SENTENCES[$i] (System output)<BR>\n";
+    foreach my $ref (@SHOW) {
+      if (-e "$in{CORPUS}.$ref") {
+	print "<FONT COLOR=$SHOW_COLOR{$ref}>".$REFERENCE{$ref}[$i]."</FONT> (".$FILETYPE{$ref}.")<BR>\n" if $REFERENCE{$ref}[$i];
+      }
+    }
+    if (! $in{VIEW}) {
+      print "<INPUT TYPE=RADIO NAME=SYN_SCORE_$i VALUE=correct";
+      print " CHECKED" if ($evaluation =~ /syn_correct/);
+      print "> perfect English\n";
+      print "<INPUT TYPE=RADIO NAME=SYN_SCORE_$i VALUE=wrong";
+      print " CHECKED" if ($evaluation =~ /syn_wrong/);
+      print "> imperfect English<BR>\n";
+      print "<INPUT TYPE=RADIO NAME=SEM_SCORE_$i VALUE=correct";
+      print " CHECKED" if ($evaluation =~ /sem_correct/);
+      print "> correct meaning\n";
+      print "<INPUT TYPE=RADIO NAME=SEM_SCORE_$i VALUE=wrong";
+      print " CHECKED" if ($evaluation =~ /sem_wrong/);
+      print "> incorrect meaning\n";
+    }
+  }
+  if (! $in{VIEW}) {
+    print "<P><INPUT TYPE=SUBMIT VALUE=\"Add evaluation\">\n";
+    print "</FORM>\n";
+  }
+}
+
+sub color_highlight_ngrams {
+  my($i,$sentence,$single_reference) = @_;
+  my @REF = ();
+  my %NGRAM = ();
+  if (%MULTI_REF) {
+    foreach my $sysid (keys %MULTI_REF) {
+      push @REF,&nist_normalize_text($MULTI_REF{$sysid}[$i]);
+    }
+  }
+  elsif ($single_reference) {
+    @REF = ($single_reference);
+  }
+  if (@REF) {
+    foreach my $ref (@REF) {
+      my @WORD = split(/\s+/,$ref);
+      for(my $n=1;$n<=4;$n++) {
+	for(my $w=0;$w<=$#WORD-($n-1);$w++) {
+	  my $ngram = "$n: ";
+	  for(my $j=0;$j<$n;$j++) {
+	    $ngram .= $WORD[$w+$j]." ";
+	  }
+	  $NGRAM{$ngram}++;
+	}
+      }
+    }
+    $sentence =~ s/^\s+//;
+    $sentence =~ s/\s+/ /;
+    $sentence =~ s/\s+$//;
+    my @WORD = split(/\s+/,$sentence);
+    my @CORRECT;
+    for(my $w=0;$w<=$#WORD;$w++) {
+      $CORRECT[$w] = 0;
+    }
+    for(my $n=1;$n<=4;$n++) {
+      for(my $w=0;$w<=$#WORD-($n-1);$w++) {
+	my $ngram = "$n: ";
+	for(my $j=0;$j<$n;$j++) {
+	  $ngram .= $WORD[$w+$j]." ";
+	}
+	next unless defined($NGRAM{$ngram}) && $NGRAM{$ngram}>0;
+	$NGRAM{$ngram}--;
+	for(my $j=0;$j<$n;$j++) {
+	  $CORRECT[$w+$j] = $n;
+	}
+      }
+    }
+    my @COLOR;
+    $COLOR[0] = "#FF0000";
+    $COLOR[1] = "#C000C0";
+    $COLOR[2] = "#0000FF";
+    $COLOR[3] = "#00C0C0";
+    $COLOR[4] = "#00C000";
+    for(my $w=0;$w<=$#WORD;$w++) {
+      print "<B><FONT COLOR=".$COLOR[$CORRECT[$w]].">$WORD[$w]<SUB>".$CORRECT[$w]."</SUB></FONT></B> ";
+    }
+    print "\n<BR>";
+  }
+}
+
+###### OTHER STATS
+
+#print (in some unspecified way) the offending exception of type Error::Simple
+#arguments: the error object, a context string
+#return: none
+sub printError
+{
+	my ($err, $context) = @_;
+	warn "$context: " . $err->{'-text'} . " @ " . $err->{'-file'} . " (" .$err->{'-line'} . ")\n";
+}
+
+#compute number and percentage of unknown tokens for a given factor in foreign corpus
+#arguments: corpus object ref, factor name
+#return (unkwordCount, totalWordCount), or (-1, -1) if an error occurs
+sub calc_unknown_words
+{
+	my ($corpus, $factorName) = @_;
+	try
+	{
+		my ($unknownCount, $totalCount) = $corpus->calcUnknownTokens($factorName);
+		return ($unknownCount, $totalCount);
+	}
+	catch Error::Simple with
+	{
+		my $err = shift;
+		printError($err, 'calc_unknown_words()');
+		return (-1, -1);
+	};
+}
+
+#compute (if we have the necessary factors) info for:
+#- diff btwn wer and pwer for NNs & ADJs -- if large, many reordering errors
+#- diff btwn pwer for surface forms and pwer for lemmas -- if large, morphology errors
+#arguments: corpus object, system name
+#return (NN/ADJ (wer, pwer), surf pwer, lemma pwer), or (-1, -1, -1, -1) if an error occurs
+sub calc_misc_stats
+{
+	my ($corpus, $sysname) = @_;
+	try
+	{
+		my ($nnAdjWER, $nnAdjPWER) = $corpus->calcNounAdjWER_PWERDiff($sysname);
+		my ($surfPWER, $lemmaPWER) = ($corpus->calcOverallPWER($sysname, 'surf'), $corpus->calcOverallPWER($sysname, 'lemma'));
+		return ($nnAdjWER, $nnAdjPWER, $surfPWER, $lemmaPWER);
+	}
+	catch Error::Simple with
+	{
+		my $err = shift;
+		printError($err, 'calc_misc_stats()');
+		return (-1, -1, -1, -1);
+	};
+}
+
+###### NIST SCORE
+
+sub get_nist_score {
+  my($reference_file,$source_file,$translation_file) = @_;
+  my @STAT = stat($translation_file);
+  my $current_timestamp = $STAT[9];
+  foreach (@NIST) {
+    my ($file,$time,$nist,$bleu) = split;
+    return ($nist,$bleu) 
+      if ($file eq $translation_file && $current_timestamp == $time);
+  }
+
+  my $nist_eval = `/home/pkoehn/statmt/bin/mteval-v10.pl -c -r $reference_file -s $source_file -t $translation_file`;
+  return (0,0) unless ($nist_eval =~ /NIST score = (\d+\.\d+)  BLEU score = (\d+\.\d+)/i);
+
+  open(NIST,">>nist-memory.dat");
+  printf NIST "$translation_file $current_timestamp %f %f\n",$1,$2;
+  close(NIST);
+  return ($1,$2);
+}
+
+sub nist_normalize_text {
+    my ($norm_text) = @_;
+
+# language-independent part:
+    $norm_text =~ s/<skipped>//g; # strip "skipped" tags
+    $norm_text =~ s/-\n//g; # strip end-of-line hyphenation and join lines
+    $norm_text =~ s/\n/ /g; # join lines
+    $norm_text =~ s/(\d)\s+(\d)/$1$2/g; #join digits
+    $norm_text =~ s/&quot;/"/g;  # convert SGML tag for quote to "
+    $norm_text =~ s/&amp;/&/g;   # convert SGML tag for ampersand to &
+    $norm_text =~ s/&lt;/</g;    # convert SGML tag for less-than to >
+    $norm_text =~ s/&gt;/>/g;    # convert SGML tag for greater-than to <
+
+# language-dependent part (assuming Western languages):
+    $norm_text = " $norm_text ";
+#    $norm_text =~ tr/[A-Z]/[a-z]/ unless $preserve_case;
+    $norm_text =~ s/([\{-\~\[-\` -\&\(-\+\:-\@\/])/ $1 /g;   # tokenize punctuation
+    $norm_text =~ s/([^0-9])([\.,])/$1 $2 /g; # tokenize period and comma unless preceded by a digit
+    $norm_text =~ s/([\.,])([^0-9])/ $1 $2/g; # tokenize period and comma unless followed by a digit
+    $norm_text =~ s/([0-9])(-)/$1 $2 /g; # tokenize dash when preceded by a digit
+    $norm_text =~ s/\s+/ /g; # one space only between words
+    $norm_text =~ s/^\s+//;  # no leading space
+    $norm_text =~ s/\s+$//;  # no trailing space
+
+    return $norm_text;
+}
+
+###### BLEU SCORE
+
+sub get_multi_bleu_score {
+  my($foreign_file,$reference_file,$translation_file) = @_;
+  my @STAT = stat($translation_file);
+  my $current_timestamp = $STAT[9];
+  foreach (@mBLEU) {
+    my ($file,$time,$score,$g1,$g2,$g3,$g4,$bp) = split;
+    if ($file eq $translation_file && $current_timestamp == $time) {
+      return ($score,$g1*100,$g2*100,$g3*100,$g4*100,$bp);
+    }
+  }
+
+  # load reference translation from reference file
+  my @REFERENCE_SENTENCE = `cat $reference_file`; chop(@REFERENCE_SENTENCE);
+  my @TRANSLATION_SENTENCE = `cat $translation_file`; chop(@TRANSLATION_SENTENCE);
+  my %REF;
+  my @FOREIGN_SENTENCE = `cat $foreign_file`; chop(@FOREIGN_SENTENCE);
+  for(my $i=0;$i<=$#TRANSLATION_SENTENCE;$i++) {
+    push @{$REF{$FOREIGN_SENTENCE[$i]}},$REFERENCE_SENTENCE[$i];
+  }
+  # load reference translation from translation memory
+  foreach my $memory (keys %MEMORY) {
+    next if $MEMORY{$memory} ne 'syn_correct sem_correct';
+    my ($foreign,$english) = split(/ .o0O0o. /,$memory);
+    next unless defined($REF{$foreign});
+    push @{$REF{$foreign}},$english;
+  }
+  my(@CORRECT,@TOTAL,$length_translation,$length_reference);
+  # compute bleu
+  for(my $i=0;$i<=$#TRANSLATION_SENTENCE;$i++) {
+    my %REF_NGRAM = ();
+    my @WORD = split(/ /,$TRANSLATION_SENTENCE[$i]);
+    my $length_translation_this_sentence = scalar(@WORD);
+    my ($closest_diff,$closest_length) = (9999,9999);
+    foreach my $reference (@{$REF{$FOREIGN_SENTENCE[$i]}}) {
+      my @WORD = split(/ /,$reference);
+      my $length = scalar(@WORD);
+      if (abs($length_translation_this_sentence-$length) < $closest_diff) {
+	$closest_diff = abs($length_translation_this_sentence-$length);
+	$closest_length = $length;
+      }
+      for(my $n=1;$n<=4;$n++) {
+	my %REF_NGRAM_N = ();
+	for(my $start=0;$start<=$#WORD-($n-1);$start++) {
+	  my $ngram = "$n";
+	  for(my $w=0;$w<$n;$w++) {
+	    $ngram .= " ".$WORD[$start+$w];
+	  }
+	  $REF_NGRAM_N{$ngram}++;
+	}
+	foreach my $ngram (keys %REF_NGRAM_N) {
+	  if (!defined($REF_NGRAM{$ngram}) || 
+	      $REF_NGRAM{$ngram} < $REF_NGRAM_N{$ngram}) {
+	    $REF_NGRAM{$ngram} = $REF_NGRAM_N{$ngram};
+	  }
+	}
+      }
+    }
+    $length_translation += $length_translation_this_sentence;
+    $length_reference += $closest_length;
+    for(my $n=1;$n<=4;$n++) {
+      my %T_NGRAM = ();
+      for(my $start=0;$start<=$#WORD-($n-1);$start++) {
+	my $ngram = "$n";
+	for(my $w=0;$w<$n;$w++) {
+	  $ngram .= " ".$WORD[$start+$w];
+	}
+	$T_NGRAM{$ngram}++;
+      }
+      foreach my $ngram (keys %T_NGRAM) {
+	my $n = 0+$ngram;
+#	print "$i e $ngram $T_NGRAM{$ngram}<BR>\n";
+	$TOTAL[$n] += $T_NGRAM{$ngram};
+	if (defined($REF_NGRAM{$ngram})) {
+	  if ($REF_NGRAM{$ngram} >= $T_NGRAM{$ngram}) {
+	    $CORRECT[$n] += $T_NGRAM{$ngram};
+#	    print "$i e correct1 $T_NGRAM{$ngram}<BR>\n";
+	  }
+	  else {
+	    $CORRECT[$n] += $REF_NGRAM{$ngram};
+#	    print "$i e correct2 $REF_NGRAM{$ngram}<BR>\n";
+	  }
+	}
+      }
+    }
+  }
+  my $brevity_penalty = 1;
+  if ($length_translation<$length_reference) {
+    $brevity_penalty = exp(1-$length_reference/$length_translation);
+  }
+  my $bleu = $brevity_penalty * exp((my_log( $CORRECT[1]/$TOTAL[1] ) +
+				     my_log( $CORRECT[2]/$TOTAL[2] ) +
+				     my_log( $CORRECT[3]/$TOTAL[3] ) +
+				     my_log( $CORRECT[4]/$TOTAL[4] ) ) / 4);
+
+  open(BLEU,">>mbleu-memory.dat");
+  @STAT = stat($translation_file);
+  printf BLEU "$translation_file $STAT[9] %f %f %f %f %f %f\n",$bleu,$CORRECT[1]/$TOTAL[1],$CORRECT[2]/$TOTAL[2],$CORRECT[3]/$TOTAL[3],$CORRECT[4]/$TOTAL[4],$brevity_penalty;
+  close(BLEU);
+  
+  return ($bleu,
+	  100*$CORRECT[1]/$TOTAL[1],
+	  100*$CORRECT[2]/$TOTAL[2],
+	  100*$CORRECT[3]/$TOTAL[3],
+	  100*$CORRECT[4]/$TOTAL[4],
+	  $brevity_penalty);
+}
+
+sub my_log {
+  return -9999999999 unless $_[0];
+  return log($_[0]);
+}
+
+
+###### SCORE TRANSLATIONS
+
+################################ IN PROGRESS ###############################
+sub compare2
+{
+	&htmlhead("Compare Translations");
+	print "<A HREF=\"?ACTION=VIEW_CORPUS&CORPUS=".CGI::escape($in{CORPUS})."\">View Corpus $in{CORPUS}</A><P>\n";
+	print "<FORM ACTION=\"\" METHOD=POST>\n";
+	print "<INPUT TYPE=HIDDEN NAME=ACTION VALUE=$in{ACTION}>\n";
+	print "<INPUT TYPE=HIDDEN NAME=CORPUS VALUE=\"$in{CORPUS}\">\n";
+	my $corpus = new Corpus('-name' => "$in{CORPUS}", '-descriptions' => \%FILEDESC, '-indices' => \%factorIndices);
+	$corpus->writeComparisonPage(\*STDOUT, /^.*$/);
+	print "</FORM>\n";
+}
+
+sub compare {
+  &htmlhead("Compare Translations");
+  print "<A HREF=\"?ACTION=VIEW_CORPUS&CORPUS=".CGI::escape($in{CORPUS})."\">View Corpus $in{CORPUS}</A><P>\n";
+  print "<FORM ACTION=\"\" METHOD=POST>\n";
+  print "<INPUT TYPE=HIDDEN NAME=ACTION VALUE=$in{ACTION}>\n";
+  print "<INPUT TYPE=HIDDEN NAME=CORPUS VALUE=\"$in{CORPUS}\">\n";
+
+  # get sentences
+  my %SENTENCES;
+  my $sentence_count;
+  foreach (keys %in) {
+    if (/^FILE_(.+)$/) {
+      my $file = $1;
+      print "<INPUT TYPE=HIDDEN NAME=\"$file\" VALUE=1>\n";
+      my @SENTENCES;
+      if ($file =~ /.sgm$/) {
+	  @{$SENTENCES{$file}} = `grep '<seg' $in{CORPUS}.$file`;
+	  for(my $i=0;$i<$#{$SENTENCES{$file}};$i++) {
+	      $SENTENCES{$file}[$i] =~ s/^<seg[^>]+> *(\S.+\S) *<\/seg> *$/$1/;
+	  }
+      }
+      else {
+	  @{$SENTENCES{$file}} = `cat $in{CORPUS}.$1`;
+	  chop(@{$SENTENCES{$file}});
+      }
+
+      $sentence_count = scalar @{$SENTENCES{$file}};
+    }
+  }
+  my %REFERENCE;
+  foreach (@SHOW) {
+    if (-e "$in{CORPUS}.$_") {
+      @{$REFERENCE{$_}} = `cat $in{CORPUS}.$_`; chop(@{$REFERENCE{$_}});
+    }
+  }
+
+  # update memory
+  foreach (keys %in) {
+    next unless /^SYN_SCORE_(.+)_(\d+)$/;
+    next unless $in{"SEM_SCORE_$1_$2"};
+    &store_in_memory($REFERENCE{$FOREIGN}[$2],
+		     $SENTENCES{$1}[$2],
+                     "syn_".$in{"SYN_SCORE_$1_$2"}." sem_".$in{"SEM_SCORE_$1_$2"});
+  }
+
+  # display sentences
+  for(my $i=0;$i<$sentence_count;$i++)
+  {
+    my $evaluation = "";
+    my $show = 0;
+    my $surface = "";
+    foreach my $file (keys %SENTENCES)
+	 {
+      if ($in{SURFACE}) {
+	$SENTENCES{$file}[$i] =~ s/ *$//;
+	$surface = $SENTENCES{$file}[$i] if ($surface eq '');
+	$show = 1 if ($SENTENCES{$file}[$i] ne $surface);
+      }
+      else {
+	my $this_ev = &get_from_memory($REFERENCE{$FOREIGN}[$i],$SENTENCES{$file}[$i]);
+	$this_ev = "syn_wrong sem_wrong" unless $this_ev;
+	$evaluation = $this_ev if ($evaluation eq '');
+	$show = 1 if ($evaluation ne $this_ev);
+      }
+    }
+    next unless $show;
+    print "<HR>Sentence ".($i+1).":<BR>\n";
+    foreach my $ref (@SHOW) {
+      if (-e "$in{CORPUS}.$ref") {
+	print "<FONT COLOR=$SHOW_COLOR{$ref}>".$REFERENCE{$ref}[$i]."</FONT> (".$FILETYPE{$ref}.")<BR>\n";
+      }
+    }
+    foreach my $file (keys %SENTENCES) {
+      print "<B>$SENTENCES{$file}[$i]</B> ($file)<BR>\n";
+      &color_highlight_ngrams($i,&nist_normalize_text($SENTENCES{$file}[$i]),$REFERENCE{"e"}[$i]);
+      if (0 && $in{WITH_EVAL}) {
+	$evaluation = &get_from_memory($REFERENCE{$FOREIGN}[$i],$SENTENCES{$file}[$i]);
+	print "<INPUT TYPE=RADIO NAME=SYN_SCORE_$file"."_$i VALUE=correct";
+	print " CHECKED" if ($evaluation =~ /syn_correct/);
+	print "> perfect English\n";
+	print "<INPUT TYPE=RADIO NAME=SYN_SCORE_$file"."_$i VALUE=wrong";
+	print " CHECKED" if ($evaluation =~ /syn_wrong/);
+	print "> imperfect English<BR>\n";
+	print "<INPUT TYPE=RADIO NAME=SEM_SCORE_$file"."_$i VALUE=correct";
+	print " CHECKED" if ($evaluation =~ /sem_correct/);
+	print "> correct meaning\n";
+	print "<INPUT TYPE=RADIO NAME=SEM_SCORE_$file"."_$i VALUE=wrong";
+	print " CHECKED" if ($evaluation =~ /sem_wrong/);
+	print "> incorrect meaning<BR>\n";
+      }
+    }
+  }
+  print "<P><INPUT TYPE=SUBMIT VALUE=\"Add evaluation\">\n";
+  print "</FORM>\n";
+}
+
+###### MEMORY SUBS
+
+sub load_memory {
+  open(MEMORY,"evaluation-memory.dat") or return;
+  while(<MEMORY>) {
+    chop;
+    my($foreign,$translation,$evaluation) = split(/ \.o0O0o\. /);
+    $evaluation = 'syn_correct sem_correct' if ($evaluation eq 'correct');
+    $MEMORY{"$foreign .o0O0o. $translation"} = $evaluation;
+  }
+  close(MEMORY);
+}
+
+sub get_score_from_memory {
+  my($foreign_file,$translation_file) = @_;
+  my $unknown=0;
+  my $correct=0;
+  my $just_syn=0;
+  my $just_sem=0;
+  my $wrong=0;
+  my @FOREIGN = `cat $foreign_file`; chop(@FOREIGN);
+  my @TRANSLATION = `cat $translation_file`; chop(@TRANSLATION);
+  for(my $i=0;$i<=$#FOREIGN;$i++) {
+    if (my $evaluation = &get_from_memory($FOREIGN[$i],$TRANSLATION[$i])) {
+      if ($evaluation eq 'syn_correct sem_correct') { $correct++ }
+      elsif ($evaluation eq 'syn_correct sem_wrong') { $just_syn++ }
+      elsif ($evaluation eq 'syn_wrong sem_correct') { $just_sem++ }
+      elsif ($evaluation eq 'syn_wrong sem_wrong') { $wrong++ }
+      else { $unknown++; }
+    }
+    else { $unknown++; }
+  }
+  return($correct,$just_syn,$just_sem,$wrong,$unknown);
+}
+
+sub store_in_memory {
+  my($foreign,$translation,$evaluation) = @_;
+  &trim(\$translation);
+  return if $MEMORY{"$foreign .o0O0o. $translation"} eq $evaluation;
+  $MEMORY{"$foreign .o0O0o. $translation"} = $evaluation;
+  open(MEMORY,">>evaluation-memory.dat") or die "store_in_memory(): couldn't open 'evaluation-memory.dat' for append\n";
+  print MEMORY "$foreign .o0O0o. $translation .o0O0o. $evaluation\n";
+  close(MEMORY);
+}
+
+sub get_from_memory {
+  my($foreign,$translation) = @_;
+  &trim(\$translation);
+  return $MEMORY{"$foreign .o0O0o. $translation"};
+}
+
+sub trim {
+  my($translation) = @_;
+  $$translation =~ s/ +/ /g;
+  $$translation =~ s/^ +//;
+  $$translation =~ s/ +$//;
+#  $$translation =~ s/ +[\.]$//;
+}
+
+sub load_descriptions {
+  open(FD,"file-descriptions") or die "load_descriptions(): couldn't open 'file-descriptions' for read\n";
+  while(<FD>) {
+    my($file,$description) = split(/\s+/,$_,2);
+    $FILEDESC{$file} = $description;
+  }
+  close(FD);
+}
+
+#read config file giving names of corpi and fill factor-index map
+#arguments: filename to read
+#return: hash of corpus names to hashrefs of factor names to indices
+sub loadFactorIndices
+{
+	my $filename = shift;
+	my %data = ();
+	open(INFILE, "<$filename") or die "loadFactorIndices(): couldn't open '$filename' for read\n";
+	while(my $line = <INFILE>)
+	{
+		if($line =~ /^\#/) {next;} #skip comment lines
+		my @tokens = split(/\s+/, $line);
+		my $corpusName = shift @tokens;
+		$data{$corpusName} = {};
+		for(my $i = 0; $i < scalar(@tokens); $i++)
+		{
+			$data{$corpusName}->{$tokens[$i]} = $i;
+		}
+	}
+	close(INFILE);
+	return %data;
+}
+
+###### SUBS
+
+sub htmlhead {
+  print <<"___ENDHTML";
+Content-type: text/html
+
+<HTML><HEAD>
+<TITLE>MTEval: $_[0]</TITLE>
+<SCRIPT LANGUAGE="JavaScript">
+
+<!-- hide from old browsers
+
+function FieldInfo(field,description) {
+  popup = window.open("","popDialog","height=500,width=600,scrollbars=yes,resizable=yes");
+  popup.document.write("<HTML><HEAD><TITLE>"+field+"</TITLE></HEAD><BODY BGCOLOR=#FFFFCC><CENTER><B>"+field+"</B><HR SIZE=2 NOSHADE></CENTER><PRE>"+description+"</PRE><CENTER><FORM><INPUT TYPE='BUTTON' VALUE='Okay' onClick='self.close()'></FORM><CENTER></BODY></HTML>");
+  popup.focus();
+  popup.document.close();
+}
+
+<!-- done hiding -->
+
+</SCRIPT>
+</HEAD>
+<BODY BGCOLOR=white>
+<H2>Evaluation Tool for Machine Translation<BR>$_[0]</H2>
+___ENDHTML
+}
+
+
+############################# parts of cgi-lib.pl
+
+
+sub ReadParse {
+  my ($i, $key, $val);
+
+  # Read in text
+  my $in;
+  if (&MethGet) {
+    $in = $ENV{'QUERY_STRING'};
+  } elsif (&MethPost) {
+    read(STDIN,$in,$ENV{'CONTENT_LENGTH'});
+  }
+
+  my @in = split(/[&;]/,$in);
+
+  foreach $i (0 .. $#in) {
+    # Convert plus's to spaces
+    $in[$i] =~ s/\+/ /g;
+
+    # Split into key and value.
+    ($key, $val) = split(/=/,$in[$i],2); # splits on the first =.
+
+    # Convert %XX from hex numbers to alphanumeric
+    $key =~ s/%(..)/pack("c",hex($1))/ge;
+    $val =~ s/%(..)/pack("c",hex($1))/ge;
+
+    # Associate key and value
+    $in{$key} .= "\0" if (defined($in{$key})); # \0 is the multiple separator
+    $in{$key} .= $val;
+
+  }
+
+  return scalar(@in);
+}
+
+sub MethGet {
+  return ($ENV{'REQUEST_METHOD'} eq "GET");
+}
+
+sub MethPost {
+  return ($ENV{'REQUEST_METHOD'} eq "POST");
+}
author	eherbst <eherbst@1f5c12ca-751b-0410-a591-d2e778427230>	2006-08-01 02:05:11 +0400
committer	eherbst <eherbst@1f5c12ca-751b-0410-a591-d2e778427230>	2006-08-01 02:05:11 +0400
commit	5cce8336c076371dfa30c5cd37c29bebb56a558a (patch)
tree	b13af443c55c50398385b21fecb4b3e5f17c8eb1 /scripts/analysis
parent	dfe989fbd22f76045301693edfb84a36371dc5d9 (diff)