diff options
Diffstat (limited to 'scripts/analysis/smtgui')
-rw-r--r-- | scripts/analysis/smtgui/Corpus.pm | 867 | ||||
-rw-r--r-- | scripts/analysis/smtgui/Error.pm | 744 | ||||
-rw-r--r-- | scripts/analysis/smtgui/README | 31 | ||||
-rw-r--r-- | scripts/analysis/smtgui/file-descriptions | 1 | ||||
-rw-r--r-- | scripts/analysis/smtgui/file-factors | 6 | ||||
-rw-r--r-- | scripts/analysis/smtgui/filter-phrase-table.pl | 83 | ||||
-rwxr-xr-x | scripts/analysis/smtgui/newsmtgui.cgi | 935 |
7 files changed, 0 insertions, 2667 deletions
diff --git a/scripts/analysis/smtgui/Corpus.pm b/scripts/analysis/smtgui/Corpus.pm deleted file mode 100644 index f191ce273..000000000 --- a/scripts/analysis/smtgui/Corpus.pm +++ /dev/null @@ -1,867 +0,0 @@ -#package Corpus: hold a bunch of sentences in any language, with translation factors and stats about individual sentences and the corpus as a whole -#Evan Herbst, 7 / 25 / 06 - -package Corpus; -BEGIN -{ - push @INC, "."; -} -use Error; - -return 1; - -########################################################################################################################### - -#'our' variables are available outside the package -our @FACTORNAMES = ('surf', 'pos', 'lemma', 'stem', 'morph'); - -#constructor -#arguments: short corpus name (-name), hashref of filenames to descriptions (-descriptions), hashref of factor names to indices in this corpus (-indices) -sub new -{ - my $class = shift; - my %args = @_; #turn the remainder of @_ into a hash - my ($corpusName, $refFileDescs, $factorIndices) = ($args{'-name'}, $args{'-descriptions'}, $args{'-indices'}); - my $self = {}; - $self->{'corpusName'} = $corpusName; - $self->{'truth'} = []; #arrayref of arrayrefs of factors - $self->{'input'} = []; #same; also same for any system outputs that get loaded - $self->{'truthFilename'} = ""; - $self->{'inputFilename'} = ""; - $self->{'sysoutFilenames'} = {}; #hashref of (string => string) for (system name, filename) - $self->{'phraseTableFilenames'} = {}; #factor name (from @FACTORNAMES) => filename - $self->{'factorIndices'} = {}; #factor name => index; names ought to be strings from the standard list in @FACTORNAMES - %{$self->{'factorIndices'}} = %$factorIndices; - $self->{'phraseTables'} = {}; #factor name (from @FACTORNAMES) => hashref of source phrases to anything; used for unknown-word counting - $self->{'unknownCount'} = {}; #factor name => count of unknown tokens in input - $self->{'totalTokens'} = 0; #useful with counts of unknown tokens - $self->{'sysoutWER'} = {}; #system name => (factor name => arrayref with system output total WER and arrayref of WER scores for individual sysout sentences wrt truth) - $self->{'sysoutPWER'} = {}; #similarly - $self->{'fileDescriptions'} = {}; #filename associated with us => string description of file - $self->{'bleuScores'} = {}; #system name => (factor name => arrayref of (overall score, arrayref of per-sentence scores) ) - $self->{'cacheFilename'} = "cache/$corpusName.cache"; #all memory of various scores is stored here - bless $self, $class; - $self->locateFiles($refFileDescs); #find all relevant files in the current directory; set filenames and descriptions - $self->loadCacheFile(); - return $self; -} - -#arguments: filename -#return: description string -#throw if filename doesn't belong to this corpus -sub getFileDescription -{ - my ($self, $filename) = @_; - if(!defined($self->{'fileDescriptions'}->{$filename})) - { - throw Error::Simple(-text => "Corpus::getFileDescription(): invalid filename '$filename'\n"); - } - return $self->{'fileDescriptions'}->{$filename}; -} - -#arguments: hashref of factor names to indices (not all factors in @FACTORNAMES must be included, but no strings not from the list should be) -#return: none -sub setFactorIndices -{ - my ($self, $refIndices) = @_; - %{$self->{'factorIndices'}} = %{$refIndices}; -} - -#calculate the number of unknown factor values for the given factor in the input file -#arguments: factor name -#return: unknown factor count, total factor count (note the total doesn't depend on the factor) -#throw if we don't have an input file or a phrase table for the given factor defined or if there's no index known for the given factor -sub calcUnknownTokens -{ - my ($self, $factorName) = @_; - #check in-memory cache first - if(defined($self->{'unknownCount'}->{$factorName})) - { - return ($self->{'unknownCount'}->{$factorName}, $self->{'totalTokens'}); - } - - $self->ensureFilenameDefined('input'); - $self->ensurePhraseTableDefined($factorName); - $self->ensureFactorPosDefined($factorName); - $self->loadSentences('input', $self->{'inputFilename'}); - $self->loadPhraseTable($factorName); - - #count unknown and total words - my ($unknownTokens, $totalTokens) = (0, 0); - my $factorIndex = $self->{'factorIndices'}->{$factorName}; - foreach my $sentence (@{$self->{'input'}}) - { - $totalTokens += scalar(@$sentence); - foreach my $word (@$sentence) - { - if(!defined($self->{'phraseTables'}->{$factorName}->{$word->[$factorIndex]})) - { - $unknownTokens++; - } - } - } - $self->{'unknownCount'}->{$factorName} = $unknownTokens; - $self->{'totalTokens'} = $totalTokens; - - return ($unknownTokens, $totalTokens); -} - -#arguments: system name -#return: (WER, PWER) for nouns and adjectives in given system wrt truth -#throw if given system or truth is not set or if index of 'surf' or 'pos' hasn't been specified -sub calcNounAdjWER_PWERDiff -{ - my ($self, $sysname) = @_; - $self->ensureFilenameDefined('truth'); - $self->ensureFilenameDefined($sysname); - $self->ensureFactorPosDefined('surf'); - $self->ensureFactorPosDefined('pos'); - $self->loadSentences('truth', $self->{'truthFilename'}); - $self->loadSentences($sysname, $self->{'sysoutFilenames'}->{$sysname}); - #find nouns and adjectives and score them - my ($werScore, $pwerScore) = (0, 0); - my $nnNadjTags = $self->getPOSTagList('nounAndAdj'); - for(my $i = 0; $i < scalar(@{$self->{'truth'}}); $i++) - { -# warn "truth " . join(', ', map {join('|', @$_)} @{$self->{'truth'}->[$i]}) . "\n"; -# warn "sysout " . join(', ', map {join('|', @$_)} @{$self->{$sysname}->[$i]}) . "\n"; - my @nnAdjEWords = $self->filterFactors($self->{'truth'}->[$i], $self->{'factorIndices'}->{'pos'}, $nnNadjTags); - my @nnAdjSWords = $self->filterFactors($self->{$sysname}->[$i], $self->{'factorIndices'}->{'pos'}, $nnNadjTags); -# warn "filtered truth: " . join(' ', map {join('|', @$_)} @nnAdjEWords) . "\n"; -# warn "filtered sysout: " . join(' ', map {join('|', @$_)} @nnAdjSWords) . "\n\n\n"; - my ($sentWer, $tmp) = $self->sentenceWER(\@nnAdjSWords, \@nnAdjEWords, $self->{'factorIndices'}->{'surf'}); - $werScore += $sentWer; - ($sentWer, $tmp) = $self->sentencePWER(\@nnAdjSWords, \@nnAdjEWords, $self->{'factorIndices'}->{'surf'}); - $pwerScore += $sentWer; - } - - #unhog memory - $self->releaseSentences('truth'); - $self->releaseSentences($sysname); - return ($werScore, $pwerScore); -} - -#calculate detailed WER statistics and put them into $self -#arguments: system name, factor name to consider (default 'surf', surface form) -#return: overall surface WER for given system (w/o filtering) -#throw if given system or truth is not set or if index of factor name hasn't been specified -sub calcOverallWER -{ - my ($self, $sysname, $factorName) = (shift, shift, 'surf'); - if(scalar(@_) > 0) {$factorName = shift;} - #check in-memory cache first - if(defined($self->{'sysoutWER'}->{$sysname}->{$factorName})) - { - return $self->{'sysoutWER'}->{$sysname}->{$factorName}->[0]; - } - - $self->ensureFilenameDefined('truth'); - $self->ensureFilenameDefined($sysname); - $self->ensureFactorPosDefined($factorName); - $self->loadSentences('truth', $self->{'truthFilename'}); - $self->loadSentences($sysname, $self->{'sysoutFilenames'}->{$sysname}); - - my ($wer, $swers, $indices) = $self->corpusWER($self->{$sysname}, $self->{'truth'}, $self->{'factorIndices'}->{$factorName}); - $self->{'sysoutWER'}->{$sysname}->{$factorName} = [$wer, $swers, $indices]; #total; arrayref of scores for individual sentences; arrayref of arrayrefs of offending words in each sentence - - #unhog memory - $self->releaseSentences('truth'); - $self->releaseSentences($sysname); - return $self->{'sysoutWER'}->{$sysname}->{$factorName}->[0]; -} - -#calculate detailed PWER statistics and put them into $self -#arguments: system name, factor name to consider (default 'surf') -#return: overall surface PWER for given system (w/o filtering) -#throw if given system or truth is not set or if index of factor name hasn't been specified -sub calcOverallPWER -{ - my ($self, $sysname, $factorName) = (shift, shift, 'surf'); - if(scalar(@_) > 0) {$factorName = shift;} - #check in-memory cache first - if(defined($self->{'sysoutPWER'}->{$sysname}->{$factorName})) - { - return $self->{'sysoutPWER'}->{$sysname}->{$factorName}->[0]; - } - - $self->ensureFilenameDefined('truth'); - $self->ensureFilenameDefined($sysname); - $self->ensureFactorPosDefined($factorName); - $self->loadSentences('truth', $self->{'truthFilename'}); - $self->loadSentences($sysname, $self->{'sysoutFilenames'}->{$sysname}); - - my ($pwer, $spwers, $indices) = $self->corpusPWER($self->{$sysname}, $self->{'truth'}, $self->{'factorIndices'}->{$factorName}); - $self->{'sysoutPWER'}->{$sysname}->{$factorName} = [$pwer, $spwers, $indices]; #total; arrayref of scores for individual sentences; arrayref of arrayrefs of offending words in each sentence - - #unhog memory - $self->releaseSentences('truth'); - $self->releaseSentences($sysname); - return $self->{'sysoutPWER'}->{$sysname}->{$factorName}->[0]; -} - -#arguments: system name, factor name to consider (default 'surf') -#return: BLEU score, n-gram precisions, brevity penalty -sub calcBLEU -{ - my ($self, $sysname, $factorName) = (shift, shift, 'surf'); - if(scalar(@_) > 0) {$factorName = shift;} - #check in-memory cache first - if(defined($self->{'bleuScores'}->{$sysname}->{$factorName})) - { - return $self->{'bleuScores'}->{$sysname}->{$factorName}; - } - - $self->ensureFilenameDefined('truth'); - $self->ensureFilenameDefined($sysname); - $self->ensureFactorPosDefined($factorName); - $self->loadSentences('truth', $self->{'truthFilename'}); - $self->loadSentences($sysname, $self->{'sysoutFilenames'}->{$sysname}); - - #score structure: various total scores, arrayref of by-sentence score arrays - if(!defined($self->{'bleuScores'}->{$sysname})) {$self->{'bleuScores'}->{$sysname} = {};} - if(!defined($self->{'bleuScores'}->{$sysname}->{$factorName})) {$self->{'bleuScores'}->{$sysname}->{$factorName} = [[], []];} - - my ($good1, $tot1, $good2, $tot2, $good3, $tot3, $good4, $tot4, $totCLength, $totRLength) = (0, 0, 0, 0, 0, 0, 0, 0, 0, 0); - my $factorIndex = $self->{'factorIndices'}->{$factorName}; - for(my $i = 0; $i < scalar(@{$self->{'truth'}}); $i++) - { - my ($truthSentence, $sysoutSentence) = ($self->{'truth'}->[$i], $self->{$sysname}->[$i]); - my ($unigood, $unicount, $bigood, $bicount, $trigood, $tricount, $quadrugood, $quadrucount, $cLength, $rLength) = - $self->sentenceBLEU($truthSentence, $sysoutSentence, $factorIndex, 0); #last argument is whether to debug-print - push @{$self->{'bleuScores'}->{$sysname}->{$factorName}->[1]}, [$unigood, $unicount, $bigood, $bicount, $trigood, $tricount, $quadrugood, $quadrucount, $cLength, $rLength]; - $good1 += $unigood; $tot1 += $unicount; - $good2 += $bigood; $tot2 += $bicount; - $good3 += $trigood; $tot3 += $tricount; - $good4 += $quadrugood; $tot4 += $quadrucount; - $totCLength += $cLength; - $totRLength += $rLength; - } - my $brevity = ($totCLength > $totRLength || $totCLength == 0) ? 1 : exp(1 - $totRLength / $totCLength); - my ($pct1, $pct2, $pct3, $pct4) = ($good1 / $tot1, $good2 / $tot2, $good3 / $tot3, $good4 / $tot4); -# warn sprintf("n-gram prec: %d/%d = %.3lf, %d/%d = %.3lf, %d/%d = %.3lf, %d/%d = %.3lf\n", -# $good1, $tot1, $pct1, $good2, $tot2, $pct2, $good3, $tot3, $pct3, $good4, $tot4, $pct4); - my $bleu = $brevity * exp((my_log($pct1) + my_log($pct2) + my_log($pct3) + my_log($pct4)) / 4); -# warn sprintf("brevity: %.3lf (%d ref, %d sys)\n", $brevity, $totRLength, $totCLength); - sleep 8; - $self->{'bleuScores'}->{$sysname}->{$factorName}->[0] = [$bleu, 100 * $pct1, 100 * $pct2, 100 * $pct3, 100 * $pct4, $brevity]; - - #unhog memory - $self->releaseSentences('truth'); - $self->releaseSentences($sysname); - return @{$self->{'bleuScores'}->{$sysname}->{$factorName}->[0]}; -} - -#write HTML to be displayed to compare the various versions we have of each sentence in the corpus; -#allow filtering of which versions will be displayed -#(we don't write the whole page, just the contents of the body) -#arguments: filehandleref to which to write, regex to filter filename extensions to be included -#return: none -sub writeComparisonPage -{ - my ($self, $fh, $filter) = @_; - my @filteredExtensions = grep($filter, ('e', 'f', keys %{$self->{'sysoutFilenames'}})); - my %openedFiles = $self->openFiles(@filteredExtensions); - my $id = 1; #sentence ID string - while(my %lines = $self->readLineFromFiles(%openedFiles)) - { - $self->printSingleSentenceComparison($fh, $id, %lines); - $id++; - } - $self->closeFiles(%openedFiles); -} - -########################################################################################################## -##### INTERNAL ################################################################################### -########################################################################################################## - -#destructor! -#arguments: none -#return: none -sub DESTROY -{ - my $self = shift; - $self->writeCacheFile(); -} - -#write all scores in memory to disk -#arguments: none -#return: none -sub writeCacheFile -{ - my $self = shift; - if(!open(CACHEFILE, ">" . $self->{'cacheFilename'})) - { - warn "Corpus::writeCacheFile(): can't open '" . $self->{'cacheFilename'} . "' for write\n"; - return; - } - - #store file changetimes to disk - print CACHEFILE "File changetimes\n"; - #store bleu scores to disk - print CACHEFILE "\nBLEU scores\n"; - foreach my $sysname (keys %{$self->{'bleuScores'}}) - { - foreach my $factorName (keys %{$self->{'bleuScores'}->{$sysname}}) - { - print CACHEFILE "$sysname $factorName " . join(' ', @{$self->{'bleuScores'}->{$sysname}->{$factorName}->[0]}); - foreach my $sentenceBLEU (@{$self->{'bleuScores'}->{$sysname}->{$factorName}->[1]}) - { - print CACHEFILE "; " . join(' ', @$sentenceBLEU); - } - print CACHEFILE "\n"; - } - } - #store unknown-token counts to disk - print CACHEFILE "\nUnknown-token counts\n"; - foreach my $factorName (keys %{$self->{'unknownCount'}}) - { - print CACHEFILE $factorName . " " . $self->{'phraseTableFilenames'}->{$factorName} . " " . $self->{'unknownCount'}->{$factorName} . " " . $self->{'totalTokens'} . "\n"; - } - #store WER, PWER to disk - print CACHEFILE "\nWER scores\n"; - my $printWERFunc = - sub - { - my $werType = shift; - foreach my $sysname (keys %{$self->{$werType}}) - { - foreach my $factorName (keys %{$self->{$werType}->{$sysname}}) - { - my ($totalWER, $sentenceWERs, $errorWords) = @{$self->{$werType}->{$sysname}->{$factorName}}; - print CACHEFILE "$werType $sysname $factorName $totalWER " . join(' ', @$sentenceWERs); - foreach my $indices (@$errorWords) - { - print CACHEFILE ";" . join(' ', @$indices); - } - print CACHEFILE "\n"; - } - } - }; - &$printWERFunc('sysoutWER'); - &$printWERFunc('sysoutPWER'); - #store misc scores to disk - print CACHEFILE "\nMisc scores\n"; - close(CACHEFILE); -} - -#load all scores present in the cache file into the appropriate fields of $self -#arguments: none -#return: none -sub loadCacheFile -{ - my $self = shift; - if(!open(CACHEFILE, "<" . $self->{'cacheFilename'})) - { - warn "Corpus::loadCacheFile(): can't open '" . $self->{'cacheFilename'} . "' for read\n"; - return; - } - my $mode = 'none'; - while(my $line = <CACHEFILE>) - { - next if $line =~ /^[ \t\n\r\x0a]*$/; #anyone know why char 10 (0x0a) shows up on empty lines, at least on solaris? - chop $line; - #check for start of section - if($line eq "File changetimes\n") {$mode = 'ctime';} - elsif($line eq "BLEU scores\n") {$mode = 'bleu';} - elsif($line eq "Unknown-token counts\n") {$mode = 'unk';} - elsif($line eq "WER scores") {$mode = 'wer';} - elsif($line eq "Misc scores") {$mode = 'misc';} - #get data when in a mode already - elsif($mode eq 'ctime') - { - } - elsif($mode eq 'bleu') - { - local ($sysname, $factorName, $rest) = split(/\s+/, $line, 3); - if(!defined($self->{'bleuScores'}->{$sysname})) {$self->{'bleuScores'}->{$sysname} = {};} - if(!defined($self->{'bleuScores'}->{$sysname}->{$factorName})) {$self->{'bleuScores'}->{$sysname}->{$factorName} = [[], []];} - my @stats = map {my @tmp = split(/\s+/, $_); \@tmp;} split(/;/, $rest); - $self->{'bleuScores'}->{$sysname}->{$factorName}->[0] = shift @stats; - $self->{'bleuScores'}->{$sysname}->{$factorName}->[1] = \@stats; - } - elsif($mode eq 'unk') - { - local ($factorName, $phraseTableFilename, $unknownCount, $totalCount) = split(' ', $line); - if(defined($self->{'phraseTableFilenames'}->{$factorName}) && $self->{'phraseTableFilenames'}->{$factorName} eq $phraseTableFilename) - { - $self->{'unknownCount'}->{$factorName} = $unknownCount; - $self->{'totalTokens'} = $totalCount; - } - } - elsif($mode eq 'wer') - { - local ($werType, $sysname, $factorName, $totalWER, $details) = split(/\s+/, $line, 5); #werType is 'sysoutWER' or 'sysoutPWER' - $details =~ /^([^;]*);(.*)/; - my @sentenceWERs = split(/\s+/, $1); - if(!defined($self->{$werType}->{$sysname})) {$self->{$werType}->{$sysname} = {};} - $self->{$werType}->{$sysname}->{$factorName} = [$totalWER, \@sentenceWERs, []]; - my @indexLists = split(/;/, $2); - sleep 6; - for(my $i = 0; $i < scalar(@sentenceWERs); $i++) - { - my @indices = split(/\s+/, $indexLists[$i]); - $self->{$werType}->{$sysname}->{$factorName}->[2] = \@indices; - } - } - elsif($mode eq 'misc') - { - } - } - close(CACHEFILE); -} - -##### utils ##### -#arguments: a, b (scalars) -sub min -{ - my ($a, $b) = @_; - return ($a < $b) ? $a : $b; -} -#arguments: a, b (scalars) -sub max -{ - my ($a, $b) = @_; - return ($a > $b) ? $a : $b; -} -#arguments: x -sub my_log -{ - return -9999999999 unless $_[0]; - return log($_[0]); -} - -#throw if the given factor doesn't have an index defined -#arguments: factor name -#return: none -sub ensureFactorPosDefined -{ - my ($self, $factorName) = @_; - if(!defined($self->{'factorIndices'}->{$factorName})) - { - throw Error::Simple(-text => "Corpus: no index known for factor '$factorName'\n"); - } -} - -#throw if the filename field corresponding to the argument hasn't been defined -#arguments: 'truth' | 'input' | a system name -#return: none -sub ensureFilenameDefined -{ - my ($self, $sysname) = @_; - if($sysname eq 'truth' || $sysname eq 'input') - { - if(!defined($self->{"${sysname}Filename"})) - { - throw Error::Simple(-text => "Corpus: no $sysname corpus defined\n"); - } - } - else - { - if(!defined($self->{'sysoutFilenames'}->{$sysname})) - { - throw Error::Simple(-text => "Corpus: no system $sysname defined\n"); - } - } -} - -#throw if there isn't a defined phrase-table filename for the given factor -#arguments: factor name -#return: none -sub ensurePhraseTableDefined -{ - my ($self, $factorName) = @_; - if(!defined($self->{'phraseTableFilenames'}->{$factorName})) - { - throw Error::Simple(-text => "Corpus: no phrase table defined for factor '$factorName'\n"); - } -} - -#search current directory for files with our corpus name as basename and set filename fields of $self -#arguments: hashref of filenames to descriptions -#return: none -sub locateFiles -{ - my ($self, $refDescs) = @_; - open(DIR, "ls -x1 . |") or die "Corpus::locateFiles(): couldn't list current directory\n"; - my $corpusName = $self->{'corpusName'}; - while(my $filename = <DIR>) - { - chop $filename; #remove \n - if($filename =~ /^$corpusName\.(.*)$/) - { - my $ext = $1; - if($ext eq 'e') {$self->{'truthFilename'} = $filename;} - elsif($ext eq 'f') {$self->{'inputFilename'} = $filename;} - elsif($ext =~ /pt_(.*)/) {$self->{'phraseTableFilenames'}->{$1} = $filename;} - else {$self->{'sysoutFilenames'}->{$ext} = $filename;} - if(defined($refDescs->{$filename})) - { - $self->{'fileDescriptions'}->{$filename} = $refDescs->{$filename}; - } - } - } - close(DIR); -# warn "truth is set to '" . $self->{'truthFilename'} . "'\n"; -# warn "input is set to '" . $self->{'inputFilename'} . "'\n"; -# warn "sysouts set are '" . join("', '", keys %{$self->{'sysoutFilenames'}}) . "'\n"; -# warn "phrase tables set are '" . join("', '", keys %{$self->{'phraseTableFilenames'}}) . "'\n"; -} - -#arguments: type ('truth' | 'input' | a string to represent a system output), filename -#pre: filename exists -#return: none -sub loadSentences -{ - my ($self, $sysname, $filename) = @_; - $self->{$sysname} = []; - open(INFILE, "<$filename") or die "Corpus::load(): couldn't open '$filename' for read\n"; - while(my $line = <INFILE>) - { - my @words = split(/\s+/, $line); - my $refFactors = []; - foreach my $word (@words) - { - my @factors = split(/\|/, $word); - push @$refFactors, \@factors; - } - push @{$self->{$sysname}}, $refFactors; - } - close(INFILE); -} - -#free the memory used for the given corpus (but NOT any associated calculations, eg WER) -#arguments: type ('truth' | 'input' | a string to represent a system output) -#return: none -sub releaseSentences -{ - my ($self, $sysname) = @_; - $self->{$sysname} = []; -} - -#arguments: factor name -#return: none -#throw if we don't have a filename for the given phrase table -sub loadPhraseTable -{ - my ($self, $factorName) = @_; - $self->ensurePhraseTableDefined($factorName); - - my $filename = $self->{'phraseTableFilenames'}->{$factorName}; - open(PTABLE, "<$filename") or die "couldn't open '$filename' for read\n"; - $self->{'phraseTables'}->{$factorName} = {}; #create ref to phrase table (hash of strings, for source phrases, to anything whatsoever) - #assume the table is sorted so that duplicate source phrases will be consecutive - while(my $line = <PTABLE>) - { - my @phrases = split(/\s*\|\|\|\s*/, $line, 2); - $self->{'phraseTables'}->{$factorName}->{$phrases[0]} = 0; #just so that it's set to something - } - close(PTABLE); -} - -#arguments: factor name -#return: none -sub releasePhraseTable -{ - my ($self, $factorName) = @_; - $self->{'phraseTables'}->{$factorName} = {}; -} - -#arguments: name of list ('nounAndAdj' | ...) -#return: arrayref of strings (postags) -sub getPOSTagList -{ - my ($self, $listname) = @_; - ##### assume PTB tagset ##### - if($listname eq 'nounAndAdj') {return ['NN', 'NNS', 'NNP', 'NNPS', 'JJ', 'JJR', 'JJS'];} -# if($listname eq '') {return [];} -} - -#arguments: list to be filtered (arrayref of arrayrefs of factor strings), desired factor index, arrayref of allowable values -#return: filtered list as array of arrayrefs of factor strings -sub filterFactors -{ - my ($self, $refFullList, $index, $refFactorValues) = @_; - my $valuesRegex = join("|", @$refFactorValues); - my @filteredList = (); - foreach my $factors (@$refFullList) - { - if($factors->[$index] =~ m/$valuesRegex/) - { - push @filteredList, $factors; - } - } - return @filteredList; -} - -#arguments: system output (arrayref of arrayrefs of arrayrefs of factor strings), truth (same), factor index to use -#return: wer score, arrayref of sentence scores, arrayref of arrayrefs of indices of errorful words -sub corpusWER -{ - my ($self, $refSysOutput, $refTruth, $index) = @_; - my ($totWER, $sentenceWER, $errIndices) = (0, [], []); - for(my $i = 0; $i < scalar(@$refSysOutput); $i++) - { - my ($sentWER, $indices) = $self->sentenceWER($refSysOutput->[$i], $refTruth->[$i], $index); - $totWER += $sentWER; - push @$sentenceWER, $sentWER; - push @$errIndices, $indices; - } - return ($totWER, $sentenceWER, $errIndices); -} - -#arguments: system output (arrayref of arrayrefs of factor strings), truth (same), factor index to use -#return: wer score, arrayref of arrayrefs of indices of errorful words -sub sentenceWER -{ - my ($self, $refSysOutput, $refTruth, $index) = @_; - my ($totWER, $indices) = (0, []); - my ($sLength, $eLength) = (scalar(@$refSysOutput), scalar(@$refTruth)); - for(my $j = 0; $j < min($sLength, $eLength); $j++) - { - if(lc $refSysOutput->[$j]->[$index] ne lc $refTruth->[$j]->[$index]) #check output word against truth word in same position - { - $totWER++; - push @$indices, $j; - } - } - $totWER += max(0, $sLength - $eLength); - return ($totWER, $indices); -} - -#arguments: system output (arrayref of arrayrefs of arrayrefs of factor strings), truth (same), factor index to use -#return: wer score, arrayref of sentence scores, arrayref of arrayrefs of indices of errorful words -sub corpusPWER -{ - my ($self, $refSysOutput, $refTruth, $index) = @_; - my ($totWER, $sentenceWER, $errIndices) = (0, [], []); - for(my $i = 0; $i < scalar(@$refSysOutput); $i++) - { - my ($sentWER, $indices) = $self->sentencePWER($refSysOutput->[$i], $refTruth->[$i], $index); - $totWER += $sentWER; - push @$sentenceWER, $sentWER; - push @$errIndices, $indices; - } - return ($totWER, $sentenceWER, $errIndices); -} - -#arguments: system output (arrayref of arrayrefs of factor strings), truth (same), factor index to use -#return: wer score, arrayref of arrayrefs of indices of errorful words -sub sentencePWER -{ - my ($self, $refSysOutput, $refTruth, $index) = @_; - my ($totWER, $indices) = (0, []); - my ($sLength, $eLength) = (scalar(@$refSysOutput), scalar(@$refTruth)); - my @truthWordUsed = (0) x $eLength; #array of 0/1; can only match a given truth word once - for(my $j = 0; $j < $sLength; $j++) - { - my $found = 0; - for(my $k = 0; $k < $eLength; $k++) #check output word against entire truth sentence - { - if(lc $refSysOutput->[$j]->[$index] eq lc $refTruth->[$k]->[$index] && $truthWordUsed[$k] == 0) - { - $truthWordUsed[$k] = 1; - $found = 1; - } - } - if($found == 0) - { - $totWER++; - push @$indices, $j; - } - } - return ($totWER, $indices); -} - -#BLEU calculation for a single sentence -#arguments: truth sentence (arrayref of arrayrefs of factor strings), sysout sentence (same), factor index to use -#return: 1- through 4-gram matching and total counts (1-g match, 1-g tot, 2-g match...), candidate length, reference length -sub sentenceBLEU -{ - my ($self, $refTruth, $refSysOutput, $factorIndex, $debug) = @_; - my ($length_reference, $length_translation) = (scalar(@$refTruth), scalar(@$refSysOutput)); - my ($correct1, $correct2, $correct3, $correct4, $total1, $total2, $total3, $total4) = (0, 0, 0, 0, 0, 0, 0, 0); - my %REF_GRAM = (); - my ($i, $gram); - for($i = 0; $i < $length_reference; $i++) - { - $gram = $refTruth->[$i]->[$factorIndex]; - $REF_GRAM{$gram}++; - next if $i<1; - $gram = $refTruth->[$i - 1]->[$factorIndex] ." ".$gram; - $REF_GRAM{$gram}++; - next if $i<2; - $gram = $refTruth->[$i - 2]->[$factorIndex] ." ".$gram; - $REF_GRAM{$gram}++; - next if $i<3; - $gram = $refTruth->[$i - 3]->[$factorIndex] ." ".$gram; - $REF_GRAM{$gram}++; - } - for($i = 0; $i < $length_translation; $i++) - { - $gram = $refSysOutput->[$i]->[$factorIndex]; - if (defined($REF_GRAM{$gram}) && $REF_GRAM{$gram} > 0) { - $REF_GRAM{$gram}--; - if($debug != 0) {warn "'$gram' ";} - $correct1 += 1; - } - next if $i<1; - $gram = $refSysOutput->[$i - 1]->[$factorIndex] ." ".$gram; - if (defined($REF_GRAM{$gram}) && $REF_GRAM{$gram} > 0) { - $REF_GRAM{$gram}--; - if($debug != 0) {warn "'$gram' ";} - $correct2 += 1; - } - next if $i<2; - $gram = $refSysOutput->[$i - 2]->[$factorIndex] ." ".$gram; - if (defined($REF_GRAM{$gram}) && $REF_GRAM{$gram} > 0) { - $REF_GRAM{$gram}--; - if($debug != 0) {warn "'$gram' ";} - $correct3 += 1; - } - next if $i<3; - $gram = $refSysOutput->[$i - 3]->[$factorIndex] ." ".$gram; - if (defined($REF_GRAM{$gram}) && $REF_GRAM{$gram} > 0) { - $REF_GRAM{$gram}--; - if($debug != 0) {warn "'$gram' ";} - $correct4 += 1; - } - } - if($debug != 0) {warn "\n";} - my $total = $length_translation; - $total1 = max(1, $total); - $total2 = max(1, $total - 1); - $total3 = max(1, $total - 2); - $total4 = max(1, $total - 3); - if($debug != 0) - { - warn "BLEU($factorIndex)\n"; - warn "truth: " . join(' ', map {join('|', @$_)} @{$refTruth}) . "\n"; - warn "sysop: " . join(' ', map {join('|', @$_)} @{$refSysOutput}) . "\n"; - warn "stats: $correct1 / $total1, $correct2 / $total2, $correct3 / $total3, $correct4 / $total4\n"; - sleep 8; - } - - return ($correct1, $total1, $correct2, $total2, $correct3, $total3, $correct4, $total4, $length_translation, $length_reference); -} - -##### filesystem ##### - -#open as many given files as possible; only warn about the rest -#arguments: list of filename extensions to open (assume corpus name is file title) -#return: hash from type string to filehandleref, giving all files that were successfully opened -sub openFiles -{ - my ($self, @extensions) = @_; - my %openedFiles = (); - foreach my $ext (@extensions) - { - if(!open(FILE, "<" . $self->{'corpusName'} . $ext)) - { - warn "Corpus::openFiles(): couldn't open '" . $self->{'corpusName'} . $ext . "' for read\n"; - } - else #success - { - $openedFiles{$ext} = \*FILE; - } - } - return %openedFiles; -} - -#read one line from each given file -#arguments: hash from type string to filehandleref -#return: hash from type string to sentence (stored as arrayref of arrayrefs of factors) read from corresponding file -sub readLineFromFiles -{ - my ($self, %openedFiles) = @_; - my %lines; - foreach my $type (keys %openedFiles) - { - $lines{$type} = []; - my $sentence = <$openedFiles{$type}>; - my @words = split(/\s+/, $sentence); - foreach my $word (@words) - { - my @factors = split(/\|/, $word); - push @{$lines{$type}}, \@factors; - } - } - return %lines; -} - -#close all given files -#arguments: hash from type string to filehandleref -#return: none -sub closeFiles -{ - my ($self, %openedFiles) = @_; - foreach my $type (keys %openedFiles) - { - close($openedFiles{$type}); - } -} - -##### write HTML ##### - -#print HTML for comparing various versions of a sentence, with special processing for each version as appropriate -#arguments: filehandleref to which to write, sentence ID string, hashref of version string to sentence (stored as arrayref of arrayref of factor strings) -#return: none -sub printSingleSentenceComparison -{ - my ($self, $fh, $sentID, $sentences) = @_; - my $curFH = select; - select $fh; - #javascript to reorder rows to look nice afterward - print "<script type=\"text/javascript\"> - function reorder_$sentID() - {/* - var table = document.getElementById('div_$sentID').firstChild; - var refTransRow = table.getElementById('row_e'); - var inputRow = table.getElementById('row_f'); - table.removeRow(refTransRow); - table.removeRow(inputRow); - var newRow1 = table.insertRow(0); - var newRow2 = table.insertRow(1); - newRow1.childNodes = inputRow.childNodes; - newRow2.childNodes = refTransRow.childNodes;*/ - } - </script>"; - #html for sentences - print "<div id=\"div_$sentID\" style=\"padding: 3px; margin: 5px\">"; - print "<table border=\"1\">"; -# my $rowCount = 0; -# my @bgColors = ("#ffefbf", "#ffdf7f"); - #process all rows in order - foreach my $sentType (keys %$sentences) - { - my $bgcolor = $bgColors[$rowCount % 2]; - print "<tr id=\"row_$sentType\"><td align=right>"; - #description of sentence - if(defined($self->{'fileDescriptions'}->{$self->{'corpusName'} . $sentType})) - { - print "(" . $self->{'fileDescriptions'}->{$self->{'corpusName'} . $sentType} . ")"; - } - else - { - print "($sentType)"; - } - print "</td><td align=left>"; - #sentence with markup - if($sentType eq 'f') #input - { -# $self->writeHTMLSentenceWithFactors($fh, $sentences->{$sentType}, $inputColor); - } - elsif($sentType eq 'e') #reference translation - { -# $self->writeHTMLSentenceWithFactors($fh, $sentences->{$sentType}, $reftransColor); - } - else #system output - { -# $self->writeHTMLTranslationHighlightedWithFactors($fh, $sentences->{$sentType}, $sentences->{'e'}, $highlightColors); - } - print "</td></tr>"; -# $rowCount++; - } - print "</table>"; - print "</div>\n"; - select $curFH; -} diff --git a/scripts/analysis/smtgui/Error.pm b/scripts/analysis/smtgui/Error.pm deleted file mode 100644 index cc9edbb69..000000000 --- a/scripts/analysis/smtgui/Error.pm +++ /dev/null @@ -1,744 +0,0 @@ -# Error.pm -# -# Copyright (c) 1997-8 Graham Barr <gbarr@ti.com>. All rights reserved. -# This program is free software; you can redistribute it and/or -# modify it under the same terms as Perl itself. -# -# Based on my original Error.pm, and Exceptions.pm by Peter Seibel -# <peter@weblogic.com> and adapted by Jesse Glick <jglick@sig.bsh.com>. -# -# but modified ***significantly*** - -package Error; - -use strict; -use vars qw($VERSION); -use 5.004; - -$VERSION = "0.15"; - -use overload ( - '""' => 'stringify', - '0+' => 'value', - 'bool' => sub { return 1; }, - 'fallback' => 1 -); - -$Error::Depth = 0; # Depth to pass to caller() -$Error::Debug = 0; # Generate verbose stack traces -@Error::STACK = (); # Clause stack for try -$Error::THROWN = undef; # last error thrown, a workaround until die $ref works - -my $LAST; # Last error created -my %ERROR; # Last error associated with package - -# Exported subs are defined in Error::subs - -sub import { - shift; - local $Exporter::ExportLevel = $Exporter::ExportLevel + 1; - Error::subs->import(@_); -} - -# I really want to use last for the name of this method, but it is a keyword -# which prevent the syntax last Error - -sub prior { - shift; # ignore - - return $LAST unless @_; - - my $pkg = shift; - return exists $ERROR{$pkg} ? $ERROR{$pkg} : undef - unless ref($pkg); - - my $obj = $pkg; - my $err = undef; - if($obj->isa('HASH')) { - $err = $obj->{'__Error__'} - if exists $obj->{'__Error__'}; - } - elsif($obj->isa('GLOB')) { - $err = ${*$obj}{'__Error__'} - if exists ${*$obj}{'__Error__'}; - } - - $err; -} - -# Return as much information as possible about where the error -# happened. The -stacktrace element only exists if $Error::DEBUG -# was set when the error was created - -sub stacktrace { - my $self = shift; - - return $self->{'-stacktrace'} - if exists $self->{'-stacktrace'}; - - my $text = exists $self->{'-text'} ? $self->{'-text'} : "Died"; - - $text .= sprintf(" at %s line %d.\n", $self->file, $self->line) - unless($text =~ /\n$/s); - - $text; -} - -# Allow error propagation, ie -# -# $ber->encode(...) or -# return Error->prior($ber)->associate($ldap); - -sub associate { - my $err = shift; - my $obj = shift; - - return unless ref($obj); - - if($obj->isa('HASH')) { - $obj->{'__Error__'} = $err; - } - elsif($obj->isa('GLOB')) { - ${*$obj}{'__Error__'} = $err; - } - $obj = ref($obj); - $ERROR{ ref($obj) } = $err; - - return; -} - -sub new { - my $self = shift; - my($pkg,$file,$line) = caller($Error::Depth); - - my $err = bless { - '-package' => $pkg, - '-file' => $file, - '-line' => $line, - @_ - }, $self; - - $err->associate($err->{'-object'}) - if(exists $err->{'-object'}); - - # To always create a stacktrace would be very inefficient, so - # we only do it if $Error::Debug is set - - if($Error::Debug) { - require Carp; - local $Carp::CarpLevel = $Error::Depth; - my $text = defined($err->{'-text'}) ? $err->{'-text'} : "Error"; - my $trace = Carp::longmess($text); - # Remove try calls from the trace - $trace =~ s/(\n\s+\S+__ANON__[^\n]+)?\n\s+eval[^\n]+\n\s+Error::subs::try[^\n]+(?=\n)//sog; - $trace =~ s/(\n\s+\S+__ANON__[^\n]+)?\n\s+eval[^\n]+\n\s+Error::subs::run_clauses[^\n]+\n\s+Error::subs::try[^\n]+(?=\n)//sog; - $err->{'-stacktrace'} = $trace - } - - $@ = $LAST = $ERROR{$pkg} = $err; -} - -# Throw an error. this contains some very gory code. - -sub throw { - my $self = shift; - local $Error::Depth = $Error::Depth + 1; - - # if we are not rethrow-ing then create the object to throw - $self = $self->new(@_) unless ref($self); - - die $Error::THROWN = $self; -} - -# syntactic sugar for -# -# die with Error( ... ); - -sub with { - my $self = shift; - local $Error::Depth = $Error::Depth + 1; - - $self->new(@_); -} - -# syntactic sugar for -# -# record Error( ... ) and return; - -sub record { - my $self = shift; - local $Error::Depth = $Error::Depth + 1; - - $self->new(@_); -} - -# catch clause for -# -# try { ... } catch CLASS with { ... } - -sub catch { - my $pkg = shift; - my $code = shift; - my $clauses = shift || {}; - my $catch = $clauses->{'catch'} ||= []; - - unshift @$catch, $pkg, $code; - - $clauses; -} - -# Object query methods - -sub object { - my $self = shift; - exists $self->{'-object'} ? $self->{'-object'} : undef; -} - -sub file { - my $self = shift; - exists $self->{'-file'} ? $self->{'-file'} : undef; -} - -sub line { - my $self = shift; - exists $self->{'-line'} ? $self->{'-line'} : undef; -} - -sub text { - my $self = shift; - exists $self->{'-text'} ? $self->{'-text'} : undef; -} - -# overload methods - -sub stringify { - my $self = shift; - defined $self->{'-text'} ? $self->{'-text'} : "Died"; -} - -sub value { - my $self = shift; - exists $self->{'-value'} ? $self->{'-value'} : undef; -} - -package Error::Simple; - -@Error::Simple::ISA = qw(Error); - -sub new { - my $self = shift; - my $text = "" . shift; - my $value = shift; - my(@args) = (); - - local $Error::Depth = $Error::Depth + 1; - - @args = ( -file => $1, -line => $2) - if($text =~ s/ at (\S+) line (\d+)(\.\n)?$//s); - - push(@args, '-value', 0 + $value) - if defined($value); - - $self->SUPER::new(-text => $text, @args); -} - -sub stringify { - my $self = shift; - my $text = $self->SUPER::stringify; - $text .= sprintf(" at %s line %d.\n", $self->file, $self->line) - unless($text =~ /\n$/s); - $text; -} - -########################################################################## -########################################################################## - -# Inspired by code from Jesse Glick <jglick@sig.bsh.com> and -# Peter Seibel <peter@weblogic.com> - -package Error::subs; - -use Exporter (); -use vars qw(@EXPORT_OK @ISA %EXPORT_TAGS); - -@EXPORT_OK = qw(try with finally except otherwise); -%EXPORT_TAGS = (try => \@EXPORT_OK); - -@ISA = qw(Exporter); - -sub run_clauses ($$$\@) { - my($clauses,$err,$wantarray,$result) = @_; - my $code = undef; - - $err = new Error::Simple($err) unless ref($err); - - CATCH: { - - # catch - my $catch; - if(defined($catch = $clauses->{'catch'})) { - my $i = 0; - - CATCHLOOP: - for( ; $i < @$catch ; $i += 2) { - my $pkg = $catch->[$i]; - unless(defined $pkg) { - #except - splice(@$catch,$i,2,$catch->[$i+1]->()); - $i -= 2; - next CATCHLOOP; - } - elsif($err->isa($pkg)) { - $code = $catch->[$i+1]; - while(1) { - my $more = 0; - local($Error::THROWN); - my $ok = eval { - if($wantarray) { - @{$result} = $code->($err,\$more); - } - elsif(defined($wantarray)) { - @{$result} = (); - $result->[0] = $code->($err,\$more); - } - else { - $code->($err,\$more); - } - 1; - }; - if( $ok ) { - next CATCHLOOP if $more; - undef $err; - } - else { - $err = defined($Error::THROWN) - ? $Error::THROWN : $@; - $err = new Error::Simple($err) - unless ref($err); - } - last CATCH; - }; - } - } - } - - # otherwise - my $owise; - if(defined($owise = $clauses->{'otherwise'})) { - my $code = $clauses->{'otherwise'}; - my $more = 0; - my $ok = eval { - if($wantarray) { - @{$result} = $code->($err,\$more); - } - elsif(defined($wantarray)) { - @{$result} = (); - $result->[0] = $code->($err,\$more); - } - else { - $code->($err,\$more); - } - 1; - }; - if( $ok ) { - undef $err; - } - else { - $err = defined($Error::THROWN) - ? $Error::THROWN : $@; - $err = new Error::Simple($err) - unless ref($err); - } - } - } - $err; -} - -sub try (&;$) { - my $try = shift; - my $clauses = @_ ? shift : {}; - my $ok = 0; - my $err = undef; - my @result = (); - - unshift @Error::STACK, $clauses; - - my $wantarray = wantarray(); - - do { - local $Error::THROWN = undef; - - $ok = eval { - if($wantarray) { - @result = $try->(); - } - elsif(defined $wantarray) { - $result[0] = $try->(); - } - else { - $try->(); - } - 1; - }; - - $err = defined($Error::THROWN) ? $Error::THROWN : $@ - unless $ok; - }; - - shift @Error::STACK; - - $err = run_clauses($clauses,$err,wantarray,@result) - unless($ok); - - $clauses->{'finally'}->() - if(defined($clauses->{'finally'})); - - throw $err if defined($err); - - wantarray ? @result : $result[0]; -} - -# Each clause adds a sub to the list of clauses. The finally clause is -# always the last, and the otherwise clause is always added just before -# the finally clause. -# -# All clauses, except the finally clause, add a sub which takes one argument -# this argument will be the error being thrown. The sub will return a code ref -# if that clause can handle that error, otherwise undef is returned. -# -# The otherwise clause adds a sub which unconditionally returns the users -# code reference, this is why it is forced to be last. -# -# The catch clause is defined in Error.pm, as the syntax causes it to -# be called as a method - -sub with (&;$) { - @_ -} - -sub finally (&) { - my $code = shift; - my $clauses = { 'finally' => $code }; - $clauses; -} - -# The except clause is a block which returns a hashref or a list of -# key-value pairs, where the keys are the classes and the values are subs. - -sub except (&;$) { - my $code = shift; - my $clauses = shift || {}; - my $catch = $clauses->{'catch'} ||= []; - - my $sub = sub { - my $ref; - my(@array) = $code->($_[0]); - if(@array == 1 && ref($array[0])) { - $ref = $array[0]; - $ref = [ %$ref ] - if(UNIVERSAL::isa($ref,'HASH')); - } - else { - $ref = \@array; - } - @$ref - }; - - unshift @{$catch}, undef, $sub; - - $clauses; -} - -sub otherwise (&;$) { - my $code = shift; - my $clauses = shift || {}; - - if(exists $clauses->{'otherwise'}) { - require Carp; - Carp::croak("Multiple otherwise clauses"); - } - - $clauses->{'otherwise'} = $code; - - $clauses; -} - -1; -__END__ - -=head1 NAME - -Error - Error/exception handling in an OO-ish way - -=head1 SYNOPSIS - - use Error qw(:try); - - throw Error::Simple( "A simple error"); - - sub xyz { - ... - record Error::Simple("A simple error") - and return; - } - - unlink($file) or throw Error::Simple("$file: $!",$!); - - try { - do_some_stuff(); - die "error!" if $condition; - throw Error::Simple -text => "Oops!" if $other_condition; - } - catch Error::IO with { - my $E = shift; - print STDERR "File ", $E->{'-file'}, " had a problem\n"; - } - except { - my $E = shift; - my $general_handler=sub {send_message $E->{-description}}; - return { - UserException1 => $general_handler, - UserException2 => $general_handler - }; - } - otherwise { - print STDERR "Well I don't know what to say\n"; - } - finally { - close_the_garage_door_already(); # Should be reliable - }; # Don't forget the trailing ; or you might be surprised - -=head1 DESCRIPTION - -The C<Error> package provides two interfaces. Firstly C<Error> provides -a procedural interface to exception handling. Secondly C<Error> is a -base class for errors/exceptions that can either be thrown, for -subsequent catch, or can simply be recorded. - -Errors in the class C<Error> should not be thrown directly, but the -user should throw errors from a sub-class of C<Error>. - -=head1 PROCEDURAL INTERFACE - -C<Error> exports subroutines to perform exception handling. These will -be exported if the C<:try> tag is used in the C<use> line. - -=over 4 - -=item try BLOCK CLAUSES - -C<try> is the main subroutine called by the user. All other subroutines -exported are clauses to the try subroutine. - -The BLOCK will be evaluated and, if no error is throw, try will return -the result of the block. - -C<CLAUSES> are the subroutines below, which describe what to do in the -event of an error being thrown within BLOCK. - -=item catch CLASS with BLOCK - -This clauses will cause all errors that satisfy C<$err-E<gt>isa(CLASS)> -to be caught and handled by evaluating C<BLOCK>. - -C<BLOCK> will be passed two arguments. The first will be the error -being thrown. The second is a reference to a scalar variable. If this -variable is set by the catch block then, on return from the catch -block, try will continue processing as if the catch block was never -found. - -To propagate the error the catch block may call C<$err-E<gt>throw> - -If the scalar reference by the second argument is not set, and the -error is not thrown. Then the current try block will return with the -result from the catch block. - -=item except BLOCK - -When C<try> is looking for a handler, if an except clause is found -C<BLOCK> is evaluated. The return value from this block should be a -HASHREF or a list of key-value pairs, where the keys are class names -and the values are CODE references for the handler of errors of that -type. - -=item otherwise BLOCK - -Catch any error by executing the code in C<BLOCK> - -When evaluated C<BLOCK> will be passed one argument, which will be the -error being processed. - -Only one otherwise block may be specified per try block - -=item finally BLOCK - -Execute the code in C<BLOCK> either after the code in the try block has -successfully completed, or if the try block throws an error then -C<BLOCK> will be executed after the handler has completed. - -If the handler throws an error then the error will be caught, the -finally block will be executed and the error will be re-thrown. - -Only one finally block may be specified per try block - -=back - -=head1 CLASS INTERFACE - -=head2 CONSTRUCTORS - -The C<Error> object is implemented as a HASH. This HASH is initialized -with the arguments that are passed to it's constructor. The elements -that are used by, or are retrievable by the C<Error> class are listed -below, other classes may add to these. - - -file - -line - -text - -value - -object - -If C<-file> or C<-line> are not specified in the constructor arguments -then these will be initialized with the file name and line number where -the constructor was called from. - -If the error is associated with an object then the object should be -passed as the C<-object> argument. This will allow the C<Error> package -to associate the error with the object. - -The C<Error> package remembers the last error created, and also the -last error associated with a package. This could either be the last -error created by a sub in that package, or the last error which passed -an object blessed into that package as the C<-object> argument. - -=over 4 - -=item throw ( [ ARGS ] ) - -Create a new C<Error> object and throw an error, which will be caught -by a surrounding C<try> block, if there is one. Otherwise it will cause -the program to exit. - -C<throw> may also be called on an existing error to re-throw it. - -=item with ( [ ARGS ] ) - -Create a new C<Error> object and returns it. This is defined for -syntactic sugar, eg - - die with Some::Error ( ... ); - -=item record ( [ ARGS ] ) - -Create a new C<Error> object and returns it. This is defined for -syntactic sugar, eg - - record Some::Error ( ... ) - and return; - -=back - -=head2 STATIC METHODS - -=over 4 - -=item prior ( [ PACKAGE ] ) - -Return the last error created, or the last error associated with -C<PACKAGE> - -=back - -=head2 OBJECT METHODS - -=over 4 - -=item stacktrace - -If the variable C<$Error::Debug> was non-zero when the error was -created, then C<stacktrace> returns a string created by calling -C<Carp::longmess>. If the variable was zero the C<stacktrace> returns -the text of the error appended with the filename and line number of -where the error was created, providing the text does not end with a -newline. - -=item object - -The object this error was associated with - -=item file - -The file where the constructor of this error was called from - -=item line - -The line where the constructor of this error was called from - -=item text - -The text of the error - -=back - -=head2 OVERLOAD METHODS - -=over 4 - -=item stringify - -A method that converts the object into a string. This method may simply -return the same as the C<text> method, or it may append more -information. For example the file name and line number. - -By default this method returns the C<-text> argument that was passed to -the constructor, or the string C<"Died"> if none was given. - -=item value - -A method that will return a value that can be associated with the -error. For example if an error was created due to the failure of a -system call, then this may return the numeric value of C<$!> at the -time. - -By default this method returns the C<-value> argument that was passed -to the constructor. - -=back - -=head1 PRE-DEFINED ERROR CLASSES - -=over 4 - -=item Error::Simple - -This class can be used to hold simple error strings and values. It's -constructor takes two arguments. The first is a text value, the second -is a numeric value. These values are what will be returned by the -overload methods. - -If the text value ends with C<at file line 1> as $@ strings do, then -this infomation will be used to set the C<-file> and C<-line> arguments -of the error object. - -This class is used internally if an eval'd block die's with an error -that is a plain string. - -=back - -=head1 KNOWN BUGS - -None, but that does not mean there are not any. - -=head1 AUTHORS - -Graham Barr <gbarr@pobox.com> - -The code that inspired me to write this was originally written by -Peter Seibel <peter@weblogic.com> and adapted by Jesse Glick -<jglick@sig.bsh.com>. - -=head1 MAINTAINER - -Arun Kumar U <u_arunkumar@yahoo.com> - -=cut diff --git a/scripts/analysis/smtgui/README b/scripts/analysis/smtgui/README deleted file mode 100644 index c86cd9c1a..000000000 --- a/scripts/analysis/smtgui/README +++ /dev/null @@ -1,31 +0,0 @@ -Readme for SMTGUI -Philipp Koehn, Evan Herbst -7 / 31 / 06 ------------------------------------ - -SMTGUI is Philipp's and my code to analyze a decoder's output (the decoder doesn't have to be moses, but most of SMTGUI's features relate to factors, so it probably will be). You can view a list of available corpora by running <newsmtgui.cgi?ACTION=> on any web server. When you're viewing a corpus, click the checkboxes and Compare to see sentences from various sources on one screen. Currently they're in an annoying format; feel free to make the display nicer and more useful. There are per-sentence stats stored in a Corpus object; they just aren't used yet. See compare2() in newsmtgui and Corpus::printSingleSentenceComparison() for a start to better display code. For now it's mostly the view-corpus screen that's useful. - -newsmtgui.cgi is the main program. Corpus.pm is my module; Error.pm is a standard part of Perl but appears to not always be distributed. The accompanying version is Error.pm v1.15. - -The program requires file 'file-factors', which gives the list of factors included in each corpus (see the example file for details). Only corpi included in 'file-factors' are displayed. The file 'file-descriptions' is optional and associates a descriptive string with each included filename. These are used only for display. Again an example is provided. - -For the corpus with name CORPUS, there should be present the files: -- CORPUS.f, the foreign input -- CORPUS.e, the truth (aka reference translation) -- CORPUS.SYSTEM_TRANSLATION for each system to be analyzed -- CORPUS.pt_FACTORNAME for each factor that requires a phrase table (these are currently used only to count unknown source words) - -The .f, .e and system-output files should have the usual pipe-delimited format, one sentence per line. Phrase tables should also have standard three-pipe format. - -A list of standard factor names is available in @Corpus::FACTORNAMES. Feel free to add, but woe betide you if you muck with 'surf', 'pos' and 'lemma'; those are hardcoded all over the place. - -Currently the program assumes you've included factors 'surf', 'pos' and 'lemma', in whatever order; if not you'll want to edit view_corpus() in newsmtgui.cgi to not automatically display all info. To get English POS tags and lemmas from a words-only corpus and put together factors into one file: - -$ $BIN/tag-english < CORPUS.lc > CORPUS.pos-tmp (call Brill) -$ $BIN/morph < CORPUS.pos-tmp > CORPUS.morph -$ $DATA/test/factor-stem.en.perl < CORPUS.morph > CORPUS.lemma -$ cat CORPUS.pos-tmp | perl -n -e 's/_/\|/g; print;' > CORPUS.lc+pos (replace _ with |) -$ $DATA/test/combine-features.perl CORPUS lc+pos lemma > CORPUS.lc+pos+lemma -$ rm CORPUS.pos-tmp (cleanup) - -where $BIN=/export/ws06osmt/bin, $DATA=/export/ws06osmt/data. diff --git a/scripts/analysis/smtgui/file-descriptions b/scripts/analysis/smtgui/file-descriptions deleted file mode 100644 index 97ac6f31a..000000000 --- a/scripts/analysis/smtgui/file-descriptions +++ /dev/null @@ -1 +0,0 @@ -devtest2006.de-en.top100.matrix05-baseline.pharaoh Pharaoh JHUWS baseline run diff --git a/scripts/analysis/smtgui/file-factors b/scripts/analysis/smtgui/file-factors deleted file mode 100644 index a1128eb2a..000000000 --- a/scripts/analysis/smtgui/file-factors +++ /dev/null @@ -1,6 +0,0 @@ -#corpus name, factors given (/\s+/-delimited) -#(the given factors should be present in all target-language files for the given corpus) -devtest2006.de-en surf pos lemma -devtest2006.de-en.top100 surf pos lemma -#pstem: lemmas come from the Porter stemmer (and so are really a mix of stems and lemmas) -pstem_devtest2006.de-en surf pos lemma diff --git a/scripts/analysis/smtgui/filter-phrase-table.pl b/scripts/analysis/smtgui/filter-phrase-table.pl deleted file mode 100644 index a7e998794..000000000 --- a/scripts/analysis/smtgui/filter-phrase-table.pl +++ /dev/null @@ -1,83 +0,0 @@ -#!/usr/bin/perl -w - -#by Philipp Koehn, de-augmented by Evan Herbst -#filter a phrase table for a specific input corpus -#arguments: phrasetable_filename input_filename factor_index (0...) -#outputs to phrasetable_filename.short - -#similar function to filter-model-given-input.pl, but only operates -#on the phrase table and doesn't require that any subdirectories exist - -use strict; - -my $MAX_LENGTH = 10; - -my ($file, $input, $source_factor) = @ARGV; -my $dir = "."; - - # get tables to be filtered (and modify config file) - my (@TABLE,@TABLE_FACTORS,@TABLE_NEW_NAME,%CONSIDER_FACTORS); - push @TABLE,$file; - - my $new_name = "$file.short";#"$dir/phrase-table.$source_factor"; - push @TABLE_NEW_NAME,$new_name; - - $CONSIDER_FACTORS{$source_factor} = 1; - push @TABLE_FACTORS,$source_factor; - - # get the phrase pairs appearing in the input text - my %PHRASE_USED; - die("could not find input file $input") unless -e $input; - open(INPUT,$input); - while(my $line = <INPUT>) { - chop($line); - my @WORD = split(/ +/,$line); - for(my $i=0;$i<=$#WORD;$i++) { - for(my $j=0;$j<$MAX_LENGTH && $j+$i<=$#WORD;$j++) { - foreach (keys %CONSIDER_FACTORS) { - my @FACTOR = split(/,/); - my $phrase = ""; - for(my $k=$i;$k<=$i+$j;$k++) { - my @WORD_FACTOR = split(/\|/,$WORD[$k]); - for(my $f=0;$f<=$#FACTOR;$f++) { - $phrase .= $WORD_FACTOR[$FACTOR[$f]]."|"; - } - chop($phrase); - $phrase .= " "; - } - chop($phrase); - $PHRASE_USED{$_}{$phrase}++; - } - } - } - } - close(INPUT); - - # filter files - for(my $i=0;$i<=$#TABLE;$i++) { - my ($used,$total) = (0,0); - my $file = $TABLE[$i]; - my $factors = $TABLE_FACTORS[$i]; - my $new_file = $TABLE_NEW_NAME[$i]; - print STDERR "filtering $file -> $new_file...\n"; - - if (-e $file && $file =~ /\.gz$/) { open(FILE,"zcat $file |"); } - elsif (! -e $file && -e "$file.gz") { open(FILE,"zcat $file.gz|"); } - elsif (-e $file) { open(FILE,$file); } - else { die("could not find model file $file"); } - - open(FILE_OUT,">$new_file"); - - while(my $entry = <FILE>) { - my ($foreign,$rest) = split(/ \|\|\| /,$entry,2); - $foreign =~ s/ $//; - if (defined($PHRASE_USED{$factors}{$foreign})) { - print FILE_OUT $entry; - $used++; - } - $total++; - } - close(FILE); - close(FILE_OUT); - printf STDERR "$used of $total phrases pairs used (%.2f%s) - note: max length $MAX_LENGTH\n",(100*$used/$total),'%'; - } diff --git a/scripts/analysis/smtgui/newsmtgui.cgi b/scripts/analysis/smtgui/newsmtgui.cgi deleted file mode 100755 index 0e969a788..000000000 --- a/scripts/analysis/smtgui/newsmtgui.cgi +++ /dev/null @@ -1,935 +0,0 @@ -#!/usr/bin/perl -w - -use strict; - -use CGI; -use Corpus; #Evan's code -use Error qw(:try); - -#files with extensions other than these are interpreted as system translations; see the file 'file-descriptions', if it exists, for the comments that go with them -my %FILETYPE = ('e' => 'Reference Translation', - 'f' => 'Foreign Original', - 'ref.sgm' => 'Reference Translations', - 'e.sgm' => 'Reference Translations', - 'src.sgm' => 'Foreign Originals', - 'f.sgm' => 'Foreign Originals'); -my %DONTSCORE = ('f' => 1, 'f.sgm' => 1, 'src.sgm' => 1, - 'e' => 1, 'e.sgm' => 1, 'ref.sgm' => 1); -my @SHOW = ('f', 'e', 'comm'); -my %SHOW_COLOR = ('f' => "BLUE", - 'e' => "GREEN"); -my $FOREIGN = 'f'; - -#FILEDESC: textual descriptions associated with specific filenames; to be displayed on the single-corpus view -my %FILEDESC = (); &load_descriptions(); -my %factorIndices = loadFactorIndices('file-factors'); -my %MEMORY; &load_memory(); -my (@mBLEU,@NIST); -@mBLEU=`cat mbleu-memory.dat` if -e "mbleu-memory.dat"; chop(@mBLEU); -@NIST = `cat nist-memory.dat` if -e "nist-memory.dat"; chop(@NIST); -my %in; &ReadParse(); #parse arguments - -if (scalar(@ARGV) > 0 && $ARGV[0] eq 'bleu') { - $in{CORPUS} = $ARGV[1]; - $in{ACTION} = "VIEW_CORPUS"; -} - -my %MULTI_REF; -if ($in{CORPUS} && -e "$in{CORPUS}.ref.sgm") { - my $sysid; - open(REF,"$in{CORPUS}.ref.sgm"); - while(<REF>) { - $sysid = $1 if /<DOC.+sysid=\"([^\"]+)\"/; - if (/<seg[^>]*> *(\S.+\S) *<\/seg>/) { - push @{$MULTI_REF{$sysid}}, $1; - } - } - close(REF); -} - -if ($in{ACTION} eq '') { &show_corpora(); } -elsif ($in{ACTION} eq 'VIEW_CORPUS') { &view_corpus(); } -elsif ($in{ACTION} eq 'SCORE_FILE') { &score_file(); } -elsif ($in{ACTION} eq 'RESCORE_FILE') { &score_file(); } -elsif ($in{ACTION} eq 'COMPARE') { &compare(); } -else { &htmlhead("Unknown Action $in{ACTION}"); } -print "</BODY></HTML>\n"; - -###### SHOW CORPORA IN EVALUATION DIRECTORY - -sub show_corpora { - my %CORPUS = (); - - # find corpora in evaluation directory: see the factor-index file, which was already read in - foreach my $corpusName (keys %factorIndices) - { - $CORPUS{$corpusName} = 1; - } - - # list corpora - &htmlhead("All Corpora"); - print "<UL>\n"; - foreach (sort (keys %CORPUS)) { - print "<LI><A HREF=\"?ACTION=VIEW_CORPUS&CORPUS=".CGI::escape($_)."\">Corpus $_</A>\n"; - } - print "</UL>\n"; -} - -###### SHOW INFORMATION FOR ONE CORPUS - -sub view_corpus { - my @TABLE; - &htmlhead("View Corpus $in{CORPUS}"); - - # find corpora in evaluation directory - my $corpus = new Corpus('-name' => "$in{CORPUS}", '-descriptions' => \%FILEDESC, '-indices' => $factorIndices{$in{CORPUS}}); - - my ($sentence_count, $lineInfo); - if(-e "$in{CORPUS}.f") - { - $lineInfo = `wc -l $in{CORPUS}.f`; - $lineInfo =~ /^\s*(\d+)\s+/; - $sentence_count = 0 + $1; - } - else - { - $lineInfo = `wc -l $in{CORPUS}.e`; - $lineInfo =~ /^\s*(\d+)\s+/; - $sentence_count = 0 + $1; - } - - print "Corpus '$in{CORPUS}' consists of $sentence_count sentences\n"; - print "(<A HREF=?ACTION=VIEW_CORPUS&CORPUS=" . CGI::escape($in{CORPUS})."&mBLEU=1>with mBLEU</A>)" if ((!defined($in{mBLEU})) && (scalar keys %MEMORY) && -e "$in{CORPUS}.e" && -e "$in{CORPUS}.f"); - print "<P>\n"; - print "<FORM ACTION=''>\n"; - print "<INPUT TYPE=HIDDEN NAME=ACTION VALUE=COMPARE>\n"; - print "<INPUT TYPE=HIDDEN NAME=CORPUS VALUE=\"$in{CORPUS}\">\n"; - print "<TABLE BORDER=1 CELLSPACING=0><TR> -<TD>Filename (<A HREF=?ACTION=VIEW_CORPUS&CORPUS=" . CGI::escape($in{CORPUS}).">sort</A>)</TD> -<TD>Date (<A HREF=?ACTION=VIEW_CORPUS&CORPUS=" . CGI::escape($in{CORPUS})."&SORT=TIME>sort</A>)</TD>"; - if (-e "$in{CORPUS}.e") { - print "<TD>IBM BLEU (<A HREF=?ACTION=VIEW_CORPUS&CORPUS=" . CGI::escape($in{CORPUS})."&SORT=IBM>sort</A>)</TD>"; - } - if (-e "$in{CORPUS}.ref.sgm" && -e "$in{CORPUS}.src.sgm") { - print "<TD>NIST (<A HREF=?ACTION=VIEW_CORPUS&CORPUS=" . CGI::escape($in{CORPUS})."&SORT=NIST>sort</A>)</TD>"; - if (! -e "$in{CORPUS}.e") { - print "<TD>BLEU (<A HREF=?ACTION=VIEW_CORPUS&CORPUS=" . CGI::escape($in{CORPUS})."&SORT=BLEU>sort</A>)</TD>"; - } - } - if ($in{mBLEU} && (scalar keys %MEMORY) && -e "$in{CORPUS}.e" && -e "$in{CORPUS}.f") { - print "<TD>mBLEU (<A HREF=?ACTION=VIEW_CORPUS&CORPUS=" . CGI::escape($in{CORPUS})."&SORT=mBLEU>sort</A>)</TD>"; - } - print "<TD>Unknown Words</TD>"; #can't sort on; only applies to the input - print "<TD>Noun & adj WER-PWER</TD>"; #can't sort on; only applies to sysoutputs - print "<TD>Surface vs. lemma PWER</TD>"; #can't sort on; only applies to sysoutputs - print "<TD>Score (<A HREF=?ACTION=VIEW_CORPUS&CORPUS=" . CGI::escape($in{CORPUS})."&SORT=SCORE>sort</A>)</TD><TD>Actions</TD></TR>"; - - open(DIR,"ls $in{CORPUS}.*|"); - while(<DIR>) { - my $sort = ""; - chop; - my $sgm = 0; - if (/.sgm$/) - { - `grep '<seg' $_ | wc -l` =~ /^\s*(\d+)\s+/; - next unless $1 == $sentence_count; - $sgm = 1; - } - else - { - `wc -l $_` =~ /^\s*(\d+)\s+/; - next unless $1 == $sentence_count; - } - /^$in{CORPUS}.([^\/]+)$/; - my $file = $1; - # checkbox for compare - my $row = "<TR><TD><INPUT TYPE=CHECKBOX NAME=FILE_$file VALUE=1>"; - # README - if (-e "$in{CORPUS}.$file.README") { - my $readme = `cat $in{CORPUS}.$file.README`; - $readme =~ s/([\"\'])/\\\"/g; - $readme =~ s/[\n\r]/\\n/g; - $readme =~ s/\t/\\t/g; - $row .= "<A HREF='javascript:FieldInfo(\"$in{CORPUS}.$file\",\"$readme\")'>"; - } - # filename - $row .= "$in{CORPUS}.$file</A>"; - # description (hard-coded) - my @TRANSLATION_SENTENCE = `cat $in{CORPUS}.$file`; - chop(@TRANSLATION_SENTENCE); - - #count sentences that contain null words - my $null_count = 0; - foreach (@TRANSLATION_SENTENCE) - { - $null_count++ if /^NULL$/ || /^NONE$/; - } - if ($null_count > 0) { - $row .= "$null_count NULL "; - } - - $row .= " (".$FILETYPE{$file}.")" if defined($FILETYPE{$file}); - $row .= " (".$FILEDESC{$in{CORPUS}.".".$file}.")" if defined($FILEDESC{$in{CORPUS}.".".$file}); - $row .= " (".$FILEDESC{$file}.")" if defined($FILEDESC{$file}); - # filedate - my @STAT = stat("$in{CORPUS}.$file"); - my ($sec,$min,$hour,$mday,$mon,$year,$wday,$yday,$isdst) = localtime($STAT[8]); #STAT[8] should be last modify time - my $time = sprintf("%04d-%02d-%02d %02d:%02d:%02d",$year+1900,$mon+1,$mday,$hour,$min,$sec); - $row .= "</TD>\n<TD>".$time."</TD>\n"; - if (defined($in{SORT}) && $in{SORT} eq 'TIME') { $sort = $time; } - # IBM BLEU score - my $no_bleu =0; - if (!$sgm && -e "$in{CORPUS}.e") { - $row .= "<TD>"; - if (!defined($DONTSCORE{$file}) && $file !~ /^f$/ && $file ne "e" && $file !~ /^pt/) { - my ($score,$p1,$p2,$p3,$p4,$bp) = $corpus->calcBLEU($file, 'surf'); - $row .= sprintf("<B>%.04f</B> %.01f/%.01f/%.01f/%.01f *%.03f", $score, $p1, $p2, $p3, $p4, $bp); - if (defined($in{SORT}) && $in{SORT} eq 'IBM') { $sort = $score; } - } - $row .= "</TD>\n"; - } - else { - $no_bleu=1; - } - # NIST score - if (-e "$in{CORPUS}.ref.sgm" && -e "$in{CORPUS}.src.sgm" - && !$DONTSCORE{$file}) { - $row .= "<TD>"; - print "$DONTSCORE{$file}+"; - my ($nist,$nist_bleu); - if ($file =~ /sgm$/) { - ($nist,$nist_bleu) = &get_nist_score("$in{CORPUS}.ref.sgm","$in{CORPUS}.src.sgm","$in{CORPUS}.$file"); - $row .= sprintf("<B>%.04f</B>",$nist); - if ($in{SORT} eq 'NIST') { $sort = $nist; } - } - $row .= "</TD>\n"; - if ($no_bleu) { - $row .= "<TD>"; - if ($file =~ /sgm$/) { - $row .= sprintf("<B>%.04f</B>",$nist_bleu); - if ($in{SORT} eq 'BLEU') { $sort = $nist_bleu; } - } - $row .= "</TD>\n"; - } - } - # multi-bleu - if ($in{mBLEU} && (scalar keys %MEMORY) && -e "$in{CORPUS}.e") { - $row .= "<TD>"; - if (!defined($DONTSCORE{$file}) && $file !~ /^f$/ && $file ne "e") { - my ($score,$p1,$p2,$p3,$p4,$bp) = &get_multi_bleu_score("$in{CORPUS}.f","$in{CORPUS}.e","$in{CORPUS}.$file"); - $row .= sprintf("<B>%.04f</B> %.01f/%.01f/%.01f/%.01f *%.03f",$score,$p1,$p2,$p3,$p4,$bp); - if ($in{SORT} eq 'mBLEU') { $sort = $score; } - } - $row .= "</TD>\n"; - } - - # misc stats - $row .= "<TD align=\"center\">"; - if($file eq 'f') #input - { - try - { - my ($unknownCount, $totalCount) = calc_unknown_words($corpus, 'surf'); - $row .= sprintf("%.4lf (%d / %d)", $unknownCount / $totalCount, $unknownCount, $totalCount); - } - catch Error::Simple with {$row .= "[system error]";}; - } - $row .= "</TD>\n<TD align=\"center\">"; - my ($nnAdjWER, $nnAdjPWER, $surfPWER, $lemmaPWER); - if($file ne 'e' && $file ne 'f' && $file !~ /^pt/) #system output - { - try - { - ($nnAdjWER, $nnAdjPWER, $surfPWER, $lemmaPWER) = calc_misc_stats($corpus, $file); - $row .= "WER = $nnAdjWER<br>PWER = $nnAdjPWER<br><b>diff = " . ($nnAdjWER - $nnAdjPWER) . "</b>"; - } - catch Error::Simple with {$row .= "[system error]";}; - } - $row .= "</TD>\n<TD align=\"center\">"; - if($file ne 'e' && $file ne 'f' && $file !~ /^pt/) #system output - { - if($surfPWER == -1) - { - $row .= "[system error]"; - } - else - { - my ($lemmaBLEU, $p1, $p2, $p3, $p4, $brevity) = $corpus->calcBLEU($file, 'lemma'); - $row .= sprintf("surface = %d<br>lemma = %d<br><b>lemma BLEU = %.04f</b> %.01f/%.01f/%.01f/%.01f *%.03f", - $surfPWER, $lemmaPWER, $lemmaBLEU, $p1, $p2, $p3, $p4, $brevity); - } - } - $row .= "</TD>\n"; - - # correct sentence score - my($correct,$wrong,$unknown); - $row .= "<TD>"; - if (!defined($DONTSCORE{$file}) && (scalar keys %MEMORY)) { - my ($correct,$just_syn,$just_sem,$wrong,$unknown) = - &get_score_from_memory("$in{CORPUS}.$FOREIGN", - "$in{CORPUS}.$file"); - $row .= "<B><FONT COLOR=GREEN>$correct</FONT></B>"; - $row .= "/<FONT COLOR=ORANGE>$just_syn</FONT>"; - $row .= "/<FONT COLOR=ORANGE>$just_sem</FONT>"; - $row .= "/<FONT COLOR=RED>$wrong</FONT> ($unknown)</TD>\n"; - if ($in{SORT} eq 'SCORE') { - $sort = sprintf("%03d %04d",$correct,$just_syn+$just_sem); - } - } - else - { - $row .= "</TD>\n"; - } - - # score / review links - $row .= "<TD>"; - $row .= "<A HREF=\"?ACTION=SCORE_FILE&VIEW=1&CORPUS=".CGI::escape($in{CORPUS})."&FILE=".CGI::escape($file)."\">view</A>" if (!defined($DONTSCORE{$file})); - $row .= " <A HREF=\"?ACTION=SCORE_FILE&CORPUS=".CGI::escape($in{CORPUS})."&FILE=".CGI::escape($file)."\">score</A>" if (!defined($DONTSCORE{$file}) && ($unknown || scalar keys %MEMORY == 0)); - $row .= " <A HREF=\"?ACTION=RESCORE_FILE&CORPUS=".CGI::escape($in{CORPUS})."&FILE=".CGI::escape($file)."\">review</A>" if (!defined($DONTSCORE{$file}) && scalar keys %MEMORY); - $row .= "</TD></TR>\n"; - push @TABLE, "<!-- $sort -->\n$row"; - } - close(DIR); - foreach (reverse sort @TABLE) { print $_; } - print "</TABLE>\n"; - print "<INPUT TYPE=SUBMIT VALUE=\"Compare\">\n"; - print "<INPUT TYPE=CHECKBOX NAME=SURFACE VALUE=1 CHECKED> Compare all different sentences (instead of just differently <I>evaluated</I> sentences) <INPUT TYPE=CHECKBOX NAME=WITH_EVAL VALUE=1 CHECKED> with evaluation</FORM><P>\n"; - print "<P>The score is to be read as: <FONT COLOR=GREEN>correct</FONT>/<FONT COLOR=ORANGE>just-syn-correct</FONT>/<FONT COLOR=ORANGE>just-sem-correct</FONT>/<FONT COLOR=RED>wrong</FONT> (unscored)\n"; - print "<BR>IBM BLEU is to be read as: <B>metric</B> unigram/bigram/trigram/quadgram *brevity-penalty\n"; - print "<P><A HREF=\"newsmtgui.cgi?action=\">All corpora</A>\n"; -} - -###### SCORE TRANSLATIONS - -sub score_file { - if ($in{VIEW}) { - &htmlhead("View Translations"); - } - else { - &htmlhead("Score Translations"); - } - print "<A HREF=\"?ACTION=VIEW_CORPUS&CORPUS=".CGI::escape($in{CORPUS})."\">View Corpus $in{CORPUS}</A><P>\n"; - print "<FORM ACTION=\"\" METHOD=POST>\n"; - print "<INPUT TYPE=HIDDEN NAME=ACTION VALUE=$in{ACTION}>\n"; - print "<INPUT TYPE=HIDDEN NAME=CORPUS VALUE=\"$in{CORPUS}\">\n"; - print "<INPUT TYPE=HIDDEN NAME=FILE VALUE=\"$in{FILE}\">\n"; - - # get sentences - my @SENTENCES; - if ($in{FILE} =~ /.sgm$/) { - @SENTENCES = `grep '<seg' $in{CORPUS}.$in{FILE}`; - for(my $i=0;$i<$#SENTENCES;$i++) { - $SENTENCES[$i] =~ s/^<seg[^>]+> *(\S.+\S) *<\/seg> *$/$1/; - } - } - else { - @SENTENCES = `cat $in{CORPUS}.$in{FILE}`; chop(@SENTENCES); - } - - my %REFERENCE; - foreach (@SHOW) { - if (-e "$in{CORPUS}.$_") { - @{$REFERENCE{$_}} = `cat $in{CORPUS}.$_`; chop(@{$REFERENCE{$_}}); - } - } - - # update memory - foreach (keys %in) { - next unless /^SYN_SCORE_(\d+)$/; - next unless $in{"SEM_SCORE_$1"}; - &store_in_memory($REFERENCE{$FOREIGN}[$1], - $SENTENCES[$1], - "syn_".$in{"SYN_SCORE_$1"}." sem_".$in{"SEM_SCORE_$1"}); - } - - # display sentences - for(my $i=0;$i<=$#SENTENCES;$i++) { - my $evaluation = &get_from_memory($REFERENCE{$FOREIGN}[$i],$SENTENCES[$i]); - next if ($in{ACTION} eq 'SCORE_FILE' && - ! $in{VIEW} && - $evaluation ne '' && $evaluation ne 'wrong'); - print "<P>Sentence ".($i+1).":<BR>\n"; - # color coding - &color_highlight_ngrams($i,&nist_normalize_text($SENTENCES[$i]),$REFERENCE{"e"}[$i]); - if (%MULTI_REF) { - foreach my $sysid (keys %MULTI_REF) { - print "<FONT COLOR=GREEN>".$MULTI_REF{$sysid}[$i]."</FONT> (Reference $sysid)<BR>\n"; - } - } - - # all sentences - print "$SENTENCES[$i] (System output)<BR>\n"; - foreach my $ref (@SHOW) { - if (-e "$in{CORPUS}.$ref") { - print "<FONT COLOR=$SHOW_COLOR{$ref}>".$REFERENCE{$ref}[$i]."</FONT> (".$FILETYPE{$ref}.")<BR>\n" if $REFERENCE{$ref}[$i]; - } - } - if (! $in{VIEW}) { - print "<INPUT TYPE=RADIO NAME=SYN_SCORE_$i VALUE=correct"; - print " CHECKED" if ($evaluation =~ /syn_correct/); - print "> perfect English\n"; - print "<INPUT TYPE=RADIO NAME=SYN_SCORE_$i VALUE=wrong"; - print " CHECKED" if ($evaluation =~ /syn_wrong/); - print "> imperfect English<BR>\n"; - print "<INPUT TYPE=RADIO NAME=SEM_SCORE_$i VALUE=correct"; - print " CHECKED" if ($evaluation =~ /sem_correct/); - print "> correct meaning\n"; - print "<INPUT TYPE=RADIO NAME=SEM_SCORE_$i VALUE=wrong"; - print " CHECKED" if ($evaluation =~ /sem_wrong/); - print "> incorrect meaning\n"; - } - } - if (! $in{VIEW}) { - print "<P><INPUT TYPE=SUBMIT VALUE=\"Add evaluation\">\n"; - print "</FORM>\n"; - } -} - -sub color_highlight_ngrams { - my($i,$sentence,$single_reference) = @_; - my @REF = (); - my %NGRAM = (); - if (%MULTI_REF) { - foreach my $sysid (keys %MULTI_REF) { - push @REF,&nist_normalize_text($MULTI_REF{$sysid}[$i]); - } - } - elsif ($single_reference) { - @REF = ($single_reference); - } - if (@REF) { - foreach my $ref (@REF) { - my @WORD = split(/\s+/,$ref); - for(my $n=1;$n<=4;$n++) { - for(my $w=0;$w<=$#WORD-($n-1);$w++) { - my $ngram = "$n: "; - for(my $j=0;$j<$n;$j++) { - $ngram .= $WORD[$w+$j]." "; - } - $NGRAM{$ngram}++; - } - } - } - $sentence =~ s/^\s+//; - $sentence =~ s/\s+/ /; - $sentence =~ s/\s+$//; - my @WORD = split(/\s+/,$sentence); - my @CORRECT; - for(my $w=0;$w<=$#WORD;$w++) { - $CORRECT[$w] = 0; - } - for(my $n=1;$n<=4;$n++) { - for(my $w=0;$w<=$#WORD-($n-1);$w++) { - my $ngram = "$n: "; - for(my $j=0;$j<$n;$j++) { - $ngram .= $WORD[$w+$j]." "; - } - next unless defined($NGRAM{$ngram}) && $NGRAM{$ngram}>0; - $NGRAM{$ngram}--; - for(my $j=0;$j<$n;$j++) { - $CORRECT[$w+$j] = $n; - } - } - } - my @COLOR; - $COLOR[0] = "#FF0000"; - $COLOR[1] = "#C000C0"; - $COLOR[2] = "#0000FF"; - $COLOR[3] = "#00C0C0"; - $COLOR[4] = "#00C000"; - for(my $w=0;$w<=$#WORD;$w++) { - print "<B><FONT COLOR=".$COLOR[$CORRECT[$w]].">$WORD[$w]<SUB>".$CORRECT[$w]."</SUB></FONT></B> "; - } - print "\n<BR>"; - } -} - -###### OTHER STATS - -#print (in some unspecified way) the offending exception of type Error::Simple -#arguments: the error object, a context string -#return: none -sub printError -{ - my ($err, $context) = @_; - warn "$context: " . $err->{'-text'} . " @ " . $err->{'-file'} . " (" .$err->{'-line'} . ")\n"; -} - -#compute number and percentage of unknown tokens for a given factor in foreign corpus -#arguments: corpus object ref, factor name -#return (unkwordCount, totalWordCount), or (-1, -1) if an error occurs -sub calc_unknown_words -{ - my ($corpus, $factorName) = @_; - try - { - my ($unknownCount, $totalCount) = $corpus->calcUnknownTokens($factorName); - return ($unknownCount, $totalCount); - } - catch Error::Simple with - { - my $err = shift; - printError($err, 'calc_unknown_words()'); - return (-1, -1); - }; -} - -#compute (if we have the necessary factors) info for: -#- diff btwn wer and pwer for NNs & ADJs -- if large, many reordering errors -#- diff btwn pwer for surface forms and pwer for lemmas -- if large, morphology errors -#arguments: corpus object, system name -#return (NN/ADJ (wer, pwer), surf pwer, lemma pwer), or (-1, -1, -1, -1) if an error occurs -sub calc_misc_stats -{ - my ($corpus, $sysname) = @_; - try - { - my ($nnAdjWER, $nnAdjPWER) = $corpus->calcNounAdjWER_PWERDiff($sysname); - my ($surfPWER, $lemmaPWER) = ($corpus->calcOverallPWER($sysname, 'surf'), $corpus->calcOverallPWER($sysname, 'lemma')); - return ($nnAdjWER, $nnAdjPWER, $surfPWER, $lemmaPWER); - } - catch Error::Simple with - { - my $err = shift; - printError($err, 'calc_misc_stats()'); - return (-1, -1, -1, -1); - }; -} - -###### NIST SCORE - -sub get_nist_score { - my($reference_file,$source_file,$translation_file) = @_; - my @STAT = stat($translation_file); - my $current_timestamp = $STAT[9]; - foreach (@NIST) { - my ($file,$time,$nist,$bleu) = split; - return ($nist,$bleu) - if ($file eq $translation_file && $current_timestamp == $time); - } - - my $nist_eval = `/home/pkoehn/statmt/bin/mteval-v10.pl -c -r $reference_file -s $source_file -t $translation_file`; - return (0,0) unless ($nist_eval =~ /NIST score = (\d+\.\d+) BLEU score = (\d+\.\d+)/i); - - open(NIST,">>nist-memory.dat"); - printf NIST "$translation_file $current_timestamp %f %f\n",$1,$2; - close(NIST); - return ($1,$2); -} - -sub nist_normalize_text { - my ($norm_text) = @_; - -# language-independent part: - $norm_text =~ s/<skipped>//g; # strip "skipped" tags - $norm_text =~ s/-\n//g; # strip end-of-line hyphenation and join lines - $norm_text =~ s/\n/ /g; # join lines - $norm_text =~ s/(\d)\s+(\d)/$1$2/g; #join digits - $norm_text =~ s/"/"/g; # convert SGML tag for quote to " - $norm_text =~ s/&/&/g; # convert SGML tag for ampersand to & - $norm_text =~ s/</</g; # convert SGML tag for less-than to > - $norm_text =~ s/>/>/g; # convert SGML tag for greater-than to < - -# language-dependent part (assuming Western languages): - $norm_text = " $norm_text "; -# $norm_text =~ tr/[A-Z]/[a-z]/ unless $preserve_case; - $norm_text =~ s/([\{-\~\[-\` -\&\(-\+\:-\@\/])/ $1 /g; # tokenize punctuation - $norm_text =~ s/([^0-9])([\.,])/$1 $2 /g; # tokenize period and comma unless preceded by a digit - $norm_text =~ s/([\.,])([^0-9])/ $1 $2/g; # tokenize period and comma unless followed by a digit - $norm_text =~ s/([0-9])(-)/$1 $2 /g; # tokenize dash when preceded by a digit - $norm_text =~ s/\s+/ /g; # one space only between words - $norm_text =~ s/^\s+//; # no leading space - $norm_text =~ s/\s+$//; # no trailing space - - return $norm_text; -} - -###### BLEU SCORE - -sub get_multi_bleu_score { - my($foreign_file,$reference_file,$translation_file) = @_; - my @STAT = stat($translation_file); - my $current_timestamp = $STAT[9]; - foreach (@mBLEU) { - my ($file,$time,$score,$g1,$g2,$g3,$g4,$bp) = split; - if ($file eq $translation_file && $current_timestamp == $time) { - return ($score,$g1*100,$g2*100,$g3*100,$g4*100,$bp); - } - } - - # load reference translation from reference file - my @REFERENCE_SENTENCE = `cat $reference_file`; chop(@REFERENCE_SENTENCE); - my @TRANSLATION_SENTENCE = `cat $translation_file`; chop(@TRANSLATION_SENTENCE); - my %REF; - my @FOREIGN_SENTENCE = `cat $foreign_file`; chop(@FOREIGN_SENTENCE); - for(my $i=0;$i<=$#TRANSLATION_SENTENCE;$i++) { - push @{$REF{$FOREIGN_SENTENCE[$i]}},$REFERENCE_SENTENCE[$i]; - } - # load reference translation from translation memory - foreach my $memory (keys %MEMORY) { - next if $MEMORY{$memory} ne 'syn_correct sem_correct'; - my ($foreign,$english) = split(/ .o0O0o. /,$memory); - next unless defined($REF{$foreign}); - push @{$REF{$foreign}},$english; - } - my(@CORRECT,@TOTAL,$length_translation,$length_reference); - # compute bleu - for(my $i=0;$i<=$#TRANSLATION_SENTENCE;$i++) { - my %REF_NGRAM = (); - my @WORD = split(/ /,$TRANSLATION_SENTENCE[$i]); - my $length_translation_this_sentence = scalar(@WORD); - my ($closest_diff,$closest_length) = (9999,9999); - foreach my $reference (@{$REF{$FOREIGN_SENTENCE[$i]}}) { - my @WORD = split(/ /,$reference); - my $length = scalar(@WORD); - if (abs($length_translation_this_sentence-$length) < $closest_diff) { - $closest_diff = abs($length_translation_this_sentence-$length); - $closest_length = $length; - } - for(my $n=1;$n<=4;$n++) { - my %REF_NGRAM_N = (); - for(my $start=0;$start<=$#WORD-($n-1);$start++) { - my $ngram = "$n"; - for(my $w=0;$w<$n;$w++) { - $ngram .= " ".$WORD[$start+$w]; - } - $REF_NGRAM_N{$ngram}++; - } - foreach my $ngram (keys %REF_NGRAM_N) { - if (!defined($REF_NGRAM{$ngram}) || - $REF_NGRAM{$ngram} < $REF_NGRAM_N{$ngram}) { - $REF_NGRAM{$ngram} = $REF_NGRAM_N{$ngram}; - } - } - } - } - $length_translation += $length_translation_this_sentence; - $length_reference += $closest_length; - for(my $n=1;$n<=4;$n++) { - my %T_NGRAM = (); - for(my $start=0;$start<=$#WORD-($n-1);$start++) { - my $ngram = "$n"; - for(my $w=0;$w<$n;$w++) { - $ngram .= " ".$WORD[$start+$w]; - } - $T_NGRAM{$ngram}++; - } - foreach my $ngram (keys %T_NGRAM) { - my $n = 0+$ngram; -# print "$i e $ngram $T_NGRAM{$ngram}<BR>\n"; - $TOTAL[$n] += $T_NGRAM{$ngram}; - if (defined($REF_NGRAM{$ngram})) { - if ($REF_NGRAM{$ngram} >= $T_NGRAM{$ngram}) { - $CORRECT[$n] += $T_NGRAM{$ngram}; -# print "$i e correct1 $T_NGRAM{$ngram}<BR>\n"; - } - else { - $CORRECT[$n] += $REF_NGRAM{$ngram}; -# print "$i e correct2 $REF_NGRAM{$ngram}<BR>\n"; - } - } - } - } - } - my $brevity_penalty = 1; - if ($length_translation<$length_reference) { - $brevity_penalty = exp(1-$length_reference/$length_translation); - } - my $bleu = $brevity_penalty * exp((my_log( $CORRECT[1]/$TOTAL[1] ) + - my_log( $CORRECT[2]/$TOTAL[2] ) + - my_log( $CORRECT[3]/$TOTAL[3] ) + - my_log( $CORRECT[4]/$TOTAL[4] ) ) / 4); - - open(BLEU,">>mbleu-memory.dat"); - @STAT = stat($translation_file); - printf BLEU "$translation_file $STAT[9] %f %f %f %f %f %f\n",$bleu,$CORRECT[1]/$TOTAL[1],$CORRECT[2]/$TOTAL[2],$CORRECT[3]/$TOTAL[3],$CORRECT[4]/$TOTAL[4],$brevity_penalty; - close(BLEU); - - return ($bleu, - 100*$CORRECT[1]/$TOTAL[1], - 100*$CORRECT[2]/$TOTAL[2], - 100*$CORRECT[3]/$TOTAL[3], - 100*$CORRECT[4]/$TOTAL[4], - $brevity_penalty); -} - -sub my_log { - return -9999999999 unless $_[0]; - return log($_[0]); -} - - -###### SCORE TRANSLATIONS - -################################ IN PROGRESS ############################### -sub compare2 -{ - &htmlhead("Compare Translations"); - print "<A HREF=\"?ACTION=VIEW_CORPUS&CORPUS=".CGI::escape($in{CORPUS})."\">View Corpus $in{CORPUS}</A><P>\n"; - print "<FORM ACTION=\"\" METHOD=POST>\n"; - print "<INPUT TYPE=HIDDEN NAME=ACTION VALUE=$in{ACTION}>\n"; - print "<INPUT TYPE=HIDDEN NAME=CORPUS VALUE=\"$in{CORPUS}\">\n"; - my $corpus = new Corpus('-name' => "$in{CORPUS}", '-descriptions' => \%FILEDESC, '-indices' => \%factorIndices); - $corpus->writeComparisonPage(\*STDOUT, /^.*$/); - print "</FORM>\n"; -} - -sub compare { - &htmlhead("Compare Translations"); - print "<A HREF=\"?ACTION=VIEW_CORPUS&CORPUS=".CGI::escape($in{CORPUS})."\">View Corpus $in{CORPUS}</A><P>\n"; - print "<FORM ACTION=\"\" METHOD=POST>\n"; - print "<INPUT TYPE=HIDDEN NAME=ACTION VALUE=$in{ACTION}>\n"; - print "<INPUT TYPE=HIDDEN NAME=CORPUS VALUE=\"$in{CORPUS}\">\n"; - - # get sentences - my %SENTENCES; - my $sentence_count; - foreach (keys %in) { - if (/^FILE_(.+)$/) { - my $file = $1; - print "<INPUT TYPE=HIDDEN NAME=\"$file\" VALUE=1>\n"; - my @SENTENCES; - if ($file =~ /.sgm$/) { - @{$SENTENCES{$file}} = `grep '<seg' $in{CORPUS}.$file`; - for(my $i=0;$i<$#{$SENTENCES{$file}};$i++) { - $SENTENCES{$file}[$i] =~ s/^<seg[^>]+> *(\S.+\S) *<\/seg> *$/$1/; - } - } - else { - @{$SENTENCES{$file}} = `cat $in{CORPUS}.$1`; - chop(@{$SENTENCES{$file}}); - } - - $sentence_count = scalar @{$SENTENCES{$file}}; - } - } - my %REFERENCE; - foreach (@SHOW) { - if (-e "$in{CORPUS}.$_") { - @{$REFERENCE{$_}} = `cat $in{CORPUS}.$_`; chop(@{$REFERENCE{$_}}); - } - } - - # update memory - foreach (keys %in) { - next unless /^SYN_SCORE_(.+)_(\d+)$/; - next unless $in{"SEM_SCORE_$1_$2"}; - &store_in_memory($REFERENCE{$FOREIGN}[$2], - $SENTENCES{$1}[$2], - "syn_".$in{"SYN_SCORE_$1_$2"}." sem_".$in{"SEM_SCORE_$1_$2"}); - } - - # display sentences - for(my $i=0;$i<$sentence_count;$i++) - { - my $evaluation = ""; - my $show = 0; - my $surface = ""; - foreach my $file (keys %SENTENCES) - { - if ($in{SURFACE}) { - $SENTENCES{$file}[$i] =~ s/ *$//; - $surface = $SENTENCES{$file}[$i] if ($surface eq ''); - $show = 1 if ($SENTENCES{$file}[$i] ne $surface); - } - else { - my $this_ev = &get_from_memory($REFERENCE{$FOREIGN}[$i],$SENTENCES{$file}[$i]); - $this_ev = "syn_wrong sem_wrong" unless $this_ev; - $evaluation = $this_ev if ($evaluation eq ''); - $show = 1 if ($evaluation ne $this_ev); - } - } - next unless $show; - print "<HR>Sentence ".($i+1).":<BR>\n"; - foreach my $ref (@SHOW) { - if (-e "$in{CORPUS}.$ref") { - print "<FONT COLOR=$SHOW_COLOR{$ref}>".$REFERENCE{$ref}[$i]."</FONT> (".$FILETYPE{$ref}.")<BR>\n"; - } - } - foreach my $file (keys %SENTENCES) { - print "<B>$SENTENCES{$file}[$i]</B> ($file)<BR>\n"; - &color_highlight_ngrams($i,&nist_normalize_text($SENTENCES{$file}[$i]),$REFERENCE{"e"}[$i]); - if (0 && $in{WITH_EVAL}) { - $evaluation = &get_from_memory($REFERENCE{$FOREIGN}[$i],$SENTENCES{$file}[$i]); - print "<INPUT TYPE=RADIO NAME=SYN_SCORE_$file"."_$i VALUE=correct"; - print " CHECKED" if ($evaluation =~ /syn_correct/); - print "> perfect English\n"; - print "<INPUT TYPE=RADIO NAME=SYN_SCORE_$file"."_$i VALUE=wrong"; - print " CHECKED" if ($evaluation =~ /syn_wrong/); - print "> imperfect English<BR>\n"; - print "<INPUT TYPE=RADIO NAME=SEM_SCORE_$file"."_$i VALUE=correct"; - print " CHECKED" if ($evaluation =~ /sem_correct/); - print "> correct meaning\n"; - print "<INPUT TYPE=RADIO NAME=SEM_SCORE_$file"."_$i VALUE=wrong"; - print " CHECKED" if ($evaluation =~ /sem_wrong/); - print "> incorrect meaning<BR>\n"; - } - } - } - print "<P><INPUT TYPE=SUBMIT VALUE=\"Add evaluation\">\n"; - print "</FORM>\n"; -} - -###### MEMORY SUBS - -sub load_memory { - open(MEMORY,"evaluation-memory.dat") or return; - while(<MEMORY>) { - chop; - my($foreign,$translation,$evaluation) = split(/ \.o0O0o\. /); - $evaluation = 'syn_correct sem_correct' if ($evaluation eq 'correct'); - $MEMORY{"$foreign .o0O0o. $translation"} = $evaluation; - } - close(MEMORY); -} - -sub get_score_from_memory { - my($foreign_file,$translation_file) = @_; - my $unknown=0; - my $correct=0; - my $just_syn=0; - my $just_sem=0; - my $wrong=0; - my @FOREIGN = `cat $foreign_file`; chop(@FOREIGN); - my @TRANSLATION = `cat $translation_file`; chop(@TRANSLATION); - for(my $i=0;$i<=$#FOREIGN;$i++) { - if (my $evaluation = &get_from_memory($FOREIGN[$i],$TRANSLATION[$i])) { - if ($evaluation eq 'syn_correct sem_correct') { $correct++ } - elsif ($evaluation eq 'syn_correct sem_wrong') { $just_syn++ } - elsif ($evaluation eq 'syn_wrong sem_correct') { $just_sem++ } - elsif ($evaluation eq 'syn_wrong sem_wrong') { $wrong++ } - else { $unknown++; } - } - else { $unknown++; } - } - return($correct,$just_syn,$just_sem,$wrong,$unknown); -} - -sub store_in_memory { - my($foreign,$translation,$evaluation) = @_; - &trim(\$translation); - return if $MEMORY{"$foreign .o0O0o. $translation"} eq $evaluation; - $MEMORY{"$foreign .o0O0o. $translation"} = $evaluation; - open(MEMORY,">>evaluation-memory.dat") or die "store_in_memory(): couldn't open 'evaluation-memory.dat' for append\n"; - print MEMORY "$foreign .o0O0o. $translation .o0O0o. $evaluation\n"; - close(MEMORY); -} - -sub get_from_memory { - my($foreign,$translation) = @_; - &trim(\$translation); - return $MEMORY{"$foreign .o0O0o. $translation"}; -} - -sub trim { - my($translation) = @_; - $$translation =~ s/ +/ /g; - $$translation =~ s/^ +//; - $$translation =~ s/ +$//; -# $$translation =~ s/ +[\.]$//; -} - -sub load_descriptions { - open(FD,"file-descriptions") or die "load_descriptions(): couldn't open 'file-descriptions' for read\n"; - while(<FD>) { - my($file,$description) = split(/\s+/,$_,2); - $FILEDESC{$file} = $description; - } - close(FD); -} - -#read config file giving names of corpi and fill factor-index map -#arguments: filename to read -#return: hash of corpus names to hashrefs of factor names to indices -sub loadFactorIndices -{ - my $filename = shift; - my %data = (); - open(INFILE, "<$filename") or die "loadFactorIndices(): couldn't open '$filename' for read\n"; - while(my $line = <INFILE>) - { - if($line =~ /^\#/) {next;} #skip comment lines - my @tokens = split(/\s+/, $line); - my $corpusName = shift @tokens; - $data{$corpusName} = {}; - for(my $i = 0; $i < scalar(@tokens); $i++) - { - $data{$corpusName}->{$tokens[$i]} = $i; - } - } - close(INFILE); - return %data; -} - -###### SUBS - -sub htmlhead { - print <<"___ENDHTML"; -Content-type: text/html - -<HTML><HEAD> -<TITLE>MTEval: $_[0]</TITLE> -<SCRIPT LANGUAGE="JavaScript"> - -<!-- hide from old browsers - -function FieldInfo(field,description) { - popup = window.open("","popDialog","height=500,width=600,scrollbars=yes,resizable=yes"); - popup.document.write("<HTML><HEAD><TITLE>"+field+"</TITLE></HEAD><BODY BGCOLOR=#FFFFCC><CENTER><B>"+field+"</B><HR SIZE=2 NOSHADE></CENTER><PRE>"+description+"</PRE><CENTER><FORM><INPUT TYPE='BUTTON' VALUE='Okay' onClick='self.close()'></FORM><CENTER></BODY></HTML>"); - popup.focus(); - popup.document.close(); -} - -<!-- done hiding --> - -</SCRIPT> -</HEAD> -<BODY BGCOLOR=white> -<H2>Evaluation Tool for Machine Translation<BR>$_[0]</H2> -___ENDHTML -} - - -############################# parts of cgi-lib.pl - - -sub ReadParse { - my ($i, $key, $val); - - # Read in text - my $in; - if (&MethGet) { - $in = $ENV{'QUERY_STRING'}; - } elsif (&MethPost) { - read(STDIN,$in,$ENV{'CONTENT_LENGTH'}); - } - - my @in = split(/[&;]/,$in); - - foreach $i (0 .. $#in) { - # Convert plus's to spaces - $in[$i] =~ s/\+/ /g; - - # Split into key and value. - ($key, $val) = split(/=/,$in[$i],2); # splits on the first =. - - # Convert %XX from hex numbers to alphanumeric - $key =~ s/%(..)/pack("c",hex($1))/ge; - $val =~ s/%(..)/pack("c",hex($1))/ge; - - # Associate key and value - $in{$key} .= "\0" if (defined($in{$key})); # \0 is the multiple separator - $in{$key} .= $val; - - } - - return scalar(@in); -} - -sub MethGet { - return ($ENV{'REQUEST_METHOD'} eq "GET"); -} - -sub MethPost { - return ($ENV{'REQUEST_METHOD'} eq "POST"); -} |