diff options
Diffstat (limited to 'scripts/analysis/smtgui/Corpus.pm')
-rw-r--r-- | scripts/analysis/smtgui/Corpus.pm | 867 |
1 files changed, 0 insertions, 867 deletions
diff --git a/scripts/analysis/smtgui/Corpus.pm b/scripts/analysis/smtgui/Corpus.pm deleted file mode 100644 index f191ce273..000000000 --- a/scripts/analysis/smtgui/Corpus.pm +++ /dev/null @@ -1,867 +0,0 @@ -#package Corpus: hold a bunch of sentences in any language, with translation factors and stats about individual sentences and the corpus as a whole -#Evan Herbst, 7 / 25 / 06 - -package Corpus; -BEGIN -{ - push @INC, "."; -} -use Error; - -return 1; - -########################################################################################################################### - -#'our' variables are available outside the package -our @FACTORNAMES = ('surf', 'pos', 'lemma', 'stem', 'morph'); - -#constructor -#arguments: short corpus name (-name), hashref of filenames to descriptions (-descriptions), hashref of factor names to indices in this corpus (-indices) -sub new -{ - my $class = shift; - my %args = @_; #turn the remainder of @_ into a hash - my ($corpusName, $refFileDescs, $factorIndices) = ($args{'-name'}, $args{'-descriptions'}, $args{'-indices'}); - my $self = {}; - $self->{'corpusName'} = $corpusName; - $self->{'truth'} = []; #arrayref of arrayrefs of factors - $self->{'input'} = []; #same; also same for any system outputs that get loaded - $self->{'truthFilename'} = ""; - $self->{'inputFilename'} = ""; - $self->{'sysoutFilenames'} = {}; #hashref of (string => string) for (system name, filename) - $self->{'phraseTableFilenames'} = {}; #factor name (from @FACTORNAMES) => filename - $self->{'factorIndices'} = {}; #factor name => index; names ought to be strings from the standard list in @FACTORNAMES - %{$self->{'factorIndices'}} = %$factorIndices; - $self->{'phraseTables'} = {}; #factor name (from @FACTORNAMES) => hashref of source phrases to anything; used for unknown-word counting - $self->{'unknownCount'} = {}; #factor name => count of unknown tokens in input - $self->{'totalTokens'} = 0; #useful with counts of unknown tokens - $self->{'sysoutWER'} = {}; #system name => (factor name => arrayref with system output total WER and arrayref of WER scores for individual sysout sentences wrt truth) - $self->{'sysoutPWER'} = {}; #similarly - $self->{'fileDescriptions'} = {}; #filename associated with us => string description of file - $self->{'bleuScores'} = {}; #system name => (factor name => arrayref of (overall score, arrayref of per-sentence scores) ) - $self->{'cacheFilename'} = "cache/$corpusName.cache"; #all memory of various scores is stored here - bless $self, $class; - $self->locateFiles($refFileDescs); #find all relevant files in the current directory; set filenames and descriptions - $self->loadCacheFile(); - return $self; -} - -#arguments: filename -#return: description string -#throw if filename doesn't belong to this corpus -sub getFileDescription -{ - my ($self, $filename) = @_; - if(!defined($self->{'fileDescriptions'}->{$filename})) - { - throw Error::Simple(-text => "Corpus::getFileDescription(): invalid filename '$filename'\n"); - } - return $self->{'fileDescriptions'}->{$filename}; -} - -#arguments: hashref of factor names to indices (not all factors in @FACTORNAMES must be included, but no strings not from the list should be) -#return: none -sub setFactorIndices -{ - my ($self, $refIndices) = @_; - %{$self->{'factorIndices'}} = %{$refIndices}; -} - -#calculate the number of unknown factor values for the given factor in the input file -#arguments: factor name -#return: unknown factor count, total factor count (note the total doesn't depend on the factor) -#throw if we don't have an input file or a phrase table for the given factor defined or if there's no index known for the given factor -sub calcUnknownTokens -{ - my ($self, $factorName) = @_; - #check in-memory cache first - if(defined($self->{'unknownCount'}->{$factorName})) - { - return ($self->{'unknownCount'}->{$factorName}, $self->{'totalTokens'}); - } - - $self->ensureFilenameDefined('input'); - $self->ensurePhraseTableDefined($factorName); - $self->ensureFactorPosDefined($factorName); - $self->loadSentences('input', $self->{'inputFilename'}); - $self->loadPhraseTable($factorName); - - #count unknown and total words - my ($unknownTokens, $totalTokens) = (0, 0); - my $factorIndex = $self->{'factorIndices'}->{$factorName}; - foreach my $sentence (@{$self->{'input'}}) - { - $totalTokens += scalar(@$sentence); - foreach my $word (@$sentence) - { - if(!defined($self->{'phraseTables'}->{$factorName}->{$word->[$factorIndex]})) - { - $unknownTokens++; - } - } - } - $self->{'unknownCount'}->{$factorName} = $unknownTokens; - $self->{'totalTokens'} = $totalTokens; - - return ($unknownTokens, $totalTokens); -} - -#arguments: system name -#return: (WER, PWER) for nouns and adjectives in given system wrt truth -#throw if given system or truth is not set or if index of 'surf' or 'pos' hasn't been specified -sub calcNounAdjWER_PWERDiff -{ - my ($self, $sysname) = @_; - $self->ensureFilenameDefined('truth'); - $self->ensureFilenameDefined($sysname); - $self->ensureFactorPosDefined('surf'); - $self->ensureFactorPosDefined('pos'); - $self->loadSentences('truth', $self->{'truthFilename'}); - $self->loadSentences($sysname, $self->{'sysoutFilenames'}->{$sysname}); - #find nouns and adjectives and score them - my ($werScore, $pwerScore) = (0, 0); - my $nnNadjTags = $self->getPOSTagList('nounAndAdj'); - for(my $i = 0; $i < scalar(@{$self->{'truth'}}); $i++) - { -# warn "truth " . join(', ', map {join('|', @$_)} @{$self->{'truth'}->[$i]}) . "\n"; -# warn "sysout " . join(', ', map {join('|', @$_)} @{$self->{$sysname}->[$i]}) . "\n"; - my @nnAdjEWords = $self->filterFactors($self->{'truth'}->[$i], $self->{'factorIndices'}->{'pos'}, $nnNadjTags); - my @nnAdjSWords = $self->filterFactors($self->{$sysname}->[$i], $self->{'factorIndices'}->{'pos'}, $nnNadjTags); -# warn "filtered truth: " . join(' ', map {join('|', @$_)} @nnAdjEWords) . "\n"; -# warn "filtered sysout: " . join(' ', map {join('|', @$_)} @nnAdjSWords) . "\n\n\n"; - my ($sentWer, $tmp) = $self->sentenceWER(\@nnAdjSWords, \@nnAdjEWords, $self->{'factorIndices'}->{'surf'}); - $werScore += $sentWer; - ($sentWer, $tmp) = $self->sentencePWER(\@nnAdjSWords, \@nnAdjEWords, $self->{'factorIndices'}->{'surf'}); - $pwerScore += $sentWer; - } - - #unhog memory - $self->releaseSentences('truth'); - $self->releaseSentences($sysname); - return ($werScore, $pwerScore); -} - -#calculate detailed WER statistics and put them into $self -#arguments: system name, factor name to consider (default 'surf', surface form) -#return: overall surface WER for given system (w/o filtering) -#throw if given system or truth is not set or if index of factor name hasn't been specified -sub calcOverallWER -{ - my ($self, $sysname, $factorName) = (shift, shift, 'surf'); - if(scalar(@_) > 0) {$factorName = shift;} - #check in-memory cache first - if(defined($self->{'sysoutWER'}->{$sysname}->{$factorName})) - { - return $self->{'sysoutWER'}->{$sysname}->{$factorName}->[0]; - } - - $self->ensureFilenameDefined('truth'); - $self->ensureFilenameDefined($sysname); - $self->ensureFactorPosDefined($factorName); - $self->loadSentences('truth', $self->{'truthFilename'}); - $self->loadSentences($sysname, $self->{'sysoutFilenames'}->{$sysname}); - - my ($wer, $swers, $indices) = $self->corpusWER($self->{$sysname}, $self->{'truth'}, $self->{'factorIndices'}->{$factorName}); - $self->{'sysoutWER'}->{$sysname}->{$factorName} = [$wer, $swers, $indices]; #total; arrayref of scores for individual sentences; arrayref of arrayrefs of offending words in each sentence - - #unhog memory - $self->releaseSentences('truth'); - $self->releaseSentences($sysname); - return $self->{'sysoutWER'}->{$sysname}->{$factorName}->[0]; -} - -#calculate detailed PWER statistics and put them into $self -#arguments: system name, factor name to consider (default 'surf') -#return: overall surface PWER for given system (w/o filtering) -#throw if given system or truth is not set or if index of factor name hasn't been specified -sub calcOverallPWER -{ - my ($self, $sysname, $factorName) = (shift, shift, 'surf'); - if(scalar(@_) > 0) {$factorName = shift;} - #check in-memory cache first - if(defined($self->{'sysoutPWER'}->{$sysname}->{$factorName})) - { - return $self->{'sysoutPWER'}->{$sysname}->{$factorName}->[0]; - } - - $self->ensureFilenameDefined('truth'); - $self->ensureFilenameDefined($sysname); - $self->ensureFactorPosDefined($factorName); - $self->loadSentences('truth', $self->{'truthFilename'}); - $self->loadSentences($sysname, $self->{'sysoutFilenames'}->{$sysname}); - - my ($pwer, $spwers, $indices) = $self->corpusPWER($self->{$sysname}, $self->{'truth'}, $self->{'factorIndices'}->{$factorName}); - $self->{'sysoutPWER'}->{$sysname}->{$factorName} = [$pwer, $spwers, $indices]; #total; arrayref of scores for individual sentences; arrayref of arrayrefs of offending words in each sentence - - #unhog memory - $self->releaseSentences('truth'); - $self->releaseSentences($sysname); - return $self->{'sysoutPWER'}->{$sysname}->{$factorName}->[0]; -} - -#arguments: system name, factor name to consider (default 'surf') -#return: BLEU score, n-gram precisions, brevity penalty -sub calcBLEU -{ - my ($self, $sysname, $factorName) = (shift, shift, 'surf'); - if(scalar(@_) > 0) {$factorName = shift;} - #check in-memory cache first - if(defined($self->{'bleuScores'}->{$sysname}->{$factorName})) - { - return $self->{'bleuScores'}->{$sysname}->{$factorName}; - } - - $self->ensureFilenameDefined('truth'); - $self->ensureFilenameDefined($sysname); - $self->ensureFactorPosDefined($factorName); - $self->loadSentences('truth', $self->{'truthFilename'}); - $self->loadSentences($sysname, $self->{'sysoutFilenames'}->{$sysname}); - - #score structure: various total scores, arrayref of by-sentence score arrays - if(!defined($self->{'bleuScores'}->{$sysname})) {$self->{'bleuScores'}->{$sysname} = {};} - if(!defined($self->{'bleuScores'}->{$sysname}->{$factorName})) {$self->{'bleuScores'}->{$sysname}->{$factorName} = [[], []];} - - my ($good1, $tot1, $good2, $tot2, $good3, $tot3, $good4, $tot4, $totCLength, $totRLength) = (0, 0, 0, 0, 0, 0, 0, 0, 0, 0); - my $factorIndex = $self->{'factorIndices'}->{$factorName}; - for(my $i = 0; $i < scalar(@{$self->{'truth'}}); $i++) - { - my ($truthSentence, $sysoutSentence) = ($self->{'truth'}->[$i], $self->{$sysname}->[$i]); - my ($unigood, $unicount, $bigood, $bicount, $trigood, $tricount, $quadrugood, $quadrucount, $cLength, $rLength) = - $self->sentenceBLEU($truthSentence, $sysoutSentence, $factorIndex, 0); #last argument is whether to debug-print - push @{$self->{'bleuScores'}->{$sysname}->{$factorName}->[1]}, [$unigood, $unicount, $bigood, $bicount, $trigood, $tricount, $quadrugood, $quadrucount, $cLength, $rLength]; - $good1 += $unigood; $tot1 += $unicount; - $good2 += $bigood; $tot2 += $bicount; - $good3 += $trigood; $tot3 += $tricount; - $good4 += $quadrugood; $tot4 += $quadrucount; - $totCLength += $cLength; - $totRLength += $rLength; - } - my $brevity = ($totCLength > $totRLength || $totCLength == 0) ? 1 : exp(1 - $totRLength / $totCLength); - my ($pct1, $pct2, $pct3, $pct4) = ($good1 / $tot1, $good2 / $tot2, $good3 / $tot3, $good4 / $tot4); -# warn sprintf("n-gram prec: %d/%d = %.3lf, %d/%d = %.3lf, %d/%d = %.3lf, %d/%d = %.3lf\n", -# $good1, $tot1, $pct1, $good2, $tot2, $pct2, $good3, $tot3, $pct3, $good4, $tot4, $pct4); - my $bleu = $brevity * exp((my_log($pct1) + my_log($pct2) + my_log($pct3) + my_log($pct4)) / 4); -# warn sprintf("brevity: %.3lf (%d ref, %d sys)\n", $brevity, $totRLength, $totCLength); - sleep 8; - $self->{'bleuScores'}->{$sysname}->{$factorName}->[0] = [$bleu, 100 * $pct1, 100 * $pct2, 100 * $pct3, 100 * $pct4, $brevity]; - - #unhog memory - $self->releaseSentences('truth'); - $self->releaseSentences($sysname); - return @{$self->{'bleuScores'}->{$sysname}->{$factorName}->[0]}; -} - -#write HTML to be displayed to compare the various versions we have of each sentence in the corpus; -#allow filtering of which versions will be displayed -#(we don't write the whole page, just the contents of the body) -#arguments: filehandleref to which to write, regex to filter filename extensions to be included -#return: none -sub writeComparisonPage -{ - my ($self, $fh, $filter) = @_; - my @filteredExtensions = grep($filter, ('e', 'f', keys %{$self->{'sysoutFilenames'}})); - my %openedFiles = $self->openFiles(@filteredExtensions); - my $id = 1; #sentence ID string - while(my %lines = $self->readLineFromFiles(%openedFiles)) - { - $self->printSingleSentenceComparison($fh, $id, %lines); - $id++; - } - $self->closeFiles(%openedFiles); -} - -########################################################################################################## -##### INTERNAL ################################################################################### -########################################################################################################## - -#destructor! -#arguments: none -#return: none -sub DESTROY -{ - my $self = shift; - $self->writeCacheFile(); -} - -#write all scores in memory to disk -#arguments: none -#return: none -sub writeCacheFile -{ - my $self = shift; - if(!open(CACHEFILE, ">" . $self->{'cacheFilename'})) - { - warn "Corpus::writeCacheFile(): can't open '" . $self->{'cacheFilename'} . "' for write\n"; - return; - } - - #store file changetimes to disk - print CACHEFILE "File changetimes\n"; - #store bleu scores to disk - print CACHEFILE "\nBLEU scores\n"; - foreach my $sysname (keys %{$self->{'bleuScores'}}) - { - foreach my $factorName (keys %{$self->{'bleuScores'}->{$sysname}}) - { - print CACHEFILE "$sysname $factorName " . join(' ', @{$self->{'bleuScores'}->{$sysname}->{$factorName}->[0]}); - foreach my $sentenceBLEU (@{$self->{'bleuScores'}->{$sysname}->{$factorName}->[1]}) - { - print CACHEFILE "; " . join(' ', @$sentenceBLEU); - } - print CACHEFILE "\n"; - } - } - #store unknown-token counts to disk - print CACHEFILE "\nUnknown-token counts\n"; - foreach my $factorName (keys %{$self->{'unknownCount'}}) - { - print CACHEFILE $factorName . " " . $self->{'phraseTableFilenames'}->{$factorName} . " " . $self->{'unknownCount'}->{$factorName} . " " . $self->{'totalTokens'} . "\n"; - } - #store WER, PWER to disk - print CACHEFILE "\nWER scores\n"; - my $printWERFunc = - sub - { - my $werType = shift; - foreach my $sysname (keys %{$self->{$werType}}) - { - foreach my $factorName (keys %{$self->{$werType}->{$sysname}}) - { - my ($totalWER, $sentenceWERs, $errorWords) = @{$self->{$werType}->{$sysname}->{$factorName}}; - print CACHEFILE "$werType $sysname $factorName $totalWER " . join(' ', @$sentenceWERs); - foreach my $indices (@$errorWords) - { - print CACHEFILE ";" . join(' ', @$indices); - } - print CACHEFILE "\n"; - } - } - }; - &$printWERFunc('sysoutWER'); - &$printWERFunc('sysoutPWER'); - #store misc scores to disk - print CACHEFILE "\nMisc scores\n"; - close(CACHEFILE); -} - -#load all scores present in the cache file into the appropriate fields of $self -#arguments: none -#return: none -sub loadCacheFile -{ - my $self = shift; - if(!open(CACHEFILE, "<" . $self->{'cacheFilename'})) - { - warn "Corpus::loadCacheFile(): can't open '" . $self->{'cacheFilename'} . "' for read\n"; - return; - } - my $mode = 'none'; - while(my $line = <CACHEFILE>) - { - next if $line =~ /^[ \t\n\r\x0a]*$/; #anyone know why char 10 (0x0a) shows up on empty lines, at least on solaris? - chop $line; - #check for start of section - if($line eq "File changetimes\n") {$mode = 'ctime';} - elsif($line eq "BLEU scores\n") {$mode = 'bleu';} - elsif($line eq "Unknown-token counts\n") {$mode = 'unk';} - elsif($line eq "WER scores") {$mode = 'wer';} - elsif($line eq "Misc scores") {$mode = 'misc';} - #get data when in a mode already - elsif($mode eq 'ctime') - { - } - elsif($mode eq 'bleu') - { - local ($sysname, $factorName, $rest) = split(/\s+/, $line, 3); - if(!defined($self->{'bleuScores'}->{$sysname})) {$self->{'bleuScores'}->{$sysname} = {};} - if(!defined($self->{'bleuScores'}->{$sysname}->{$factorName})) {$self->{'bleuScores'}->{$sysname}->{$factorName} = [[], []];} - my @stats = map {my @tmp = split(/\s+/, $_); \@tmp;} split(/;/, $rest); - $self->{'bleuScores'}->{$sysname}->{$factorName}->[0] = shift @stats; - $self->{'bleuScores'}->{$sysname}->{$factorName}->[1] = \@stats; - } - elsif($mode eq 'unk') - { - local ($factorName, $phraseTableFilename, $unknownCount, $totalCount) = split(' ', $line); - if(defined($self->{'phraseTableFilenames'}->{$factorName}) && $self->{'phraseTableFilenames'}->{$factorName} eq $phraseTableFilename) - { - $self->{'unknownCount'}->{$factorName} = $unknownCount; - $self->{'totalTokens'} = $totalCount; - } - } - elsif($mode eq 'wer') - { - local ($werType, $sysname, $factorName, $totalWER, $details) = split(/\s+/, $line, 5); #werType is 'sysoutWER' or 'sysoutPWER' - $details =~ /^([^;]*);(.*)/; - my @sentenceWERs = split(/\s+/, $1); - if(!defined($self->{$werType}->{$sysname})) {$self->{$werType}->{$sysname} = {};} - $self->{$werType}->{$sysname}->{$factorName} = [$totalWER, \@sentenceWERs, []]; - my @indexLists = split(/;/, $2); - sleep 6; - for(my $i = 0; $i < scalar(@sentenceWERs); $i++) - { - my @indices = split(/\s+/, $indexLists[$i]); - $self->{$werType}->{$sysname}->{$factorName}->[2] = \@indices; - } - } - elsif($mode eq 'misc') - { - } - } - close(CACHEFILE); -} - -##### utils ##### -#arguments: a, b (scalars) -sub min -{ - my ($a, $b) = @_; - return ($a < $b) ? $a : $b; -} -#arguments: a, b (scalars) -sub max -{ - my ($a, $b) = @_; - return ($a > $b) ? $a : $b; -} -#arguments: x -sub my_log -{ - return -9999999999 unless $_[0]; - return log($_[0]); -} - -#throw if the given factor doesn't have an index defined -#arguments: factor name -#return: none -sub ensureFactorPosDefined -{ - my ($self, $factorName) = @_; - if(!defined($self->{'factorIndices'}->{$factorName})) - { - throw Error::Simple(-text => "Corpus: no index known for factor '$factorName'\n"); - } -} - -#throw if the filename field corresponding to the argument hasn't been defined -#arguments: 'truth' | 'input' | a system name -#return: none -sub ensureFilenameDefined -{ - my ($self, $sysname) = @_; - if($sysname eq 'truth' || $sysname eq 'input') - { - if(!defined($self->{"${sysname}Filename"})) - { - throw Error::Simple(-text => "Corpus: no $sysname corpus defined\n"); - } - } - else - { - if(!defined($self->{'sysoutFilenames'}->{$sysname})) - { - throw Error::Simple(-text => "Corpus: no system $sysname defined\n"); - } - } -} - -#throw if there isn't a defined phrase-table filename for the given factor -#arguments: factor name -#return: none -sub ensurePhraseTableDefined -{ - my ($self, $factorName) = @_; - if(!defined($self->{'phraseTableFilenames'}->{$factorName})) - { - throw Error::Simple(-text => "Corpus: no phrase table defined for factor '$factorName'\n"); - } -} - -#search current directory for files with our corpus name as basename and set filename fields of $self -#arguments: hashref of filenames to descriptions -#return: none -sub locateFiles -{ - my ($self, $refDescs) = @_; - open(DIR, "ls -x1 . |") or die "Corpus::locateFiles(): couldn't list current directory\n"; - my $corpusName = $self->{'corpusName'}; - while(my $filename = <DIR>) - { - chop $filename; #remove \n - if($filename =~ /^$corpusName\.(.*)$/) - { - my $ext = $1; - if($ext eq 'e') {$self->{'truthFilename'} = $filename;} - elsif($ext eq 'f') {$self->{'inputFilename'} = $filename;} - elsif($ext =~ /pt_(.*)/) {$self->{'phraseTableFilenames'}->{$1} = $filename;} - else {$self->{'sysoutFilenames'}->{$ext} = $filename;} - if(defined($refDescs->{$filename})) - { - $self->{'fileDescriptions'}->{$filename} = $refDescs->{$filename}; - } - } - } - close(DIR); -# warn "truth is set to '" . $self->{'truthFilename'} . "'\n"; -# warn "input is set to '" . $self->{'inputFilename'} . "'\n"; -# warn "sysouts set are '" . join("', '", keys %{$self->{'sysoutFilenames'}}) . "'\n"; -# warn "phrase tables set are '" . join("', '", keys %{$self->{'phraseTableFilenames'}}) . "'\n"; -} - -#arguments: type ('truth' | 'input' | a string to represent a system output), filename -#pre: filename exists -#return: none -sub loadSentences -{ - my ($self, $sysname, $filename) = @_; - $self->{$sysname} = []; - open(INFILE, "<$filename") or die "Corpus::load(): couldn't open '$filename' for read\n"; - while(my $line = <INFILE>) - { - my @words = split(/\s+/, $line); - my $refFactors = []; - foreach my $word (@words) - { - my @factors = split(/\|/, $word); - push @$refFactors, \@factors; - } - push @{$self->{$sysname}}, $refFactors; - } - close(INFILE); -} - -#free the memory used for the given corpus (but NOT any associated calculations, eg WER) -#arguments: type ('truth' | 'input' | a string to represent a system output) -#return: none -sub releaseSentences -{ - my ($self, $sysname) = @_; - $self->{$sysname} = []; -} - -#arguments: factor name -#return: none -#throw if we don't have a filename for the given phrase table -sub loadPhraseTable -{ - my ($self, $factorName) = @_; - $self->ensurePhraseTableDefined($factorName); - - my $filename = $self->{'phraseTableFilenames'}->{$factorName}; - open(PTABLE, "<$filename") or die "couldn't open '$filename' for read\n"; - $self->{'phraseTables'}->{$factorName} = {}; #create ref to phrase table (hash of strings, for source phrases, to anything whatsoever) - #assume the table is sorted so that duplicate source phrases will be consecutive - while(my $line = <PTABLE>) - { - my @phrases = split(/\s*\|\|\|\s*/, $line, 2); - $self->{'phraseTables'}->{$factorName}->{$phrases[0]} = 0; #just so that it's set to something - } - close(PTABLE); -} - -#arguments: factor name -#return: none -sub releasePhraseTable -{ - my ($self, $factorName) = @_; - $self->{'phraseTables'}->{$factorName} = {}; -} - -#arguments: name of list ('nounAndAdj' | ...) -#return: arrayref of strings (postags) -sub getPOSTagList -{ - my ($self, $listname) = @_; - ##### assume PTB tagset ##### - if($listname eq 'nounAndAdj') {return ['NN', 'NNS', 'NNP', 'NNPS', 'JJ', 'JJR', 'JJS'];} -# if($listname eq '') {return [];} -} - -#arguments: list to be filtered (arrayref of arrayrefs of factor strings), desired factor index, arrayref of allowable values -#return: filtered list as array of arrayrefs of factor strings -sub filterFactors -{ - my ($self, $refFullList, $index, $refFactorValues) = @_; - my $valuesRegex = join("|", @$refFactorValues); - my @filteredList = (); - foreach my $factors (@$refFullList) - { - if($factors->[$index] =~ m/$valuesRegex/) - { - push @filteredList, $factors; - } - } - return @filteredList; -} - -#arguments: system output (arrayref of arrayrefs of arrayrefs of factor strings), truth (same), factor index to use -#return: wer score, arrayref of sentence scores, arrayref of arrayrefs of indices of errorful words -sub corpusWER -{ - my ($self, $refSysOutput, $refTruth, $index) = @_; - my ($totWER, $sentenceWER, $errIndices) = (0, [], []); - for(my $i = 0; $i < scalar(@$refSysOutput); $i++) - { - my ($sentWER, $indices) = $self->sentenceWER($refSysOutput->[$i], $refTruth->[$i], $index); - $totWER += $sentWER; - push @$sentenceWER, $sentWER; - push @$errIndices, $indices; - } - return ($totWER, $sentenceWER, $errIndices); -} - -#arguments: system output (arrayref of arrayrefs of factor strings), truth (same), factor index to use -#return: wer score, arrayref of arrayrefs of indices of errorful words -sub sentenceWER -{ - my ($self, $refSysOutput, $refTruth, $index) = @_; - my ($totWER, $indices) = (0, []); - my ($sLength, $eLength) = (scalar(@$refSysOutput), scalar(@$refTruth)); - for(my $j = 0; $j < min($sLength, $eLength); $j++) - { - if(lc $refSysOutput->[$j]->[$index] ne lc $refTruth->[$j]->[$index]) #check output word against truth word in same position - { - $totWER++; - push @$indices, $j; - } - } - $totWER += max(0, $sLength - $eLength); - return ($totWER, $indices); -} - -#arguments: system output (arrayref of arrayrefs of arrayrefs of factor strings), truth (same), factor index to use -#return: wer score, arrayref of sentence scores, arrayref of arrayrefs of indices of errorful words -sub corpusPWER -{ - my ($self, $refSysOutput, $refTruth, $index) = @_; - my ($totWER, $sentenceWER, $errIndices) = (0, [], []); - for(my $i = 0; $i < scalar(@$refSysOutput); $i++) - { - my ($sentWER, $indices) = $self->sentencePWER($refSysOutput->[$i], $refTruth->[$i], $index); - $totWER += $sentWER; - push @$sentenceWER, $sentWER; - push @$errIndices, $indices; - } - return ($totWER, $sentenceWER, $errIndices); -} - -#arguments: system output (arrayref of arrayrefs of factor strings), truth (same), factor index to use -#return: wer score, arrayref of arrayrefs of indices of errorful words -sub sentencePWER -{ - my ($self, $refSysOutput, $refTruth, $index) = @_; - my ($totWER, $indices) = (0, []); - my ($sLength, $eLength) = (scalar(@$refSysOutput), scalar(@$refTruth)); - my @truthWordUsed = (0) x $eLength; #array of 0/1; can only match a given truth word once - for(my $j = 0; $j < $sLength; $j++) - { - my $found = 0; - for(my $k = 0; $k < $eLength; $k++) #check output word against entire truth sentence - { - if(lc $refSysOutput->[$j]->[$index] eq lc $refTruth->[$k]->[$index] && $truthWordUsed[$k] == 0) - { - $truthWordUsed[$k] = 1; - $found = 1; - } - } - if($found == 0) - { - $totWER++; - push @$indices, $j; - } - } - return ($totWER, $indices); -} - -#BLEU calculation for a single sentence -#arguments: truth sentence (arrayref of arrayrefs of factor strings), sysout sentence (same), factor index to use -#return: 1- through 4-gram matching and total counts (1-g match, 1-g tot, 2-g match...), candidate length, reference length -sub sentenceBLEU -{ - my ($self, $refTruth, $refSysOutput, $factorIndex, $debug) = @_; - my ($length_reference, $length_translation) = (scalar(@$refTruth), scalar(@$refSysOutput)); - my ($correct1, $correct2, $correct3, $correct4, $total1, $total2, $total3, $total4) = (0, 0, 0, 0, 0, 0, 0, 0); - my %REF_GRAM = (); - my ($i, $gram); - for($i = 0; $i < $length_reference; $i++) - { - $gram = $refTruth->[$i]->[$factorIndex]; - $REF_GRAM{$gram}++; - next if $i<1; - $gram = $refTruth->[$i - 1]->[$factorIndex] ." ".$gram; - $REF_GRAM{$gram}++; - next if $i<2; - $gram = $refTruth->[$i - 2]->[$factorIndex] ." ".$gram; - $REF_GRAM{$gram}++; - next if $i<3; - $gram = $refTruth->[$i - 3]->[$factorIndex] ." ".$gram; - $REF_GRAM{$gram}++; - } - for($i = 0; $i < $length_translation; $i++) - { - $gram = $refSysOutput->[$i]->[$factorIndex]; - if (defined($REF_GRAM{$gram}) && $REF_GRAM{$gram} > 0) { - $REF_GRAM{$gram}--; - if($debug != 0) {warn "'$gram' ";} - $correct1 += 1; - } - next if $i<1; - $gram = $refSysOutput->[$i - 1]->[$factorIndex] ." ".$gram; - if (defined($REF_GRAM{$gram}) && $REF_GRAM{$gram} > 0) { - $REF_GRAM{$gram}--; - if($debug != 0) {warn "'$gram' ";} - $correct2 += 1; - } - next if $i<2; - $gram = $refSysOutput->[$i - 2]->[$factorIndex] ." ".$gram; - if (defined($REF_GRAM{$gram}) && $REF_GRAM{$gram} > 0) { - $REF_GRAM{$gram}--; - if($debug != 0) {warn "'$gram' ";} - $correct3 += 1; - } - next if $i<3; - $gram = $refSysOutput->[$i - 3]->[$factorIndex] ." ".$gram; - if (defined($REF_GRAM{$gram}) && $REF_GRAM{$gram} > 0) { - $REF_GRAM{$gram}--; - if($debug != 0) {warn "'$gram' ";} - $correct4 += 1; - } - } - if($debug != 0) {warn "\n";} - my $total = $length_translation; - $total1 = max(1, $total); - $total2 = max(1, $total - 1); - $total3 = max(1, $total - 2); - $total4 = max(1, $total - 3); - if($debug != 0) - { - warn "BLEU($factorIndex)\n"; - warn "truth: " . join(' ', map {join('|', @$_)} @{$refTruth}) . "\n"; - warn "sysop: " . join(' ', map {join('|', @$_)} @{$refSysOutput}) . "\n"; - warn "stats: $correct1 / $total1, $correct2 / $total2, $correct3 / $total3, $correct4 / $total4\n"; - sleep 8; - } - - return ($correct1, $total1, $correct2, $total2, $correct3, $total3, $correct4, $total4, $length_translation, $length_reference); -} - -##### filesystem ##### - -#open as many given files as possible; only warn about the rest -#arguments: list of filename extensions to open (assume corpus name is file title) -#return: hash from type string to filehandleref, giving all files that were successfully opened -sub openFiles -{ - my ($self, @extensions) = @_; - my %openedFiles = (); - foreach my $ext (@extensions) - { - if(!open(FILE, "<" . $self->{'corpusName'} . $ext)) - { - warn "Corpus::openFiles(): couldn't open '" . $self->{'corpusName'} . $ext . "' for read\n"; - } - else #success - { - $openedFiles{$ext} = \*FILE; - } - } - return %openedFiles; -} - -#read one line from each given file -#arguments: hash from type string to filehandleref -#return: hash from type string to sentence (stored as arrayref of arrayrefs of factors) read from corresponding file -sub readLineFromFiles -{ - my ($self, %openedFiles) = @_; - my %lines; - foreach my $type (keys %openedFiles) - { - $lines{$type} = []; - my $sentence = <$openedFiles{$type}>; - my @words = split(/\s+/, $sentence); - foreach my $word (@words) - { - my @factors = split(/\|/, $word); - push @{$lines{$type}}, \@factors; - } - } - return %lines; -} - -#close all given files -#arguments: hash from type string to filehandleref -#return: none -sub closeFiles -{ - my ($self, %openedFiles) = @_; - foreach my $type (keys %openedFiles) - { - close($openedFiles{$type}); - } -} - -##### write HTML ##### - -#print HTML for comparing various versions of a sentence, with special processing for each version as appropriate -#arguments: filehandleref to which to write, sentence ID string, hashref of version string to sentence (stored as arrayref of arrayref of factor strings) -#return: none -sub printSingleSentenceComparison -{ - my ($self, $fh, $sentID, $sentences) = @_; - my $curFH = select; - select $fh; - #javascript to reorder rows to look nice afterward - print "<script type=\"text/javascript\"> - function reorder_$sentID() - {/* - var table = document.getElementById('div_$sentID').firstChild; - var refTransRow = table.getElementById('row_e'); - var inputRow = table.getElementById('row_f'); - table.removeRow(refTransRow); - table.removeRow(inputRow); - var newRow1 = table.insertRow(0); - var newRow2 = table.insertRow(1); - newRow1.childNodes = inputRow.childNodes; - newRow2.childNodes = refTransRow.childNodes;*/ - } - </script>"; - #html for sentences - print "<div id=\"div_$sentID\" style=\"padding: 3px; margin: 5px\">"; - print "<table border=\"1\">"; -# my $rowCount = 0; -# my @bgColors = ("#ffefbf", "#ffdf7f"); - #process all rows in order - foreach my $sentType (keys %$sentences) - { - my $bgcolor = $bgColors[$rowCount % 2]; - print "<tr id=\"row_$sentType\"><td align=right>"; - #description of sentence - if(defined($self->{'fileDescriptions'}->{$self->{'corpusName'} . $sentType})) - { - print "(" . $self->{'fileDescriptions'}->{$self->{'corpusName'} . $sentType} . ")"; - } - else - { - print "($sentType)"; - } - print "</td><td align=left>"; - #sentence with markup - if($sentType eq 'f') #input - { -# $self->writeHTMLSentenceWithFactors($fh, $sentences->{$sentType}, $inputColor); - } - elsif($sentType eq 'e') #reference translation - { -# $self->writeHTMLSentenceWithFactors($fh, $sentences->{$sentType}, $reftransColor); - } - else #system output - { -# $self->writeHTMLTranslationHighlightedWithFactors($fh, $sentences->{$sentType}, $sentences->{'e'}, $highlightColors); - } - print "</td></tr>"; -# $rowCount++; - } - print "</table>"; - print "</div>\n"; - select $curFH; -} |