$sysoutfile vs. $truthfile: Sentence-by-sentence Comparison

"; #the %%%% is a flag to be replaced - $html .= "

Sentence $i) BLEU: " . sprintf("%.4lg", $bleuData->[0]->[0]) . " (" . join('/', map {sprintf("%.4lg", $_)} @{$bleuData->[0]}[1 .. 4]) . ")

\n"; - $html .= "

Source

" . getFactoredSentenceHTML(\@sourceFactors) . "

\n" if defined $sourcefile; - $html .= "

Reference

" . getFactoredSentenceHTML(\@eFactors) . "

\n"; + my $html = "

"; #the %%%% and other tokens like it are flags to be replaced + $html .= "

Sentence $i) BLEU: " . sprintf("%.4lg", $bleuData->[0]->[0]) . " (" . join('/', map {sprintf("%.4lg", $_)} @{$bleuData->[0]}[1 .. 4]) . ")

\n"; + if(defined $sourcefile) + { + $html .= "\n"; + } + for(my $j = 0; $j < scalar(@truthfiles); $j++) + { + $html .= "\n"; + } my $j = 0; - $html .= "

System Output

(PWER errors marked)

" . getFactoredSentenceHTML(\@sFactors, $pwerData) . "

\n"; + $html .= "\n"; $j = 0; - $html .= "

N-grams

" . getAllNgramsHTML(\@sFactors, $bleuData->[1]) . "

\n"; - $html .= "\n"; + $html .= "\n"; + $html .= "

Source	" . getFactoredSentenceHTML($sourceFactors) . "
Ref $j	" . getFactoredSentenceHTML($eFactors[$j]) . "
Output	" . getFactoredSentenceHTML(\@sFactors, $pwerData) . "
N-grams	" . getAllNgramsHTML(\@sFactors, $bleuData->[1], scalar(@truthfiles)) . "

\n"; push @htmlSentences, $html; $i++; } close(SYSOUT); -close(TRUTH); +foreach my $truthfh (@TRUTHS) {close($truthfh);} rankSentencesByBLEU(\@bleuScores); my $stylesheet = < -h2 {font-weight: bold; font-size: large; margin-bottom: 12px} -h4 {font-weight: bold; font-size: small} -#legend {background-color: #fff; border: 1px solid #000; padding: 2px; margin-bottom: 15px} -#legend_title {font-weight: bold; font-size: medium; text-decoration: underline} -div.sentence {background-color: #ffffee; border: 1px solid #000088; padding: 0px 8px 0px 8px} //entire composition +.legend {background-color: #fff; border: 1px solid #000; padding: 2px; margin-bottom: 10px; margin-right: 15px} +.legend_title {font-weight: bold; font-size: medium; text-decoration: underline} +div.sentence {background-color: #ffffee; border: 1px solid #000088; padding: 0px 8px 0px 8px} //entire composition for a given sentence +div.sentence td {margin: 8px 0px 8px 0px} div.bleu_report {margin-bottom: 5px} -div.source_sentence {background-color: #ffcccc; border: 1px solid #bbb; margin: 8px 0px 8px 0px} -div.truth_sentence {background-color: #ccffcc; border: 1px solid #bbb; margin: 8px 0px 8px 0px} -div.sysout_sentence {background-color: #ccccff; border: 1px solid #bbb; margin: 8px 0px 8px 0px} +td.sent_title {font-weight: bold; font-size: medium; margin-bottom: 12px} +.source_sentence {background-color: #ffcccc; border: 1px solid #bbb} +.truth_sentence {background-color: #ccffcc; border: 1px solid #bbb} +.sysout_sentence {background-color: #ccccff; border: 1px solid #bbb} table.sentence_table {border: none} -div.sysout_ngrams {background-color: #fff; border: 1px solid #bbb; margin-top: 8px; margin-bottom: 8px} +.sysout_ngrams {background-color: #fff; border: 1px solid #bbb} table.ngram_table {} td.ngram_cell {padding: 1px} EOHTML print "\n"; print "\n"; -print "$sysoutfile vs. $truthfile: Sentence-by-sentence Comparison$stylesheet\n"; +print "$sysoutfile vs. [" . join(' ', @truthfiles) . "]: Sentence-by-sentence Comparison$stylesheet\n"; + +#javascript to sort by BLEU, by order in corpus, ... +my %rank2index = map {$bleuScores[$_]->[2] => $_} (0 .. scalar(@htmlSentences) - 1); +print "\n"; #legend for background colors my @minBLEU = (1e9) x scalar(@htmlColors); @@ -105,25 +141,35 @@ for(my $k = 0; $k < scalar(@htmlSentences); $k++) if($bleuScores[$k]->[1] < $minBLEU[$tier]) {$minBLEU[$tier] = $bleuScores[$k]->[1];} elsif($bleuScores[$k]->[1] > $maxBLEU[$tier]) {$maxBLEU[$tier] = $bleuScores[$k]->[1];} } -print "

BLEU Ranges"; +print "

BLEU Ranges (sentence backgrounds)"; for(my $k = 0; $k < scalar(@htmlColors); $k++) { print ""; } -print "

" . sprintf("%.4lg", $minBLEU[$k]) . " - " . sprintf("%.4lg", $maxBLEU[$k]) . "

\n"; +print "

N-gram Colors => Number of Matching Reference Translations"; +for(my $k = 1; $k <= scalar(@truthfiles); $k++) +{ + print ""; +} +print "

+PWER errors are marked in red on output sentence displays.

Sort by BLEU score | corpus order (default)

\n"; #sentence boxes +print "

"; my $j = 0; foreach my $sentenceHTML (@htmlSentences) { - if($j > 0) {print "

";} + print "

"; + print "

"; my $bgcolor = getSentenceBGColorHTML($bleuScores[$j], $i); #i is now # of sentences $sentenceHTML =~ s/%%%%/$bgcolor/; - print "$sentenceHTML\n"; + print "$sentenceHTML

\n"; $j++; } -print ""; +print "

"; ##################### utils ##################### @@ -139,111 +185,153 @@ sub max my ($a, $b) = @_; return ($a > $b) ? $a : $b; } +#arguments: a list of elements +#return undef for an empty list, the max element otherwise +sub maxN +{ + if(scalar @_ == 0) {return undef;} + my $val = shift @_; + foreach my $e (@_) {if($e > $val) {$val = $e;}} + return $val; +} #arguments: x sub my_log { return -9999999999 unless $_[0]; return log($_[0]); } +#arguments: x +sub round +{ + my $x = shift; + return ($x - int($x) < .5) ? int($x) : int($x) + 1; +} ############################################################################################################################################################### -#arguments: sysout sentence (arrayref of arrayrefs of factor strings), truth sentence (same), factor index to use -#return: arrayref of [arrayref of [overall BLEU score, n-gram precisions], arrayref of matching n-gram [start index, length]] +#arguments: line read from corpus file, (optionally) string to die with if line isn't defined (default die-msg is empty) +#return: sentence (arrayref of arrayrefs of factor strings) taken from line +sub extractFactorArrays +{ + my ($line, $msg) = (shift, ''); + $msg = shift if scalar(@_); + die $msg if !defined $line; + chomp $line; + $line =~ s/^\s*|\s*$//g; #added by Ondrej to handle moses-mert-parallel output + my @words = split(/\s+/, $line); + my @factors = map {my @f = split(/\|/, $_); \@f;} @words; + return \@factors; +} + +#can handle multiple reference translations; assume at least one +#arguments: sysout sentence (arrayref of arrayrefs of factor strings), truth sentences (arrayref of same), factor index to use +#return: arrayref of [arrayref of [overall BLEU score, n-gram precisions], arrayref of matching n-gram [start index, length, arrayref of indices of matching references]] sub getBLEUSentenceDetails { - my ($refSysOutput, $refTruth, $factorIndex) = @_; - my ($length_reference, $length_translation) = (scalar(@$refTruth), scalar(@$refSysOutput)); - my ($correct1, $correct2, $correct3, $correct4, $total1, $total2, $total3, $total4) = (0, 0, 0, 0, 0, 0, 0, 0); + my $maxNgramOrder = 4; + my ($refSysOutput, $refTruths, $factorIndex) = @_; + my $length_translation = scalar(@$refSysOutput); #length of sysout sentence + my @length_references = map {scalar(@$_)} @$refTruths; + my $closestTruthLength = (sort(map {abs($_ - $length_translation)} @length_references))[0]; + my @correct = (0) x $maxNgramOrder; #n-gram counts + my @total = (0) x $maxNgramOrder; #n-gram counts my $returnData = [[], []]; - my %REF_GRAM = (); + my %REF_GRAM; #hash from n-gram to arrayref with # of times found in each reference my $ngramMatches = []; #arrayref of n-gram [start index, length] - my ($i, $gram); - for($i = 0; $i < $length_reference; $i++) + for(my $j = 0; $j < scalar(@$refTruths); $j++) { - $gram = $refTruth->[$i]->[$factorIndex]; - $REF_GRAM{$gram}++; - next if $i<1; - $gram = $refTruth->[$i - 1]->[$factorIndex] ." ".$gram; - $REF_GRAM{$gram}++; - next if $i<2; - $gram = $refTruth->[$i - 2]->[$factorIndex] ." ".$gram; - $REF_GRAM{$gram}++; - next if $i<3; - $gram = $refTruth->[$i - 3]->[$factorIndex] ." ".$gram; - $REF_GRAM{$gram}++; + for(my $i = 0; $i < $length_references[$j]; $i++) + { + my $gram = ''; + for(my $k = 0; $k < min($i + 1, $maxNgramOrder); $k++) #run over n-gram orders + { + $gram = $refTruths->[$j]->[$i - $k]->[$factorIndex] . " " . $gram; + #increment the count for the given n-gram and given reference number + if(!exists $REF_GRAM{$gram}) + { + my @tmp = (0) x scalar @$refTruths; + $tmp[$j] = 1; + $REF_GRAM{$gram} = \@tmp; + } + else + { + $REF_GRAM{$gram}->[$j]++; + } + } + } } - for($i = 0; $i < $length_translation; $i++) + for(my $i = 0; $i < $length_translation; $i++) { - $gram = $refSysOutput->[$i]->[$factorIndex]; - if (defined($REF_GRAM{$gram}) && $REF_GRAM{$gram} > 0) { - $REF_GRAM{$gram}--; - $correct1++; - push @$ngramMatches, [$i, 1]; - } - next if $i<1; - $gram = $refSysOutput->[$i - 1]->[$factorIndex] ." ".$gram; - if (defined($REF_GRAM{$gram}) && $REF_GRAM{$gram} > 0) { - $REF_GRAM{$gram}--; - $correct2++; - push @$ngramMatches, [$i - 1, 2]; - } - next if $i<2; - $gram = $refSysOutput->[$i - 2]->[$factorIndex] ." ".$gram; - if (defined($REF_GRAM{$gram}) && $REF_GRAM{$gram} > 0) { - $REF_GRAM{$gram}--; - $correct3++; - push @$ngramMatches, [$i - 2, 3]; - } - next if $i<3; - $gram = $refSysOutput->[$i - 3]->[$factorIndex] ." ".$gram; - if (defined($REF_GRAM{$gram}) && $REF_GRAM{$gram} > 0) { - $REF_GRAM{$gram}--; - $correct4++; - push @$ngramMatches, [$i - 3, 4]; - } + my $gram = ''; + for(my $k = 0; $k < min($i + 1, $maxNgramOrder); $k++) #run over n-gram orders + { + $gram = $refSysOutput->[$i - $k]->[$factorIndex] . " " . $gram; + if(exists $REF_GRAM{$gram}) #this n-gram was found in at least one reference + { + $correct[$k]++; + my @indices = (); + for(my $m = 0; $m < scalar(@{$REF_GRAM{$gram}}); $m++) + { + if($REF_GRAM{$gram}->[$m] > 0) + { + push @indices, $m; + $REF_GRAM{$gram}->[$m]--; + } + } + push @$ngramMatches, [$i - $k, $k + 1, \@indices]; + } + } } - my $total = $length_translation; - $total1 = max(1, $total); - $total2 = max(1, $total - 1); - $total3 = max(1, $total - 2); - $total4 = max(1, $total - 3); - my $brevity = ($length_translation > $length_reference || $length_translation == 0) ? 1 : exp(1 - $length_reference / $length_translation); - my ($pct1, $pct2, $pct3, $pct4) = ($total1 == 0 ? -1 : $correct1 / $total1, $total2 == 0 ? -1 : $correct2 / $total2, - $total3 == 0 ? -1 : $correct3 / $total3, $total4 == 0 ? -1 : $correct4 / $total4); + my $brevity = ($length_translation > $closestTruthLength || $length_translation == 0) ? 1 : exp(1 - $closestTruthLength / $length_translation); + my @pct; my ($logsum, $logcount) = (0, 0); - if($total1 > 0) {$logsum += my_log($pct1); $logcount++;} - if($total2 > 0) {$logsum += my_log($pct2); $logcount++;} - if($total3 > 0) {$logsum += my_log($pct3); $logcount++;} - if($total4 > 0) {$logsum += my_log($pct4); $logcount++;} + for(my $i = 0; $i < $maxNgramOrder; $i++) + { + $total[$i] = max(1, $length_translation - $i); + push @pct, ($total[$i] == 0) ? -1 : $correct[$i] / $total[$i]; + if($total[$i] > 0) + { + $logsum += my_log($pct[$i]); + $logcount++; + } + } my $bleu = $brevity * exp($logsum / $logcount); - $returnData->[0] = [$bleu, $pct1, $pct2, $pct3, $pct4]; + $returnData->[0] = [$bleu, @pct]; $returnData->[1] = $ngramMatches; return $returnData; } -#arguments: sysout sentence (arrayref of arrayrefs of factor strings), truth sentence (same), factor index to use +#can handle multiple sentence translations; assume at least one +#arguments: sysout sentence (arrayref of arrayrefs of factor strings), truth sentences (arrayref of same), factor index to use #return: hashref of sysout word index => whether word matches sub getPWERSentenceDetails { - my ($refSysOutput, $refTruth, $factorIndex) = @_; - my $indices = {}; - my ($sLength, $eLength) = (scalar(@$refSysOutput), scalar(@$refTruth)); - my @truthWordUsed = (0) x $eLength; #array of 0/1; can only match a given truth word once - for(my $j = 0; $j < $sLength; $j++) + my ($refSysOutput, $refTruths, $factorIndex) = @_; + my $matches = {}; + my %truthWords; #hash from word to arrayref with number of times seen in each reference (but later holds only the max) + for(my $i = 0; $i < scalar(@$refTruths); $i++) { - $indices->{$j} = 0; - for(my $k = 0; $k < $eLength; $k++) #check output word against entire truth sentence + foreach my $eWord (@{$refTruths->[$i]}) { - if(lc $refSysOutput->[$j]->[$factorIndex] eq lc $refTruth->[$k]->[$factorIndex] && $truthWordUsed[$k] == 0) - { - $truthWordUsed[$k] = 1; - $indices->{$j} = 1; - last; - } + my $factor = $eWord->[$factorIndex]; + if(exists $truthWords{$factor}) {$truthWords{$factor}->[$i]++;} + else {my @tmp = (0) x scalar(@$refTruths); $tmp[$i] = 1; $truthWords{$factor} = \@tmp;} } } - return $indices; + %truthWords = map {$_ => maxN(@{$truthWords{$_}})} (keys %truthWords); #save only the max times each word is seen in a reference + for(my $j = 0; $j < scalar(@$refSysOutput); $j++) + { + if(exists $truthWords{$refSysOutput->[$j]->[$factorIndex]} && $truthWords{$refSysOutput->[$j]->[$factorIndex]} > 0) + { + $truthWords{$refSysOutput->[$j]->[$factorIndex]}--; + $matches->{$j} = 1; + } + else + { + $matches->{$j} = 0; + } + } + return $matches; } #assign ranks to sentences by BLEU score @@ -287,11 +375,12 @@ sub getSentenceBGColorHTML #display all matching n-grams in the given sentence, with all 1-grams on one line, then arranged by picking, for each, the first line on which it fits, # where a given word position can only be filled by one n-gram per line, so that all n-grams can be shown -#arguments: sentence (arrayref of arrayrefs of factor strings), arrayref of arrayrefs of matching n-gram [start, length] +#arguments: sentence (arrayref of arrayrefs of factor strings), arrayref of arrayrefs of matching n-gram [start, length, arrayref of matching reference indices], +# number of reference translations #return: HTML string sub getAllNgramsHTML { - my ($sentence, $ngrams) = @_; + my ($sentence, $ngrams, $numTruths) = @_; my $factorIndex = 0; my @table = (); #array or arrayrefs each of which represents a line; each position has the index of the occupying n-gram, or -1 if none my $n = 0; #n-gram index @@ -325,14 +414,12 @@ sub getAllNgramsHTML my $html = ""; my $numWords = scalar(@$sentence); - my ($curRow, $curCol) = (0, 0); - my $colorIndex = 0; + my ($curRow, $curCol) = (0, 0); #address in table $html .= ""; - foreach my $ngram (sort {my $c = $a->[2] <=> $b->[2]; if($c == 0) {$a->[0] <=> $b->[0]} else {$c}} @$ngrams) #sort by row, then word num + foreach my $ngram (sort {my $c = $a->[3] <=> $b->[3]; if($c == 0) {$a->[0] <=> $b->[0]} else {$c}} @$ngrams) #sort by row, then word num { - while($ngram->[0] > $curCol || $ngram->[2] > $curRow) {$html .= ""; $curCol = ($curCol + 1) % $numWords; if($curCol == 0) {$html .= ""; $curRow++;}} - $html .= ""; - $colorIndex = ($colorIndex + 1) % scalar(@ngramColors); + while($ngram->[0] > $curCol || $ngram->[3] > $curRow) {$html .= ""; $curCol = ($curCol + 1) % $numWords; if($curCol == 0) {$html .= ""; $curRow++;}} + $html .= ""; $curCol = ($curCol + $ngram->[1]) % $numWords; if($curCol == 0) {$html .= ""; $curRow++;} } $html .= ""; @@ -349,15 +436,12 @@ sub rowIsClear return (maxN(@{$row}[$ngram->[0] .. $ngram->[0] + $ngram->[1] - 1]) == -1) ? 1 : 0; } -#arguments: array of numeric values -#return: max value, or empty list if input list is empty -sub maxN +#auxiliary to getAllNgramsHTML() +#arguments: number of reference translations matching the n-gram, total number of references +#return: HTML color string +sub getNgramColorHTML { - if(scalar(@_) == 0) {return ();} - my $max = $_[0]; - for(my $i = 1; $i < scalar(@_); $i++) - { - if($_[$i] > $max) {$max = $_[$i];} - } - return $max; + my ($matches, $total) = @_; + if($total == 1) {return $ngramSingleRefColor;} + return $ngramMultirefColors[round($matches / $total * (scalar(@ngramMultirefColors) - 1))]; } diff --git a/scripts/analysis/smtgui/README b/scripts/analysis/smtgui/README index c86cd9c1a..e6bcabb2e 100644 --- a/scripts/analysis/smtgui/README +++ b/scripts/analysis/smtgui/README @@ -29,3 +29,14 @@ $ $DATA/test/combine-features.perl CORPUS lc+pos lemma > CORPUS.lc+pos+lemma $ rm CORPUS.pos-tmp (cleanup) where $BIN=/export/ws06osmt/bin, $DATA=/export/ws06osmt/data. + +To get German POS tags and lemmas from a words-only corpus (the first step must be run on linux): + +$ $BIN/recase.perl --in CORPUS.lc --model $MODELS/en-de/recaser/pharaoh.ini > CORPUS.recased (call pharaoh with a lowercase->uppercase model) +$ $BIN/run-lopar-tagger-lowercase.perl CORPUS.recased CORPUS.recased.lopar (call LOPAR) +$ $DATA/test/factor-stem.de.perl < CORPUS.recased.lopar > CORPUS.stem +$ $BIN/lowercase.latin1.perl < CORPUS.stem > CORPUS.lcstem (as you might guess, assumes latin-1 encoding) +$ $DATA/test/factor-pos.de.perl < CORPUS.recased.lopar > CORPUS.pos +$ $DATA/test/combine-features.perl CORPUS lc pos lcstem > CORPUS.lc+pos+lcstem + +where $MODELS=/export/ws06osmt/models. diff --git a/scripts/analysis/smtgui/file-descriptions b/scripts/analysis/smtgui/file-descriptions index caf1507e6..5c1f9153f 100644 --- a/scripts/analysis/smtgui/file-descriptions +++ b/scripts/analysis/smtgui/file-descriptions @@ -1,3 +1,4 @@ devtest2006.de-en.matrix05-baseline.pharaoh Pharaoh JHUWS baseline run devtest2006.de-en.matrix05-baseline.moses-2006-07-20 Moses baseline run devtest2006.en-de.matrix05-baseline.pharaoh Pharaoh JHUWS baseline run +devtest2006.en-de.matrix05-moses.2006-08-02 Moses baseline run diff --git a/scripts/analysis/smtgui/file-factors b/scripts/analysis/smtgui/file-factors index fad031ec4..7938e9297 100644 --- a/scripts/analysis/smtgui/file-factors +++ b/scripts/analysis/smtgui/file-factors @@ -4,3 +4,5 @@ devtest2006.de-en : surf pos lemma : surf europarl.de.srilm.gz : surf europarl.e devtest2006.en-de : surf pos lemma : surf europarl.en.srilm.gz : surf europarl.de.srilm.gz #pstem: lemmas come from the Porter stemmer (and so are really a mix of stems and lemmas) pstem_devtest2006.de-en : surf pos lemma : : surf europarl.en.srilm.gz +#replace esset with ss in German text +ss_devtest2006.en-de : surf pos lemma : surf europarl.en.srilm.gz : surf ss_europarl.de.srilm.gz -- cgit v1.2.3

" . join("	", map {$_->[$factorIndex]} @$sentence) . "

" . join(' ', map {$_->[$factorIndex]} @{$sentence}[$ngram->[0] .. $ngram->[0] + $ngram->[1] - 1]) . "
[2]}), $numTruths) . "\">" . join(' ', map {$_->[$factorIndex]} @{$sentence}[$ngram->[0] .. $ngram->[0] + $ngram->[1] - 1]) . "