From 1f6e9b488be31c79c7bf4977646e1b21c380b848 Mon Sep 17 00:00:00 2001 From: mphi Date: Tue, 22 Jun 2010 20:17:42 +0000 Subject: the script now calculates the p-value and confidence intervals not only using BLEU, but also the NIST score; improved confidence interval representation (avg+-stddev); fixed bugs git-svn-id: https://mosesdecoder.svn.sourceforge.net/svnroot/mosesdecoder/trunk@3345 1f5c12ca-751b-0410-a591-d2e778427230 --- ...bootstrap-hypothesis-difference-significance.pl | 338 ++++++++++++++++----- 1 file changed, 257 insertions(+), 81 deletions(-) (limited to 'scripts/analysis') diff --git a/scripts/analysis/bootstrap-hypothesis-difference-significance.pl b/scripts/analysis/bootstrap-hypothesis-difference-significance.pl index 9592941e9..c8d47a011 100755 --- a/scripts/analysis/bootstrap-hypothesis-difference-significance.pl +++ b/scripts/analysis/bootstrap-hypothesis-difference-significance.pl @@ -8,7 +8,9 @@ # # Author: Mark Fishel, fishel@ut.ee # -# 22.10: altered algorithm according to (Riezler and Maxwell 2005 @ MTSE'05), now computes p-value +# 22.10.2008: altered algorithm according to (Riezler and Maxwell 2005 @ MTSE'05), now computes p-value +# +# 23.01.2010: added NIST p-value and interval computation ############################################### use strict; @@ -16,7 +18,7 @@ use strict; #constants my $TIMES_TO_REPEAT_SUBSAMPLING = 1000; my $SUBSAMPLE_SIZE = 0; # if 0 then subsample size is equal to the whole set -my $MAX_NGRAMS_FOR_BLEU = 4; +my $MAX_NGRAMS = 4; #checking cmdline argument consistency if (@ARGV < 3) { @@ -34,6 +36,7 @@ print "reading data; " . `date`; #read all data my $data = readAllData(@ARGV); +my $verbose = $ARGV[3]; #calculate each sentence's contribution to BP and ngram precision print "performing preliminary calculations (hypothesis 1); " . `date`; @@ -43,74 +46,113 @@ print "performing preliminary calculations (hypothesis 2); " . `date`; preEvalHypo($data, "hyp2"); #start comparing -print "comparing hypotheses; " . `date`; +print "comparing hypotheses -- this may take some time; " . `date`; -my @subSampleBleuDiffArr; -my @subSampleBleu1Arr; -my @subSampleBleu2Arr; +bootstrap_report("BLEU", \&getBleu); +bootstrap_report("NIST", \&getNist); -#applying sampling -for (1..$TIMES_TO_REPEAT_SUBSAMPLING) { - my $subSampleIndices = drawWithReplacement($data->{size}, ($SUBSAMPLE_SIZE? $SUBSAMPLE_SIZE: $data->{size})); - - my $bleu1 = getBleu($data->{refs}, $data->{hyp1}, $subSampleIndices); - my $bleu2 = getBleu($data->{refs}, $data->{hyp2}, $subSampleIndices); - - push @subSampleBleuDiffArr, abs($bleu2 - $bleu1); - push @subSampleBleu1Arr, $bleu1; - push @subSampleBleu2Arr, $bleu2; +##### +# +##### +sub bootstrap_report { + my $title = shift; + my $proc = shift; - if ($_ % int($TIMES_TO_REPEAT_SUBSAMPLING / 100) == 0) { - print "$_ / $TIMES_TO_REPEAT_SUBSAMPLING " . `date`; - } -} + my ($subSampleScoreDiffArr, $subSampleScore1Arr, $subSampleScore2Arr) = bootstrap_pass($proc); -#get subsample bleu difference mean -my $averageSubSampleBleuDiff = 0; + my $realScore1 = &$proc($data->{refs}, $data->{hyp1}); + my $realScore2 = &$proc($data->{refs}, $data->{hyp2}); -for my $subSampleDiff (@subSampleBleuDiffArr) { - $averageSubSampleBleuDiff += $subSampleDiff; -} + my $scorePValue = bootstrap_pvalue($subSampleScoreDiffArr, $realScore1, $realScore2); -$averageSubSampleBleuDiff /= $TIMES_TO_REPEAT_SUBSAMPLING; + my ($scoreAvg1, $scoreVar1) = bootstrap_interval($subSampleScore1Arr); + my ($scoreAvg2, $scoreVar2) = bootstrap_interval($subSampleScore2Arr); -print "average subsample bleu: $averageSubSampleBleuDiff " . `date`; + print "\n---=== $title score ===---\n"; -#calculating p-value -my $count = 0; - -my $realBleu1 = getBleu($data->{refs}, $data->{hyp1}); -my $realBleu2 = getBleu($data->{refs}, $data->{hyp2}); - -print "actual BLEU of hypothesis 1: $realBleu1\n"; -print "actual BLEU of hypothesis 1: $realBleu2\n"; - -my $realBleuDiff = abs($realBleu2 - $realBleu1); + print "actual score of hypothesis 1: $realScore1\n"; + print "95% confidence interval for hypothesis 1 score: $scoreAvg1 +- $scoreVar1\n-----\n"; + print "actual score of hypothesis 1: $realScore2\n"; + print "95% confidence interval for hypothesis 2 score: $scoreAvg2 +- $scoreVar2\n-----\n"; + print "Assuming that essentially the same system generated the two hypothesis translations (null-hypothesis),\n"; + print "the probability of actually getting them (p-value) is: $scorePValue.\n"; +} -for my $subSampleDiff (@subSampleBleuDiffArr) { - my $op; +##### +# +##### +sub bootstrap_pass { + my $scoreFunc = shift; - if ($subSampleDiff - $averageSubSampleBleuDiff >= $realBleuDiff) { - $count++; - $op = ">="; - } - else { - $op = "< "; + my @subSampleDiffArr; + my @subSample1Arr; + my @subSample2Arr; + + #applying sampling + for (1..$TIMES_TO_REPEAT_SUBSAMPLING) { + my $subSampleIndices = drawWithReplacement($data->{size}, ($SUBSAMPLE_SIZE? $SUBSAMPLE_SIZE: $data->{size})); + + my $score1 = &$scoreFunc($data->{refs}, $data->{hyp1}, $subSampleIndices); + my $score2 = &$scoreFunc($data->{refs}, $data->{hyp2}, $subSampleIndices); + + push @subSampleDiffArr, abs($score2 - $score1); + push @subSample1Arr, $score1; + push @subSample2Arr, $score2; } - #print "$subSampleDiff - $averageSubSampleBleuDiff $op $realBleuDiff\n"; + return (\@subSampleDiffArr, \@subSample1Arr, \@subSample2Arr); } -my $result = $count / $TIMES_TO_REPEAT_SUBSAMPLING; +##### +# +##### +sub bootstrap_pvalue { + my $subSampleDiffArr = shift; + my $realScore1 = shift; + my $realScore2 = shift; + + my $realDiff = abs($realScore2 - $realScore1); + + #get subsample difference mean + my $averageSubSampleDiff = 0; + + for my $subSampleDiff (@$subSampleDiffArr) { + $averageSubSampleDiff += $subSampleDiff; + } + + $averageSubSampleDiff /= $TIMES_TO_REPEAT_SUBSAMPLING; -print "Assuming that essentially the same system generated the two hypothesis translations (null-hypothesis),\n"; -print "the probability of actually getting them (p-value) is: $result.\n"; + #calculating p-value + my $count = 0; + + my $realScoreDiff = abs($realScore2 - $realScore1); + + for my $subSampleDiff (@$subSampleDiffArr) { + if ($subSampleDiff - $averageSubSampleDiff >= $realDiff) { + $count++; + } + } -my @sorted1 = sort @subSampleBleu1Arr; -my @sorted2 = sort @subSampleBleu2Arr; + return $count / $TIMES_TO_REPEAT_SUBSAMPLING; +} -print "95% confidence interval for hypothesis 1: " . $sorted1[25] . " -- " . $sorted1[924] . "\n"; -print "95% confidence interval for hypothesis 2: " . $sorted2[25] . " -- " . $sorted2[924] . "\n"; +##### +# +##### +sub bootstrap_interval { + my $subSampleArr = shift; + + my @sorted = sort @$subSampleArr; + + my $lowerIdx = int($TIMES_TO_REPEAT_SUBSAMPLING / 40); + my $higherIdx = $TIMES_TO_REPEAT_SUBSAMPLING - $lowerIdx - 1; + + my $lower = $sorted[$lowerIdx]; + my $higher = $sorted[$higherIdx]; + my $diff = $higher - $lower; + + return ($lower + 0.5 * $diff, 0.5 * $diff); +} ##### # read 2 hyp and 1 to \infty ref data files @@ -131,6 +173,7 @@ sub readAllData { #reading reference(s) and checking for matching sizes $result{refs} = []; + $result{ngramCounts} = { }; my $i = 0; for my $refFile (@refFiles) { @@ -141,12 +184,49 @@ sub readAllData { die ("ERROR: ref set $i size doesn't match the size of hyp sets"); } + updateCounts($result{ngramCounts}, $refDataX); + push @{$result{refs}}, $refDataX; } return \%result; } +##### +# +##### +sub updateCounts { + my ($countHash, $refData) = @_; + + for my $snt(@$refData) { + my $size = scalar @{$snt->{words}}; + $countHash->{""} += $size; + + for my $order(1..$MAX_NGRAMS) { + my $ngram; + + for my $i (0..($size-$order)) { + $ngram = join(" ", @{$snt->{words}}[$i..($i + $order - 1)]); + + $countHash->{$ngram}++; + } + } + } +} + +##### +# +##### +sub ngramInfo { + my ($data, $ngram) = @_; + + my @nwords = split(/ /, $ngram); + pop @nwords; + my $smallGram = join(" ", @nwords); + + return log($data->{ngramCounts}->{$smallGram} / $data->{ngramCounts}->{$ngram}) / log(2.0); +} + ##### # read sentences from file ##### @@ -172,41 +252,64 @@ sub preEvalHypo { my $data = shift; my $hypId = shift; + for my $lineIdx (0..($data->{size} - 1)) { + preEvalHypoSnt($data, $hypId, $lineIdx); + } +} + +##### +# +##### +sub preEvalHypoSnt { + my ($data, $hypId, $lineIdx) = @_; + my ($correctNgramCounts, $totalNgramCounts); my ($refNgramCounts, $hypNgramCounts); + my ($coocNgramInfoSum, $totalNgramAmt); - for my $lineIdx (0..($data->{size} - 1)) { - my $hypSnt = $data->{$hypId}->[$lineIdx]; - - #update total hyp len - $hypSnt->{hyplen} = scalar @{$hypSnt->{words}}; + my $hypSnt = $data->{$hypId}->[$lineIdx]; + + #update total hyp len + $hypSnt->{hyplen} = scalar @{$hypSnt->{words}}; + + #update total ref len with closest current ref len + $hypSnt->{reflen} = getClosestLength($data->{refs}, $lineIdx, $hypSnt->{hyplen}); + $hypSnt->{avgreflen} = getAvgLength($data->{refs}, $lineIdx); + + $hypSnt->{correctNgrams} = []; + $hypSnt->{totalNgrams} = []; + + #update ngram precision for each n-gram order + for my $order (1..$MAX_NGRAMS) { + #hyp ngrams + $hypNgramCounts = groupNgrams($hypSnt, $order); - #update total ref len with closest current ref len - $hypSnt->{reflen} = getClosestLength($data->{refs}, $lineIdx, $hypSnt->{hyplen}); + #ref ngrams + $refNgramCounts = groupNgramsMultiSrc($data->{refs}, $lineIdx, $order); - $hypSnt->{correctNgrams} = []; - $hypSnt->{totalNgrams} = []; + $correctNgramCounts = 0; + $totalNgramCounts = 0; + $coocNgramInfoSum = 0; + $totalNgramAmt = 0; + my $coocUpd; - #update ngram precision for each n-gram order - for my $order (1..$MAX_NGRAMS_FOR_BLEU) { - #hyp ngrams - $hypNgramCounts = groupNgrams($hypSnt, $order); - - #ref ngrams - $refNgramCounts = groupNgramsMultiSrc($data->{refs}, $lineIdx, $order); + #correct, total + for my $ngram (keys %$hypNgramCounts) { + $coocUpd = min($hypNgramCounts->{$ngram}, $refNgramCounts->{$ngram}); + $correctNgramCounts += $coocUpd; + $totalNgramCounts += $hypNgramCounts->{$ngram}; - $correctNgramCounts = 0; - $totalNgramCounts = 0; - - #correct, total - for my $ngram (keys %$hypNgramCounts) { - $correctNgramCounts += min($hypNgramCounts->{$ngram}, $refNgramCounts->{$ngram}); - $totalNgramCounts += $hypNgramCounts->{$ngram}; + if ($coocUpd > 0) { + $coocNgramInfoSum += ngramInfo($data, $ngram); } - $hypSnt->{correctNgrams}->[$order] = $correctNgramCounts; - $hypSnt->{totalNgrams}->[$order] = $totalNgramCounts; + $totalNgramAmt++; } + + $hypSnt->{correctNgrams}->[$order] = $correctNgramCounts; + $hypSnt->{totalNgrams}->[$order] = $totalNgramCounts; + $hypSnt->{ngramNistInfoSum}->[$order] = $coocNgramInfoSum; + $hypSnt->{ngramNistCount}->[$order] = $totalNgramAmt; } } @@ -225,6 +328,56 @@ sub drawWithReplacement { return \@result; } +##### +# +##### +sub getNist { + my ($refs, $hyp, $idxs) = @_; + + #default value for $idxs + unless (defined($idxs)) { + $idxs = [0..((scalar @$hyp) - 1)]; + } + + #vars + my ($hypothesisLength, $referenceLength) = (0, 0); + my (@infosum, @totalamt); + + #gather info from each line + for my $lineIdx (@$idxs) { + + my $hypSnt = $hyp->[$lineIdx]; + + #update total hyp len + $hypothesisLength += $hypSnt->{hyplen}; + + #update total ref len with closest current ref len + $referenceLength += $hypSnt->{avgreflen}; + + #update ngram precision for each n-gram order + for my $order (1..$MAX_NGRAMS) { + $infosum[$order] += $hypSnt->{ngramNistInfoSum}->[$order]; + $totalamt[$order] += $hypSnt->{ngramNistCount}->[$order]; + } + } + + my $toplog = log($hypothesisLength / $referenceLength); + my $btmlog = log(2.0/3.0); + + #compose nist score + my $brevityPenalty = ($hypothesisLength > $referenceLength)? 1.0: exp(log(0.5) * $toplog * $toplog / ($btmlog * $btmlog)); + + my $sum = 0; + + for my $order (1..$MAX_NGRAMS) { + $sum += $infosum[$order]/$totalamt[$order]; + } + + my $result = $sum * $brevityPenalty; + + return $result; +} + ##### # refs: arrayref of different references, reference = array of lines, line = array of words, word = string # hyp: arrayref of lines of hypothesis translation, line = array of words, word = string @@ -254,7 +407,7 @@ sub getBleu { $referenceLength += $hypSnt->{reflen}; #update ngram precision for each n-gram order - for my $order (1..$MAX_NGRAMS_FOR_BLEU) { + for my $order (1..$MAX_NGRAMS) { $correctNgramCounts[$order] += $hypSnt->{correctNgrams}->[$order]; $totalNgramCounts[$order] += $hypSnt->{totalNgrams}->[$order]; } @@ -265,11 +418,28 @@ sub getBleu { my $logsum = 0; - for my $order (1..$MAX_NGRAMS_FOR_BLEU) { + for my $order (1..$MAX_NGRAMS) { $logsum += safeLog($correctNgramCounts[$order] / $totalNgramCounts[$order]); } - return $brevityPenalty * exp($logsum / $MAX_NGRAMS_FOR_BLEU); + return $brevityPenalty * exp($logsum / $MAX_NGRAMS); +} + +##### +# +##### +sub getAvgLength { + my ($refs, $lineIdx) = @_; + + my $result = 0; + my $count = 0; + + for my $ref (@$refs) { + $result += scalar @{$ref->[$lineIdx]->{words}}; + $count++; + } + + return $result / $count; } ##### @@ -366,3 +536,9 @@ sub max { return ($a > $b)? $a: $b; } + +sub poww { + my ($a, $b) = @_; + + return exp($b * log($a)); +} -- cgit v1.2.3