Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/moses-smt/mosesdecoder.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authormphi <mphi@1f5c12ca-751b-0410-a591-d2e778427230>2010-06-23 00:17:42 +0400
committermphi <mphi@1f5c12ca-751b-0410-a591-d2e778427230>2010-06-23 00:17:42 +0400
commit1f6e9b488be31c79c7bf4977646e1b21c380b848 (patch)
tree9f59601be534a82ed335e5b96c9eee90118731ea /scripts/analysis
parenta21c9bff6805206e33e10b11fed1ad6ceb85a2fb (diff)
the script now calculates the p-value and confidence intervals not only using BLEU, but also the NIST score;
improved confidence interval representation (avg+-stddev); fixed bugs git-svn-id: https://mosesdecoder.svn.sourceforge.net/svnroot/mosesdecoder/trunk@3345 1f5c12ca-751b-0410-a591-d2e778427230
Diffstat (limited to 'scripts/analysis')
-rwxr-xr-xscripts/analysis/bootstrap-hypothesis-difference-significance.pl338
1 files changed, 257 insertions, 81 deletions
diff --git a/scripts/analysis/bootstrap-hypothesis-difference-significance.pl b/scripts/analysis/bootstrap-hypothesis-difference-significance.pl
index 9592941e9..c8d47a011 100755
--- a/scripts/analysis/bootstrap-hypothesis-difference-significance.pl
+++ b/scripts/analysis/bootstrap-hypothesis-difference-significance.pl
@@ -8,7 +8,9 @@
#
# Author: Mark Fishel, fishel@ut.ee
#
-# 22.10: altered algorithm according to (Riezler and Maxwell 2005 @ MTSE'05), now computes p-value
+# 22.10.2008: altered algorithm according to (Riezler and Maxwell 2005 @ MTSE'05), now computes p-value
+#
+# 23.01.2010: added NIST p-value and interval computation
###############################################
use strict;
@@ -16,7 +18,7 @@ use strict;
#constants
my $TIMES_TO_REPEAT_SUBSAMPLING = 1000;
my $SUBSAMPLE_SIZE = 0; # if 0 then subsample size is equal to the whole set
-my $MAX_NGRAMS_FOR_BLEU = 4;
+my $MAX_NGRAMS = 4;
#checking cmdline argument consistency
if (@ARGV < 3) {
@@ -34,6 +36,7 @@ print "reading data; " . `date`;
#read all data
my $data = readAllData(@ARGV);
+my $verbose = $ARGV[3];
#calculate each sentence's contribution to BP and ngram precision
print "performing preliminary calculations (hypothesis 1); " . `date`;
@@ -43,74 +46,113 @@ print "performing preliminary calculations (hypothesis 2); " . `date`;
preEvalHypo($data, "hyp2");
#start comparing
-print "comparing hypotheses; " . `date`;
+print "comparing hypotheses -- this may take some time; " . `date`;
-my @subSampleBleuDiffArr;
-my @subSampleBleu1Arr;
-my @subSampleBleu2Arr;
+bootstrap_report("BLEU", \&getBleu);
+bootstrap_report("NIST", \&getNist);
-#applying sampling
-for (1..$TIMES_TO_REPEAT_SUBSAMPLING) {
- my $subSampleIndices = drawWithReplacement($data->{size}, ($SUBSAMPLE_SIZE? $SUBSAMPLE_SIZE: $data->{size}));
-
- my $bleu1 = getBleu($data->{refs}, $data->{hyp1}, $subSampleIndices);
- my $bleu2 = getBleu($data->{refs}, $data->{hyp2}, $subSampleIndices);
-
- push @subSampleBleuDiffArr, abs($bleu2 - $bleu1);
- push @subSampleBleu1Arr, $bleu1;
- push @subSampleBleu2Arr, $bleu2;
+#####
+#
+#####
+sub bootstrap_report {
+ my $title = shift;
+ my $proc = shift;
- if ($_ % int($TIMES_TO_REPEAT_SUBSAMPLING / 100) == 0) {
- print "$_ / $TIMES_TO_REPEAT_SUBSAMPLING " . `date`;
- }
-}
+ my ($subSampleScoreDiffArr, $subSampleScore1Arr, $subSampleScore2Arr) = bootstrap_pass($proc);
-#get subsample bleu difference mean
-my $averageSubSampleBleuDiff = 0;
+ my $realScore1 = &$proc($data->{refs}, $data->{hyp1});
+ my $realScore2 = &$proc($data->{refs}, $data->{hyp2});
-for my $subSampleDiff (@subSampleBleuDiffArr) {
- $averageSubSampleBleuDiff += $subSampleDiff;
-}
+ my $scorePValue = bootstrap_pvalue($subSampleScoreDiffArr, $realScore1, $realScore2);
-$averageSubSampleBleuDiff /= $TIMES_TO_REPEAT_SUBSAMPLING;
+ my ($scoreAvg1, $scoreVar1) = bootstrap_interval($subSampleScore1Arr);
+ my ($scoreAvg2, $scoreVar2) = bootstrap_interval($subSampleScore2Arr);
-print "average subsample bleu: $averageSubSampleBleuDiff " . `date`;
+ print "\n---=== $title score ===---\n";
-#calculating p-value
-my $count = 0;
-
-my $realBleu1 = getBleu($data->{refs}, $data->{hyp1});
-my $realBleu2 = getBleu($data->{refs}, $data->{hyp2});
-
-print "actual BLEU of hypothesis 1: $realBleu1\n";
-print "actual BLEU of hypothesis 1: $realBleu2\n";
-
-my $realBleuDiff = abs($realBleu2 - $realBleu1);
+ print "actual score of hypothesis 1: $realScore1\n";
+ print "95% confidence interval for hypothesis 1 score: $scoreAvg1 +- $scoreVar1\n-----\n";
+ print "actual score of hypothesis 1: $realScore2\n";
+ print "95% confidence interval for hypothesis 2 score: $scoreAvg2 +- $scoreVar2\n-----\n";
+ print "Assuming that essentially the same system generated the two hypothesis translations (null-hypothesis),\n";
+ print "the probability of actually getting them (p-value) is: $scorePValue.\n";
+}
-for my $subSampleDiff (@subSampleBleuDiffArr) {
- my $op;
+#####
+#
+#####
+sub bootstrap_pass {
+ my $scoreFunc = shift;
- if ($subSampleDiff - $averageSubSampleBleuDiff >= $realBleuDiff) {
- $count++;
- $op = ">=";
- }
- else {
- $op = "< ";
+ my @subSampleDiffArr;
+ my @subSample1Arr;
+ my @subSample2Arr;
+
+ #applying sampling
+ for (1..$TIMES_TO_REPEAT_SUBSAMPLING) {
+ my $subSampleIndices = drawWithReplacement($data->{size}, ($SUBSAMPLE_SIZE? $SUBSAMPLE_SIZE: $data->{size}));
+
+ my $score1 = &$scoreFunc($data->{refs}, $data->{hyp1}, $subSampleIndices);
+ my $score2 = &$scoreFunc($data->{refs}, $data->{hyp2}, $subSampleIndices);
+
+ push @subSampleDiffArr, abs($score2 - $score1);
+ push @subSample1Arr, $score1;
+ push @subSample2Arr, $score2;
}
- #print "$subSampleDiff - $averageSubSampleBleuDiff $op $realBleuDiff\n";
+ return (\@subSampleDiffArr, \@subSample1Arr, \@subSample2Arr);
}
-my $result = $count / $TIMES_TO_REPEAT_SUBSAMPLING;
+#####
+#
+#####
+sub bootstrap_pvalue {
+ my $subSampleDiffArr = shift;
+ my $realScore1 = shift;
+ my $realScore2 = shift;
+
+ my $realDiff = abs($realScore2 - $realScore1);
+
+ #get subsample difference mean
+ my $averageSubSampleDiff = 0;
+
+ for my $subSampleDiff (@$subSampleDiffArr) {
+ $averageSubSampleDiff += $subSampleDiff;
+ }
+
+ $averageSubSampleDiff /= $TIMES_TO_REPEAT_SUBSAMPLING;
-print "Assuming that essentially the same system generated the two hypothesis translations (null-hypothesis),\n";
-print "the probability of actually getting them (p-value) is: $result.\n";
+ #calculating p-value
+ my $count = 0;
+
+ my $realScoreDiff = abs($realScore2 - $realScore1);
+
+ for my $subSampleDiff (@$subSampleDiffArr) {
+ if ($subSampleDiff - $averageSubSampleDiff >= $realDiff) {
+ $count++;
+ }
+ }
-my @sorted1 = sort @subSampleBleu1Arr;
-my @sorted2 = sort @subSampleBleu2Arr;
+ return $count / $TIMES_TO_REPEAT_SUBSAMPLING;
+}
-print "95% confidence interval for hypothesis 1: " . $sorted1[25] . " -- " . $sorted1[924] . "\n";
-print "95% confidence interval for hypothesis 2: " . $sorted2[25] . " -- " . $sorted2[924] . "\n";
+#####
+#
+#####
+sub bootstrap_interval {
+ my $subSampleArr = shift;
+
+ my @sorted = sort @$subSampleArr;
+
+ my $lowerIdx = int($TIMES_TO_REPEAT_SUBSAMPLING / 40);
+ my $higherIdx = $TIMES_TO_REPEAT_SUBSAMPLING - $lowerIdx - 1;
+
+ my $lower = $sorted[$lowerIdx];
+ my $higher = $sorted[$higherIdx];
+ my $diff = $higher - $lower;
+
+ return ($lower + 0.5 * $diff, 0.5 * $diff);
+}
#####
# read 2 hyp and 1 to \infty ref data files
@@ -131,6 +173,7 @@ sub readAllData {
#reading reference(s) and checking for matching sizes
$result{refs} = [];
+ $result{ngramCounts} = { };
my $i = 0;
for my $refFile (@refFiles) {
@@ -141,6 +184,8 @@ sub readAllData {
die ("ERROR: ref set $i size doesn't match the size of hyp sets");
}
+ updateCounts($result{ngramCounts}, $refDataX);
+
push @{$result{refs}}, $refDataX;
}
@@ -148,6 +193,41 @@ sub readAllData {
}
#####
+#
+#####
+sub updateCounts {
+ my ($countHash, $refData) = @_;
+
+ for my $snt(@$refData) {
+ my $size = scalar @{$snt->{words}};
+ $countHash->{""} += $size;
+
+ for my $order(1..$MAX_NGRAMS) {
+ my $ngram;
+
+ for my $i (0..($size-$order)) {
+ $ngram = join(" ", @{$snt->{words}}[$i..($i + $order - 1)]);
+
+ $countHash->{$ngram}++;
+ }
+ }
+ }
+}
+
+#####
+#
+#####
+sub ngramInfo {
+ my ($data, $ngram) = @_;
+
+ my @nwords = split(/ /, $ngram);
+ pop @nwords;
+ my $smallGram = join(" ", @nwords);
+
+ return log($data->{ngramCounts}->{$smallGram} / $data->{ngramCounts}->{$ngram}) / log(2.0);
+}
+
+#####
# read sentences from file
#####
sub readData {
@@ -172,41 +252,64 @@ sub preEvalHypo {
my $data = shift;
my $hypId = shift;
+ for my $lineIdx (0..($data->{size} - 1)) {
+ preEvalHypoSnt($data, $hypId, $lineIdx);
+ }
+}
+
+#####
+#
+#####
+sub preEvalHypoSnt {
+ my ($data, $hypId, $lineIdx) = @_;
+
my ($correctNgramCounts, $totalNgramCounts);
my ($refNgramCounts, $hypNgramCounts);
+ my ($coocNgramInfoSum, $totalNgramAmt);
- for my $lineIdx (0..($data->{size} - 1)) {
- my $hypSnt = $data->{$hypId}->[$lineIdx];
-
- #update total hyp len
- $hypSnt->{hyplen} = scalar @{$hypSnt->{words}};
+ my $hypSnt = $data->{$hypId}->[$lineIdx];
+
+ #update total hyp len
+ $hypSnt->{hyplen} = scalar @{$hypSnt->{words}};
+
+ #update total ref len with closest current ref len
+ $hypSnt->{reflen} = getClosestLength($data->{refs}, $lineIdx, $hypSnt->{hyplen});
+ $hypSnt->{avgreflen} = getAvgLength($data->{refs}, $lineIdx);
+
+ $hypSnt->{correctNgrams} = [];
+ $hypSnt->{totalNgrams} = [];
+
+ #update ngram precision for each n-gram order
+ for my $order (1..$MAX_NGRAMS) {
+ #hyp ngrams
+ $hypNgramCounts = groupNgrams($hypSnt, $order);
- #update total ref len with closest current ref len
- $hypSnt->{reflen} = getClosestLength($data->{refs}, $lineIdx, $hypSnt->{hyplen});
+ #ref ngrams
+ $refNgramCounts = groupNgramsMultiSrc($data->{refs}, $lineIdx, $order);
- $hypSnt->{correctNgrams} = [];
- $hypSnt->{totalNgrams} = [];
+ $correctNgramCounts = 0;
+ $totalNgramCounts = 0;
+ $coocNgramInfoSum = 0;
+ $totalNgramAmt = 0;
+ my $coocUpd;
- #update ngram precision for each n-gram order
- for my $order (1..$MAX_NGRAMS_FOR_BLEU) {
- #hyp ngrams
- $hypNgramCounts = groupNgrams($hypSnt, $order);
-
- #ref ngrams
- $refNgramCounts = groupNgramsMultiSrc($data->{refs}, $lineIdx, $order);
+ #correct, total
+ for my $ngram (keys %$hypNgramCounts) {
+ $coocUpd = min($hypNgramCounts->{$ngram}, $refNgramCounts->{$ngram});
+ $correctNgramCounts += $coocUpd;
+ $totalNgramCounts += $hypNgramCounts->{$ngram};
- $correctNgramCounts = 0;
- $totalNgramCounts = 0;
-
- #correct, total
- for my $ngram (keys %$hypNgramCounts) {
- $correctNgramCounts += min($hypNgramCounts->{$ngram}, $refNgramCounts->{$ngram});
- $totalNgramCounts += $hypNgramCounts->{$ngram};
+ if ($coocUpd > 0) {
+ $coocNgramInfoSum += ngramInfo($data, $ngram);
}
- $hypSnt->{correctNgrams}->[$order] = $correctNgramCounts;
- $hypSnt->{totalNgrams}->[$order] = $totalNgramCounts;
+ $totalNgramAmt++;
}
+
+ $hypSnt->{correctNgrams}->[$order] = $correctNgramCounts;
+ $hypSnt->{totalNgrams}->[$order] = $totalNgramCounts;
+ $hypSnt->{ngramNistInfoSum}->[$order] = $coocNgramInfoSum;
+ $hypSnt->{ngramNistCount}->[$order] = $totalNgramAmt;
}
}
@@ -226,6 +329,56 @@ sub drawWithReplacement {
}
#####
+#
+#####
+sub getNist {
+ my ($refs, $hyp, $idxs) = @_;
+
+ #default value for $idxs
+ unless (defined($idxs)) {
+ $idxs = [0..((scalar @$hyp) - 1)];
+ }
+
+ #vars
+ my ($hypothesisLength, $referenceLength) = (0, 0);
+ my (@infosum, @totalamt);
+
+ #gather info from each line
+ for my $lineIdx (@$idxs) {
+
+ my $hypSnt = $hyp->[$lineIdx];
+
+ #update total hyp len
+ $hypothesisLength += $hypSnt->{hyplen};
+
+ #update total ref len with closest current ref len
+ $referenceLength += $hypSnt->{avgreflen};
+
+ #update ngram precision for each n-gram order
+ for my $order (1..$MAX_NGRAMS) {
+ $infosum[$order] += $hypSnt->{ngramNistInfoSum}->[$order];
+ $totalamt[$order] += $hypSnt->{ngramNistCount}->[$order];
+ }
+ }
+
+ my $toplog = log($hypothesisLength / $referenceLength);
+ my $btmlog = log(2.0/3.0);
+
+ #compose nist score
+ my $brevityPenalty = ($hypothesisLength > $referenceLength)? 1.0: exp(log(0.5) * $toplog * $toplog / ($btmlog * $btmlog));
+
+ my $sum = 0;
+
+ for my $order (1..$MAX_NGRAMS) {
+ $sum += $infosum[$order]/$totalamt[$order];
+ }
+
+ my $result = $sum * $brevityPenalty;
+
+ return $result;
+}
+
+#####
# refs: arrayref of different references, reference = array of lines, line = array of words, word = string
# hyp: arrayref of lines of hypothesis translation, line = array of words, word = string
# idxs: indices of lines to include; default value - full set (0..size_of_hyp-1)
@@ -254,7 +407,7 @@ sub getBleu {
$referenceLength += $hypSnt->{reflen};
#update ngram precision for each n-gram order
- for my $order (1..$MAX_NGRAMS_FOR_BLEU) {
+ for my $order (1..$MAX_NGRAMS) {
$correctNgramCounts[$order] += $hypSnt->{correctNgrams}->[$order];
$totalNgramCounts[$order] += $hypSnt->{totalNgrams}->[$order];
}
@@ -265,11 +418,28 @@ sub getBleu {
my $logsum = 0;
- for my $order (1..$MAX_NGRAMS_FOR_BLEU) {
+ for my $order (1..$MAX_NGRAMS) {
$logsum += safeLog($correctNgramCounts[$order] / $totalNgramCounts[$order]);
}
- return $brevityPenalty * exp($logsum / $MAX_NGRAMS_FOR_BLEU);
+ return $brevityPenalty * exp($logsum / $MAX_NGRAMS);
+}
+
+#####
+#
+#####
+sub getAvgLength {
+ my ($refs, $lineIdx) = @_;
+
+ my $result = 0;
+ my $count = 0;
+
+ for my $ref (@$refs) {
+ $result += scalar @{$ref->[$lineIdx]->{words}};
+ $count++;
+ }
+
+ return $result / $count;
}
#####
@@ -366,3 +536,9 @@ sub max {
return ($a > $b)? $a: $b;
}
+
+sub poww {
+ my ($a, $b) = @_;
+
+ return exp($b * log($a));
+}