From 8a4c6a2c6380eee06ada7c932473bb9d2045e770 Mon Sep 17 00:00:00 2001
From: mphi <mphi@1f5c12ca-751b-0410-a591-d2e778427230>
Date: Thu, 23 Oct 2008 09:16:33 +0000
Subject: pus significance test into proper location

git-svn-id: https://mosesdecoder.svn.sourceforge.net/svnroot/mosesdecoder/trunk@1915 1f5c12ca-751b-0410-a591-d2e778427230
---
 ...bootstrap-hypothesis-difference-significance.pl | 309 +++++++++++++++++++++
 1 file changed, 309 insertions(+)
 create mode 100755 scripts/analysis/bootstrap-hypothesis-difference-significance.pl

(limited to 'scripts/analysis')

diff --git a/scripts/analysis/bootstrap-hypothesis-difference-significance.pl b/scripts/analysis/bootstrap-hypothesis-difference-significance.pl
new file mode 100755
index 000000000..b3ce89f99
--- /dev/null
+++ b/scripts/analysis/bootstrap-hypothesis-difference-significance.pl
@@ -0,0 +1,309 @@
+#!/usr/bin/perl
+
+###############################################
+# An implementation of paired bootstrap resampling for testing the statistical
+# significance of the difference between two systems from (Koehn 2004 @ EMNLP)
+#
+# Usage: ./compare-hypotheses-with-significance.pl hypothesis_1 hypothesis_2 reference_1 [ reference_2 ... ]
+#
+# Author: Mark Fishel, fishel@ut.ee
+# 
+# 22.10: altered algorithm according to (Riezler and Maxwell 2005 @ MTSE'05), now computes p-value
+###############################################
+
+use strict;
+
+#constants
+my $TIMES_TO_REPEAT_SUBSAMPLING = 1000;
+my $SUBSAMPLE_SIZE = 0; # if 0 then subsample size is equal to the whole set
+my $TMP_PREFIX = "/tmp/signigicance_test_file_";
+my $MAX_NGRAMS_FOR_BLEU = 4;
+
+#checking cmdline argument consistency
+if (@ARGV < 3) {
+	print STDERR "Usage: ./bootstrap-hypothesis-difference-significance.pl hypothesis_1 hypothesis_2 reference_1 [ reference_2 ... ]\n";
+
+	unless ($ARGV[0] =~ /^(--help|-help|-h|-\?|\/\?|--usage|-usage)$/) {
+		die("\nERROR: not enough arguments");
+	}
+	
+	exit 1;
+}
+
+print "reading data; " . `date`;
+
+#read all data
+my $data = readAllData(@ARGV);
+
+#start comparing
+print "comparing hypotheses; " . `date`;
+
+my @subSampleBleuDiffArr;
+
+#applying sampling
+for (1..$TIMES_TO_REPEAT_SUBSAMPLING) {
+	my $subSampleIndices = drawWithReplacement($data->{size}, ($SUBSAMPLE_SIZE? $SUBSAMPLE_SIZE: $data->{size}));
+	
+	my $bleu1 = getBleu($data->{refs}, $data->{hyp1}, $subSampleIndices);
+	my $bleu2 = getBleu($data->{refs}, $data->{hyp2}, $subSampleIndices);
+	
+	push @subSampleBleuDiffArr, abs($bleu2 - $bleu1);
+	
+	if ($_ % int($TIMES_TO_REPEAT_SUBSAMPLING / 100) == 0) {
+		print "$_ / $TIMES_TO_REPEAT_SUBSAMPLING " . `date`;
+	}
+}
+
+#get subsample bleu difference mean
+my $averageSubSampleBleuDiff = 0;
+
+for my $subSampleDiff (@subSampleBleuDiffArr) {
+	$averageSubSampleBleuDiff += $subSampleDiff;
+}
+
+$averageSubSampleBleuDiff /= $TIMES_TO_REPEAT_SUBSAMPLING;
+
+print "average subsample bleu: $averageSubSampleBleuDiff " . `date`;
+
+#calculating p-value
+my $count = 0;
+
+my $realBleuDiff = abs(getBleu($data->{refs}, $data->{hyp2}) - getBleu($data->{refs}, $data->{hyp1}));
+
+for my $subSampleDiff (@subSampleBleuDiffArr) {
+#	my $op;
+	
+	if ($subSampleDiff - $averageSubSampleBleuDiff >= $realBleuDiff) {
+		$count++;
+#		$op = ">=";
+	}
+	else {
+#		$op = "< ";
+	}
+	
+#	print "$subSampleDiff - $averageSubSampleBleuDiff $op $realBleuDiff\n";
+}
+
+my $result = ($count + 1) / $TIMES_TO_REPEAT_SUBSAMPLING;
+
+print "Assuming that essentially the same system generated the two hypothesis translations (null-hypothesis),\n";
+print "the probability of actually getting the second hypothesis translation as output (p-value) is: $result.\n";
+
+#####
+# read 2 hyp and 1 to \infty ref data files
+#####
+sub readAllData {
+	my ($hypFile1, $hypFile2, @refFiles) = @_;
+	
+	my %result;
+
+	#reading hypotheses and checking for matching sizes
+	$result{hyp1} = readData($hypFile1);
+	$result{size} = scalar @{$result{hyp1}};
+
+	$result{hyp2} = readData($hypFile2);
+	unless (scalar @{$result{hyp2}} == $result{size}) {
+		die ("ERROR: sizes of hypothesis sets 1 and 2 don't match");
+	}
+
+	#reading reference(s) and checking for matching sizes
+	$result{refs} = [];
+	my $i = 0;
+
+	for my $refFile (@refFiles) {
+		$i++;
+		my $refDataX = readData($refFile);
+		
+		unless (scalar @$refDataX == $result{size}) {
+			die ("ERROR: ref set $i size doesn't match the size of hyp sets");
+		}
+		
+		push @{$result{refs}}, $refDataX;
+	}
+	
+	return \%result;
+}
+
+#####
+# read sentences from file
+#####
+sub readData {
+	my $file = shift;
+	my @result;
+	
+	open (FILE, $file) or die ("Failed to open `$file' for reading");
+	
+	while (<FILE>) {
+		push @result, [split(/\s+/, $_)];
+	}
+	
+	close (FILE);
+	
+	return \@result;
+}
+
+#####
+# draw a subsample of size $subSize from set (0..$setSize) with replacement
+#####
+sub drawWithReplacement {
+	my ($setSize, $subSize) = @_;
+	
+	my @result;
+	
+	for (1..$subSize) {
+		push @result, int(rand($setSize));
+	}
+	
+	return \@result;
+}
+
+#####
+# refs: arrayref of different references, reference = array of lines, line = array of words, word = string
+# hyp: arrayref of lines of hypothesis translation, line = array of words, word = string
+# idxs: indices of lines to include; default value - full set (0..size_of_hyp-1)
+#####
+sub getBleu {
+	my ($refs, $hyp, $idxs) = @_;
+	
+	#default value for $idxs
+	unless (defined($idxs)) {
+		$idxs = [0..((scalar @$hyp) - 1)];
+	}
+	
+	#vars
+	my ($hypothesisLength, $referenceLength) = (0, 0);
+	my (@correctNgramCounts, @totalNgramCounts);
+	my ($refNgramCounts, $hypNgramCounts);
+	
+	#gather info from each line
+	for my $lineIdx (@$idxs) {
+		my $hypSnt = $hyp->[$lineIdx];
+		
+		#update total hyp len
+		$hypothesisLength += scalar @$hypSnt;
+		
+		#update total ref len with closest current ref len
+		$referenceLength += getClosestLength($refs, $lineIdx, $hypothesisLength);
+		
+		#update ngram precision for each n-gram order
+		for my $order (1..$MAX_NGRAMS_FOR_BLEU) {
+			#hyp ngrams
+			$hypNgramCounts = groupNgrams($hypSnt, $order);
+			
+			#ref ngrams
+			$refNgramCounts = groupNgramsMultiSrc($refs, $lineIdx, $order);
+			
+			#correct, total
+			for my $ngram (keys %$hypNgramCounts) {
+				$correctNgramCounts[$order] += min($hypNgramCounts->{$ngram}, $refNgramCounts->{$ngram});
+				$totalNgramCounts[$order] += $hypNgramCounts->{$ngram};
+			}
+		}
+	}
+	
+	#compose bleu score
+	my $brevityPenalty = ($hypothesisLength < $referenceLength)? exp(1 - $referenceLength/$hypothesisLength): 1;
+	
+	my $logsum = 0;
+	
+	for my $order (1..$MAX_NGRAMS_FOR_BLEU) {
+		$logsum += safeLog($correctNgramCounts[$order] / $totalNgramCounts[$order]);
+	}
+	
+	return $brevityPenalty * exp($logsum / $MAX_NGRAMS_FOR_BLEU);
+}
+
+#####
+#
+#####
+sub getClosestLength {
+	my ($refs, $lineIdx, $hypothesisLength) = @_;
+	
+	my $bestDiff = infty();
+	my $bestLen = infty();
+	
+	my ($currLen, $currDiff);
+	
+	for my $ref (@$refs) {
+		$currLen = scalar @{$ref->[$lineIdx]};
+		$currDiff = abs($currLen - $hypothesisLength);
+		
+		if ($currDiff < $bestDiff or ($currDiff == $bestDiff and $currLen < $bestLen)) {
+			$bestDiff = $currDiff;
+			$bestLen = $currLen;
+		}
+	}
+	
+	return $bestLen;
+}
+
+#####
+#
+#####
+sub groupNgrams {
+	my ($snt, $order) = @_;
+	my %result;
+	
+	my $size = scalar @$snt;
+	my $ngram;
+	
+	for my $i (0..($size-$order)) {
+		$ngram = join(" ", @$snt[$i..($i + $order - 1)]);
+		
+		$result{$ngram}++;
+	}
+	
+	return \%result;
+}
+
+#####
+#
+#####
+sub groupNgramsMultiSrc {
+	my ($refs, $lineIdx, $order) = @_;
+	my %result;
+	
+	for my $ref (@$refs) {
+		my $currNgramCounts = groupNgrams($ref->[$lineIdx], $order);
+		
+		for my $currNgram (keys %$currNgramCounts) {
+			$result{$currNgram} = max($result{$currNgram}, $currNgramCounts->{$currNgram});
+		}
+	}
+	
+	return \%result;
+}
+
+#####
+#
+#####
+sub safeLog {
+	my $x = shift;
+	
+	return ($x > 0)? log($x): -infty();
+}
+
+#####
+#
+#####
+sub infty {
+	return 1e6000;
+}
+
+#####
+#
+#####
+sub min {
+	my ($a, $b) = @_;
+	
+	return ($a < $b)? $a: $b;
+}
+
+#####
+#
+#####
+sub max {
+	my ($a, $b) = @_;
+	
+	return ($a > $b)? $a: $b;
+}
-- 
cgit v1.2.3