added support for arbitrary encodings via the $IO_ENCODING global variable on line 23; set to UTF8 by default

git-svn-id: https://mosesdecoder.svn.sourceforge.net/svnroot/mosesdecoder/trunk@3739 1f5c12ca-751b-0410-a591-d2e778427230
author: mphi <mphi@1f5c12ca-751b-0410-a591-d2e778427230> 2010-11-29 12:04:44 +0300
committer: mphi <mphi@1f5c12ca-751b-0410-a591-d2e778427230> 2010-11-29 12:04:44 +0300
commit: ddabdf6b1b4acb5a01cc11b56fcd2799a11ef693 (patch)
tree: 339767e6230732bf617a85cce6efd0a6cb729c4c /scripts/analysis
parent: eabc1373060ea3f52f3b02eead9052eb6289043c (diff)
1 files changed, 19 insertions, 5 deletions
diff --git a/scripts/analysis/bootstrap-hypothesis-difference-significance.pl b/scripts/analysis/bootstrap-hypothesis-difference-significance.pl
index c8d47a011..50492cad0 100755
--- a/scripts/analysis/bootstrap-hypothesis-difference-significance.pl
+++ b/scripts/analysis/bootstrap-hypothesis-difference-significance.pl
@@ -1,4 +1,5 @@
 #!/usr/bin/perl
+use utf8;
 
 ###############################################
 # An implementation of paired bootstrap resampling for testing the statistical
@@ -19,6 +20,7 @@ use strict;
 my $TIMES_TO_REPEAT_SUBSAMPLING = 1000;
 my $SUBSAMPLE_SIZE = 0; # if 0 then subsample size is equal to the whole set
 my $MAX_NGRAMS = 4;
+my $IO_ENCODING = "utf8"; # can be replaced with e.g. "encoding(iso-8859-13)" or alike
 
 #checking cmdline argument consistency
 if (@ARGV < 3) {
@@ -31,7 +33,7 @@ if (@ARGV < 3) {
 	exit 1;
 }
 
-print "reading data; " . `date`;
+print STDERR "reading data; " . `date`;
 
 #read all data
 my $data = readAllData(@ARGV);
@@ -39,14 +41,14 @@ my $data = readAllData(@ARGV);
 my $verbose = $ARGV[3];
 
 #calculate each sentence's contribution to BP and ngram precision
-print "performing preliminary calculations (hypothesis 1); " . `date`;
+print STDERR "performing preliminary calculations (hypothesis 1); " . `date`;
 preEvalHypo($data, "hyp1");
 
-print "performing preliminary calculations (hypothesis 2); " . `date`;
+print STDERR "performing preliminary calculations (hypothesis 2); " . `date`;
 preEvalHypo($data, "hyp2");
 
 #start comparing
-print "comparing hypotheses -- this may take some time; " . `date`;
+print STDERR "comparing hypotheses -- this may take some time; " . `date`;
 
 bootstrap_report("BLEU", \&getBleu);
 bootstrap_report("NIST", \&getNist);
@@ -89,7 +91,7 @@ sub bootstrap_pass {
 	my @subSample2Arr;
 
 	#applying sampling
-	for (1..$TIMES_TO_REPEAT_SUBSAMPLING) {
+	for my $idx (1..$TIMES_TO_REPEAT_SUBSAMPLING) {
 		my $subSampleIndices = drawWithReplacement($data->{size}, ($SUBSAMPLE_SIZE? $SUBSAMPLE_SIZE: $data->{size}));
 		
 		my $score1 = &$scoreFunc($data->{refs}, $data->{hyp1}, $subSampleIndices);
@@ -98,6 +100,17 @@ sub bootstrap_pass {
 		push @subSampleDiffArr, abs($score2 - $score1);
 		push @subSample1Arr, $score1;
 		push @subSample2Arr, $score2;
+		
+		if ($idx % 10 == 0) {
+			print STDERR ".";
+		}
+		if ($idx % 100 == 0) {
+			print STDERR "$idx\n";
+		}
+	}
+	
+	if ($TIMES_TO_REPEAT_SUBSAMPLING % 100 != 0) {
+		print STDERR ".$TIMES_TO_REPEAT_SUBSAMPLING\n";
 	}
 	
 	return (\@subSampleDiffArr, \@subSample1Arr, \@subSample2Arr);
@@ -235,6 +248,7 @@ sub readData {
 	my @result;
 	
 	open (FILE, $file) or die ("Failed to open `$file' for reading");
+	binmode (FILE, ":$IO_ENCODING");
 	
 	while (<FILE>) {
 		push @result, { words => [split(/\s+/, $_)] };
author	mphi <mphi@1f5c12ca-751b-0410-a591-d2e778427230>	2010-11-29 12:04:44 +0300
committer	mphi <mphi@1f5c12ca-751b-0410-a591-d2e778427230>	2010-11-29 12:04:44 +0300
commit	ddabdf6b1b4acb5a01cc11b56fcd2799a11ef693 (patch)
tree	339767e6230732bf617a85cce6efd0a6cb729c4c /scripts/analysis
parent	eabc1373060ea3f52f3b02eead9052eb6289043c (diff)