Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/moses-smt/mosesdecoder.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authormphi <mphi@1f5c12ca-751b-0410-a591-d2e778427230>2010-11-29 12:04:44 +0300
committermphi <mphi@1f5c12ca-751b-0410-a591-d2e778427230>2010-11-29 12:04:44 +0300
commitddabdf6b1b4acb5a01cc11b56fcd2799a11ef693 (patch)
tree339767e6230732bf617a85cce6efd0a6cb729c4c /scripts/analysis
parenteabc1373060ea3f52f3b02eead9052eb6289043c (diff)
added support for arbitrary encodings via the $IO_ENCODING global variable on line 23; set to UTF8 by default
git-svn-id: https://mosesdecoder.svn.sourceforge.net/svnroot/mosesdecoder/trunk@3739 1f5c12ca-751b-0410-a591-d2e778427230
Diffstat (limited to 'scripts/analysis')
-rwxr-xr-xscripts/analysis/bootstrap-hypothesis-difference-significance.pl24
1 files changed, 19 insertions, 5 deletions
diff --git a/scripts/analysis/bootstrap-hypothesis-difference-significance.pl b/scripts/analysis/bootstrap-hypothesis-difference-significance.pl
index c8d47a011..50492cad0 100755
--- a/scripts/analysis/bootstrap-hypothesis-difference-significance.pl
+++ b/scripts/analysis/bootstrap-hypothesis-difference-significance.pl
@@ -1,4 +1,5 @@
#!/usr/bin/perl
+use utf8;
###############################################
# An implementation of paired bootstrap resampling for testing the statistical
@@ -19,6 +20,7 @@ use strict;
my $TIMES_TO_REPEAT_SUBSAMPLING = 1000;
my $SUBSAMPLE_SIZE = 0; # if 0 then subsample size is equal to the whole set
my $MAX_NGRAMS = 4;
+my $IO_ENCODING = "utf8"; # can be replaced with e.g. "encoding(iso-8859-13)" or alike
#checking cmdline argument consistency
if (@ARGV < 3) {
@@ -31,7 +33,7 @@ if (@ARGV < 3) {
exit 1;
}
-print "reading data; " . `date`;
+print STDERR "reading data; " . `date`;
#read all data
my $data = readAllData(@ARGV);
@@ -39,14 +41,14 @@ my $data = readAllData(@ARGV);
my $verbose = $ARGV[3];
#calculate each sentence's contribution to BP and ngram precision
-print "performing preliminary calculations (hypothesis 1); " . `date`;
+print STDERR "performing preliminary calculations (hypothesis 1); " . `date`;
preEvalHypo($data, "hyp1");
-print "performing preliminary calculations (hypothesis 2); " . `date`;
+print STDERR "performing preliminary calculations (hypothesis 2); " . `date`;
preEvalHypo($data, "hyp2");
#start comparing
-print "comparing hypotheses -- this may take some time; " . `date`;
+print STDERR "comparing hypotheses -- this may take some time; " . `date`;
bootstrap_report("BLEU", \&getBleu);
bootstrap_report("NIST", \&getNist);
@@ -89,7 +91,7 @@ sub bootstrap_pass {
my @subSample2Arr;
#applying sampling
- for (1..$TIMES_TO_REPEAT_SUBSAMPLING) {
+ for my $idx (1..$TIMES_TO_REPEAT_SUBSAMPLING) {
my $subSampleIndices = drawWithReplacement($data->{size}, ($SUBSAMPLE_SIZE? $SUBSAMPLE_SIZE: $data->{size}));
my $score1 = &$scoreFunc($data->{refs}, $data->{hyp1}, $subSampleIndices);
@@ -98,6 +100,17 @@ sub bootstrap_pass {
push @subSampleDiffArr, abs($score2 - $score1);
push @subSample1Arr, $score1;
push @subSample2Arr, $score2;
+
+ if ($idx % 10 == 0) {
+ print STDERR ".";
+ }
+ if ($idx % 100 == 0) {
+ print STDERR "$idx\n";
+ }
+ }
+
+ if ($TIMES_TO_REPEAT_SUBSAMPLING % 100 != 0) {
+ print STDERR ".$TIMES_TO_REPEAT_SUBSAMPLING\n";
}
return (\@subSampleDiffArr, \@subSample1Arr, \@subSample2Arr);
@@ -235,6 +248,7 @@ sub readData {
my @result;
open (FILE, $file) or die ("Failed to open `$file' for reading");
+ binmode (FILE, ":$IO_ENCODING");
while (<FILE>) {
push @result, { words => [split(/\s+/, $_)] };