Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/moses-smt/mosesdecoder.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorphkoehn <phkoehn@1f5c12ca-751b-0410-a591-d2e778427230>2011-06-17 03:43:29 +0400
committerphkoehn <phkoehn@1f5c12ca-751b-0410-a591-d2e778427230>2011-06-17 03:43:29 +0400
commit4285a8e236b3bc91aace803876b62df01b4186b8 (patch)
tree0054e5893e49e23ab4fb015cd6c66d6bc4eaa67a /scripts/ems/support/analysis.perl
parentab3460591c546534a57c771ce8864c7da4e3c6a5 (diff)
various experiment.perl improvements: split filter and decode/tune; extensions to analysis.perl, especially precision by coverage graphs
git-svn-id: https://mosesdecoder.svn.sourceforge.net/svnroot/mosesdecoder/trunk@4018 1f5c12ca-751b-0410-a591-d2e778427230
Diffstat (limited to 'scripts/ems/support/analysis.perl')
-rwxr-xr-xscripts/ems/support/analysis.perl33
1 files changed, 24 insertions, 9 deletions
diff --git a/scripts/ems/support/analysis.perl b/scripts/ems/support/analysis.perl
index 1ab18330f..9a9de9327 100755
--- a/scripts/ems/support/analysis.perl
+++ b/scripts/ems/support/analysis.perl
@@ -5,12 +5,14 @@ use Getopt::Long "GetOptions";
my $MAX_LENGTH = 4;
-my ($system,$system_alignment,$segmentation,$reference,$dir,$input,$corpus,$ttable,@FACTORED_TTABLE,$score_options,$hierarchical,$output_corpus,$alignment,$biconcor,$input_factors,$precision_by_coverage,$precision_by_coverage_factor);
+my ($system,$system_alignment,$segmentation,$reference,$dir,$input,$corpus,$ttable,@FACTORED_TTABLE,$score_options,$hierarchical,$output_corpus,$alignment,$biconcor,$input_factors,$input_factor_names,$output_factor_names,$precision_by_coverage,$precision_by_coverage_factor,$coverage_dir);
if (!&GetOptions('system=s' => \$system, # raw output from decoder
'system-alignment=s' => \$system_alignment, # word alignment of system output
'reference=s' => \$reference, # tokenized reference
'dir=s' => \$dir, # directory for storing results
'input-factors=i' => \$input_factors, # list of input factors
+ 'input-factor-names=s' => \$input_factor_names,
+ 'output-factor-names=s' => \$output_factor_names,
'precision-by-coverage' => \$precision_by_coverage, # added report for input words
'precision-by-coverage-factor=i' => \$precision_by_coverage_factor, # sub-reports
'input=s' => \$input, # tokenized input (as for decoder)
@@ -21,6 +23,7 @@ if (!&GetOptions('system=s' => \$system, # raw output from decoder
'score-options=s' => \$score_options, # score options to detect p(e|f) score
'output-corpus=s' => \$output_corpus, # output side of parallel training corpus
'alignment-file=s' => \$alignment, # alignment of parallel corpus
+ 'coverage=s' => \$coverage_dir, # already computed coverage, stored in this dir
'biconcor=s' => \$biconcor, # binary for bilingual concordancer
'hierarchical' => \$hierarchical) || # hierarchical model?
!defined($dir)) {
@@ -29,6 +32,14 @@ if (!&GetOptions('system=s' => \$system, # raw output from decoder
`mkdir -p $dir`;
+# factor names
+if (defined($input_factor_names) && defined($output_factor_names)) {
+ open(FACTOR,">$dir/factor-names");
+ print FACTOR $input_factor_names."\n";
+ print FACTOR $output_factor_names."\n";
+ close(FACTOR);
+}
+
# compare system output against reference translation
my(@SYSTEM,@REFERENCE);
my (%PRECISION_CORRECT,%PRECISION_TOTAL,
@@ -82,13 +93,13 @@ if (defined($segmentation)) {
# coverage analysis
my (%INPUT_PHRASE,%CORPUS_COVERED,%TTABLE_COVERED,%TTABLE_ENTROPY);
-if (!defined($system_alignment) && (defined($ttable) || defined($corpus))) {
+if (!defined($coverage_dir) && (defined($ttable) || defined($corpus))) {
if (!defined($input)) {
die("ERROR: when specifying either ttable or input-corpus, please also specify input\n");
}
$MAX_LENGTH = 7;
&input_phrases();
- &ttable_coverage(0,$ttable) if defined($ttable);
+ &ttable_coverage("0",$ttable) if defined($ttable);
&corpus_coverage() if defined($corpus);
&input_annotation();
@@ -101,11 +112,11 @@ if (!defined($system_alignment) && (defined($ttable) || defined($corpus))) {
}
# factored ttable coverage
- foreach my $ttable (@FACTORED_TTABLE) {
+ foreach my $factored_ttable (@FACTORED_TTABLE) {
die("factored ttable must be specified as factor:file -- $ttable")
- unless $ttable =~ /^(\d+)\:(.+)/; # factor:ttable
+ unless $factored_ttable =~ /^([\d,]+)\:(.+)/; # factor:ttable
my ($factor,$file) = ($1,$2);
- next unless $file eq $ttable; # no need to do this twice
+ next if defined($ttable) && $file eq $ttable; # no need to do this twice
&input_phrases($factor);
&ttable_coverage($factor,$file);
}
@@ -166,7 +177,7 @@ sub get_factor_phrase {
$line =~ s/ $//;
# only surface? delete remaining factors
- if (!defined($factor) || $factor == 0) {
+ if (!defined($factor) || $factor eq "0") {
$line =~ s/\|\S+//g;
return $line;
}
@@ -196,7 +207,7 @@ sub get_factor_word {
sub factor_ext {
my ($factor) = @_;
- return "" if !defined($factor) || $factor == 0;
+ return "" if !defined($factor) || $factor eq "0";
return ".".$factor;
}
@@ -327,6 +338,7 @@ sub ttable_coverage {
# handling hierarchical
$in =~ s/ \[[^ \]]+\]$//; # remove lhs nt
next if $in =~ /\[[^ \]]+\]\[[^ \]]+\]/; # only consider flat rules
+ $in = &get_factor_phrase($factor,$in) unless !defined($factor) || $factor eq "0";
$scores = $COLUMN[4] if defined($hierarchical); #scalar @COLUMN == 5;
my @IN = split(/ /,$in);
$size = scalar @IN;
@@ -536,7 +548,10 @@ sub precision_by_coverage {
# get coverage statistics
my %COVERAGE;
- open(COVERAGE,"$dir/$coverage_type-coverage-by-phrase");
+ print STDERR "".(defined($coverage_dir)?$coverage_dir:$dir)
+ ."/$coverage_type-coverage-by-phrase";
+ open(COVERAGE,(defined($coverage_dir)?$coverage_dir:$dir)
+ ."/$coverage_type-coverage-by-phrase");
while(<COVERAGE>) {
chop;
my ($phrase,$count) = split(/\t/);