Merge branch 'master' of github.com:moses-smt/mosesdecoder

author: Barry Haddow <barry.haddow@gmail.com> 2012-07-06 00:02:15 +0400
committer: Barry Haddow <barry.haddow@gmail.com> 2012-07-06 00:02:15 +0400
commit: afd654eb74974ecdc0e1da62a6d21ee1f47bd8b5 (patch)
tree: 75bb37c8c000c22695bfc24a94a98fbd9a793d37 /scripts
parent: 278ec106039089e49392c5e9f99cdbc9b720a4e0 (diff)
parent: a8b7e40cecb3655671b06a5beadc56b6d73acb91 (diff)
5 files changed, 154 insertions, 14 deletions
diff --git a/scripts/ems/experiment.meta b/scripts/ems/experiment.meta
index b294fde9d..45f689736 100644
--- a/scripts/ems/experiment.meta
+++ b/scripts/ems/experiment.meta
@@ -395,8 +395,14 @@ build-generation-custom
 	rerun-on-change: generation-factors generation-type training-options script generation-corpus
 	ignore-unless: AND generation-factors generation-corpus
 	default-name: model/generation-table
+build-sparse-lexical
+	in: corpus
+        out: sparse-lexical
+        ignore-unless: sparse-lexical-features
+        default-name: model/most-frequent-words
+	template: $moses-script-dir/ems/support/build-sparse-lexical-features.perl IN $input-extension $output-extension OUT "$sparse-lexical-features"
 create-config
-	in: reordering-table phrase-translation-table generation-table INTERPOLATED-LM:binlm LM:binlm
+	in: reordering-table phrase-translation-table generation-table sparse-lexical INTERPOLATED-LM:binlm LM:binlm
 	out: config
 	ignore-if: use-hiero
 	rerun-on-change: decoding-steps alignment-factors translation-factors reordering-factors generation-factors lexicalized-reordering training-options script decoding-graph-backoff score-settings additional-ini
diff --git a/scripts/ems/experiment.perl b/scripts/ems/experiment.perl
index 0f6d93d8b..aae4f7754 100755
--- a/scripts/ems/experiment.perl
+++ b/scripts/ems/experiment.perl
@@ -6,6 +6,14 @@ use strict;
 use Getopt::Long "GetOptions";
 use FindBin qw($RealBin);
 
+sub trim($)
+{
+	my $string = shift;
+	$string =~ s/^\s+//;
+	$string =~ s/\s+$//;
+	return $string;
+}
+
 my $host = `hostname`; chop($host);
 print STDERR "STARTING UP AS PROCESS $$ ON $host AT ".`date`;
 
@@ -1770,7 +1778,7 @@ sub define_training_build_custom_generation {
 sub define_training_create_config {
     my ($step_id) = @_;
 
-    my ($config,$reordering_table,$phrase_translation_table,$generation_table,@LM)
+    my ($config,$reordering_table,$phrase_translation_table,$generation_table,$sparse_lexical_features,@LM)
 			= &get_output_and_input($step_id);
 
     my $cmd = &get_training_setting(9);
@@ -1794,9 +1802,10 @@ sub define_training_create_config {
 		}
 		
     # additional settings for factored models
-    my $ptCmd = "$phrase_translation_table:$ptImpl";
+    my $ptCmd = $phrase_translation_table;
+    $ptCmd .= ":$ptImpl" if $ptImpl>0;
     $ptCmd .= ":$numFF" if defined($numFF);
-    $cmd .= &get_table_name_settings("translation-factors","phrase-translation-table",$ptCmd);
+    $cmd .= &get_table_name_settings("translation-factors","phrase-translation-table", $ptCmd);
     $cmd .= &get_table_name_settings("reordering-factors","reordering-table",$reordering_table)	if $reordering_table;
     $cmd .= &get_table_name_settings("generation-factors","generation-table",$generation_table)	if $generation_table;
     $cmd .= "-config $config ";
@@ -1894,6 +1903,9 @@ sub define_training_create_config {
     my $additional_ini = &get("TRAINING:additional-ini");
     $cmd .= "-additional-ini '$additional_ini' " if defined($additional_ini);
 
+    # sparse lexical features provide additional content for config file
+    $cmd .= "-additional-ini-file $sparse_lexical_features.ini " if $sparse_lexical_features;
+
     &create_step($step_id,$cmd);
 }
 
@@ -2223,10 +2235,10 @@ sub define_tuningevaluation_filter {
     my $config = $tuning_flag ? "$dir/tuning/moses.table.ini.$VERSION" : "$dir/evaluation/$set.moses.table.ini.$VERSION";
     my $cmd = &get_training_setting(9);
     
-    my $ptCmd = "$phrase_translation_table:$ptImpl";
+    my $ptCmd = $phrase_translation_table;
+    $ptCmd .= ":$ptImpl" if $ptImpl>0;
     $ptCmd .= ":$numFF" if defined($numFF);
     $cmd .= &get_table_name_settings("translation-factors","phrase-translation-table", $ptCmd);
-    
     $cmd .= &get_table_name_settings("reordering-factors","reordering-table",$reordering_table)
 	if $reordering_table;
     # additional settings for hierarchical models
@@ -2243,7 +2255,6 @@ sub define_tuningevaluation_filter {
     $cmd .= "-config $config\n";
     
     # filter command
- 		my $sa_exec_dir = &get("TRAINING:suffix-array");
 		if ($sa_exec_dir) {
 			# suffix array
 			$cmd .= "$scripts/training/wrappers/adam-suffix-array/suffix-array-extract.sh $sa_exec_dir $phrase_translation_table $input_filter $filter_dir \n";
diff --git a/scripts/ems/support/build-sparse-lexical-features.perl b/scripts/ems/support/build-sparse-lexical-features.perl
new file mode 100755
index 000000000..81156f2f3
--- /dev/null
+++ b/scripts/ems/support/build-sparse-lexical-features.perl
@@ -0,0 +1,95 @@
+#!/usr/bin/perl -w
+
+use strict;
+
+# Build necessary files for sparse lexical features
+# * target word insertion
+# * source word deletion
+# * word translation
+
+my ($corpus,$input_extension,$output_extension,$outfile_prefix,$specification) = @ARGV;
+my $ini = "";
+my $report = "";
+my %ALREADY;
+
+foreach my $feature_spec (split(/,\s*/,$specification)) {
+  my @SPEC = split(/\s+/,$feature_spec);
+  if ($SPEC[0] eq 'target-word-insertion') {
+    if ($SPEC[1] eq 'top' && $SPEC[2] =~ /^\d+$/) {
+      my $file = &create_top_words($output_extension, $SPEC[2]);
+      $ini .= "[target-word-insertion-feature]\n0 $file\n\n";
+      $report .= "twi\n";
+    }
+    else {
+      die("ERROR: Unknown parameter specification in '$feature_spec'\n");
+    }
+  }
+  elsif ($SPEC[0] eq 'source-word-deletion') {
+    if ($SPEC[1] eq 'top' && $SPEC[2] =~ /^\d+$/) {
+      my $file = &create_top_words($input_extension, $SPEC[2]);
+      $ini .= "[source-word-deletion-feature]\n0 $file\n\n";
+      $report .= "swd\n";
+    }
+    else {
+      die("ERROR: Unknown parameter specification in '$feature_spec'\n");
+    }
+  }
+  elsif ($SPEC[0] eq 'word-translation') {
+    if ($SPEC[1] eq 'top' && $SPEC[2] =~ /^\d+$/ && $SPEC[3] =~ /^\d+$/) {
+      my $file_in  = &create_top_words($input_extension,  $SPEC[2]);
+      my $file_out = &create_top_words($output_extension, $SPEC[3]);
+      $ini .= "[word-translation-feature]\n0 0 $file_in $file_out\n\n";
+      $report .= "wt\n";
+    }
+    else {
+      die("ERROR: Unknown parameter specification in '$feature_spec'\n");
+    }
+  }
+  else {
+    die("ERROR: Unknown feature type '$SPEC[0]' in specification '$feature_spec'\nfull spec: '$specification'\n");
+  }
+}
+
+open(INI,">$outfile_prefix.ini");
+print INI $ini;
+print INI "\n[report-sparse-features]\n$report\n";
+print INI "\n[use-alignment-info]\ntrue\n\n";
+close(INI);
+
+sub create_top_words {
+  my ($extension, $count) = @_;
+  my $file = "$outfile_prefix.$extension.top$count";
+  return $file if defined($ALREADY{"$extension,$count"});
+  $ALREADY{"$extension,$count"}++;
+
+  # get counts
+  my %COUNT;
+  open(CORPUS,"$corpus.$extension");
+  while(<CORPUS>) {
+    chop;
+    foreach (split) {
+      $_ =~ s/\|.+//; # only surface factor at this point
+      $COUNT{$_}++ unless $_ eq "";
+    }
+  }
+  close(CORPUS);
+
+  # sort
+  my @COUNT_WORD;
+  foreach (keys %COUNT) {
+    next if $COUNT{$_} <= 3; # avoid large tail
+    next if $_ =~ /:/; # avoid colon bug
+    push @COUNT_WORD,sprintf("%09d %s",$COUNT{$_},$_);
+  }
+  my @SORTED = reverse sort @COUNT_WORD;
+
+  # write top n to file
+  open(TOP,">$file");
+  for(my $i=0; $i<$count && $i<scalar(@SORTED); $i++) {
+    $SORTED[$i] =~ /^\d+ (.+)$/;
+    print TOP "$1\n"; 
+  }
+  close(TOP);
+
+  return $file;
+}
diff --git a/scripts/ems/support/reuse-weights.perl b/scripts/ems/support/reuse-weights.perl
index b51f23236..1bff4bbd0 100755
--- a/scripts/ems/support/reuse-weights.perl
+++ b/scripts/ems/support/reuse-weights.perl
@@ -10,17 +10,27 @@ my ($weight_file) = @ARGV;
 
 my %WEIGHT;
 my $current_weight = "";
+my $weights_file_spec = "";
+my $weights_file_flag = 0;
 open(WEIGHT,$weight_file)
     || die("ERROR: could not open weight file: $weight_file");
 while(<WEIGHT>) {
-    if (/^\[weight\-(\S+)\]/) {
+    if (/^\[weight-file\]/) {
+      $weights_file_spec = "\n".$_;
+      $weights_file_flag = 1;
+    }
+    elsif (/^\[weight\-(\S+)\]/) {
 	$current_weight = $1;
     }
   elsif ($current_weight && /^(([\-\d\.]+)([Ee][+-]?[\d]+)?)$/) {
 	push @{$WEIGHT{$current_weight}},$1;
     }
+    elsif ($weights_file_flag && !/^\[/ && !/^\s*$/) {
+      $weights_file_spec .= $_;
+    }
     elsif (/^\[/) {
-	$current_weight = "";
+      $current_weight = "";
+      $weights_file_flag = 0;
     }
 }
 close(WEIGHT);
@@ -67,3 +77,6 @@ foreach my $weight (keys %WEIGHT) {
 	}
     }
 }
+
+print $weights_file_spec;
+
diff --git a/scripts/training/train-model.perl b/scripts/training/train-model.perl
index d402599f4..6e18581af 100755
--- a/scripts/training/train-model.perl
+++ b/scripts/training/train-model.perl
@@ -36,7 +36,7 @@ my($_EXTERNAL_BINDIR, $_ROOT_DIR, $_CORPUS_DIR, $_GIZA_E2F, $_GIZA_F2E, $_MODEL_
    $_PHRASE_WORD_ALIGNMENT,$_FORCE_FACTORED_FILENAMES,
    $_MEMSCORE, $_FINAL_ALIGNMENT_MODEL,
    $_CONTINUE,$_MAX_LEXICAL_REORDERING,$_DO_STEPS,
-   $_ADDITIONAL_INI,
+   $_ADDITIONAL_INI,$_ADDITIONAL_INI_FILE,
    $_DICTIONARY, $_EPPEX, $IGNORE);
 my $_CORES = 1;
 
@@ -121,8 +121,9 @@ $_HELP = 1
 		       'force-factored-filenames' => \$_FORCE_FACTORED_FILENAMES,
 		       'dictionary=s' => \$_DICTIONARY,
 		       'eppex:s' => \$_EPPEX,
-           'additional-ini=s' => \$_ADDITIONAL_INI, 
-           'cores=i' => \$_CORES
+		       'additional-ini=s' => \$_ADDITIONAL_INI, 
+		       'additional-ini-file=s' => \$_ADDITIONAL_INI_FILE, 
+		       'cores=i' => \$_CORES
                );
 
 if ($_HELP) {
@@ -1482,6 +1483,10 @@ sub score_phrase_phrase_extract {
     my $ONLY_DIRECT = (defined($_SCORE_OPTIONS) && $_SCORE_OPTIONS =~ /OnlyDirect/);
     my $PHRASE_COUNT = (!defined($_SCORE_OPTIONS) || $_SCORE_OPTIONS !~ /NoPhraseCount/);
     my $LOW_COUNT = (defined($_SCORE_OPTIONS) && $_SCORE_OPTIONS =~ /LowCountFeature/);
+    my $COUNT_BIN = "";
+    if (defined($_SCORE_OPTIONS) && $_SCORE_OPTIONS =~ /CountBinFeature ([\s\d]*\d)/) {
+      $COUNT_BIN = $1;
+    }
     my $UNALIGNED_COUNT = (defined($_SCORE_OPTIONS) && $_SCORE_OPTIONS =~ /UnalignedPenalty/);
     my ($UNALIGNED_FW_COUNT,$UNALIGNED_FW_F,$UNALIGNED_FW_E);
     if (defined($_SCORE_OPTIONS) && $_SCORE_OPTIONS =~ /UnalignedFunctionWordPenalty +(\S+) +(\S+)/) {
@@ -1580,6 +1585,7 @@ sub score_phrase_phrase_extract {
     $cmd .= " --OnlyDirect" if $ONLY_DIRECT;
     $cmd .= " --NoPhraseCount" unless $PHRASE_COUNT;
     $cmd .= " --LowCountFeature" if $LOW_COUNT;
+    $cmd .= " --CountBinFeature $COUNT_BIN" if $COUNT_BIN;
     $cmd .= " --GoodTuring $ttable_file.half.f2e.gz.coc" if $GOOD_TURING;
     $cmd .= " --KneserNey $ttable_file.half.f2e.gz.coc" if $KNESER_NEY;
     
@@ -1837,6 +1843,9 @@ sub create_ini {
    $basic_weight_count /= 2 if defined($_SCORE_OPTIONS) && $_SCORE_OPTIONS =~ /OnlyDirect/;
    $basic_weight_count++ unless defined($_SCORE_OPTIONS) && $_SCORE_OPTIONS =~ /NoPhraseCount/; # phrase count feature
    $basic_weight_count++ if defined($_SCORE_OPTIONS) && $_SCORE_OPTIONS =~ /LowCountFeature/; # low count feature
+   if (defined($_SCORE_OPTIONS) && $_SCORE_OPTIONS =~ /(CountBinFeature [\s\d]*\d)/) {
+     $basic_weight_count += scalar split(/\s+/,$1);
+   }
    $basic_weight_count++ if $_PCFG;
    foreach my $f (split(/\+/,$___TRANSLATION_FACTORS)) {
   	$num_of_ttables++;
@@ -1849,8 +1858,9 @@ sub create_ini {
 	    $file = shift @SPECIFIED_TABLE;
 	    my @toks = split(/:/,$file);
 			$file = $toks[0];
-			$phrase_table_impl = $toks[1];
-			
+      if (@toks > 1) {
+			  $phrase_table_impl = $toks[1];
+			}
 			if (@toks == 3) {
 				$basic_weight_count = $toks[2];
 			}
@@ -1989,10 +1999,15 @@ sub create_ini {
     print INI "\n# delimiter between factors in input\n[factor-delimiter]\n$___FACTOR_DELIMITER\n\n"
   }
 
+  # get addititional content for config file from switch or file
   if ($_ADDITIONAL_INI) {
     print INI "\n# additional settings\n\n";
     foreach (split(/<br>/i,$_ADDITIONAL_INI)) { print INI $_."\n"; }
   }
+  if ($_ADDITIONAL_INI_FILE) {
+    print INI "\n# additional settings\n\n";
+    print INI `cat $_ADDITIONAL_INI_FILE`;
+  }
 
   close(INI);
 }
author	Barry Haddow <barry.haddow@gmail.com>	2012-07-06 00:02:15 +0400
committer	Barry Haddow <barry.haddow@gmail.com>	2012-07-06 00:02:15 +0400
commit	afd654eb74974ecdc0e1da62a6d21ee1f47bd8b5 (patch)
tree	75bb37c8c000c22695bfc24a94a98fbd9a793d37 /scripts
parent	278ec106039089e49392c5e9f99cdbc9b720a4e0 (diff)
parent	a8b7e40cecb3655671b06a5beadc56b6d73acb91 (diff)