fixes for sparse feature handling

author: phikoehn <pkoehn@inf.ed.ac.uk> 2013-05-17 11:37:29 +0400
committer: phikoehn <pkoehn@inf.ed.ac.uk> 2013-05-17 11:37:29 +0400
commit: 4cdffc8a891a3004a0102aa77ae26ceb91a1e881 (patch)
tree: 815350e8007c863a098d04689bd35177cc61569b
parent: 13991fc88fc6184139db46aa306789d855ef54cd (diff)
7 files changed, 68 insertions, 49 deletions
diff --git a/mert/Data.cpp b/mert/Data.cpp
index 3f91c1376..1efa080a2 100644
--- a/mert/Data.cpp
+++ b/mert/Data.cpp
@@ -212,7 +212,7 @@ void Data::InitFeatureMap(const string& str) {
   while (!buf.empty()) {
     getNextPound(buf, substr);
 
-    // string ending with ":" are skipped, because they are the names of the features
+    // string ending with "=" are skipped, because they are the names of the features
     if (!EndsWith(substr, "=")) {
       stringstream ss;
       ss << tmp_name << "_" << tmp_index << " ";
diff --git a/mert/FeatureDataIterator.cpp b/mert/FeatureDataIterator.cpp
index a22112bb4..471da07ee 100644
--- a/mert/FeatureDataIterator.cpp
+++ b/mert/FeatureDataIterator.cpp
@@ -89,7 +89,7 @@ void FeatureDataIterator::readNext() {
       StringPiece line = m_in->ReadLine();
       m_next.push_back(FeatureDataItem());
       for (TokenIter<AnyCharacter, true> token(line, AnyCharacter(" \t")); token; ++token) {
-        TokenIter<AnyCharacterLast,false> value(*token,AnyCharacterLast(":"));
+        TokenIter<AnyCharacterLast,false> value(*token,AnyCharacterLast("="));
         if (!value) throw FileFormatException(m_in->FileName(), line.as_string());
         StringPiece first = *value;
         ++value;
diff --git a/mert/FeatureStats.cpp b/mert/FeatureStats.cpp
index 22f62e234..242d3fbd0 100644
--- a/mert/FeatureStats.cpp
+++ b/mert/FeatureStats.cpp
@@ -220,12 +220,12 @@ void FeatureStats::set(string &theString, const SparseVector& sparseWeights )
   while (!theString.empty()) {
     getNextPound(theString, substring);
     // regular feature
-    if (substring.find(":") == string::npos) {
+    if (substring.find("=") == string::npos) {
       add(ConvertStringToFeatureStatsType(substring));
     }
     // sparse feature
     else {
-      size_t separator = substring.find_last_of(":");
+      size_t separator = substring.find_last_of("=");
       addSparse(substring.substr(0,separator), atof(substring.substr(separator+1).c_str()) );
     }
   }
diff --git a/scripts/ems/experiment.meta b/scripts/ems/experiment.meta
index c82195b1a..d10bf2dbe 100644
--- a/scripts/ems/experiment.meta
+++ b/scripts/ems/experiment.meta
@@ -844,13 +844,13 @@ apply-filter
 	default-name: tuning/moses.filtered.ini
 	pass-if: TRAINING:binarize-all
 	ignore-if: use-hiero
-	template: cp IN1/moses.ini OUT
+	template: $moses-script-dir/ems/support/substitute-filtered-tables.perl IN1/moses.ini < IN > OUT
 apply-filter-devtest
 	in: TRAINING:config filtered-dir-devtest
 	out: filtered-config-devtest
 	default-name: tuning/moses.filtered.devtest.ini
 	pass-if: TRAINING:binarize-all
-  ignore-unless: use-mira
+	ignore-unless: use-mira
 	template: $moses-script-dir/ems/support/substitute-filtered-tables.perl IN1/moses.ini < IN > OUT
 tune
 	in: filtered-config input reference filtered-config-devtest input-devtest reference-devtest
diff --git a/scripts/ems/experiment.perl b/scripts/ems/experiment.perl
index b4ebe161a..a4d94b00e 100755
--- a/scripts/ems/experiment.perl
+++ b/scripts/ems/experiment.perl
@@ -2100,15 +2100,12 @@ sub define_training_sigtest_filter {
     &create_step($step_id,$cmd);
 }
 
-sub define_training_create_config {
-    my ($step_id) = @_;
-
-    my ($config,$reordering_table,$phrase_translation_table,$generation_table,$sparse_lexical_features,$domains,@LM)
-			= &get_output_and_input($step_id);
+sub get_config_tables {
+    my ($config,$reordering_table,$phrase_translation_table,$generation_table,$domains) = @_;
 
     my $moses_src_dir = &check_and_get("GENERAL:moses-src-dir");
-    #my $cmd = "$moses_src_dir/bin/create-ini ";
     my $cmd = &backoff_and_get("TRAINING:create-ini");
+    $cmd = "$moses_src_dir/bin/create-ini" unless defined($cmd);
 
     my %IN;
     my %OUT;
@@ -2188,8 +2185,29 @@ sub define_training_create_config {
 	my $unknown_word_label = &versionize(&long_file_name("unknown-word-label","model",""),$extract_version);
 	$cmd .= "-unknown-word-label $unknown_word_label ";
     }
+    my $additional_ini = &get("TRAINING:additional-ini");
+    if (&get("TRAINING:score-settings") && 
+        &get("TRAINING:score-settings") =~ /SparseCountBinFeature/) {
+      $additional_ini .= "<br>[report-sparse-features]<br>stm<br><br>";
+      $cmd .= "-sparse-translation-table ";
+    }
+    $cmd .= "-additional-ini '$additional_ini' " if defined($additional_ini);
+    $cmd .= &define_domain_feature_score_option($domains) if &get("TRAINING:domain-features");
+
+    return $cmd;
+}
+
+sub define_training_create_config {
+    my ($step_id) = @_;
+
+    my ($config,$reordering_table,$phrase_translation_table,$generation_table,$sparse_lexical_features,$domains,@LM)
+			= &get_output_and_input($step_id);
+
+    my $cmd = &get_config_tables($config,$reordering_table,$phrase_translation_table,$generation_table,$domains);
+
+    # sparse lexical features provide additional content for config file
+    $cmd .= "-additional-ini-file $sparse_lexical_features.ini " if $sparse_lexical_features;
 
-    # find out which language model files have been built
     my @LM_SETS = &get_sets("LM");
     my %INTERPOLATED_AWAY;
     my %OUTPUT_FACTORS;
@@ -2257,18 +2275,6 @@ sub define_training_create_config {
 	    $cmd .= "-lm $factor:$order:$lm_file:$type ";
     }
 
-    my $additional_ini = &get("TRAINING:additional-ini");
-    if (&get("TRAINING:score-settings") && 
-        &get("TRAINING:score-settings") =~ /SparseCountBinFeature/) {
-      $additional_ini .= "<br>[report-sparse-features]<br>stm<br><br>";
-      $cmd .= "-sparse-translation-table ";
-    }
-    $cmd .= "-additional-ini '$additional_ini' " if defined($additional_ini);
-
-    # sparse lexical features provide additional content for config file
-    $cmd .= "-additional-ini-file $sparse_lexical_features.ini " if $sparse_lexical_features;
-    $cmd .= &define_domain_feature_score_option($domains) if &get("TRAINING:domain-features");
-
     &create_step($step_id,$cmd);
 }
 
@@ -2608,9 +2614,12 @@ sub define_tuningevaluation_filter {
     # create pseudo-config file
     else {
       $config = $tuning_flag ? "$dir/tuning/moses.table.ini.$VERSION" : "$dir/evaluation/$set.moses.table.ini.$VERSION";
+      $cmd = "touch $config\n";
       $delete_config = 1;
       
-      $cmd = "cp $dir/model/moses.ini.$VERSION $config \n";
+      $cmd .= &get_config_tables($config,$reordering_table,$phrase_translation_table,undef,$domains);
+
+      $cmd .= "-lm 0:3:$config:8\n"; # dummy kenlm 3-gram model on factor 0
     }
 
     # filter command
diff --git a/scripts/ems/support/substitute-filtered-tables.perl b/scripts/ems/support/substitute-filtered-tables.perl
index 530130aa8..3efb243d7 100755
--- a/scripts/ems/support/substitute-filtered-tables.perl
+++ b/scripts/ems/support/substitute-filtered-tables.perl
@@ -10,30 +10,39 @@ if (scalar @ARGV < 1 || ! -e $ARGV[0]) {
 # read config sections about filtered tables
 my @arr;
 open(FILTERED, $ARGV[0]) or die "Cannot open: $!";
+my $feature_section = 0;
 while(my $line = <FILTERED>) {
   chomp($line);
-  if ($line =~ /PhraseModel /) {
-   print STDERR "pt:$line \n";
-   push(@arr, $line);
+  if ($line =~ /^\[(.+)\]/) {
+    $feature_section = ($1 eq "feature");
+  }
+  next unless $feature_section;
+  if ($line =~ /PhraseDictionary/) {
+    print STDERR "pt:$line \n";
+    push(@arr, $line);
+  }
+  elsif ($line =~ /LexicalReordering/) {
+    print STDERR "ro:$line \n";
+    push(@arr, $line);
   }
-  elsif ($line =~ /LexicalReordering /) {
-   print STDERR "ro:$line \n";
-   push(@arr, $line);
-  }  
 }
 close(FILTERED);
 
 # pass through master config file and replace table sections
 my $ind = 0;
+$feature_section = 0;
 while(my $line = <STDIN>) {
   chomp($line);
-  if ($line =~ /PhraseModel /) {
-   print $arr[$ind]."\n";
-   ++$ind;
+  if ($line =~ /^\[(.+)\]/) {
+    $feature_section = ($1 eq "feature");
+  }
+  if ($feature_section && $line =~ /PhraseDictionary/) {
+    print $arr[$ind]."\n";
+    ++$ind;
   }
-  elsif ($line =~ /LexicalReordering /) {
-   print $arr[$ind]."\n";
-   ++$ind;
+  elsif ($feature_section && $line =~ /LexicalReordering/) {
+    print $arr[$ind]."\n";
+    ++$ind;
   }  
   else {
     print "$line\n";
diff --git a/scripts/training/mert-moses.pl b/scripts/training/mert-moses.pl
index f4797f868..e9f46de26 100755
--- a/scripts/training/mert-moses.pl
+++ b/scripts/training/mert-moses.pl
@@ -1284,16 +1284,17 @@ sub get_featlist_from_file {
   while (<$fh>) {
     $nr++;
     chomp;
-    /^(\S+)= (.+)$/;
-    my ($longname, $valuesStr) = ($1, $2);
-    next if (!defined($valuesStr));
+    if (/^(\S+)= (.+)$/) { # only for feature functions with dense features
+      my ($longname, $valuesStr) = ($1, $2);
+      next if (!defined($valuesStr));
     
-    my @values = split(/ /, $valuesStr);
-		foreach my $value (@values) {
-			push @errs, "$featlistfn:$nr:Bad initial value of $longname: $value\n"
-				if $value !~ /^[+-]?[0-9.\-e]+$/;
-			push @names, $longname;
-			push @startvalues, $value;
+      my @values = split(/ /, $valuesStr);
+		  foreach my $value (@values) {
+			  push @errs, "$featlistfn:$nr:Bad initial value of $longname: $value\n"
+				  if $value !~ /^[+-]?[0-9.\-e]+$/;
+			  push @names, $longname;
+			  push @startvalues, $value;
+      }
     }
   }
   close $fh;
@@ -1323,7 +1324,7 @@ sub get_order_of_scores_from_nbestlist {
   my $label = undef;
   my $sparse = 0; # we ignore sparse features here
   foreach my $tok (split /\s+/, $scores) {
-    if ($tok =~ /.+_.+:/) {
+    if ($tok =~ /.+_.+=/) {
       $sparse = 1;
     } elsif ($tok =~ /^([a-z][0-9a-z]*)=/i) {
       $label = $1;
author	phikoehn <pkoehn@inf.ed.ac.uk>	2013-05-17 11:37:29 +0400
committer	phikoehn <pkoehn@inf.ed.ac.uk>	2013-05-17 11:37:29 +0400
commit	4cdffc8a891a3004a0102aa77ae26ceb91a1e881 (patch)
tree	815350e8007c863a098d04689bd35177cc61569b
parent	13991fc88fc6184139db46aa306789d855ef54cd (diff)