moved config creation back into train-model.perl

author: phikoehn <pkoehn@inf.ed.ac.uk> 2013-05-19 06:28:02 +0400
committer: phikoehn <pkoehn@inf.ed.ac.uk> 2013-05-19 06:28:02 +0400
commit: 542cd72c63f359e3d0c4c0e04dabb4e8138a66f2 (patch)
tree: bf01447ba7bc8ba79f564c2f5923a93f1e7b2629 /scripts/training
parent: 668bda8623ae6186550696286a67eb769854b9cf (diff)
1 files changed, 128 insertions, 126 deletions
diff --git a/scripts/training/train-model.perl b/scripts/training/train-model.perl
index da4590932..2f15d8d44 100755
--- a/scripts/training/train-model.perl
+++ b/scripts/training/train-model.perl
@@ -38,7 +38,7 @@ my($_EXTERNAL_BINDIR, $_ROOT_DIR, $_CORPUS_DIR, $_GIZA_E2F, $_GIZA_F2E, $_MODEL_
    $_MEMSCORE, $_FINAL_ALIGNMENT_MODEL,
    $_CONTINUE,$_MAX_LEXICAL_REORDERING,$_DO_STEPS,
    @_ADDITIONAL_INI,$_ADDITIONAL_INI_FILE,
-   $_SPARSE_TRANSLATION_TABLE, @_BASELINE_ALIGNMENT_MODEL, $_BASELINE_EXTRACT, $_BASELINE_ALIGNMENT,
+   @_BASELINE_ALIGNMENT_MODEL, $_BASELINE_EXTRACT, $_BASELINE_ALIGNMENT,
    $_DICTIONARY, $_SPARSE_PHRASE_FEATURES, $_EPPEX, $_INSTANCE_WEIGHTS_FILE, $_LMODEL_OOV_FEATURE, $_NUM_LATTICE_FEATURES, $IGNORE);
 my $_BASELINE_CORPUS = "";
 my $_CORES = 1;
@@ -60,10 +60,10 @@ $_HELP = 1
 		       'write-lexical-counts' => \$_LEXICAL_COUNTS,
 		       'model-dir=s' => \$_MODEL_DIR,
 		       'temp-dir=s' => \$_TEMP_DIR,
-           'sort-buffer-size=s' => \$_SORT_BUFFER_SIZE,
-           'sort-batch-size=i' => \$_SORT_BATCH_SIZE,
-           'sort-compress=s' => \$_SORT_COMPRESS,
-           'sort-parallel=i' => \$_SORT_PARALLEL,
+		       'sort-buffer-size=s' => \$_SORT_BUFFER_SIZE,
+		       'sort-batch-size=i' => \$_SORT_BATCH_SIZE,
+		       'sort-compress=s' => \$_SORT_COMPRESS,
+		       'sort-parallel=i' => \$_SORT_PARALLEL,
 		       'extract-file=s' => \$_EXTRACT_FILE,
 		       'alignment=s' => \$_ALIGNMENT,
 		       'alignment-file=s' => \$_ALIGNMENT_FILE,
@@ -95,7 +95,7 @@ $_HELP = 1
 		       'generation-factors=s' => \$_GENERATION_FACTORS,
 		       'decoding-steps=s' => \$_DECODING_STEPS,
 		       'decoding-graph-backoff=s' => \$_DECODING_GRAPH_BACKOFF,
-    			 'bin-dir=s' => \$IGNORE,
+		       'bin-dir=s' => \$IGNORE,
 		       'scripts-root-dir=s' => \$IGNORE,
 		       'factor-delimiter=s' => \$_FACTOR_DELIMITER,
 		       'phrase-translation-table=s' => \@_PHRASE_TABLE,
@@ -124,19 +124,18 @@ $_HELP = 1
 		       'memscore:s' => \$_MEMSCORE,
 		       'force-factored-filenames' => \$_FORCE_FACTORED_FILENAMES,
 		       'dictionary=s' => \$_DICTIONARY,
-           'sparse-phrase-features' => \$_SPARSE_PHRASE_FEATURES,
+		       'sparse-phrase-features' => \$_SPARSE_PHRASE_FEATURES,
 		       'eppex:s' => \$_EPPEX,
 		       'additional-ini=s' => \@_ADDITIONAL_INI, 
 		       'additional-ini-file=s' => \$_ADDITIONAL_INI_FILE, 
-		       'sparse-translation-table' => \$_SPARSE_TRANSLATION_TABLE,
 		       'baseline-alignment-model=s{8}' => \@_BASELINE_ALIGNMENT_MODEL,
 		       'baseline-extract=s' => \$_BASELINE_EXTRACT,
 		       'baseline-corpus=s' => \$_BASELINE_CORPUS,
 		       'baseline-alignment=s' => \$_BASELINE_ALIGNMENT,
 		       'cores=i' => \$_CORES,
-           'instance-weights-file=s' => \$_INSTANCE_WEIGHTS_FILE,
-           'lmodel-oov-feature' => \$_LMODEL_OOV_FEATURE,
-           'num-lattice-features=i' => \$_NUM_LATTICE_FEATURES,
+		       'instance-weights-file=s' => \$_INSTANCE_WEIGHTS_FILE,
+		       'lmodel-oov-feature' => \$_LMODEL_OOV_FEATURE,
+		       'num-lattice-features=i' => \$_NUM_LATTICE_FEATURES,
                );
 
 if ($_HELP) {
@@ -1838,14 +1837,15 @@ sub create_ini {
       die "ERROR: No translation steps defined, cannot prepare [input-factors] section\n";
     }
 
+    # mapping steps
     my %stepsused;
     print INI "\n# mapping steps
 [mapping]\n";
-   my $path = 0;
-   my %FIRST_TTABLE;
-   foreach (split(/:/,$___DECODING_STEPS)) {
-     my $first_ttable_flag = 1;
-     foreach (split(/,/,$_)) {
+    my $path = 0;
+    my %FIRST_TTABLE;
+    foreach (split(/:/,$___DECODING_STEPS)) {
+      my $first_ttable_flag = 1;
+      foreach (split(/,/,$_)) {
        s/t/T /g; 
        s/g/G /g;
        my ($type, $num) = split /\s+/;
@@ -1868,13 +1868,14 @@ sub create_ini {
        print INI "$_\n";
      }
    }
-   print INI "\n# translation tables: table type (hierarchical(0), textual (0), binary (1)), source-factors, target-factors, number of scores, file 
-# OLD FORMAT is still handled for back-compatibility
-# OLD FORMAT translation tables: source-factors, target-factors, number of scores, file 
-# OLD FORMAT a binary table type (1) is assumed 
-[ttable-file]\n";
-   my $num_of_ttables = 0;
+
+   my $feature_spec = "";
+   my $weight_spec = "";
+   # translation tables
+   my $i=0;
    my @SPECIFIED_TABLE = @_PHRASE_TABLE;
+
+   # number of weights
    my $basic_weight_count = 4; # both directions, lex and phrase
    $basic_weight_count-=2 if defined($_SCORE_OPTIONS) && $_SCORE_OPTIONS =~ /NoLex/;
    $basic_weight_count+=2 if defined($_SCORE_OPTIONS) && $_SCORE_OPTIONS =~ /UnalignedPenalty/; # word ins/del
@@ -1892,142 +1893,147 @@ sub create_ini {
      $basic_weight_count += 2**$count-1 if $method eq "Subset";
    }     
    $basic_weight_count++ if $_PCFG;
+
+   # go over each table
    foreach my $f (split(/\+/,$___TRANSLATION_FACTORS)) {
-  	$num_of_ttables++;
-    my $ff = $f;
-    $ff =~ s/\-/ /;
-    my $file = "$___MODEL_DIR/".($_HIERARCHICAL?"rule-table":"phrase-table").($___NOT_FACTORED ? "" : ".$f").".gz";
-    my $phrase_table_impl = ($_HIERARCHICAL? 6 : 0);
-
-		if (scalar(@SPECIFIED_TABLE)) {
-	    $file = shift @SPECIFIED_TABLE;
-	    my @toks = split(/:/,$file);
-			$file = $toks[0];
-      if (@toks > 1) {
-			  $phrase_table_impl = $toks[1];
-			}
-			if (@toks == 3) {
-				$basic_weight_count = $toks[2];
-			}
-		}
-		else {
+     my ($input_factor,$output_factor) = split(/\-/,$f);
+     my $file = "$___MODEL_DIR/".($_HIERARCHICAL?"rule-table":"phrase-table").($___NOT_FACTORED ? "" : ".$f").".gz";
+     my $phrase_table_impl = ($_HIERARCHICAL? 6 : 0);
+
+     # specified file name?
+     if (scalar(@SPECIFIED_TABLE)) {
+       $file = shift @SPECIFIED_TABLE;
+       my @toks = split(/:/,$file);
+       $file = $toks[0];
+       if (@toks > 1) {
+         $phrase_table_impl = $toks[1];
+       }
+       if (@toks == 3) {
+         $basic_weight_count = $toks[2];
+       }
+     }
 
-		}
-		
-    print INI "$phrase_table_impl $ff $basic_weight_count $file";
-    print INI " sparse" if defined($_SPARSE_TRANSLATION_TABLE);
-    print INI "\n";
+     # name of type
+     my $phrase_table_impl_name = "UnknownPtImplementation";
+     $phrase_table_impl_name = "PhraseDictionaryMemory" if $phrase_table_impl==0;
+     $phrase_table_impl_name = "PhraseDictionaryBinary" if $phrase_table_impl==1;
+     $phrase_table_impl_name = "PhraseDictionaryOnDisk" if $phrase_table_impl==2;
+     $phrase_table_impl_name = "PhraseDictionaryMemory" if $phrase_table_impl==6;
+
+     # sum up...
+     $feature_spec .= "$phrase_table_impl_name name=TranslationModel$i num-features=$basic_weight_count path=$file input-factor=$input_factor output-factor=$output_factor\n";
+     $weight_spec .= "TranslationModel$i=";
+     for(my $j=0;$j<$basic_weight_count;$j++) { $weight_spec .= " 0.2"; }
+     $weight_spec .= "\n";
+
+     $i++;
+   }
+
+   if ($i != $stepsused{"T"}) {
+     print STDERR "WARNING: Your [mapping-steps] require translation steps up to id $stepsused{T} but you defined translation steps 0..$i\n";
+     exit 1 if $i < $stepsused{"T"}; # fatal to define less
    }
+
+   # glue grammar
    if ($_GLUE_GRAMMAR) {
      &full_path(\$___GLUE_GRAMMAR_FILE);
-     print INI "6 0 0 1 $___GLUE_GRAMMAR_FILE\n";
+     $feature_spec .= "PhraseDictionaryMemory name=TranslationModel$i num-features=1 path=$___GLUE_GRAMMAR_FILE input-factor=0 output-factor=0\n";
+     $weight_spec .= "TranslationModel$i= 1.0\n";
    }
-   if ($num_of_ttables != $stepsused{"T"}) {
-     print STDERR "WARNING: Your [mapping-steps] require translation steps up to id $stepsused{T} but you defined translation steps 0..$num_of_ttables\n";
-     exit 1 if $num_of_ttables < $stepsused{"T"}; # fatal to define less
+
+   # ttable limit
+   print INI "\n\# limit on how many phrase translations e for each phrase f are loaded\n# 0 = all elements loaded\n[ttable-limit]\n20\n";
+   foreach(1 .. ($i-1)) {
+     print INI (defined($FIRST_TTABLE{$_})?"20":"0")."\n";
    }
 
-    if (defined $___GENERATION_FACTORS) {
+   # generation model
+   if (defined $___GENERATION_FACTORS) {
       my @TYPE = @_GENERATION_TYPE;
-      print INI "\n# generation models: source-factors, target-factors, number-of-weights, filename\n";
-      print INI "[generation-file]\n";
-      my $cnt = 0;
+      my $i=0;
       my @SPECIFIED_TABLE = @_GENERATION_TABLE;
       foreach my $f (split(/\+/,$___GENERATION_FACTORS)) {
         my $weights_per_generation_model = 2;
         $weights_per_generation_model = 1 if scalar(@TYPE) && (shift @TYPE) eq 'single';
-        $cnt++;
-        my $ff = $f;
-        $ff =~ s/\-/ /;
+        my ($input_factor,$output_factor) = split(/\-/,$f);
 	my $file = "$___MODEL_DIR/generation.$f";
 	$file = shift @SPECIFIED_TABLE if scalar(@SPECIFIED_TABLE);
-        print INI "$ff $weights_per_generation_model $file\n";
+        $file .= ".gz" if ! -e $file && -e $file.".gz";
+        $feature_spec .= "Generation name=GenerationModel$i num-features=$weights_per_generation_model path=$file input-factor=$input_factor output-factor=$output_factor\n";
+        $weight_spec .= "GenerationModel$i= 0.3".($weights_per_generation_model==2?" 0":"")."\n";
+        $i++;
       }
-      if ($cnt != $stepsused{"G"}) {
-        print STDERR "WARNING: Your [mapping-steps] require generation steps up to id $stepsused{G} but you defined generation steps 0..$cnt\n";
-        exit 1 if $cnt < $stepsused{"G"}; # fatal to define less
+      if ($i != $stepsused{"G"}) {
+        print STDERR "WARNING: Your [mapping-steps] require generation steps up to id $stepsused{G} but you defined generation steps 0..$i\n";
+        exit 1 if $i < $stepsused{"G"}; # fatal to define less
       }
-    } else {
-      print INI "\n# no generation models, no generation-file section\n";
-    }
-
-  print INI "\n# language models: type(srilm/irstlm), factors, order, file\n[lmodel-file]\n";
-  foreach my $lm (@___LM) {
-    my ($f, $o, $fn, $type) = @{$lm};
-    if ($fn !~ /^\//) {
-      my $path = `pwd`; chop($path);
-      $fn = $path."/".$fn;
-    }
-    $type = 0 unless $type;
-    print INI "$type $f $o $fn\n";
-  }
-
-  print INI "\n\n\# limit on how many phrase translations e for each phrase f are loaded\n# 0 = all elements loaded\n[ttable-limit]\n20\n";
-  foreach(1 .. ($num_of_ttables-1)) {
-    print INI (defined($FIRST_TTABLE{$_})?"20":"0")."\n";
-  }
-  print INI "\n";
+   }
 
-  my $weight_d_count = 1;
+  # lexicalized reordering model
   if ($___REORDERING ne "distance") {
-    my $file = "# distortion (reordering) files\n\[distortion-file]\n";
-    my $factor_i = 0;
+    my $i = 0;
  
     my @SPECIFIED_TABLE = @_REORDERING_TABLE;
     foreach my $factor (split(/\+/,$___REORDERING_FACTORS)) {
+        my ($input_factor,$output_factor) = split(/\-/,$factor);
 	foreach my $model (@REORDERING_MODELS) {
-	    $weight_d_count += $model->{"numfeatures"};
 	    my $table_file = "$___MODEL_DIR/reordering-table";
 	    $table_file .= ".$factor" unless $___NOT_FACTORED;
 	    $table_file = shift @SPECIFIED_TABLE if scalar(@SPECIFIED_TABLE);
 	    $table_file .= ".";
 	    $table_file .= $model->{"filename"};
 	    $table_file .= ".gz";
-	    $file .= "$factor ".$model->{"config"}." ".$model->{"numfeatures"}." $table_file\n";
+            $feature_spec .= "LexicalReordering name=LexicalReordering$i num-features=".$model->{"numfeatures"}." type=".$model->{"config"}." input-factor=$input_factor output-factor=$output_factor path=$table_file\n"; 
+            $weight_spec .= "LexicalReordering$i=";
+            for(my $j=0;$j<$model->{"numfeatures"};$j++) { $weight_spec .= " 0.3"; }
+            $weight_spec .= "\n";
 	}
-        $factor_i++;
+        $i++;
       }
-      print INI $file."\n";
-  }
-  else {
-    $weight_d_count = 1;
   }
-  
+
+  # distance-based reordering
   if (!$_HIERARCHICAL) {
-    print INI "# distortion (reordering) weight\n[weight-d]\n";
-    for(my $i=0;$i<$weight_d_count;$i++) { 
-      print INI "".(0.6/(scalar @REORDERING_MODELS+1))."\n";
-    }
+    $feature_spec .= "Distortion\n";
+    $weight_spec .= "Distortion0= 0.3\n";
   }
-  print INI "\n# language model weights\n[weight-l]\n";
-  my $lmweighttotal = 0.5;
-  my $lmoovweighttotal = 0.1;
-  foreach(1..scalar @___LM) {
-    printf INI "%.4f\n", $lmweighttotal / scalar @___LM;
-    if ($_LMODEL_OOV_FEATURE) {
-      printf INI "%.4f\n", $lmoovweighttotal / scalar @___LM;
+
+  # language model
+  $i=0;
+  foreach my $lm (@___LM) {
+    my ($f, $o, $fn, $type) = @{$lm};
+    if ($fn !~ /^\//) {
+      my $path = `pwd`; chop($path);
+      $fn = $path."/".$fn;
     }
+    $type = 0 unless $type;
+    my $type_name = "UnknownLM";
+    $type_name = "SRILM" if $type == 0;
+    $type_name = "IRSTLM" if $type == 1;
+    $type_name = "KENLM lazyken=0" if $type == 8;
+    $type_name = "KENLM lazyken=1" if $type == 9;
+    
+    $feature_spec .= "$type_name name=LM$i factor=$f path=$fn order=$o\n";
+    $weight_spec .= "LM$i= 0.5".($_LMODEL_OOV_FEATURE?" 0.1":"")."\n";
+    $i++;
   }
-
-  print INI "\n\n# translation model weights\n[weight-t]\n";
-  foreach my $f (split(/\+/,$___TRANSLATION_FACTORS)) {
-     for(1..$basic_weight_count) {
-       printf INI "%.2f\n", 1/$basic_weight_count;
-     }
+  if ($_LMODEL_OOV_FEATURE) {
+    print INI "\n# language model OOV feature enabled\n[lmodel-oov-feature]\n1\n\n";
   }
-  print INI "1.0\n" if $_HIERARCHICAL; # glue grammar
 
-    if (defined $___GENERATION_FACTORS) {
-      print INI "\n# generation model weights\n";
-      print INI "[weight-generation]\n";
-      my @TYPE = @_GENERATION_TYPE;
-      foreach my $f (split(/\+/,$___GENERATION_FACTORS)) {
-        print INI "0.3\n";
-        print INI "0\n" unless scalar(@TYPE) && (shift @TYPE) eq 'single';
-      }
-    } else {
-      print INI "\n# no generation models, no weight-generation section\n";
-    }
+  
+  # feature functions and weights
+  print INI "\n# feature functions\n";
+  print INI "[feature]\n";
+  print INI "UnknownWordPenalty\n";
+  print INI "WordPenalty\n";
+  print INI $feature_spec;
+
+  print INI "\n# dense weights for feature functions\n";
+  print INI "[weight]\n";
+  print INI "UnknownWordPenalty0= 1\n";
+  print INI "WordPenalty0= -1\n";
+  print INI $weight_spec;
 
   if ($_NUM_LATTICE_FEATURES) {
     print INI "\n\n#lattice or confusion net weights\n[weight-i]\n";
@@ -2037,8 +2043,8 @@ sub create_ini {
     print "\n";
   }
 
-  print INI "\n# word penalty\n[weight-w]\n-1\n\n";
-
+  # hierarchical model settings
+  print INI "\n";
   if ($_HIERARCHICAL) {
     print INI "[unknown-lhs]\n$_UNKNOWN_WORD_LABEL_FILE\n\n" if $_TARGET_SYNTAX && defined($_UNKNOWN_WORD_LABEL_FILE);
     print INI "[cube-pruning-pop-limit]\n1000\n\n";
@@ -2058,10 +2064,6 @@ sub create_ini {
     print INI "\n# delimiter between factors in input\n[factor-delimiter]\n$___FACTOR_DELIMITER\n\n"
   }
 
-  if ($_LMODEL_OOV_FEATURE) {
-    print INI "\n# language model OOV feature enabled\n[lmodel-oov-feature]\n1\n\n";
-  }
-
   # get addititional content for config file from switch or file
   if ($_ADDITIONAL_INI) {
     print INI "\n# additional settings\n\n";
author	phikoehn <pkoehn@inf.ed.ac.uk>	2013-05-19 06:28:02 +0400
committer	phikoehn <pkoehn@inf.ed.ac.uk>	2013-05-19 06:28:02 +0400
commit	542cd72c63f359e3d0c4c0e04dabb4e8138a66f2 (patch)
tree	bf01447ba7bc8ba79f564c2f5923a93f1e7b2629 /scripts/training
parent	668bda8623ae6186550696286a67eb769854b9cf (diff)