Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/moses-smt/mosesdecoder.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorphikoehn <pkoehn@inf.ed.ac.uk>2013-05-19 06:28:02 +0400
committerphikoehn <pkoehn@inf.ed.ac.uk>2013-05-19 06:28:02 +0400
commit542cd72c63f359e3d0c4c0e04dabb4e8138a66f2 (patch)
treebf01447ba7bc8ba79f564c2f5923a93f1e7b2629 /scripts/training
parent668bda8623ae6186550696286a67eb769854b9cf (diff)
moved config creation back into train-model.perl
Diffstat (limited to 'scripts/training')
-rwxr-xr-xscripts/training/train-model.perl254
1 files changed, 128 insertions, 126 deletions
diff --git a/scripts/training/train-model.perl b/scripts/training/train-model.perl
index da4590932..2f15d8d44 100755
--- a/scripts/training/train-model.perl
+++ b/scripts/training/train-model.perl
@@ -38,7 +38,7 @@ my($_EXTERNAL_BINDIR, $_ROOT_DIR, $_CORPUS_DIR, $_GIZA_E2F, $_GIZA_F2E, $_MODEL_
$_MEMSCORE, $_FINAL_ALIGNMENT_MODEL,
$_CONTINUE,$_MAX_LEXICAL_REORDERING,$_DO_STEPS,
@_ADDITIONAL_INI,$_ADDITIONAL_INI_FILE,
- $_SPARSE_TRANSLATION_TABLE, @_BASELINE_ALIGNMENT_MODEL, $_BASELINE_EXTRACT, $_BASELINE_ALIGNMENT,
+ @_BASELINE_ALIGNMENT_MODEL, $_BASELINE_EXTRACT, $_BASELINE_ALIGNMENT,
$_DICTIONARY, $_SPARSE_PHRASE_FEATURES, $_EPPEX, $_INSTANCE_WEIGHTS_FILE, $_LMODEL_OOV_FEATURE, $_NUM_LATTICE_FEATURES, $IGNORE);
my $_BASELINE_CORPUS = "";
my $_CORES = 1;
@@ -60,10 +60,10 @@ $_HELP = 1
'write-lexical-counts' => \$_LEXICAL_COUNTS,
'model-dir=s' => \$_MODEL_DIR,
'temp-dir=s' => \$_TEMP_DIR,
- 'sort-buffer-size=s' => \$_SORT_BUFFER_SIZE,
- 'sort-batch-size=i' => \$_SORT_BATCH_SIZE,
- 'sort-compress=s' => \$_SORT_COMPRESS,
- 'sort-parallel=i' => \$_SORT_PARALLEL,
+ 'sort-buffer-size=s' => \$_SORT_BUFFER_SIZE,
+ 'sort-batch-size=i' => \$_SORT_BATCH_SIZE,
+ 'sort-compress=s' => \$_SORT_COMPRESS,
+ 'sort-parallel=i' => \$_SORT_PARALLEL,
'extract-file=s' => \$_EXTRACT_FILE,
'alignment=s' => \$_ALIGNMENT,
'alignment-file=s' => \$_ALIGNMENT_FILE,
@@ -95,7 +95,7 @@ $_HELP = 1
'generation-factors=s' => \$_GENERATION_FACTORS,
'decoding-steps=s' => \$_DECODING_STEPS,
'decoding-graph-backoff=s' => \$_DECODING_GRAPH_BACKOFF,
- 'bin-dir=s' => \$IGNORE,
+ 'bin-dir=s' => \$IGNORE,
'scripts-root-dir=s' => \$IGNORE,
'factor-delimiter=s' => \$_FACTOR_DELIMITER,
'phrase-translation-table=s' => \@_PHRASE_TABLE,
@@ -124,19 +124,18 @@ $_HELP = 1
'memscore:s' => \$_MEMSCORE,
'force-factored-filenames' => \$_FORCE_FACTORED_FILENAMES,
'dictionary=s' => \$_DICTIONARY,
- 'sparse-phrase-features' => \$_SPARSE_PHRASE_FEATURES,
+ 'sparse-phrase-features' => \$_SPARSE_PHRASE_FEATURES,
'eppex:s' => \$_EPPEX,
'additional-ini=s' => \@_ADDITIONAL_INI,
'additional-ini-file=s' => \$_ADDITIONAL_INI_FILE,
- 'sparse-translation-table' => \$_SPARSE_TRANSLATION_TABLE,
'baseline-alignment-model=s{8}' => \@_BASELINE_ALIGNMENT_MODEL,
'baseline-extract=s' => \$_BASELINE_EXTRACT,
'baseline-corpus=s' => \$_BASELINE_CORPUS,
'baseline-alignment=s' => \$_BASELINE_ALIGNMENT,
'cores=i' => \$_CORES,
- 'instance-weights-file=s' => \$_INSTANCE_WEIGHTS_FILE,
- 'lmodel-oov-feature' => \$_LMODEL_OOV_FEATURE,
- 'num-lattice-features=i' => \$_NUM_LATTICE_FEATURES,
+ 'instance-weights-file=s' => \$_INSTANCE_WEIGHTS_FILE,
+ 'lmodel-oov-feature' => \$_LMODEL_OOV_FEATURE,
+ 'num-lattice-features=i' => \$_NUM_LATTICE_FEATURES,
);
if ($_HELP) {
@@ -1838,14 +1837,15 @@ sub create_ini {
die "ERROR: No translation steps defined, cannot prepare [input-factors] section\n";
}
+ # mapping steps
my %stepsused;
print INI "\n# mapping steps
[mapping]\n";
- my $path = 0;
- my %FIRST_TTABLE;
- foreach (split(/:/,$___DECODING_STEPS)) {
- my $first_ttable_flag = 1;
- foreach (split(/,/,$_)) {
+ my $path = 0;
+ my %FIRST_TTABLE;
+ foreach (split(/:/,$___DECODING_STEPS)) {
+ my $first_ttable_flag = 1;
+ foreach (split(/,/,$_)) {
s/t/T /g;
s/g/G /g;
my ($type, $num) = split /\s+/;
@@ -1868,13 +1868,14 @@ sub create_ini {
print INI "$_\n";
}
}
- print INI "\n# translation tables: table type (hierarchical(0), textual (0), binary (1)), source-factors, target-factors, number of scores, file
-# OLD FORMAT is still handled for back-compatibility
-# OLD FORMAT translation tables: source-factors, target-factors, number of scores, file
-# OLD FORMAT a binary table type (1) is assumed
-[ttable-file]\n";
- my $num_of_ttables = 0;
+
+ my $feature_spec = "";
+ my $weight_spec = "";
+ # translation tables
+ my $i=0;
my @SPECIFIED_TABLE = @_PHRASE_TABLE;
+
+ # number of weights
my $basic_weight_count = 4; # both directions, lex and phrase
$basic_weight_count-=2 if defined($_SCORE_OPTIONS) && $_SCORE_OPTIONS =~ /NoLex/;
$basic_weight_count+=2 if defined($_SCORE_OPTIONS) && $_SCORE_OPTIONS =~ /UnalignedPenalty/; # word ins/del
@@ -1892,142 +1893,147 @@ sub create_ini {
$basic_weight_count += 2**$count-1 if $method eq "Subset";
}
$basic_weight_count++ if $_PCFG;
+
+ # go over each table
foreach my $f (split(/\+/,$___TRANSLATION_FACTORS)) {
- $num_of_ttables++;
- my $ff = $f;
- $ff =~ s/\-/ /;
- my $file = "$___MODEL_DIR/".($_HIERARCHICAL?"rule-table":"phrase-table").($___NOT_FACTORED ? "" : ".$f").".gz";
- my $phrase_table_impl = ($_HIERARCHICAL? 6 : 0);
-
- if (scalar(@SPECIFIED_TABLE)) {
- $file = shift @SPECIFIED_TABLE;
- my @toks = split(/:/,$file);
- $file = $toks[0];
- if (@toks > 1) {
- $phrase_table_impl = $toks[1];
- }
- if (@toks == 3) {
- $basic_weight_count = $toks[2];
- }
- }
- else {
+ my ($input_factor,$output_factor) = split(/\-/,$f);
+ my $file = "$___MODEL_DIR/".($_HIERARCHICAL?"rule-table":"phrase-table").($___NOT_FACTORED ? "" : ".$f").".gz";
+ my $phrase_table_impl = ($_HIERARCHICAL? 6 : 0);
+
+ # specified file name?
+ if (scalar(@SPECIFIED_TABLE)) {
+ $file = shift @SPECIFIED_TABLE;
+ my @toks = split(/:/,$file);
+ $file = $toks[0];
+ if (@toks > 1) {
+ $phrase_table_impl = $toks[1];
+ }
+ if (@toks == 3) {
+ $basic_weight_count = $toks[2];
+ }
+ }
- }
-
- print INI "$phrase_table_impl $ff $basic_weight_count $file";
- print INI " sparse" if defined($_SPARSE_TRANSLATION_TABLE);
- print INI "\n";
+ # name of type
+ my $phrase_table_impl_name = "UnknownPtImplementation";
+ $phrase_table_impl_name = "PhraseDictionaryMemory" if $phrase_table_impl==0;
+ $phrase_table_impl_name = "PhraseDictionaryBinary" if $phrase_table_impl==1;
+ $phrase_table_impl_name = "PhraseDictionaryOnDisk" if $phrase_table_impl==2;
+ $phrase_table_impl_name = "PhraseDictionaryMemory" if $phrase_table_impl==6;
+
+ # sum up...
+ $feature_spec .= "$phrase_table_impl_name name=TranslationModel$i num-features=$basic_weight_count path=$file input-factor=$input_factor output-factor=$output_factor\n";
+ $weight_spec .= "TranslationModel$i=";
+ for(my $j=0;$j<$basic_weight_count;$j++) { $weight_spec .= " 0.2"; }
+ $weight_spec .= "\n";
+
+ $i++;
+ }
+
+ if ($i != $stepsused{"T"}) {
+ print STDERR "WARNING: Your [mapping-steps] require translation steps up to id $stepsused{T} but you defined translation steps 0..$i\n";
+ exit 1 if $i < $stepsused{"T"}; # fatal to define less
}
+
+ # glue grammar
if ($_GLUE_GRAMMAR) {
&full_path(\$___GLUE_GRAMMAR_FILE);
- print INI "6 0 0 1 $___GLUE_GRAMMAR_FILE\n";
+ $feature_spec .= "PhraseDictionaryMemory name=TranslationModel$i num-features=1 path=$___GLUE_GRAMMAR_FILE input-factor=0 output-factor=0\n";
+ $weight_spec .= "TranslationModel$i= 1.0\n";
}
- if ($num_of_ttables != $stepsused{"T"}) {
- print STDERR "WARNING: Your [mapping-steps] require translation steps up to id $stepsused{T} but you defined translation steps 0..$num_of_ttables\n";
- exit 1 if $num_of_ttables < $stepsused{"T"}; # fatal to define less
+
+ # ttable limit
+ print INI "\n\# limit on how many phrase translations e for each phrase f are loaded\n# 0 = all elements loaded\n[ttable-limit]\n20\n";
+ foreach(1 .. ($i-1)) {
+ print INI (defined($FIRST_TTABLE{$_})?"20":"0")."\n";
}
- if (defined $___GENERATION_FACTORS) {
+ # generation model
+ if (defined $___GENERATION_FACTORS) {
my @TYPE = @_GENERATION_TYPE;
- print INI "\n# generation models: source-factors, target-factors, number-of-weights, filename\n";
- print INI "[generation-file]\n";
- my $cnt = 0;
+ my $i=0;
my @SPECIFIED_TABLE = @_GENERATION_TABLE;
foreach my $f (split(/\+/,$___GENERATION_FACTORS)) {
my $weights_per_generation_model = 2;
$weights_per_generation_model = 1 if scalar(@TYPE) && (shift @TYPE) eq 'single';
- $cnt++;
- my $ff = $f;
- $ff =~ s/\-/ /;
+ my ($input_factor,$output_factor) = split(/\-/,$f);
my $file = "$___MODEL_DIR/generation.$f";
$file = shift @SPECIFIED_TABLE if scalar(@SPECIFIED_TABLE);
- print INI "$ff $weights_per_generation_model $file\n";
+ $file .= ".gz" if ! -e $file && -e $file.".gz";
+ $feature_spec .= "Generation name=GenerationModel$i num-features=$weights_per_generation_model path=$file input-factor=$input_factor output-factor=$output_factor\n";
+ $weight_spec .= "GenerationModel$i= 0.3".($weights_per_generation_model==2?" 0":"")."\n";
+ $i++;
}
- if ($cnt != $stepsused{"G"}) {
- print STDERR "WARNING: Your [mapping-steps] require generation steps up to id $stepsused{G} but you defined generation steps 0..$cnt\n";
- exit 1 if $cnt < $stepsused{"G"}; # fatal to define less
+ if ($i != $stepsused{"G"}) {
+ print STDERR "WARNING: Your [mapping-steps] require generation steps up to id $stepsused{G} but you defined generation steps 0..$i\n";
+ exit 1 if $i < $stepsused{"G"}; # fatal to define less
}
- } else {
- print INI "\n# no generation models, no generation-file section\n";
- }
-
- print INI "\n# language models: type(srilm/irstlm), factors, order, file\n[lmodel-file]\n";
- foreach my $lm (@___LM) {
- my ($f, $o, $fn, $type) = @{$lm};
- if ($fn !~ /^\//) {
- my $path = `pwd`; chop($path);
- $fn = $path."/".$fn;
- }
- $type = 0 unless $type;
- print INI "$type $f $o $fn\n";
- }
-
- print INI "\n\n\# limit on how many phrase translations e for each phrase f are loaded\n# 0 = all elements loaded\n[ttable-limit]\n20\n";
- foreach(1 .. ($num_of_ttables-1)) {
- print INI (defined($FIRST_TTABLE{$_})?"20":"0")."\n";
- }
- print INI "\n";
+ }
- my $weight_d_count = 1;
+ # lexicalized reordering model
if ($___REORDERING ne "distance") {
- my $file = "# distortion (reordering) files\n\[distortion-file]\n";
- my $factor_i = 0;
+ my $i = 0;
my @SPECIFIED_TABLE = @_REORDERING_TABLE;
foreach my $factor (split(/\+/,$___REORDERING_FACTORS)) {
+ my ($input_factor,$output_factor) = split(/\-/,$factor);
foreach my $model (@REORDERING_MODELS) {
- $weight_d_count += $model->{"numfeatures"};
my $table_file = "$___MODEL_DIR/reordering-table";
$table_file .= ".$factor" unless $___NOT_FACTORED;
$table_file = shift @SPECIFIED_TABLE if scalar(@SPECIFIED_TABLE);
$table_file .= ".";
$table_file .= $model->{"filename"};
$table_file .= ".gz";
- $file .= "$factor ".$model->{"config"}." ".$model->{"numfeatures"}." $table_file\n";
+ $feature_spec .= "LexicalReordering name=LexicalReordering$i num-features=".$model->{"numfeatures"}." type=".$model->{"config"}." input-factor=$input_factor output-factor=$output_factor path=$table_file\n";
+ $weight_spec .= "LexicalReordering$i=";
+ for(my $j=0;$j<$model->{"numfeatures"};$j++) { $weight_spec .= " 0.3"; }
+ $weight_spec .= "\n";
}
- $factor_i++;
+ $i++;
}
- print INI $file."\n";
- }
- else {
- $weight_d_count = 1;
}
-
+
+ # distance-based reordering
if (!$_HIERARCHICAL) {
- print INI "# distortion (reordering) weight\n[weight-d]\n";
- for(my $i=0;$i<$weight_d_count;$i++) {
- print INI "".(0.6/(scalar @REORDERING_MODELS+1))."\n";
- }
+ $feature_spec .= "Distortion\n";
+ $weight_spec .= "Distortion0= 0.3\n";
}
- print INI "\n# language model weights\n[weight-l]\n";
- my $lmweighttotal = 0.5;
- my $lmoovweighttotal = 0.1;
- foreach(1..scalar @___LM) {
- printf INI "%.4f\n", $lmweighttotal / scalar @___LM;
- if ($_LMODEL_OOV_FEATURE) {
- printf INI "%.4f\n", $lmoovweighttotal / scalar @___LM;
+
+ # language model
+ $i=0;
+ foreach my $lm (@___LM) {
+ my ($f, $o, $fn, $type) = @{$lm};
+ if ($fn !~ /^\//) {
+ my $path = `pwd`; chop($path);
+ $fn = $path."/".$fn;
}
+ $type = 0 unless $type;
+ my $type_name = "UnknownLM";
+ $type_name = "SRILM" if $type == 0;
+ $type_name = "IRSTLM" if $type == 1;
+ $type_name = "KENLM lazyken=0" if $type == 8;
+ $type_name = "KENLM lazyken=1" if $type == 9;
+
+ $feature_spec .= "$type_name name=LM$i factor=$f path=$fn order=$o\n";
+ $weight_spec .= "LM$i= 0.5".($_LMODEL_OOV_FEATURE?" 0.1":"")."\n";
+ $i++;
}
-
- print INI "\n\n# translation model weights\n[weight-t]\n";
- foreach my $f (split(/\+/,$___TRANSLATION_FACTORS)) {
- for(1..$basic_weight_count) {
- printf INI "%.2f\n", 1/$basic_weight_count;
- }
+ if ($_LMODEL_OOV_FEATURE) {
+ print INI "\n# language model OOV feature enabled\n[lmodel-oov-feature]\n1\n\n";
}
- print INI "1.0\n" if $_HIERARCHICAL; # glue grammar
- if (defined $___GENERATION_FACTORS) {
- print INI "\n# generation model weights\n";
- print INI "[weight-generation]\n";
- my @TYPE = @_GENERATION_TYPE;
- foreach my $f (split(/\+/,$___GENERATION_FACTORS)) {
- print INI "0.3\n";
- print INI "0\n" unless scalar(@TYPE) && (shift @TYPE) eq 'single';
- }
- } else {
- print INI "\n# no generation models, no weight-generation section\n";
- }
+
+ # feature functions and weights
+ print INI "\n# feature functions\n";
+ print INI "[feature]\n";
+ print INI "UnknownWordPenalty\n";
+ print INI "WordPenalty\n";
+ print INI $feature_spec;
+
+ print INI "\n# dense weights for feature functions\n";
+ print INI "[weight]\n";
+ print INI "UnknownWordPenalty0= 1\n";
+ print INI "WordPenalty0= -1\n";
+ print INI $weight_spec;
if ($_NUM_LATTICE_FEATURES) {
print INI "\n\n#lattice or confusion net weights\n[weight-i]\n";
@@ -2037,8 +2043,8 @@ sub create_ini {
print "\n";
}
- print INI "\n# word penalty\n[weight-w]\n-1\n\n";
-
+ # hierarchical model settings
+ print INI "\n";
if ($_HIERARCHICAL) {
print INI "[unknown-lhs]\n$_UNKNOWN_WORD_LABEL_FILE\n\n" if $_TARGET_SYNTAX && defined($_UNKNOWN_WORD_LABEL_FILE);
print INI "[cube-pruning-pop-limit]\n1000\n\n";
@@ -2058,10 +2064,6 @@ sub create_ini {
print INI "\n# delimiter between factors in input\n[factor-delimiter]\n$___FACTOR_DELIMITER\n\n"
}
- if ($_LMODEL_OOV_FEATURE) {
- print INI "\n# language model OOV feature enabled\n[lmodel-oov-feature]\n1\n\n";
- }
-
# get addititional content for config file from switch or file
if ($_ADDITIONAL_INI) {
print INI "\n# additional settings\n\n";