Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/moses-smt/mosesdecoder.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorBarry Haddow <barry.haddow@gmail.com>2012-07-06 00:02:15 +0400
committerBarry Haddow <barry.haddow@gmail.com>2012-07-06 00:02:15 +0400
commitafd654eb74974ecdc0e1da62a6d21ee1f47bd8b5 (patch)
tree75bb37c8c000c22695bfc24a94a98fbd9a793d37 /scripts
parent278ec106039089e49392c5e9f99cdbc9b720a4e0 (diff)
parenta8b7e40cecb3655671b06a5beadc56b6d73acb91 (diff)
Merge branch 'master' of github.com:moses-smt/mosesdecoder
Diffstat (limited to 'scripts')
-rw-r--r--scripts/ems/experiment.meta8
-rwxr-xr-xscripts/ems/experiment.perl23
-rwxr-xr-xscripts/ems/support/build-sparse-lexical-features.perl95
-rwxr-xr-xscripts/ems/support/reuse-weights.perl17
-rwxr-xr-xscripts/training/train-model.perl25
5 files changed, 154 insertions, 14 deletions
diff --git a/scripts/ems/experiment.meta b/scripts/ems/experiment.meta
index b294fde9d..45f689736 100644
--- a/scripts/ems/experiment.meta
+++ b/scripts/ems/experiment.meta
@@ -395,8 +395,14 @@ build-generation-custom
rerun-on-change: generation-factors generation-type training-options script generation-corpus
ignore-unless: AND generation-factors generation-corpus
default-name: model/generation-table
+build-sparse-lexical
+ in: corpus
+ out: sparse-lexical
+ ignore-unless: sparse-lexical-features
+ default-name: model/most-frequent-words
+ template: $moses-script-dir/ems/support/build-sparse-lexical-features.perl IN $input-extension $output-extension OUT "$sparse-lexical-features"
create-config
- in: reordering-table phrase-translation-table generation-table INTERPOLATED-LM:binlm LM:binlm
+ in: reordering-table phrase-translation-table generation-table sparse-lexical INTERPOLATED-LM:binlm LM:binlm
out: config
ignore-if: use-hiero
rerun-on-change: decoding-steps alignment-factors translation-factors reordering-factors generation-factors lexicalized-reordering training-options script decoding-graph-backoff score-settings additional-ini
diff --git a/scripts/ems/experiment.perl b/scripts/ems/experiment.perl
index 0f6d93d8b..aae4f7754 100755
--- a/scripts/ems/experiment.perl
+++ b/scripts/ems/experiment.perl
@@ -6,6 +6,14 @@ use strict;
use Getopt::Long "GetOptions";
use FindBin qw($RealBin);
+sub trim($)
+{
+ my $string = shift;
+ $string =~ s/^\s+//;
+ $string =~ s/\s+$//;
+ return $string;
+}
+
my $host = `hostname`; chop($host);
print STDERR "STARTING UP AS PROCESS $$ ON $host AT ".`date`;
@@ -1770,7 +1778,7 @@ sub define_training_build_custom_generation {
sub define_training_create_config {
my ($step_id) = @_;
- my ($config,$reordering_table,$phrase_translation_table,$generation_table,@LM)
+ my ($config,$reordering_table,$phrase_translation_table,$generation_table,$sparse_lexical_features,@LM)
= &get_output_and_input($step_id);
my $cmd = &get_training_setting(9);
@@ -1794,9 +1802,10 @@ sub define_training_create_config {
}
# additional settings for factored models
- my $ptCmd = "$phrase_translation_table:$ptImpl";
+ my $ptCmd = $phrase_translation_table;
+ $ptCmd .= ":$ptImpl" if $ptImpl>0;
$ptCmd .= ":$numFF" if defined($numFF);
- $cmd .= &get_table_name_settings("translation-factors","phrase-translation-table",$ptCmd);
+ $cmd .= &get_table_name_settings("translation-factors","phrase-translation-table", $ptCmd);
$cmd .= &get_table_name_settings("reordering-factors","reordering-table",$reordering_table) if $reordering_table;
$cmd .= &get_table_name_settings("generation-factors","generation-table",$generation_table) if $generation_table;
$cmd .= "-config $config ";
@@ -1894,6 +1903,9 @@ sub define_training_create_config {
my $additional_ini = &get("TRAINING:additional-ini");
$cmd .= "-additional-ini '$additional_ini' " if defined($additional_ini);
+ # sparse lexical features provide additional content for config file
+ $cmd .= "-additional-ini-file $sparse_lexical_features.ini " if $sparse_lexical_features;
+
&create_step($step_id,$cmd);
}
@@ -2223,10 +2235,10 @@ sub define_tuningevaluation_filter {
my $config = $tuning_flag ? "$dir/tuning/moses.table.ini.$VERSION" : "$dir/evaluation/$set.moses.table.ini.$VERSION";
my $cmd = &get_training_setting(9);
- my $ptCmd = "$phrase_translation_table:$ptImpl";
+ my $ptCmd = $phrase_translation_table;
+ $ptCmd .= ":$ptImpl" if $ptImpl>0;
$ptCmd .= ":$numFF" if defined($numFF);
$cmd .= &get_table_name_settings("translation-factors","phrase-translation-table", $ptCmd);
-
$cmd .= &get_table_name_settings("reordering-factors","reordering-table",$reordering_table)
if $reordering_table;
# additional settings for hierarchical models
@@ -2243,7 +2255,6 @@ sub define_tuningevaluation_filter {
$cmd .= "-config $config\n";
# filter command
- my $sa_exec_dir = &get("TRAINING:suffix-array");
if ($sa_exec_dir) {
# suffix array
$cmd .= "$scripts/training/wrappers/adam-suffix-array/suffix-array-extract.sh $sa_exec_dir $phrase_translation_table $input_filter $filter_dir \n";
diff --git a/scripts/ems/support/build-sparse-lexical-features.perl b/scripts/ems/support/build-sparse-lexical-features.perl
new file mode 100755
index 000000000..81156f2f3
--- /dev/null
+++ b/scripts/ems/support/build-sparse-lexical-features.perl
@@ -0,0 +1,95 @@
+#!/usr/bin/perl -w
+
+use strict;
+
+# Build necessary files for sparse lexical features
+# * target word insertion
+# * source word deletion
+# * word translation
+
+my ($corpus,$input_extension,$output_extension,$outfile_prefix,$specification) = @ARGV;
+my $ini = "";
+my $report = "";
+my %ALREADY;
+
+foreach my $feature_spec (split(/,\s*/,$specification)) {
+ my @SPEC = split(/\s+/,$feature_spec);
+ if ($SPEC[0] eq 'target-word-insertion') {
+ if ($SPEC[1] eq 'top' && $SPEC[2] =~ /^\d+$/) {
+ my $file = &create_top_words($output_extension, $SPEC[2]);
+ $ini .= "[target-word-insertion-feature]\n0 $file\n\n";
+ $report .= "twi\n";
+ }
+ else {
+ die("ERROR: Unknown parameter specification in '$feature_spec'\n");
+ }
+ }
+ elsif ($SPEC[0] eq 'source-word-deletion') {
+ if ($SPEC[1] eq 'top' && $SPEC[2] =~ /^\d+$/) {
+ my $file = &create_top_words($input_extension, $SPEC[2]);
+ $ini .= "[source-word-deletion-feature]\n0 $file\n\n";
+ $report .= "swd\n";
+ }
+ else {
+ die("ERROR: Unknown parameter specification in '$feature_spec'\n");
+ }
+ }
+ elsif ($SPEC[0] eq 'word-translation') {
+ if ($SPEC[1] eq 'top' && $SPEC[2] =~ /^\d+$/ && $SPEC[3] =~ /^\d+$/) {
+ my $file_in = &create_top_words($input_extension, $SPEC[2]);
+ my $file_out = &create_top_words($output_extension, $SPEC[3]);
+ $ini .= "[word-translation-feature]\n0 0 $file_in $file_out\n\n";
+ $report .= "wt\n";
+ }
+ else {
+ die("ERROR: Unknown parameter specification in '$feature_spec'\n");
+ }
+ }
+ else {
+ die("ERROR: Unknown feature type '$SPEC[0]' in specification '$feature_spec'\nfull spec: '$specification'\n");
+ }
+}
+
+open(INI,">$outfile_prefix.ini");
+print INI $ini;
+print INI "\n[report-sparse-features]\n$report\n";
+print INI "\n[use-alignment-info]\ntrue\n\n";
+close(INI);
+
+sub create_top_words {
+ my ($extension, $count) = @_;
+ my $file = "$outfile_prefix.$extension.top$count";
+ return $file if defined($ALREADY{"$extension,$count"});
+ $ALREADY{"$extension,$count"}++;
+
+ # get counts
+ my %COUNT;
+ open(CORPUS,"$corpus.$extension");
+ while(<CORPUS>) {
+ chop;
+ foreach (split) {
+ $_ =~ s/\|.+//; # only surface factor at this point
+ $COUNT{$_}++ unless $_ eq "";
+ }
+ }
+ close(CORPUS);
+
+ # sort
+ my @COUNT_WORD;
+ foreach (keys %COUNT) {
+ next if $COUNT{$_} <= 3; # avoid large tail
+ next if $_ =~ /:/; # avoid colon bug
+ push @COUNT_WORD,sprintf("%09d %s",$COUNT{$_},$_);
+ }
+ my @SORTED = reverse sort @COUNT_WORD;
+
+ # write top n to file
+ open(TOP,">$file");
+ for(my $i=0; $i<$count && $i<scalar(@SORTED); $i++) {
+ $SORTED[$i] =~ /^\d+ (.+)$/;
+ print TOP "$1\n";
+ }
+ close(TOP);
+
+ return $file;
+}
diff --git a/scripts/ems/support/reuse-weights.perl b/scripts/ems/support/reuse-weights.perl
index b51f23236..1bff4bbd0 100755
--- a/scripts/ems/support/reuse-weights.perl
+++ b/scripts/ems/support/reuse-weights.perl
@@ -10,17 +10,27 @@ my ($weight_file) = @ARGV;
my %WEIGHT;
my $current_weight = "";
+my $weights_file_spec = "";
+my $weights_file_flag = 0;
open(WEIGHT,$weight_file)
|| die("ERROR: could not open weight file: $weight_file");
while(<WEIGHT>) {
- if (/^\[weight\-(\S+)\]/) {
+ if (/^\[weight-file\]/) {
+ $weights_file_spec = "\n".$_;
+ $weights_file_flag = 1;
+ }
+ elsif (/^\[weight\-(\S+)\]/) {
$current_weight = $1;
}
elsif ($current_weight && /^(([\-\d\.]+)([Ee][+-]?[\d]+)?)$/) {
push @{$WEIGHT{$current_weight}},$1;
}
+ elsif ($weights_file_flag && !/^\[/ && !/^\s*$/) {
+ $weights_file_spec .= $_;
+ }
elsif (/^\[/) {
- $current_weight = "";
+ $current_weight = "";
+ $weights_file_flag = 0;
}
}
close(WEIGHT);
@@ -67,3 +77,6 @@ foreach my $weight (keys %WEIGHT) {
}
}
}
+
+print $weights_file_spec;
+
diff --git a/scripts/training/train-model.perl b/scripts/training/train-model.perl
index d402599f4..6e18581af 100755
--- a/scripts/training/train-model.perl
+++ b/scripts/training/train-model.perl
@@ -36,7 +36,7 @@ my($_EXTERNAL_BINDIR, $_ROOT_DIR, $_CORPUS_DIR, $_GIZA_E2F, $_GIZA_F2E, $_MODEL_
$_PHRASE_WORD_ALIGNMENT,$_FORCE_FACTORED_FILENAMES,
$_MEMSCORE, $_FINAL_ALIGNMENT_MODEL,
$_CONTINUE,$_MAX_LEXICAL_REORDERING,$_DO_STEPS,
- $_ADDITIONAL_INI,
+ $_ADDITIONAL_INI,$_ADDITIONAL_INI_FILE,
$_DICTIONARY, $_EPPEX, $IGNORE);
my $_CORES = 1;
@@ -121,8 +121,9 @@ $_HELP = 1
'force-factored-filenames' => \$_FORCE_FACTORED_FILENAMES,
'dictionary=s' => \$_DICTIONARY,
'eppex:s' => \$_EPPEX,
- 'additional-ini=s' => \$_ADDITIONAL_INI,
- 'cores=i' => \$_CORES
+ 'additional-ini=s' => \$_ADDITIONAL_INI,
+ 'additional-ini-file=s' => \$_ADDITIONAL_INI_FILE,
+ 'cores=i' => \$_CORES
);
if ($_HELP) {
@@ -1482,6 +1483,10 @@ sub score_phrase_phrase_extract {
my $ONLY_DIRECT = (defined($_SCORE_OPTIONS) && $_SCORE_OPTIONS =~ /OnlyDirect/);
my $PHRASE_COUNT = (!defined($_SCORE_OPTIONS) || $_SCORE_OPTIONS !~ /NoPhraseCount/);
my $LOW_COUNT = (defined($_SCORE_OPTIONS) && $_SCORE_OPTIONS =~ /LowCountFeature/);
+ my $COUNT_BIN = "";
+ if (defined($_SCORE_OPTIONS) && $_SCORE_OPTIONS =~ /CountBinFeature ([\s\d]*\d)/) {
+ $COUNT_BIN = $1;
+ }
my $UNALIGNED_COUNT = (defined($_SCORE_OPTIONS) && $_SCORE_OPTIONS =~ /UnalignedPenalty/);
my ($UNALIGNED_FW_COUNT,$UNALIGNED_FW_F,$UNALIGNED_FW_E);
if (defined($_SCORE_OPTIONS) && $_SCORE_OPTIONS =~ /UnalignedFunctionWordPenalty +(\S+) +(\S+)/) {
@@ -1580,6 +1585,7 @@ sub score_phrase_phrase_extract {
$cmd .= " --OnlyDirect" if $ONLY_DIRECT;
$cmd .= " --NoPhraseCount" unless $PHRASE_COUNT;
$cmd .= " --LowCountFeature" if $LOW_COUNT;
+ $cmd .= " --CountBinFeature $COUNT_BIN" if $COUNT_BIN;
$cmd .= " --GoodTuring $ttable_file.half.f2e.gz.coc" if $GOOD_TURING;
$cmd .= " --KneserNey $ttable_file.half.f2e.gz.coc" if $KNESER_NEY;
@@ -1837,6 +1843,9 @@ sub create_ini {
$basic_weight_count /= 2 if defined($_SCORE_OPTIONS) && $_SCORE_OPTIONS =~ /OnlyDirect/;
$basic_weight_count++ unless defined($_SCORE_OPTIONS) && $_SCORE_OPTIONS =~ /NoPhraseCount/; # phrase count feature
$basic_weight_count++ if defined($_SCORE_OPTIONS) && $_SCORE_OPTIONS =~ /LowCountFeature/; # low count feature
+ if (defined($_SCORE_OPTIONS) && $_SCORE_OPTIONS =~ /(CountBinFeature [\s\d]*\d)/) {
+ $basic_weight_count += scalar split(/\s+/,$1);
+ }
$basic_weight_count++ if $_PCFG;
foreach my $f (split(/\+/,$___TRANSLATION_FACTORS)) {
$num_of_ttables++;
@@ -1849,8 +1858,9 @@ sub create_ini {
$file = shift @SPECIFIED_TABLE;
my @toks = split(/:/,$file);
$file = $toks[0];
- $phrase_table_impl = $toks[1];
-
+ if (@toks > 1) {
+ $phrase_table_impl = $toks[1];
+ }
if (@toks == 3) {
$basic_weight_count = $toks[2];
}
@@ -1989,10 +1999,15 @@ sub create_ini {
print INI "\n# delimiter between factors in input\n[factor-delimiter]\n$___FACTOR_DELIMITER\n\n"
}
+ # get addititional content for config file from switch or file
if ($_ADDITIONAL_INI) {
print INI "\n# additional settings\n\n";
foreach (split(/<br>/i,$_ADDITIONAL_INI)) { print INI $_."\n"; }
}
+ if ($_ADDITIONAL_INI_FILE) {
+ print INI "\n# additional settings\n\n";
+ print INI `cat $_ADDITIONAL_INI_FILE`;
+ }
close(INI);
}