Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/moses-smt/mosesdecoder.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorphikoehn <pkoehn@inf.ed.ac.uk>2013-05-17 11:37:29 +0400
committerphikoehn <pkoehn@inf.ed.ac.uk>2013-05-17 11:37:29 +0400
commit4cdffc8a891a3004a0102aa77ae26ceb91a1e881 (patch)
tree815350e8007c863a098d04689bd35177cc61569b
parent13991fc88fc6184139db46aa306789d855ef54cd (diff)
fixes for sparse feature handling
-rw-r--r--mert/Data.cpp2
-rw-r--r--mert/FeatureDataIterator.cpp2
-rw-r--r--mert/FeatureStats.cpp4
-rw-r--r--scripts/ems/experiment.meta4
-rwxr-xr-xscripts/ems/experiment.perl49
-rwxr-xr-xscripts/ems/support/substitute-filtered-tables.perl35
-rwxr-xr-xscripts/training/mert-moses.pl21
7 files changed, 68 insertions, 49 deletions
diff --git a/mert/Data.cpp b/mert/Data.cpp
index 3f91c1376..1efa080a2 100644
--- a/mert/Data.cpp
+++ b/mert/Data.cpp
@@ -212,7 +212,7 @@ void Data::InitFeatureMap(const string& str) {
while (!buf.empty()) {
getNextPound(buf, substr);
- // string ending with ":" are skipped, because they are the names of the features
+ // string ending with "=" are skipped, because they are the names of the features
if (!EndsWith(substr, "=")) {
stringstream ss;
ss << tmp_name << "_" << tmp_index << " ";
diff --git a/mert/FeatureDataIterator.cpp b/mert/FeatureDataIterator.cpp
index a22112bb4..471da07ee 100644
--- a/mert/FeatureDataIterator.cpp
+++ b/mert/FeatureDataIterator.cpp
@@ -89,7 +89,7 @@ void FeatureDataIterator::readNext() {
StringPiece line = m_in->ReadLine();
m_next.push_back(FeatureDataItem());
for (TokenIter<AnyCharacter, true> token(line, AnyCharacter(" \t")); token; ++token) {
- TokenIter<AnyCharacterLast,false> value(*token,AnyCharacterLast(":"));
+ TokenIter<AnyCharacterLast,false> value(*token,AnyCharacterLast("="));
if (!value) throw FileFormatException(m_in->FileName(), line.as_string());
StringPiece first = *value;
++value;
diff --git a/mert/FeatureStats.cpp b/mert/FeatureStats.cpp
index 22f62e234..242d3fbd0 100644
--- a/mert/FeatureStats.cpp
+++ b/mert/FeatureStats.cpp
@@ -220,12 +220,12 @@ void FeatureStats::set(string &theString, const SparseVector& sparseWeights )
while (!theString.empty()) {
getNextPound(theString, substring);
// regular feature
- if (substring.find(":") == string::npos) {
+ if (substring.find("=") == string::npos) {
add(ConvertStringToFeatureStatsType(substring));
}
// sparse feature
else {
- size_t separator = substring.find_last_of(":");
+ size_t separator = substring.find_last_of("=");
addSparse(substring.substr(0,separator), atof(substring.substr(separator+1).c_str()) );
}
}
diff --git a/scripts/ems/experiment.meta b/scripts/ems/experiment.meta
index c82195b1a..d10bf2dbe 100644
--- a/scripts/ems/experiment.meta
+++ b/scripts/ems/experiment.meta
@@ -844,13 +844,13 @@ apply-filter
default-name: tuning/moses.filtered.ini
pass-if: TRAINING:binarize-all
ignore-if: use-hiero
- template: cp IN1/moses.ini OUT
+ template: $moses-script-dir/ems/support/substitute-filtered-tables.perl IN1/moses.ini < IN > OUT
apply-filter-devtest
in: TRAINING:config filtered-dir-devtest
out: filtered-config-devtest
default-name: tuning/moses.filtered.devtest.ini
pass-if: TRAINING:binarize-all
- ignore-unless: use-mira
+ ignore-unless: use-mira
template: $moses-script-dir/ems/support/substitute-filtered-tables.perl IN1/moses.ini < IN > OUT
tune
in: filtered-config input reference filtered-config-devtest input-devtest reference-devtest
diff --git a/scripts/ems/experiment.perl b/scripts/ems/experiment.perl
index b4ebe161a..a4d94b00e 100755
--- a/scripts/ems/experiment.perl
+++ b/scripts/ems/experiment.perl
@@ -2100,15 +2100,12 @@ sub define_training_sigtest_filter {
&create_step($step_id,$cmd);
}
-sub define_training_create_config {
- my ($step_id) = @_;
-
- my ($config,$reordering_table,$phrase_translation_table,$generation_table,$sparse_lexical_features,$domains,@LM)
- = &get_output_and_input($step_id);
+sub get_config_tables {
+ my ($config,$reordering_table,$phrase_translation_table,$generation_table,$domains) = @_;
my $moses_src_dir = &check_and_get("GENERAL:moses-src-dir");
- #my $cmd = "$moses_src_dir/bin/create-ini ";
my $cmd = &backoff_and_get("TRAINING:create-ini");
+ $cmd = "$moses_src_dir/bin/create-ini" unless defined($cmd);
my %IN;
my %OUT;
@@ -2188,8 +2185,29 @@ sub define_training_create_config {
my $unknown_word_label = &versionize(&long_file_name("unknown-word-label","model",""),$extract_version);
$cmd .= "-unknown-word-label $unknown_word_label ";
}
+ my $additional_ini = &get("TRAINING:additional-ini");
+ if (&get("TRAINING:score-settings") &&
+ &get("TRAINING:score-settings") =~ /SparseCountBinFeature/) {
+ $additional_ini .= "<br>[report-sparse-features]<br>stm<br><br>";
+ $cmd .= "-sparse-translation-table ";
+ }
+ $cmd .= "-additional-ini '$additional_ini' " if defined($additional_ini);
+ $cmd .= &define_domain_feature_score_option($domains) if &get("TRAINING:domain-features");
+
+ return $cmd;
+}
+
+sub define_training_create_config {
+ my ($step_id) = @_;
+
+ my ($config,$reordering_table,$phrase_translation_table,$generation_table,$sparse_lexical_features,$domains,@LM)
+ = &get_output_and_input($step_id);
+
+ my $cmd = &get_config_tables($config,$reordering_table,$phrase_translation_table,$generation_table,$domains);
+
+ # sparse lexical features provide additional content for config file
+ $cmd .= "-additional-ini-file $sparse_lexical_features.ini " if $sparse_lexical_features;
- # find out which language model files have been built
my @LM_SETS = &get_sets("LM");
my %INTERPOLATED_AWAY;
my %OUTPUT_FACTORS;
@@ -2257,18 +2275,6 @@ sub define_training_create_config {
$cmd .= "-lm $factor:$order:$lm_file:$type ";
}
- my $additional_ini = &get("TRAINING:additional-ini");
- if (&get("TRAINING:score-settings") &&
- &get("TRAINING:score-settings") =~ /SparseCountBinFeature/) {
- $additional_ini .= "<br>[report-sparse-features]<br>stm<br><br>";
- $cmd .= "-sparse-translation-table ";
- }
- $cmd .= "-additional-ini '$additional_ini' " if defined($additional_ini);
-
- # sparse lexical features provide additional content for config file
- $cmd .= "-additional-ini-file $sparse_lexical_features.ini " if $sparse_lexical_features;
- $cmd .= &define_domain_feature_score_option($domains) if &get("TRAINING:domain-features");
-
&create_step($step_id,$cmd);
}
@@ -2608,9 +2614,12 @@ sub define_tuningevaluation_filter {
# create pseudo-config file
else {
$config = $tuning_flag ? "$dir/tuning/moses.table.ini.$VERSION" : "$dir/evaluation/$set.moses.table.ini.$VERSION";
+ $cmd = "touch $config\n";
$delete_config = 1;
- $cmd = "cp $dir/model/moses.ini.$VERSION $config \n";
+ $cmd .= &get_config_tables($config,$reordering_table,$phrase_translation_table,undef,$domains);
+
+ $cmd .= "-lm 0:3:$config:8\n"; # dummy kenlm 3-gram model on factor 0
}
# filter command
diff --git a/scripts/ems/support/substitute-filtered-tables.perl b/scripts/ems/support/substitute-filtered-tables.perl
index 530130aa8..3efb243d7 100755
--- a/scripts/ems/support/substitute-filtered-tables.perl
+++ b/scripts/ems/support/substitute-filtered-tables.perl
@@ -10,30 +10,39 @@ if (scalar @ARGV < 1 || ! -e $ARGV[0]) {
# read config sections about filtered tables
my @arr;
open(FILTERED, $ARGV[0]) or die "Cannot open: $!";
+my $feature_section = 0;
while(my $line = <FILTERED>) {
chomp($line);
- if ($line =~ /PhraseModel /) {
- print STDERR "pt:$line \n";
- push(@arr, $line);
+ if ($line =~ /^\[(.+)\]/) {
+ $feature_section = ($1 eq "feature");
+ }
+ next unless $feature_section;
+ if ($line =~ /PhraseDictionary/) {
+ print STDERR "pt:$line \n";
+ push(@arr, $line);
+ }
+ elsif ($line =~ /LexicalReordering/) {
+ print STDERR "ro:$line \n";
+ push(@arr, $line);
}
- elsif ($line =~ /LexicalReordering /) {
- print STDERR "ro:$line \n";
- push(@arr, $line);
- }
}
close(FILTERED);
# pass through master config file and replace table sections
my $ind = 0;
+$feature_section = 0;
while(my $line = <STDIN>) {
chomp($line);
- if ($line =~ /PhraseModel /) {
- print $arr[$ind]."\n";
- ++$ind;
+ if ($line =~ /^\[(.+)\]/) {
+ $feature_section = ($1 eq "feature");
+ }
+ if ($feature_section && $line =~ /PhraseDictionary/) {
+ print $arr[$ind]."\n";
+ ++$ind;
}
- elsif ($line =~ /LexicalReordering /) {
- print $arr[$ind]."\n";
- ++$ind;
+ elsif ($feature_section && $line =~ /LexicalReordering/) {
+ print $arr[$ind]."\n";
+ ++$ind;
}
else {
print "$line\n";
diff --git a/scripts/training/mert-moses.pl b/scripts/training/mert-moses.pl
index f4797f868..e9f46de26 100755
--- a/scripts/training/mert-moses.pl
+++ b/scripts/training/mert-moses.pl
@@ -1284,16 +1284,17 @@ sub get_featlist_from_file {
while (<$fh>) {
$nr++;
chomp;
- /^(\S+)= (.+)$/;
- my ($longname, $valuesStr) = ($1, $2);
- next if (!defined($valuesStr));
+ if (/^(\S+)= (.+)$/) { # only for feature functions with dense features
+ my ($longname, $valuesStr) = ($1, $2);
+ next if (!defined($valuesStr));
- my @values = split(/ /, $valuesStr);
- foreach my $value (@values) {
- push @errs, "$featlistfn:$nr:Bad initial value of $longname: $value\n"
- if $value !~ /^[+-]?[0-9.\-e]+$/;
- push @names, $longname;
- push @startvalues, $value;
+ my @values = split(/ /, $valuesStr);
+ foreach my $value (@values) {
+ push @errs, "$featlistfn:$nr:Bad initial value of $longname: $value\n"
+ if $value !~ /^[+-]?[0-9.\-e]+$/;
+ push @names, $longname;
+ push @startvalues, $value;
+ }
}
}
close $fh;
@@ -1323,7 +1324,7 @@ sub get_order_of_scores_from_nbestlist {
my $label = undef;
my $sparse = 0; # we ignore sparse features here
foreach my $tok (split /\s+/, $scores) {
- if ($tok =~ /.+_.+:/) {
+ if ($tok =~ /.+_.+=/) {
$sparse = 1;
} elsif ($tok =~ /^([a-z][0-9a-z]*)=/i) {
$label = $1;