Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/moses-smt/mosesdecoder.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorphikoehn <pkoehn@inf.ed.ac.uk>2013-05-01 22:20:05 +0400
committerphikoehn <pkoehn@inf.ed.ac.uk>2013-05-01 22:20:05 +0400
commitcd8915647b74cd60cb259c2ec8ba5230970389f5 (patch)
tree6c4357ecb255b3d480a9c729fe20c096512921cb /scripts/training
parent8a1e944bb428a0af9f6c82c26e5633361ce4052c (diff)
support for Chris Dyer's fast-align; bug fix with sparse word translations feature; threshold pruning in filter
Diffstat (limited to 'scripts/training')
-rwxr-xr-xscripts/training/filter-model-given-input.pl25
-rwxr-xr-xscripts/training/train-model.perl17
2 files changed, 37 insertions, 5 deletions
diff --git a/scripts/training/filter-model-given-input.pl b/scripts/training/filter-model-given-input.pl
index d994fbcef..6323096be 100755
--- a/scripts/training/filter-model-given-input.pl
+++ b/scripts/training/filter-model-given-input.pl
@@ -36,6 +36,7 @@ my $ZCAT = "gzip -cd";
# get optional parameters
my $opt_hierarchical = 0;
my $binarizer = undef;
+my $min_score = undef;
my $opt_min_non_initial_rule_count = undef;
my $opt_gzip = 1; # gzip output files (so far only phrase-based ttable until someone tests remaining models and formats)
@@ -43,6 +44,7 @@ GetOptions(
"gzip!" => \$opt_gzip,
"Hierarchical" => \$opt_hierarchical,
"Binarizer=s" => \$binarizer,
+ "MinScore=s" => \$min_score,
"MinNonInitialRuleCount=i" => \$opt_min_non_initial_rule_count
) or exit(1);
@@ -52,11 +54,20 @@ my $config = shift;
my $input = shift;
if (!defined $dir || !defined $config || !defined $input) {
- print STDERR "usage: filter-model-given-input.pl targetdir moses.ini input.text [-Binarizer binarizer] [-Hierarchical]\n";
+ print STDERR "usage: filter-model-given-input.pl targetdir moses.ini input.text [-Binarizer binarizer] [-Hierarchical] [-MinScore id:threshold[,id:threshold]*]\n";
exit 1;
}
$dir = ensure_full_path($dir);
+# decode min-score definitions
+my %MIN_SCORE;
+if ($min_score) {
+ foreach (split(/ *, */,$min_score)) {
+ my ($id,$score) = split(/ *: */);
+ $MIN_SCORE{$id} = $score;
+ print STDERR "score $id must be at least $score\n";
+ }
+}
# buggy directory in place?
if (-d $dir && ! -e "$dir/info") {
print STDERR "The directory $dir already exists. Please delete $dir and rerun!\n";
@@ -262,6 +273,18 @@ for(my $i=0;$i<=$#TABLE;$i++) {
my ($foreign,$rest) = split(/ \|\|\| /,$entry,2);
$foreign =~ s/ $//;
if (defined($PHRASE_USED{$factors}{$foreign})) {
+ # handle min_score thresholds
+ if ($min_score) {
+ my @ITEM = split(/ *\|\|\| */,$rest);
+ if(scalar (@ITEM)>2) { # do not filter reordering table
+ my @SCORE = split(/ /,$ITEM[1]);
+ my $okay = 1;
+ foreach my $id (keys %MIN_SCORE) {
+ $okay = 0 if $SCORE[$id] < $MIN_SCORE{$id};
+ }
+ next unless $okay;
+ }
+ }
print FILE_OUT $entry;
$used++;
}
diff --git a/scripts/training/train-model.perl b/scripts/training/train-model.perl
index e4292007e..680495602 100755
--- a/scripts/training/train-model.perl
+++ b/scripts/training/train-model.perl
@@ -1468,21 +1468,23 @@ sub score_phrase_factored {
&score_phrase($file,$___LEXICAL_FILE,$___EXTRACT_FILE);
}
else {
+ my $table_id = 0;
foreach my $factor (split(/\+/,$___TRANSLATION_FACTORS)) {
print STDERR "(6) [$factor] score phrases @ ".`date`;
my ($factor_f,$factor_e) = split(/\-/,$factor);
my $file = "$___MODEL_DIR/".($_HIERARCHICAL?"rule-table":"phrase-table").".$factor";
$file = shift @SPECIFIED_TABLE if scalar(@SPECIFIED_TABLE);
- &score_phrase($file,$___LEXICAL_FILE.".".$factor,$___EXTRACT_FILE.".".$factor);
+ &score_phrase($file,$___LEXICAL_FILE.".".$factor,$___EXTRACT_FILE.".".$factor,$table_id);
+ $table_id++;
}
}
}
sub score_phrase {
- my ($ttable_file,$lexical_file,$extract_file) = @_;
+ my ($ttable_file,$lexical_file,$extract_file,$table_id) = @_;
if ($___PHRASE_SCORER eq "phrase-extract") {
- &score_phrase_phrase_extract($ttable_file,$lexical_file,$extract_file);
+ &score_phrase_phrase_extract($ttable_file,$lexical_file,$extract_file,$table_id);
} elsif ($___PHRASE_SCORER eq "memscore") {
&score_phrase_memscore($ttable_file,$lexical_file,$extract_file);
} else {
@@ -1491,7 +1493,7 @@ sub score_phrase {
}
sub score_phrase_phrase_extract {
- my ($ttable_file,$lexical_file,$extract_file) = @_;
+ my ($ttable_file,$lexical_file,$extract_file,$table_id) = @_;
# distinguish between score and consolidation options
my $ONLY_DIRECT = (defined($_SCORE_OPTIONS) && $_SCORE_OPTIONS =~ /OnlyDirect/);
@@ -1502,6 +1504,13 @@ sub score_phrase_phrase_extract {
$COUNT_BIN = $1 if defined($_SCORE_OPTIONS) && $_SCORE_OPTIONS =~ /\-CountBinFeature ([\s\d]*\d)/;
$DOMAIN = $1 if defined($_SCORE_OPTIONS) && $_SCORE_OPTIONS =~ /(\-+[a-z]*Domain[a-z]+ .+)/i;
$DOMAIN =~ s/ \-.+//g;
+ if ($DOMAIN =~ /^(.+) table ([\d\,]+) *$/) {
+ my ($main_spec,$specified_tables) = ($1,$2);
+ $DOMAIN = "--IgnoreSentenceId";
+ foreach my $specified_table_id (split(/,/,$specified_tables)) {
+ $DOMAIN = $main_spec if $specified_table_id == $table_id;
+ }
+ }
my $SINGLETON = (defined($_SCORE_OPTIONS) && $_SCORE_OPTIONS =~ /Singleton/);
my $CROSSEDNONTERM = (defined($_SCORE_OPTIONS) && $_SCORE_OPTIONS =~ /CrossedNonTerm/);