Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/moses-smt/mosesdecoder.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorphkoehn <phkoehn@1f5c12ca-751b-0410-a591-d2e778427230>2011-09-07 20:37:33 +0400
committerphkoehn <phkoehn@1f5c12ca-751b-0410-a591-d2e778427230>2011-09-07 20:37:33 +0400
commit41a184943720ddf85ac83339ecffa6db15ed8efb (patch)
tree6955e980cfb7f665c45d7cace9710c920c33f055 /scripts
parent9fee4a97f251ea482479a4882a2bca93e360b61a (diff)
support for sparse feature functions (mert support only when using PRO)
git-svn-id: https://mosesdecoder.svn.sourceforge.net/svnroot/mosesdecoder/trunk@4184 1f5c12ca-751b-0410-a591-d2e778427230
Diffstat (limited to 'scripts')
-rwxr-xr-xscripts/training/mert-moses.pl136
-rw-r--r--scripts/training/phrase-extract/consolidate.cpp150
-rw-r--r--scripts/training/phrase-extract/extract-rules.cpp2
-rw-r--r--scripts/training/phrase-extract/extract.cpp2
-rw-r--r--scripts/training/phrase-extract/tables-core.cpp4
-rwxr-xr-xscripts/training/train-model.perl58
6 files changed, 258 insertions, 94 deletions
diff --git a/scripts/training/mert-moses.pl b/scripts/training/mert-moses.pl
index 15e1d0d00..9a37cc137 100755
--- a/scripts/training/mert-moses.pl
+++ b/scripts/training/mert-moses.pl
@@ -11,6 +11,7 @@
# Excerpts from revision history
# Sept 2011 multi-threaded mert (Barry Haddow)
+# 3 Aug 2011 Added random directions, historic best, pairwise ranked (PK)
# Jul 2011 simplifications (Ondrej Bojar)
# -- rely on moses' -show-weights instead of parsing moses.ini
# ... so moses is also run once *before* mert starts, checking
@@ -287,8 +288,6 @@ $qsubwrapper="$SCRIPTS_ROOTDIR/generic/qsub-wrapper.pl" if !defined $qsubwrapper
$moses_parallel_cmd = "$SCRIPTS_ROOTDIR/generic/moses-parallel.pl"
if !defined $moses_parallel_cmd;
-
-
if (!defined $mertdir) {
$mertdir = "$SCRIPTS_ROOTDIR/../mert";
print STDERR "Assuming --mertdir=$mertdir\n";
@@ -357,13 +356,11 @@ die "Not executable: $moses_parallel_cmd" if defined $___JOBS && ! -x $moses_par
die "Not executable: $qsubwrapper" if defined $___JOBS && ! -x $qsubwrapper;
die "Not executable: $___DECODER" if ! -x $___DECODER;
-
my $input_abs = ensure_full_path($___DEV_F);
die "File not found: $___DEV_F (interpreted as $input_abs)."
if ! -e $input_abs;
$___DEV_F = $input_abs;
-
# Option to pass to qsubwrapper and moses-parallel
my $pass_old_sge = $old_sge ? "-old-sge" : "";
@@ -372,7 +369,6 @@ die "File not executable: $___DECODER (interpreted as $decoder_abs)."
if ! -x $decoder_abs;
$___DECODER = $decoder_abs;
-
my $ref_abs = ensure_full_path($___DEV_E);
# check if English dev set (reference translations) exist and store a list of all references
my @references;
@@ -409,9 +405,6 @@ if ($___DECODER_FLAGS =~ /(^|\s)-(config|f) /
# normalize initial LAMBDAs, too
my $need_to_normalize = 1;
-
-
-
#store current directory and create the working directory (if needed)
my $cwd = `pawd 2>/dev/null`;
if(!$cwd){$cwd = `pwd`;}
@@ -431,17 +424,16 @@ my $mert_logfile = "mert.log";
my $weights_in_file = "init.opt";
my $weights_out_file = "weights.txt";
-
# set start run
my $start_run = 1;
my $bestpoint = undef;
my $devbleu = undef;
+my $sparse_weights_file = undef;
my $prev_feature_file = undef;
my $prev_score_file = undef;
my $prev_init_file = undef;
-
if ($___FILTER_PHRASE_TABLE) {
my $outdir = "filtered";
if (-e "$outdir/moses.ini") {
@@ -471,7 +463,6 @@ else{
$___CONFIG_ORIG = $___CONFIG;
}
-
# we run moses to check validity of moses.ini and to obtain all the feature
# names
my $featlist = get_featlist_from_moses($___CONFIG);
@@ -579,28 +570,19 @@ if ($continue) {
print STDERR "All needed data are available\n";
print STDERR "Loading information from last step ($step)\n";
- open(IN,"run$step.$mert_logfile") or die "Can't open run$step.$mert_logfile";
- while (<IN>) {
- if (/Best point:\s*([\s\d\.\-e]+?)\s*=> ([\-\d\.]+)/) {
- $bestpoint = $1;
- $devbleu = $2;
- last;
- }
- }
- close IN;
+ my %dummy; # sparse features
+ ($bestpoint,$devbleu) = &get_weights_from_mert("run$step.$mert_outfile","run$step.$mert_logfile",scalar @{$featlist->{"names"}},\%dummy);
die "Failed to parse mert.log, missed Best point there."
if !defined $bestpoint || !defined $devbleu;
print "($step) BEST at $step $bestpoint => $devbleu at ".`date`;
-
my @newweights = split /\s+/, $bestpoint;
# Sanity check: order of lambdas must match
sanity_check_order_of_lambdas($featlist,
"gunzip -c < run$step.best$___N_BEST_LIST_SIZE.out.gz |");
-
+
# update my cache of lambda values
$featlist->{"values"} = \@newweights;
-
}
else{
print STDERR "No previous data are needed\n";
@@ -630,10 +612,10 @@ while(1) {
print "run $run start at ".`date`;
# In case something dies later, we might wish to have a copy
- create_config($___CONFIG, "./run$run.moses.ini", $featlist, $run, (defined$devbleu?$devbleu:"--not-estimated--"));
+ create_config($___CONFIG, "./run$run.moses.ini", $featlist, $run, (defined$devbleu?$devbleu:"--not-estimated--"),$sparse_weights_file);
- # skip if the user wanted
+ # skip running the decoder if the user wanted
if (!$skip_decoder) {
print "($run) run decoder to produce n-best lists\n";
$nbest_file = run_decoder($featlist, $run, $need_to_normalize);
@@ -648,8 +630,6 @@ while(1) {
$need_to_normalize = 0;
}
-
-
# extract score statistics and features from the nbest lists
print STDERR "Scoring the nbestlist.\n";
@@ -740,7 +720,7 @@ while(1) {
if ! -s $weights_out_file;
- # backup copies
+ # backup copies
safesystem ("\\cp -f extract.err run$run.extract.err") or die;
safesystem ("\\cp -f extract.out run$run.extract.out") or die;
if ($___PAIRWISE_RANKED_OPTIMIZER) { safesystem ("\\cp -f pro.data run$run.pro.data") or die; }
@@ -751,34 +731,10 @@ while(1) {
print "run $run end at ".`date`;
- $bestpoint = undef;
- $devbleu = undef;
- if ($___PAIRWISE_RANKED_OPTIMIZER) {
- open(IN,"run$run.$mert_outfile") or die "Can't open run$run.$mert_outfile";
- my (@WEIGHT,$sum);
- foreach (@CURR) { push @WEIGHT, 0; }
- while(<IN>) {
- if (/^F(\d+) ([\-\.\de]+)/) {
- $WEIGHT[$1] = $2;
- $sum += abs($2);
- }
- }
- $devbleu = "unknown";
- foreach (@WEIGHT) { $_ /= $sum; }
- $bestpoint = join(" ",@WEIGHT);
- close IN;
- }
- else {
- open(IN,"run$run.$mert_logfile") or die "Can't open run$run.$mert_logfile";
- while (<IN>) {
- if (/Best point:\s*([\s\d\.\-e]+?)\s*=> ([\-\d\.]+)/) {
- $bestpoint = $1;
- $devbleu = $2;
- last;
- }
- }
- close IN;
- }
+ my %sparse_weights; # sparse features
+ ($bestpoint,$devbleu) = &get_weights_from_mert("run$run.$mert_outfile","run$run.$mert_logfile",scalar @{$featlist->{"names"}},\%sparse_weights);
+
+
die "Failed to parse mert.log, missed Best point there."
if !defined $bestpoint || !defined $devbleu;
print "($run) BEST at $run: $bestpoint => $devbleu at ".`date`;
@@ -788,6 +744,15 @@ while(1) {
$featlist->{"values"} = \@newweights;
+ if (scalar keys %sparse_weights) {
+ $sparse_weights_file = "run".($run+1).".sparse-weights";
+ open(SPARSE,">".$sparse_weights_file);
+ foreach my $feature (keys %sparse_weights) {
+ print SPARSE "$feature $sparse_weights{$feature}\n";
+ }
+ close(SPARSE);
+ }
+
## additional stopping criterion: weights have not changed
my $shouldstop = 1;
for(my $i=0; $i<@CURR; $i++) {
@@ -864,6 +829,43 @@ chdir($cwd);
} # end of local scope
+sub get_weights_from_mert {
+ my ($outfile,$logfile,$weight_count,$sparse_weights) = @_;
+ my ($bestpoint,$devbleu);
+ if ($___PAIRWISE_RANKED_OPTIMIZER) {
+ open(IN,$outfile) or die "Can't open $outfile";
+ my (@WEIGHT,$sum);
+ for(my $i=0;$i<$weight_count;$i++) { push @WEIGHT, 0; }
+ while(<IN>) {
+ # regular features
+ if (/^F(\d+) ([\-\.\de]+)/) {
+ $WEIGHT[$1] = $2;
+ $sum += abs($2);
+ }
+ # sparse features
+ elsif(/^(.+_.+) ([\-\.\de]+)/) {
+ $$sparse_weights{$1} = $2;
+ }
+ }
+ $devbleu = "unknown";
+ foreach (@WEIGHT) { $_ /= $sum; }
+ $bestpoint = join(" ",@WEIGHT);
+ close IN;
+ }
+ else {
+ open(IN,$logfile) or die "Can't open $logfile";
+ while (<IN>) {
+ if (/Best point:\s*([\s\d\.\-e]+?)\s*=> ([\-\d\.]+)/) {
+ $bestpoint = $1;
+ $devbleu = $2;
+ last;
+ }
+ }
+ close IN;
+ }
+ return ($bestpoint,$devbleu);
+}
+
sub run_decoder {
my ($featlist, $run, $need_to_normalize) = @_;
my $filename_template = "run%d.best$___N_BEST_LIST_SIZE.out";
@@ -984,6 +986,7 @@ sub get_featlist_from_moses {
$nr++;
chomp;
my ($longname, $feature, $value) = split / /;
+ next if $value eq "sparse";
push @errs, "$featlistfn:$nr:Bad initial value of $feature: $value\n"
if $value !~ /^[+-]?[0-9.e]+$/;
push @errs, "$featlistfn:$nr:Unknown feature '$feature', please add it to \@ABBR_FULL_MAP\n"
@@ -1015,14 +1018,20 @@ sub get_order_of_scores_from_nbestlist {
my @order = ();
my $label = undef;
+ my $sparse = 0; # we ignore sparse features here
foreach my $tok (split /\s+/, $scores) {
- if ($tok =~ /^([a-z][0-9a-z]*):/i) {
+ if ($tok =~ /.+_.+:/) {
+ $sparse = 1;
+ } elsif ($tok =~ /^([a-z][0-9a-z]*):/i) {
$label = $1;
} elsif ($tok =~ /^-?[-0-9.e]+$/) {
- # a score found, remember it
- die "Found a score but no label before it! Bad nbestlist '$fname_or_source'!"
- if !defined $label;
- push @order, $label;
+ if (!$sparse) {
+ # a score found, remember it
+ die "Found a score but no label before it! Bad nbestlist '$fname_or_source'!"
+ if !defined $label;
+ push @order, $label;
+ }
+ $sparse = 0;
} else {
die "Not a label, not a score '$tok'. Failed to parse the scores string: '$scores' of nbestlist '$fname_or_source'";
}
@@ -1037,6 +1046,7 @@ sub create_config {
my $featlist = shift; # the lambdas we should write
my $iteration = shift; # just for verbosity
my $bleu_achieved = shift; # just for verbosity
+ my $sparse_weights_file = shift; # only defined when optimizing sparse features
my %P; # the hash of all parameters we wish to override
@@ -1076,6 +1086,10 @@ sub create_config {
push @{$P{$name}}, $val;
}
+ if (defined($sparse_weights_file)) {
+ push @{$P{"weights-file"}}, $___WORKING_DIR."/".$sparse_weights_file;
+ }
+
# create new moses.ini decoder config file by cloning and overriding the original one
open(INI,$infn) or die "Can't read $infn";
delete($P{"config"}); # never output
diff --git a/scripts/training/phrase-extract/consolidate.cpp b/scripts/training/phrase-extract/consolidate.cpp
index 53a141221..8d31a1d27 100644
--- a/scripts/training/phrase-extract/consolidate.cpp
+++ b/scripts/training/phrase-extract/consolidate.cpp
@@ -36,10 +36,15 @@ using namespace std;
bool hierarchicalFlag = false;
bool onlyDirectFlag = false;
bool phraseCountFlag = true;
+bool lowCountFlag = false;
+bool goodTuringFlag = false;
+bool kneserNeyFlag = false;
bool logProbFlag = false;
-char line[LINE_MAX_LENGTH];
+inline float maybeLogProb( float a ) { return logProbFlag ? log(a) : a; }
-void processFiles( char*, char*, char* );
+char line[LINE_MAX_LENGTH];
+void processFiles( char*, char*, char*, char* );
+void loadCountOfCounts( char* );
bool getLine( istream &fileP, vector< string > &item );
vector< string > splitLine();
@@ -55,6 +60,7 @@ int main(int argc, char* argv[])
char* &fileNameDirect = argv[1];
char* &fileNameIndirect = argv[2];
char* &fileNameConsolidated = argv[3];
+ char* fileNameCountOfCounts;
for(int i=4; i<argc; i++) {
if (strcmp(argv[i],"--Hierarchical") == 0) {
@@ -66,6 +72,25 @@ int main(int argc, char* argv[])
} else if (strcmp(argv[i],"--NoPhraseCount") == 0) {
phraseCountFlag = false;
cerr << "not including the phrase count feature\n";
+ } else if (strcmp(argv[i],"--GoodTuring") == 0) {
+ goodTuringFlag = true;
+ if (i+1==argc) {
+ cerr << "ERROR: specify count of count files for Good Turing discounting!\n";
+ exit(1);
+ }
+ fileNameCountOfCounts = argv[++i];
+ cerr << "adjusting phrase translation probabilities with Good Turing discounting\n";
+ } else if (strcmp(argv[i],"--KneserNey") == 0) {
+ kneserNeyFlag = true;
+ if (i+1==argc) {
+ cerr << "ERROR: specify count of count files for Kneser Ney discounting!\n";
+ exit(1);
+ }
+ fileNameCountOfCounts = argv[++i];
+ cerr << "adjusting phrase translation probabilities with Kneser Ney discounting\n";
+ } else if (strcmp(argv[i],"--LowCountFeature") == 0) {
+ lowCountFlag = true;
+ cerr << "including the low count feature\n";
} else if (strcmp(argv[i],"--LogProb") == 0) {
logProbFlag = true;
cerr << "using log-probabilities\n";
@@ -75,11 +100,61 @@ int main(int argc, char* argv[])
}
}
- processFiles( fileNameDirect, fileNameIndirect, fileNameConsolidated );
+ processFiles( fileNameDirect, fileNameIndirect, fileNameConsolidated, fileNameCountOfCounts );
+}
+
+vector< float > countOfCounts;
+vector< float > goodTuringDiscount;
+float kneserNey_D1, kneserNey_D2, kneserNey_D3, totalCount = -1;
+void loadCountOfCounts( char* fileNameCountOfCounts )
+{
+ Moses::InputFileStream fileCountOfCounts(fileNameCountOfCounts);
+ if (fileCountOfCounts.fail()) {
+ cerr << "ERROR: could not open count of counts file " << fileNameCountOfCounts << endl;
+ exit(1);
+ }
+ istream &fileP = fileCountOfCounts;
+
+ countOfCounts.push_back(0.0);
+ while(1) {
+ if (fileP.eof()) break;
+ SAFE_GETLINE((fileP), line, LINE_MAX_LENGTH, '\n', __FILE__);
+ if (fileP.eof()) break;
+ if (totalCount < 0)
+ totalCount = atof(line); // total number of distinct phrase pairs
+ else
+ countOfCounts.push_back( atof(line) );
+ }
+ fileCountOfCounts.Close();
+
+ // compute Good Turing discounts
+ if (goodTuringFlag) {
+ goodTuringDiscount.push_back(0.01); // floor value
+ for( size_t i=1; i<countOfCounts.size()-1; i++ ) {
+ goodTuringDiscount.push_back(((float)i+1)/(float)i*((countOfCounts[i+1]+0.1) / ((float)countOfCounts[i]+0.1)));
+ if (goodTuringDiscount[i]>1)
+ goodTuringDiscount[i] = 1;
+ if (goodTuringDiscount[i]<goodTuringDiscount[i-1])
+ goodTuringDiscount[i] = goodTuringDiscount[i-1];
+ }
+ }
+
+ // compute Kneser Ney co-efficients [Chen&Goodman, 1998]
+ float Y = countOfCounts[1] / (countOfCounts[1] + 2*countOfCounts[2]);
+ kneserNey_D1 = 1 - 2*Y * countOfCounts[2] / countOfCounts[1];
+ kneserNey_D2 = 2 - 3*Y * countOfCounts[3] / countOfCounts[2];
+ kneserNey_D3 = 3 - 4*Y * countOfCounts[4] / countOfCounts[3];
+ // sanity constraints
+ if (kneserNey_D1 > 0.9) kneserNey_D1 = 0.9;
+ if (kneserNey_D2 > 1.9) kneserNey_D2 = 1.9;
+ if (kneserNey_D3 > 2.9) kneserNey_D3 = 2.9;
}
-void processFiles( char* fileNameDirect, char* fileNameIndirect, char* fileNameConsolidated )
+void processFiles( char* fileNameDirect, char* fileNameIndirect, char* fileNameConsolidated, char* fileNameCountOfCounts )
{
+ if (goodTuringFlag || kneserNeyFlag)
+ loadCountOfCounts( fileNameCountOfCounts );
+
// open input files
Moses::InputFileStream fileDirect(fileNameDirect);
Moses::InputFileStream fileIndirect(fileNameIndirect);
@@ -134,29 +209,67 @@ void processFiles( char* fileNameDirect, char* fileNameIndirect, char* fileNameC
// output hierarchical phrase pair (with separated labels)
fileConsolidated << itemDirect[0] << " ||| " << itemDirect[1];
- // probs
- fileConsolidated << " ||| ";
+ // SCORES ...
+ fileConsolidated << " |||";
+ vector<string> directCounts = tokenize(itemDirect[4].c_str());
+ vector<string> indirectCounts = tokenize(itemIndirect[4].c_str());
+ float countF = atof(directCounts[0].c_str());
+ float countE = atof(indirectCounts[0].c_str());
+ float countEF = atof(indirectCounts[1].c_str());
+ float n1_F, n1_E;
+ if (kneserNeyFlag) {
+ n1_F = atof(directCounts[2].c_str());
+ n1_E = atof(indirectCounts[2].c_str());
+ }
+
+ // Good Turing discounting
+ float adjustedCountEF = countEF;
+ if (goodTuringFlag && countEF+0.99999 < goodTuringDiscount.size()-1)
+ adjustedCountEF *= goodTuringDiscount[(int)(countEF+0.99998)];
+ float adjustedCountEF_indirect = adjustedCountEF;
+
+ // Kneser Ney discounting [Foster et al, 2006]
+ if (kneserNeyFlag) {
+ float D = kneserNey_D3;
+ if (countEF < 2) D = kneserNey_D1;
+ if (countEF < 3) D = kneserNey_D2;
+ if (D > countEF) D = countEF - 0.01; // sanity constraint
+
+ float p_b_E = n1_E / totalCount; // target phrase prob based on distinct
+ float alpha_F = D * n1_F / countF; // available mass
+ adjustedCountEF = countEF - D + countF * alpha_F * p_b_E;
+
+ // for indirect
+ float p_b_F = n1_F / totalCount; // target phrase prob based on distinct
+ float alpha_E = D * n1_E / countE; // available mass
+ adjustedCountEF_indirect = countEF - D + countE * alpha_E * p_b_F;
+ }
+
+ // prob indirect
if (!onlyDirectFlag) {
- fileConsolidated << itemIndirect[2]; // prob indirect
+ fileConsolidated << " " << maybeLogProb(adjustedCountEF_indirect/countE);
+ fileConsolidated << " " << itemIndirect[2];
}
- fileConsolidated << " " << itemDirect[2]; // prob direct
+
+ // prob direct
+ fileConsolidated << " " << maybeLogProb(adjustedCountEF/countF);
+ fileConsolidated << " " << itemDirect[2];
+
+ // phrase count feature
if (phraseCountFlag) {
- fileConsolidated << " " << (logProbFlag ? 1 : 2.718); // phrase count feature
+ fileConsolidated << " " << maybeLogProb(2.718);
+ }
+
+ // low count feature
+ if (lowCountFlag) {
+ fileConsolidated << " " << maybeLogProb(exp(-1.0/countEF));
}
// alignment
fileConsolidated << " ||| " << itemDirect[3];
// counts, for debugging
- vector<string> directCounts = tokenize(itemDirect[4].c_str());
- vector<string> indirectCounts = tokenize(itemIndirect[4].c_str());
- fileConsolidated << "||| " << indirectCounts[0] << " " << directCounts[0];
- // output rule count if present in either file
- if (directCounts.size() > 1) {
- fileConsolidated << " " << directCounts[1];
- } else if (indirectCounts.size() > 1) {
- fileConsolidated << " " << indirectCounts[1];
- }
+ fileConsolidated << "||| " << countE << " " << countF; // << " " << countEF;
fileConsolidated << endl;
}
@@ -165,6 +278,7 @@ void processFiles( char* fileNameDirect, char* fileNameIndirect, char* fileNameC
fileConsolidated.close();
}
+
bool getLine( istream &fileP, vector< string > &item )
{
if (fileP.eof())
diff --git a/scripts/training/phrase-extract/extract-rules.cpp b/scripts/training/phrase-extract/extract-rules.cpp
index 15367f900..0b56f0c29 100644
--- a/scripts/training/phrase-extract/extract-rules.cpp
+++ b/scripts/training/phrase-extract/extract-rules.cpp
@@ -45,7 +45,7 @@
#include "tables-core.h"
#include "XmlTree.h"
-#define LINE_MAX_LENGTH 60000
+#define LINE_MAX_LENGTH 500000
using namespace std;
diff --git a/scripts/training/phrase-extract/extract.cpp b/scripts/training/phrase-extract/extract.cpp
index 5a63b6345..0848723df 100644
--- a/scripts/training/phrase-extract/extract.cpp
+++ b/scripts/training/phrase-extract/extract.cpp
@@ -24,7 +24,7 @@
using namespace std;
-#define LINE_MAX_LENGTH 60000
+#define LINE_MAX_LENGTH 500000
// HPhraseVertex represents a point in the alignment matrix
typedef pair <int, int> HPhraseVertex;
diff --git a/scripts/training/phrase-extract/tables-core.cpp b/scripts/training/phrase-extract/tables-core.cpp
index c8911f6a9..399026930 100644
--- a/scripts/training/phrase-extract/tables-core.cpp
+++ b/scripts/training/phrase-extract/tables-core.cpp
@@ -29,6 +29,10 @@ vector<string> tokenize( const char* input )
return token;
}
+bool isNonTerminal( const WORD &symbol ) {
+ return symbol.substr(0, 1) == "[" && symbol.substr(symbol.size()-1, 1) == "]";
+}
+
WORD_ID Vocabulary::storeIfNew( const WORD& word )
{
map<WORD, WORD_ID>::iterator i = lookup.find( word );
diff --git a/scripts/training/train-model.perl b/scripts/training/train-model.perl
index d754aa885..ec0ecf79d 100755
--- a/scripts/training/train-model.perl
+++ b/scripts/training/train-model.perl
@@ -34,9 +34,10 @@ my($_ROOT_DIR, $_CORPUS_DIR, $_GIZA_E2F, $_GIZA_F2E, $_MODEL_DIR, $_TEMP_DIR, $_
$_PHRASE_WORD_ALIGNMENT,$_FORCE_FACTORED_FILENAMES,
$_MEMSCORE, $_FINAL_ALIGNMENT_MODEL,
$_CONTINUE,$_MAX_LEXICAL_REORDERING,$_DO_STEPS,
+ $_ADDITIONAL_INI,
$_DICTIONARY, $_EPPEX);
-my $debug = 0; # debug this script, do not delete any files in debug mode
+my $debug = 1; # debug this script, do not delete any files in debug mode
# the following line is set installation time by 'make release'. BEWARE!
my $BINDIR="/home/pkoehn/statmt/bin";
@@ -109,7 +110,7 @@ $_HELP = 1
'memscore:s' => \$_MEMSCORE,
'force-factored-filenames' => \$_FORCE_FACTORED_FILENAMES,
'dictionary=s' => \$_DICTIONARY,
- 'eppex:s' => \$_EPPEX,
+ 'additional-ini=s' => \$_ADDITIONAL_INI
);
if ($_HELP) {
@@ -1372,11 +1373,28 @@ sub score_phrase {
sub score_phrase_phrase_extract {
my ($ttable_file,$lexical_file,$extract_file) = @_;
+ # remove consolidation options
my $ONLY_DIRECT = (defined($_SCORE_OPTIONS) && $_SCORE_OPTIONS =~ /OnlyDirect/);
my $PHRASE_COUNT = (!defined($_SCORE_OPTIONS) || $_SCORE_OPTIONS !~ /NoPhraseCount/);
- my $CORE_SCORE_OPTIONS = defined($_SCORE_OPTIONS) ? $_SCORE_OPTIONS : "";
- $CORE_SCORE_OPTIONS =~ s/\-+OnlyDirect//i;
- $CORE_SCORE_OPTIONS =~ s/\-+NoPhraseCount//i;
+ my $LOW_COUNT = (defined($_SCORE_OPTIONS) && $_SCORE_OPTIONS =~ /LowCountFeature/);
+ my $UNALIGNED_COUNT = (defined($_SCORE_OPTIONS) && $_SCORE_OPTIONS =~ /UnalignedPenalty/);
+ my ($UNALIGNED_FW_COUNT,$UNALIGNED_FW_F,$UNALIGNED_FW_E);
+ if (defined($_SCORE_OPTIONS) && $_SCORE_OPTIONS =~ /UnalignedFunctionWordPenalty +(\S+) +(\S+)/) {
+ $UNALIGNED_FW_COUNT = 1;
+ $UNALIGNED_FW_F = $1;
+ $UNALIGNED_FW_E = $2;
+ }
+ my $GOOD_TURING = (defined($_SCORE_OPTIONS) && $_SCORE_OPTIONS =~ /GoodTuring/);
+ my $KNESER_NEY = (defined($_SCORE_OPTIONS) && $_SCORE_OPTIONS =~ /KneserNey/);
+ my $LOG_PROB = (defined($_SCORE_OPTIONS) && $_SCORE_OPTIONS =~ /LogProb/);
+ my $NEG_LOG_PROB = (defined($_SCORE_OPTIONS) && $_SCORE_OPTIONS =~ /NegLogProb/);
+ my $NO_LEX = (defined($_SCORE_OPTIONS) && $_SCORE_OPTIONS =~ /NoLex/);
+ my $MIN_COUNT_HIERARCHICAL = (defined($_SCORE_OPTIONS) && $_SCORE_OPTIONS =~ /MinCountHierarchical ([\d\.]+)/) ? $1 : undef;
+ my $CORE_SCORE_OPTIONS = "";
+ $CORE_SCORE_OPTIONS .= " --LogProb" if $LOG_PROB;
+ $CORE_SCORE_OPTIONS .= " --NegLogProb" if $NEG_LOG_PROB;
+ $CORE_SCORE_OPTIONS .= " --NoLex" if $NO_LEX;
+
my $substep = 1;
for my $direction ("f2e","e2f") {
next if $___CONTINUE && -e "$ttable_file.half.$direction";
@@ -1405,6 +1423,11 @@ sub score_phrase_phrase_extract {
my $cmd = "$PHRASE_SCORE $extract $lexical_file.$direction $ttable_file.half.$direction $inverse";
$cmd .= " --Hierarchical" if $_HIERARCHICAL;
$cmd .= " --WordAlignment" if $_PHRASE_WORD_ALIGNMENT;
+ $cmd .= " --KneserNey $ttable_file.coc" if $KNESER_NEY;
+ $cmd .= " --GoodTuring $ttable_file.coc" if $GOOD_TURING && $inverse eq "";
+ $cmd .= " --UnalignedPenalty" if $UNALIGNED_COUNT;
+ $cmd .= " --UnalignedFunctionWordPenalty ".($inverse ? $UNALIGNED_FW_F : $UNALIGNED_FW_E) if $UNALIGNED_FW_COUNT;
+ $cmd .= " --MinCountHierarchical $MIN_COUNT_HIERARCHICAL" if $MIN_COUNT_HIERARCHICAL;
$cmd .= " $CORE_SCORE_OPTIONS" if defined($_SCORE_OPTIONS);
print $cmd."\n";
safesystem($cmd) or die "ERROR: Scoring of phrases failed";
@@ -1423,8 +1446,13 @@ sub score_phrase_phrase_extract {
return if $___CONTINUE && -e "$ttable_file.gz";
my $cmd = "$PHRASE_CONSOLIDATE $ttable_file.half.f2e $ttable_file.half.e2f.sorted $ttable_file";
$cmd .= " --Hierarchical" if $_HIERARCHICAL;
+ $cmd .= " --LogProb" if $LOG_PROB;
+ $cmd .= " --NegLogProb" if $NEG_LOG_PROB;
$cmd .= " --OnlyDirect" if $ONLY_DIRECT;
$cmd .= " --NoPhraseCount" unless $PHRASE_COUNT;
+ $cmd .= " --LowCountFeature" if $LOW_COUNT;
+ $cmd .= " --GoodTuring $ttable_file.coc" if $GOOD_TURING;
+ $cmd .= " --KneserNey $ttable_file.coc" if $KNESER_NEY;
safesystem($cmd) or die "ERROR: Consolidating the two phrase table halves failed";
if (! $debug) { safesystem("rm -f $ttable_file.half.*") or die("ERROR"); }
if (! $___DONT_ZIP) {
@@ -1681,6 +1709,13 @@ sub create_ini {
[ttable-file]\n";
my $num_of_ttables = 0;
my @SPECIFIED_TABLE = @_PHRASE_TABLE;
+ my $basic_weight_count = 4; # both directions, lex and phrase
+ $basic_weight_count-=2 if defined($_SCORE_OPTIONS) && $_SCORE_OPTIONS =~ /NoLex/;
+ $basic_weight_count+=2 if defined($_SCORE_OPTIONS) && $_SCORE_OPTIONS =~ /UnalignedPenalty/; # word ins/del
+ $basic_weight_count+=2 if defined($_SCORE_OPTIONS) && $_SCORE_OPTIONS =~ /UnalignedFunctionWordPenalty/;
+ $basic_weight_count /= 2 if defined($_SCORE_OPTIONS) && $_SCORE_OPTIONS =~ /OnlyDirect/;
+ $basic_weight_count++ unless defined($_SCORE_OPTIONS) && $_SCORE_OPTIONS =~ /NoPhraseCount/; # phrase count feature
+ $basic_weight_count++ if defined($_SCORE_OPTIONS) && $_SCORE_OPTIONS =~ /LowCountFeature/; # low count feature
foreach my $f (split(/\+/,$___TRANSLATION_FACTORS)) {
$num_of_ttables++;
my $ff = $f;
@@ -1688,10 +1723,6 @@ sub create_ini {
my $file = "$___MODEL_DIR/".($_HIERARCHICAL?"rule-table":"phrase-table").($___NOT_FACTORED ? "" : ".$f").".gz";
$file = shift @SPECIFIED_TABLE if scalar(@SPECIFIED_TABLE);
my $phrase_table_impl = ($_HIERARCHICAL ? 6 : 0);
- my $basic_weight_count = 4; # both directions, lex and phrase
- $basic_weight_count /= 2 if defined($_SCORE_OPTIONS) && $_SCORE_OPTIONS =~ /OnlyDirect/;
- $basic_weight_count /= 2 if defined($_SCORE_OPTIONS) && $_SCORE_OPTIONS =~ /NoLex/;
- $basic_weight_count++ unless defined($_SCORE_OPTIONS) && $_SCORE_OPTIONS =~ /NoPhraseCount/; # phrase count feature
print INI "$phrase_table_impl $ff $basic_weight_count $file\n";
}
if ($_GLUE_GRAMMAR) {
@@ -1783,10 +1814,6 @@ sub create_ini {
print INI "\n\n# translation model weights\n[weight-t]\n";
foreach my $f (split(/\+/,$___TRANSLATION_FACTORS)) {
- my $basic_weight_count = 4; # both directions, lex and phrase
- $basic_weight_count /= 2 if defined($_SCORE_OPTIONS) && $_SCORE_OPTIONS =~ /OnlyDirect/;
- $basic_weight_count /= 2 if defined($_SCORE_OPTIONS) && $_SCORE_OPTIONS =~ /NoLex/;
- $basic_weight_count++ unless defined($_SCORE_OPTIONS) && $_SCORE_OPTIONS =~ /NoPhraseCount/; # phrase count feature
for(1..$basic_weight_count) {
printf INI "%.2f\n", 1/$basic_weight_count;
}
@@ -1826,6 +1853,11 @@ sub create_ini {
print INI "\n# delimiter between factors in input\n[factor-delimiter]\n$___FACTOR_DELIMITER\n\n"
}
+ if ($_ADDITIONAL_INI) {
+ print INI "\n# additional settings\n\n";
+ foreach (split(/<br>/i,$_ADDITIONAL_INI)) { print INI $_."\n"; }
+ }
+
close(INI);
}