From 41a184943720ddf85ac83339ecffa6db15ed8efb Mon Sep 17 00:00:00 2001 From: phkoehn Date: Wed, 7 Sep 2011 16:37:33 +0000 Subject: support for sparse feature functions (mert support only when using PRO) git-svn-id: https://mosesdecoder.svn.sourceforge.net/svnroot/mosesdecoder/trunk@4184 1f5c12ca-751b-0410-a591-d2e778427230 --- scripts/training/mert-moses.pl | 136 +++++++++++--------- scripts/training/phrase-extract/consolidate.cpp | 150 +++++++++++++++++++--- scripts/training/phrase-extract/extract-rules.cpp | 2 +- scripts/training/phrase-extract/extract.cpp | 2 +- scripts/training/phrase-extract/tables-core.cpp | 4 + scripts/training/train-model.perl | 58 +++++++-- 6 files changed, 258 insertions(+), 94 deletions(-) (limited to 'scripts') diff --git a/scripts/training/mert-moses.pl b/scripts/training/mert-moses.pl index 15e1d0d00..9a37cc137 100755 --- a/scripts/training/mert-moses.pl +++ b/scripts/training/mert-moses.pl @@ -11,6 +11,7 @@ # Excerpts from revision history # Sept 2011 multi-threaded mert (Barry Haddow) +# 3 Aug 2011 Added random directions, historic best, pairwise ranked (PK) # Jul 2011 simplifications (Ondrej Bojar) # -- rely on moses' -show-weights instead of parsing moses.ini # ... so moses is also run once *before* mert starts, checking @@ -287,8 +288,6 @@ $qsubwrapper="$SCRIPTS_ROOTDIR/generic/qsub-wrapper.pl" if !defined $qsubwrapper $moses_parallel_cmd = "$SCRIPTS_ROOTDIR/generic/moses-parallel.pl" if !defined $moses_parallel_cmd; - - if (!defined $mertdir) { $mertdir = "$SCRIPTS_ROOTDIR/../mert"; print STDERR "Assuming --mertdir=$mertdir\n"; @@ -357,13 +356,11 @@ die "Not executable: $moses_parallel_cmd" if defined $___JOBS && ! -x $moses_par die "Not executable: $qsubwrapper" if defined $___JOBS && ! -x $qsubwrapper; die "Not executable: $___DECODER" if ! -x $___DECODER; - my $input_abs = ensure_full_path($___DEV_F); die "File not found: $___DEV_F (interpreted as $input_abs)." if ! -e $input_abs; $___DEV_F = $input_abs; - # Option to pass to qsubwrapper and moses-parallel my $pass_old_sge = $old_sge ? "-old-sge" : ""; @@ -372,7 +369,6 @@ die "File not executable: $___DECODER (interpreted as $decoder_abs)." if ! -x $decoder_abs; $___DECODER = $decoder_abs; - my $ref_abs = ensure_full_path($___DEV_E); # check if English dev set (reference translations) exist and store a list of all references my @references; @@ -409,9 +405,6 @@ if ($___DECODER_FLAGS =~ /(^|\s)-(config|f) / # normalize initial LAMBDAs, too my $need_to_normalize = 1; - - - #store current directory and create the working directory (if needed) my $cwd = `pawd 2>/dev/null`; if(!$cwd){$cwd = `pwd`;} @@ -431,17 +424,16 @@ my $mert_logfile = "mert.log"; my $weights_in_file = "init.opt"; my $weights_out_file = "weights.txt"; - # set start run my $start_run = 1; my $bestpoint = undef; my $devbleu = undef; +my $sparse_weights_file = undef; my $prev_feature_file = undef; my $prev_score_file = undef; my $prev_init_file = undef; - if ($___FILTER_PHRASE_TABLE) { my $outdir = "filtered"; if (-e "$outdir/moses.ini") { @@ -471,7 +463,6 @@ else{ $___CONFIG_ORIG = $___CONFIG; } - # we run moses to check validity of moses.ini and to obtain all the feature # names my $featlist = get_featlist_from_moses($___CONFIG); @@ -579,28 +570,19 @@ if ($continue) { print STDERR "All needed data are available\n"; print STDERR "Loading information from last step ($step)\n"; - open(IN,"run$step.$mert_logfile") or die "Can't open run$step.$mert_logfile"; - while () { - if (/Best point:\s*([\s\d\.\-e]+?)\s*=> ([\-\d\.]+)/) { - $bestpoint = $1; - $devbleu = $2; - last; - } - } - close IN; + my %dummy; # sparse features + ($bestpoint,$devbleu) = &get_weights_from_mert("run$step.$mert_outfile","run$step.$mert_logfile",scalar @{$featlist->{"names"}},\%dummy); die "Failed to parse mert.log, missed Best point there." if !defined $bestpoint || !defined $devbleu; print "($step) BEST at $step $bestpoint => $devbleu at ".`date`; - my @newweights = split /\s+/, $bestpoint; # Sanity check: order of lambdas must match sanity_check_order_of_lambdas($featlist, "gunzip -c < run$step.best$___N_BEST_LIST_SIZE.out.gz |"); - + # update my cache of lambda values $featlist->{"values"} = \@newweights; - } else{ print STDERR "No previous data are needed\n"; @@ -630,10 +612,10 @@ while(1) { print "run $run start at ".`date`; # In case something dies later, we might wish to have a copy - create_config($___CONFIG, "./run$run.moses.ini", $featlist, $run, (defined$devbleu?$devbleu:"--not-estimated--")); + create_config($___CONFIG, "./run$run.moses.ini", $featlist, $run, (defined$devbleu?$devbleu:"--not-estimated--"),$sparse_weights_file); - # skip if the user wanted + # skip running the decoder if the user wanted if (!$skip_decoder) { print "($run) run decoder to produce n-best lists\n"; $nbest_file = run_decoder($featlist, $run, $need_to_normalize); @@ -648,8 +630,6 @@ while(1) { $need_to_normalize = 0; } - - # extract score statistics and features from the nbest lists print STDERR "Scoring the nbestlist.\n"; @@ -740,7 +720,7 @@ while(1) { if ! -s $weights_out_file; - # backup copies + # backup copies safesystem ("\\cp -f extract.err run$run.extract.err") or die; safesystem ("\\cp -f extract.out run$run.extract.out") or die; if ($___PAIRWISE_RANKED_OPTIMIZER) { safesystem ("\\cp -f pro.data run$run.pro.data") or die; } @@ -751,34 +731,10 @@ while(1) { print "run $run end at ".`date`; - $bestpoint = undef; - $devbleu = undef; - if ($___PAIRWISE_RANKED_OPTIMIZER) { - open(IN,"run$run.$mert_outfile") or die "Can't open run$run.$mert_outfile"; - my (@WEIGHT,$sum); - foreach (@CURR) { push @WEIGHT, 0; } - while() { - if (/^F(\d+) ([\-\.\de]+)/) { - $WEIGHT[$1] = $2; - $sum += abs($2); - } - } - $devbleu = "unknown"; - foreach (@WEIGHT) { $_ /= $sum; } - $bestpoint = join(" ",@WEIGHT); - close IN; - } - else { - open(IN,"run$run.$mert_logfile") or die "Can't open run$run.$mert_logfile"; - while () { - if (/Best point:\s*([\s\d\.\-e]+?)\s*=> ([\-\d\.]+)/) { - $bestpoint = $1; - $devbleu = $2; - last; - } - } - close IN; - } + my %sparse_weights; # sparse features + ($bestpoint,$devbleu) = &get_weights_from_mert("run$run.$mert_outfile","run$run.$mert_logfile",scalar @{$featlist->{"names"}},\%sparse_weights); + + die "Failed to parse mert.log, missed Best point there." if !defined $bestpoint || !defined $devbleu; print "($run) BEST at $run: $bestpoint => $devbleu at ".`date`; @@ -788,6 +744,15 @@ while(1) { $featlist->{"values"} = \@newweights; + if (scalar keys %sparse_weights) { + $sparse_weights_file = "run".($run+1).".sparse-weights"; + open(SPARSE,">".$sparse_weights_file); + foreach my $feature (keys %sparse_weights) { + print SPARSE "$feature $sparse_weights{$feature}\n"; + } + close(SPARSE); + } + ## additional stopping criterion: weights have not changed my $shouldstop = 1; for(my $i=0; $i<@CURR; $i++) { @@ -864,6 +829,43 @@ chdir($cwd); } # end of local scope +sub get_weights_from_mert { + my ($outfile,$logfile,$weight_count,$sparse_weights) = @_; + my ($bestpoint,$devbleu); + if ($___PAIRWISE_RANKED_OPTIMIZER) { + open(IN,$outfile) or die "Can't open $outfile"; + my (@WEIGHT,$sum); + for(my $i=0;$i<$weight_count;$i++) { push @WEIGHT, 0; } + while() { + # regular features + if (/^F(\d+) ([\-\.\de]+)/) { + $WEIGHT[$1] = $2; + $sum += abs($2); + } + # sparse features + elsif(/^(.+_.+) ([\-\.\de]+)/) { + $$sparse_weights{$1} = $2; + } + } + $devbleu = "unknown"; + foreach (@WEIGHT) { $_ /= $sum; } + $bestpoint = join(" ",@WEIGHT); + close IN; + } + else { + open(IN,$logfile) or die "Can't open $logfile"; + while () { + if (/Best point:\s*([\s\d\.\-e]+?)\s*=> ([\-\d\.]+)/) { + $bestpoint = $1; + $devbleu = $2; + last; + } + } + close IN; + } + return ($bestpoint,$devbleu); +} + sub run_decoder { my ($featlist, $run, $need_to_normalize) = @_; my $filename_template = "run%d.best$___N_BEST_LIST_SIZE.out"; @@ -984,6 +986,7 @@ sub get_featlist_from_moses { $nr++; chomp; my ($longname, $feature, $value) = split / /; + next if $value eq "sparse"; push @errs, "$featlistfn:$nr:Bad initial value of $feature: $value\n" if $value !~ /^[+-]?[0-9.e]+$/; push @errs, "$featlistfn:$nr:Unknown feature '$feature', please add it to \@ABBR_FULL_MAP\n" @@ -1015,14 +1018,20 @@ sub get_order_of_scores_from_nbestlist { my @order = (); my $label = undef; + my $sparse = 0; # we ignore sparse features here foreach my $tok (split /\s+/, $scores) { - if ($tok =~ /^([a-z][0-9a-z]*):/i) { + if ($tok =~ /.+_.+:/) { + $sparse = 1; + } elsif ($tok =~ /^([a-z][0-9a-z]*):/i) { $label = $1; } elsif ($tok =~ /^-?[-0-9.e]+$/) { - # a score found, remember it - die "Found a score but no label before it! Bad nbestlist '$fname_or_source'!" - if !defined $label; - push @order, $label; + if (!$sparse) { + # a score found, remember it + die "Found a score but no label before it! Bad nbestlist '$fname_or_source'!" + if !defined $label; + push @order, $label; + } + $sparse = 0; } else { die "Not a label, not a score '$tok'. Failed to parse the scores string: '$scores' of nbestlist '$fname_or_source'"; } @@ -1037,6 +1046,7 @@ sub create_config { my $featlist = shift; # the lambdas we should write my $iteration = shift; # just for verbosity my $bleu_achieved = shift; # just for verbosity + my $sparse_weights_file = shift; # only defined when optimizing sparse features my %P; # the hash of all parameters we wish to override @@ -1076,6 +1086,10 @@ sub create_config { push @{$P{$name}}, $val; } + if (defined($sparse_weights_file)) { + push @{$P{"weights-file"}}, $___WORKING_DIR."/".$sparse_weights_file; + } + # create new moses.ini decoder config file by cloning and overriding the original one open(INI,$infn) or die "Can't read $infn"; delete($P{"config"}); # never output diff --git a/scripts/training/phrase-extract/consolidate.cpp b/scripts/training/phrase-extract/consolidate.cpp index 53a141221..8d31a1d27 100644 --- a/scripts/training/phrase-extract/consolidate.cpp +++ b/scripts/training/phrase-extract/consolidate.cpp @@ -36,10 +36,15 @@ using namespace std; bool hierarchicalFlag = false; bool onlyDirectFlag = false; bool phraseCountFlag = true; +bool lowCountFlag = false; +bool goodTuringFlag = false; +bool kneserNeyFlag = false; bool logProbFlag = false; -char line[LINE_MAX_LENGTH]; +inline float maybeLogProb( float a ) { return logProbFlag ? log(a) : a; } -void processFiles( char*, char*, char* ); +char line[LINE_MAX_LENGTH]; +void processFiles( char*, char*, char*, char* ); +void loadCountOfCounts( char* ); bool getLine( istream &fileP, vector< string > &item ); vector< string > splitLine(); @@ -55,6 +60,7 @@ int main(int argc, char* argv[]) char* &fileNameDirect = argv[1]; char* &fileNameIndirect = argv[2]; char* &fileNameConsolidated = argv[3]; + char* fileNameCountOfCounts; for(int i=4; i countOfCounts; +vector< float > goodTuringDiscount; +float kneserNey_D1, kneserNey_D2, kneserNey_D3, totalCount = -1; +void loadCountOfCounts( char* fileNameCountOfCounts ) +{ + Moses::InputFileStream fileCountOfCounts(fileNameCountOfCounts); + if (fileCountOfCounts.fail()) { + cerr << "ERROR: could not open count of counts file " << fileNameCountOfCounts << endl; + exit(1); + } + istream &fileP = fileCountOfCounts; + + countOfCounts.push_back(0.0); + while(1) { + if (fileP.eof()) break; + SAFE_GETLINE((fileP), line, LINE_MAX_LENGTH, '\n', __FILE__); + if (fileP.eof()) break; + if (totalCount < 0) + totalCount = atof(line); // total number of distinct phrase pairs + else + countOfCounts.push_back( atof(line) ); + } + fileCountOfCounts.Close(); + + // compute Good Turing discounts + if (goodTuringFlag) { + goodTuringDiscount.push_back(0.01); // floor value + for( size_t i=1; i1) + goodTuringDiscount[i] = 1; + if (goodTuringDiscount[i] 0.9) kneserNey_D1 = 0.9; + if (kneserNey_D2 > 1.9) kneserNey_D2 = 1.9; + if (kneserNey_D3 > 2.9) kneserNey_D3 = 2.9; } -void processFiles( char* fileNameDirect, char* fileNameIndirect, char* fileNameConsolidated ) +void processFiles( char* fileNameDirect, char* fileNameIndirect, char* fileNameConsolidated, char* fileNameCountOfCounts ) { + if (goodTuringFlag || kneserNeyFlag) + loadCountOfCounts( fileNameCountOfCounts ); + // open input files Moses::InputFileStream fileDirect(fileNameDirect); Moses::InputFileStream fileIndirect(fileNameIndirect); @@ -134,29 +209,67 @@ void processFiles( char* fileNameDirect, char* fileNameIndirect, char* fileNameC // output hierarchical phrase pair (with separated labels) fileConsolidated << itemDirect[0] << " ||| " << itemDirect[1]; - // probs - fileConsolidated << " ||| "; + // SCORES ... + fileConsolidated << " |||"; + vector directCounts = tokenize(itemDirect[4].c_str()); + vector indirectCounts = tokenize(itemIndirect[4].c_str()); + float countF = atof(directCounts[0].c_str()); + float countE = atof(indirectCounts[0].c_str()); + float countEF = atof(indirectCounts[1].c_str()); + float n1_F, n1_E; + if (kneserNeyFlag) { + n1_F = atof(directCounts[2].c_str()); + n1_E = atof(indirectCounts[2].c_str()); + } + + // Good Turing discounting + float adjustedCountEF = countEF; + if (goodTuringFlag && countEF+0.99999 < goodTuringDiscount.size()-1) + adjustedCountEF *= goodTuringDiscount[(int)(countEF+0.99998)]; + float adjustedCountEF_indirect = adjustedCountEF; + + // Kneser Ney discounting [Foster et al, 2006] + if (kneserNeyFlag) { + float D = kneserNey_D3; + if (countEF < 2) D = kneserNey_D1; + if (countEF < 3) D = kneserNey_D2; + if (D > countEF) D = countEF - 0.01; // sanity constraint + + float p_b_E = n1_E / totalCount; // target phrase prob based on distinct + float alpha_F = D * n1_F / countF; // available mass + adjustedCountEF = countEF - D + countF * alpha_F * p_b_E; + + // for indirect + float p_b_F = n1_F / totalCount; // target phrase prob based on distinct + float alpha_E = D * n1_E / countE; // available mass + adjustedCountEF_indirect = countEF - D + countE * alpha_E * p_b_F; + } + + // prob indirect if (!onlyDirectFlag) { - fileConsolidated << itemIndirect[2]; // prob indirect + fileConsolidated << " " << maybeLogProb(adjustedCountEF_indirect/countE); + fileConsolidated << " " << itemIndirect[2]; } - fileConsolidated << " " << itemDirect[2]; // prob direct + + // prob direct + fileConsolidated << " " << maybeLogProb(adjustedCountEF/countF); + fileConsolidated << " " << itemDirect[2]; + + // phrase count feature if (phraseCountFlag) { - fileConsolidated << " " << (logProbFlag ? 1 : 2.718); // phrase count feature + fileConsolidated << " " << maybeLogProb(2.718); + } + + // low count feature + if (lowCountFlag) { + fileConsolidated << " " << maybeLogProb(exp(-1.0/countEF)); } // alignment fileConsolidated << " ||| " << itemDirect[3]; // counts, for debugging - vector directCounts = tokenize(itemDirect[4].c_str()); - vector indirectCounts = tokenize(itemIndirect[4].c_str()); - fileConsolidated << "||| " << indirectCounts[0] << " " << directCounts[0]; - // output rule count if present in either file - if (directCounts.size() > 1) { - fileConsolidated << " " << directCounts[1]; - } else if (indirectCounts.size() > 1) { - fileConsolidated << " " << indirectCounts[1]; - } + fileConsolidated << "||| " << countE << " " << countF; // << " " << countEF; fileConsolidated << endl; } @@ -165,6 +278,7 @@ void processFiles( char* fileNameDirect, char* fileNameIndirect, char* fileNameC fileConsolidated.close(); } + bool getLine( istream &fileP, vector< string > &item ) { if (fileP.eof()) diff --git a/scripts/training/phrase-extract/extract-rules.cpp b/scripts/training/phrase-extract/extract-rules.cpp index 15367f900..0b56f0c29 100644 --- a/scripts/training/phrase-extract/extract-rules.cpp +++ b/scripts/training/phrase-extract/extract-rules.cpp @@ -45,7 +45,7 @@ #include "tables-core.h" #include "XmlTree.h" -#define LINE_MAX_LENGTH 60000 +#define LINE_MAX_LENGTH 500000 using namespace std; diff --git a/scripts/training/phrase-extract/extract.cpp b/scripts/training/phrase-extract/extract.cpp index 5a63b6345..0848723df 100644 --- a/scripts/training/phrase-extract/extract.cpp +++ b/scripts/training/phrase-extract/extract.cpp @@ -24,7 +24,7 @@ using namespace std; -#define LINE_MAX_LENGTH 60000 +#define LINE_MAX_LENGTH 500000 // HPhraseVertex represents a point in the alignment matrix typedef pair HPhraseVertex; diff --git a/scripts/training/phrase-extract/tables-core.cpp b/scripts/training/phrase-extract/tables-core.cpp index c8911f6a9..399026930 100644 --- a/scripts/training/phrase-extract/tables-core.cpp +++ b/scripts/training/phrase-extract/tables-core.cpp @@ -29,6 +29,10 @@ vector tokenize( const char* input ) return token; } +bool isNonTerminal( const WORD &symbol ) { + return symbol.substr(0, 1) == "[" && symbol.substr(symbol.size()-1, 1) == "]"; +} + WORD_ID Vocabulary::storeIfNew( const WORD& word ) { map::iterator i = lookup.find( word ); diff --git a/scripts/training/train-model.perl b/scripts/training/train-model.perl index d754aa885..ec0ecf79d 100755 --- a/scripts/training/train-model.perl +++ b/scripts/training/train-model.perl @@ -34,9 +34,10 @@ my($_ROOT_DIR, $_CORPUS_DIR, $_GIZA_E2F, $_GIZA_F2E, $_MODEL_DIR, $_TEMP_DIR, $_ $_PHRASE_WORD_ALIGNMENT,$_FORCE_FACTORED_FILENAMES, $_MEMSCORE, $_FINAL_ALIGNMENT_MODEL, $_CONTINUE,$_MAX_LEXICAL_REORDERING,$_DO_STEPS, + $_ADDITIONAL_INI, $_DICTIONARY, $_EPPEX); -my $debug = 0; # debug this script, do not delete any files in debug mode +my $debug = 1; # debug this script, do not delete any files in debug mode # the following line is set installation time by 'make release'. BEWARE! my $BINDIR="/home/pkoehn/statmt/bin"; @@ -109,7 +110,7 @@ $_HELP = 1 'memscore:s' => \$_MEMSCORE, 'force-factored-filenames' => \$_FORCE_FACTORED_FILENAMES, 'dictionary=s' => \$_DICTIONARY, - 'eppex:s' => \$_EPPEX, + 'additional-ini=s' => \$_ADDITIONAL_INI ); if ($_HELP) { @@ -1372,11 +1373,28 @@ sub score_phrase { sub score_phrase_phrase_extract { my ($ttable_file,$lexical_file,$extract_file) = @_; + # remove consolidation options my $ONLY_DIRECT = (defined($_SCORE_OPTIONS) && $_SCORE_OPTIONS =~ /OnlyDirect/); my $PHRASE_COUNT = (!defined($_SCORE_OPTIONS) || $_SCORE_OPTIONS !~ /NoPhraseCount/); - my $CORE_SCORE_OPTIONS = defined($_SCORE_OPTIONS) ? $_SCORE_OPTIONS : ""; - $CORE_SCORE_OPTIONS =~ s/\-+OnlyDirect//i; - $CORE_SCORE_OPTIONS =~ s/\-+NoPhraseCount//i; + my $LOW_COUNT = (defined($_SCORE_OPTIONS) && $_SCORE_OPTIONS =~ /LowCountFeature/); + my $UNALIGNED_COUNT = (defined($_SCORE_OPTIONS) && $_SCORE_OPTIONS =~ /UnalignedPenalty/); + my ($UNALIGNED_FW_COUNT,$UNALIGNED_FW_F,$UNALIGNED_FW_E); + if (defined($_SCORE_OPTIONS) && $_SCORE_OPTIONS =~ /UnalignedFunctionWordPenalty +(\S+) +(\S+)/) { + $UNALIGNED_FW_COUNT = 1; + $UNALIGNED_FW_F = $1; + $UNALIGNED_FW_E = $2; + } + my $GOOD_TURING = (defined($_SCORE_OPTIONS) && $_SCORE_OPTIONS =~ /GoodTuring/); + my $KNESER_NEY = (defined($_SCORE_OPTIONS) && $_SCORE_OPTIONS =~ /KneserNey/); + my $LOG_PROB = (defined($_SCORE_OPTIONS) && $_SCORE_OPTIONS =~ /LogProb/); + my $NEG_LOG_PROB = (defined($_SCORE_OPTIONS) && $_SCORE_OPTIONS =~ /NegLogProb/); + my $NO_LEX = (defined($_SCORE_OPTIONS) && $_SCORE_OPTIONS =~ /NoLex/); + my $MIN_COUNT_HIERARCHICAL = (defined($_SCORE_OPTIONS) && $_SCORE_OPTIONS =~ /MinCountHierarchical ([\d\.]+)/) ? $1 : undef; + my $CORE_SCORE_OPTIONS = ""; + $CORE_SCORE_OPTIONS .= " --LogProb" if $LOG_PROB; + $CORE_SCORE_OPTIONS .= " --NegLogProb" if $NEG_LOG_PROB; + $CORE_SCORE_OPTIONS .= " --NoLex" if $NO_LEX; + my $substep = 1; for my $direction ("f2e","e2f") { next if $___CONTINUE && -e "$ttable_file.half.$direction"; @@ -1405,6 +1423,11 @@ sub score_phrase_phrase_extract { my $cmd = "$PHRASE_SCORE $extract $lexical_file.$direction $ttable_file.half.$direction $inverse"; $cmd .= " --Hierarchical" if $_HIERARCHICAL; $cmd .= " --WordAlignment" if $_PHRASE_WORD_ALIGNMENT; + $cmd .= " --KneserNey $ttable_file.coc" if $KNESER_NEY; + $cmd .= " --GoodTuring $ttable_file.coc" if $GOOD_TURING && $inverse eq ""; + $cmd .= " --UnalignedPenalty" if $UNALIGNED_COUNT; + $cmd .= " --UnalignedFunctionWordPenalty ".($inverse ? $UNALIGNED_FW_F : $UNALIGNED_FW_E) if $UNALIGNED_FW_COUNT; + $cmd .= " --MinCountHierarchical $MIN_COUNT_HIERARCHICAL" if $MIN_COUNT_HIERARCHICAL; $cmd .= " $CORE_SCORE_OPTIONS" if defined($_SCORE_OPTIONS); print $cmd."\n"; safesystem($cmd) or die "ERROR: Scoring of phrases failed"; @@ -1423,8 +1446,13 @@ sub score_phrase_phrase_extract { return if $___CONTINUE && -e "$ttable_file.gz"; my $cmd = "$PHRASE_CONSOLIDATE $ttable_file.half.f2e $ttable_file.half.e2f.sorted $ttable_file"; $cmd .= " --Hierarchical" if $_HIERARCHICAL; + $cmd .= " --LogProb" if $LOG_PROB; + $cmd .= " --NegLogProb" if $NEG_LOG_PROB; $cmd .= " --OnlyDirect" if $ONLY_DIRECT; $cmd .= " --NoPhraseCount" unless $PHRASE_COUNT; + $cmd .= " --LowCountFeature" if $LOW_COUNT; + $cmd .= " --GoodTuring $ttable_file.coc" if $GOOD_TURING; + $cmd .= " --KneserNey $ttable_file.coc" if $KNESER_NEY; safesystem($cmd) or die "ERROR: Consolidating the two phrase table halves failed"; if (! $debug) { safesystem("rm -f $ttable_file.half.*") or die("ERROR"); } if (! $___DONT_ZIP) { @@ -1681,6 +1709,13 @@ sub create_ini { [ttable-file]\n"; my $num_of_ttables = 0; my @SPECIFIED_TABLE = @_PHRASE_TABLE; + my $basic_weight_count = 4; # both directions, lex and phrase + $basic_weight_count-=2 if defined($_SCORE_OPTIONS) && $_SCORE_OPTIONS =~ /NoLex/; + $basic_weight_count+=2 if defined($_SCORE_OPTIONS) && $_SCORE_OPTIONS =~ /UnalignedPenalty/; # word ins/del + $basic_weight_count+=2 if defined($_SCORE_OPTIONS) && $_SCORE_OPTIONS =~ /UnalignedFunctionWordPenalty/; + $basic_weight_count /= 2 if defined($_SCORE_OPTIONS) && $_SCORE_OPTIONS =~ /OnlyDirect/; + $basic_weight_count++ unless defined($_SCORE_OPTIONS) && $_SCORE_OPTIONS =~ /NoPhraseCount/; # phrase count feature + $basic_weight_count++ if defined($_SCORE_OPTIONS) && $_SCORE_OPTIONS =~ /LowCountFeature/; # low count feature foreach my $f (split(/\+/,$___TRANSLATION_FACTORS)) { $num_of_ttables++; my $ff = $f; @@ -1688,10 +1723,6 @@ sub create_ini { my $file = "$___MODEL_DIR/".($_HIERARCHICAL?"rule-table":"phrase-table").($___NOT_FACTORED ? "" : ".$f").".gz"; $file = shift @SPECIFIED_TABLE if scalar(@SPECIFIED_TABLE); my $phrase_table_impl = ($_HIERARCHICAL ? 6 : 0); - my $basic_weight_count = 4; # both directions, lex and phrase - $basic_weight_count /= 2 if defined($_SCORE_OPTIONS) && $_SCORE_OPTIONS =~ /OnlyDirect/; - $basic_weight_count /= 2 if defined($_SCORE_OPTIONS) && $_SCORE_OPTIONS =~ /NoLex/; - $basic_weight_count++ unless defined($_SCORE_OPTIONS) && $_SCORE_OPTIONS =~ /NoPhraseCount/; # phrase count feature print INI "$phrase_table_impl $ff $basic_weight_count $file\n"; } if ($_GLUE_GRAMMAR) { @@ -1783,10 +1814,6 @@ sub create_ini { print INI "\n\n# translation model weights\n[weight-t]\n"; foreach my $f (split(/\+/,$___TRANSLATION_FACTORS)) { - my $basic_weight_count = 4; # both directions, lex and phrase - $basic_weight_count /= 2 if defined($_SCORE_OPTIONS) && $_SCORE_OPTIONS =~ /OnlyDirect/; - $basic_weight_count /= 2 if defined($_SCORE_OPTIONS) && $_SCORE_OPTIONS =~ /NoLex/; - $basic_weight_count++ unless defined($_SCORE_OPTIONS) && $_SCORE_OPTIONS =~ /NoPhraseCount/; # phrase count feature for(1..$basic_weight_count) { printf INI "%.2f\n", 1/$basic_weight_count; } @@ -1826,6 +1853,11 @@ sub create_ini { print INI "\n# delimiter between factors in input\n[factor-delimiter]\n$___FACTOR_DELIMITER\n\n" } + if ($_ADDITIONAL_INI) { + print INI "\n# additional settings\n\n"; + foreach (split(/
/i,$_ADDITIONAL_INI)) { print INI $_."\n"; } + } + close(INI); } -- cgit v1.2.3