Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/moses-smt/mosesdecoder.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
path: root/mira
diff options
context:
space:
mode:
authorEva Hasler <ehasler@saxnot.inf.ed.ac.uk>2012-05-10 21:54:24 +0400
committerEva Hasler <ehasler@saxnot.inf.ed.ac.uk>2012-05-10 21:54:24 +0400
commit30deedde9f1ce5ef22a5f3c4a6b660bac48989cd (patch)
tree239051c1b15325a9f53a358c781293801721e3aa /mira
parent93bb01b2e408a1fbdbbdfec8f4b89ade561759d7 (diff)
changed permission, everything changed..
Diffstat (limited to 'mira')
-rwxr-xr-x[-rw-r--r--]mira/Decoder.cpp3
-rwxr-xr-x[-rw-r--r--]mira/Decoder.h0
-rwxr-xr-x[-rw-r--r--]mira/Hildreth.cpp0
-rwxr-xr-x[-rw-r--r--]mira/Hildreth.h0
-rwxr-xr-x[-rw-r--r--]mira/HildrethTest.cpp0
-rwxr-xr-x[-rw-r--r--]mira/Jamfile0
-rwxr-xr-x[-rw-r--r--]mira/Main.cpp730
-rwxr-xr-x[-rw-r--r--]mira/Main.h0
-rwxr-xr-x[-rw-r--r--]mira/Makefile.am0
-rwxr-xr-x[-rw-r--r--]mira/MiraOptimiser.cpp0
-rwxr-xr-x[-rw-r--r--]mira/MiraTest.cpp0
-rwxr-xr-x[-rw-r--r--]mira/Optimiser.h0
-rwxr-xr-x[-rw-r--r--]mira/Perceptron.cpp0
-rwxr-xr-x[-rw-r--r--]mira/expt.cfg0
-rwxr-xr-x[-rw-r--r--]mira/mira.xcodeproj/project.pbxproj0
15 files changed, 428 insertions, 305 deletions
diff --git a/mira/Decoder.cpp b/mira/Decoder.cpp
index 3fe670a1c..f6912e4b9 100644..100755
--- a/mira/Decoder.cpp
+++ b/mira/Decoder.cpp
@@ -76,7 +76,7 @@ namespace Mira {
delete[] mosesargv;
const StaticData &staticData = StaticData::Instance();
- m_bleuScoreFeature = staticData.GetBleuScoreFeature();
+ m_bleuScoreFeature = staticData.GetBleuScoreFeature();
}
void MosesDecoder::cleanup(bool chartDecoding) {
@@ -335,6 +335,7 @@ namespace Mira {
m_sentence->Read(in,inputFactorOrder);
// set weight of BleuScoreFeature
+ //cerr << "Reload Bleu feature weight: " << bleuObjectiveWeight*bleuScoreWeight << " (" << bleuObjectiveWeight << "*" << bleuScoreWeight << ")" << endl;
staticData.ReLoadBleuScoreFeatureParameter(bleuObjectiveWeight*bleuScoreWeight);
m_bleuScoreFeature->SetCurrSourceLength((*m_sentence).GetSize());
diff --git a/mira/Decoder.h b/mira/Decoder.h
index be9ed2f9a..be9ed2f9a 100644..100755
--- a/mira/Decoder.h
+++ b/mira/Decoder.h
diff --git a/mira/Hildreth.cpp b/mira/Hildreth.cpp
index 53d1e0881..53d1e0881 100644..100755
--- a/mira/Hildreth.cpp
+++ b/mira/Hildreth.cpp
diff --git a/mira/Hildreth.h b/mira/Hildreth.h
index 699115242..699115242 100644..100755
--- a/mira/Hildreth.h
+++ b/mira/Hildreth.h
diff --git a/mira/HildrethTest.cpp b/mira/HildrethTest.cpp
index a32dcd1d3..a32dcd1d3 100644..100755
--- a/mira/HildrethTest.cpp
+++ b/mira/HildrethTest.cpp
diff --git a/mira/Jamfile b/mira/Jamfile
index cede96233..cede96233 100644..100755
--- a/mira/Jamfile
+++ b/mira/Jamfile
diff --git a/mira/Main.cpp b/mira/Main.cpp
index 498534b63..dabbcd988 100644..100755
--- a/mira/Main.cpp
+++ b/mira/Main.cpp
@@ -51,196 +51,206 @@ using namespace Moses;
namespace po = boost::program_options;
int main(int argc, char** argv) {
- size_t rank = 0;
- size_t size = 1;
+ size_t rank = 0;
+ size_t size = 1;
#ifdef MPI_ENABLE
- mpi::environment env(argc,argv);
- mpi::communicator world;
- rank = world.rank();
- size = world.size();
+ mpi::environment env(argc,argv);
+ mpi::communicator world;
+ rank = world.rank();
+ size = world.size();
#endif
-
- bool help;
- int verbosity;
- string mosesConfigFile;
- string inputFile;
- vector<string> referenceFiles;
- vector<string> mosesConfigFilesFolds, inputFilesFolds, referenceFilesFolds;
- string coreWeightFile, startWeightFile;
- size_t epochs;
- string learner;
- bool shuffle;
- size_t mixingFrequency;
- size_t weightDumpFrequency;
- string weightDumpStem;
- bool scale_margin, scale_margin_precision;
- bool scale_update, scale_update_precision;
- size_t n;
- size_t batchSize;
- bool distinctNbest;
- bool accumulateWeights;
- float historySmoothing;
- bool scaleByInputLength, scaleByAvgInputLength;
- bool scaleByInverseLength, scaleByAvgInverseLength;
- float scaleByX;
- float slack;
- bool averageWeights;
- bool weightConvergence;
- float learning_rate;
- float mira_learning_rate;
- float perceptron_learning_rate;
- string decoder_settings;
- float min_weight_change;
- bool normaliseWeights, normaliseMargin;
- bool print_feature_values;
- bool historyBleu ;
- bool sentenceBleu;
- float bleuWeight, bleuWeight_hope, bleuWeight_fear;
- bool perceptron_update;
- bool hope_fear, hope_model;
- bool model_hope_fear, rank_only;
- int hope_n, fear_n, rank_n;
- size_t bleu_smoothing_scheme;
- float min_oracle_bleu;
- float minBleuRatio, maxBleuRatio;
- bool boost;
- bool decode_hope, decode_fear, decode_model;
- string decode_filename;
- bool batchEqualsShard;
- bool sparseAverage, dumpMixedWeights, sparseNoAverage;
- int featureCutoff;
- bool pruneZeroWeights;
- bool megam;
- bool printFeatureCounts, printNbestWithFeatures;
- bool avgRefLength;
- bool print_weights, print_core_weights, clear_static, debug_model, scale_lm, bleu_weight_lm, bleu_weight_lm_adjust, scale_wp;
- float scale_lm_factor, bleu_weight_lm_factor, scale_wp_factor;
- bool sample;
- string moses_src;
- bool external_score = false, scale_all, dummy;
- float sigmoidParam, scale_all_factor;
- po::options_description desc("Allowed options");
- desc.add_options()
- ("accumulate-weights", po::value<bool>(&accumulateWeights)->default_value(false), "Accumulate and average weights over all epochs")
- ("average-weights", po::value<bool>(&averageWeights)->default_value(false), "Set decoder weights to average weights after each update")
- ("avg-ref-length", po::value<bool>(&avgRefLength)->default_value(false), "Use average reference length instead of shortest for BLEU score feature")
- ("batch-equals-shard", po::value<bool>(&batchEqualsShard)->default_value(false), "Batch size is equal to shard size (purely batch)")
- ("batch-size,b", po::value<size_t>(&batchSize)->default_value(1), "Size of batch that is send to optimiser for weight adjustments")
- ("bw", po::value<float>(&bleuWeight)->default_value(1.0), "Bleu score weight used in the decoder objective function (on top of the Bleu objective weight)")
- ("bw-hope", po::value<float>(&bleuWeight_hope)->default_value(-1), "Bleu score weight used in the decoder objective function for hope translations")
- ("bw-fear", po::value<float>(&bleuWeight_fear)->default_value(-1), "Bleu score weight used in the decoder objective function for fear translations")
- ("blm", po::value<bool>(&bleu_weight_lm)->default_value(false), "Make bleu weight depend on lm weight")
- ("blm-factor", po::value<float>(&bleu_weight_lm_factor)->default_value(2.0), "Make bleu weight depend on lm weight by this factor")
- ("blm-adjust", po::value<bool>(&bleu_weight_lm_adjust)->default_value(false), "Adjust bleu weight when lm weight changes")
- ("bleu-smoothing-scheme", po::value<size_t>(&bleu_smoothing_scheme)->default_value(1), "Set a smoothing scheme for sentence-Bleu: +1 (1), +0.1 (2), papineni (3) (default:1)")
- ("boost", po::value<bool>(&boost)->default_value(false), "Apply boosting factor to updates on misranked candidates")
- ("clear-static", po::value<bool>(&clear_static)->default_value(false), "Clear static data before every translation")
- ("config,f", po::value<string>(&mosesConfigFile), "Moses ini-file")
- ("configs-folds", po::value<vector<string> >(&mosesConfigFilesFolds), "Moses ini-files, one for each fold")
- ("core-weights", po::value<string>(&coreWeightFile)->default_value(""), "Weight file containing the core weights (already tuned, have to be non-zero)")
- ("debug-model", po::value<bool>(&debug_model)->default_value(false), "Get best model translation for debugging purposes")
- ("decode-hope", po::value<bool>(&decode_hope)->default_value(false), "Decode dev input set according to hope objective")
- ("decode-fear", po::value<bool>(&decode_fear)->default_value(false), "Decode dev input set according to fear objective")
- ("decode-model", po::value<bool>(&decode_model)->default_value(false), "Decode dev input set according to normal objective")
- ("decode-filename", po::value<string>(&decode_filename), "Filename for Bleu objective translations")
- ("decoder-settings", po::value<string>(&decoder_settings)->default_value(""), "Decoder settings for tuning runs")
- ("distinct-nbest", po::value<bool>(&distinctNbest)->default_value(true), "Use n-best list with distinct translations in inference step")
- ("dummy", po::value<bool>(&dummy)->default_value(false), "****")
- ("dump-mixed-weights", po::value<bool>(&dumpMixedWeights)->default_value(false), "Dump mixed weights instead of averaged weights")
- ("epochs,e", po::value<size_t>(&epochs)->default_value(10), "Number of epochs")
- ("feature-cutoff", po::value<int>(&featureCutoff)->default_value(-1), "Feature cutoff as additional regularization for sparse features")
- ("fear-n", po::value<int>(&fear_n)->default_value(-1), "Number of fear translations used")
- ("help", po::value(&help)->zero_tokens()->default_value(false), "Print this help message and exit")
- ("history-bleu", po::value<bool>(&historyBleu)->default_value(false), "Use 1best translations to update the history")
- ("history-smoothing", po::value<float>(&historySmoothing)->default_value(0.9), "Adjust the factor for history smoothing")
- ("hope-fear", po::value<bool>(&hope_fear)->default_value(true), "Use only hope and fear translations for optimisation (not model)")
- ("hope-model", po::value<bool>(&hope_model)->default_value(false), "Use only hope and model translations for optimisation (use --fear-n to set number of model translations)")
- ("hope-n", po::value<int>(&hope_n)->default_value(-1), "Number of hope translations used")
- ("input-file,i", po::value<string>(&inputFile), "Input file containing tokenised source")
- ("input-files-folds", po::value<vector<string> >(&inputFilesFolds), "Input files containing tokenised source, one for each fold")
- ("learner,l", po::value<string>(&learner)->default_value("mira"), "Learning algorithm")
- ("min-bleu-ratio", po::value<float>(&minBleuRatio)->default_value(-1), "Set a minimum BLEU ratio between hope and fear")
- ("max-bleu-ratio", po::value<float>(&maxBleuRatio)->default_value(-1), "Set a maximum BLEU ratio between hope and fear")
- ("megam", po::value<bool>(&megam)->default_value(false), "Use megam for optimization step")
- ("min-oracle-bleu", po::value<float>(&min_oracle_bleu)->default_value(0), "Set a minimum oracle BLEU score")
- ("min-weight-change", po::value<float>(&min_weight_change)->default_value(0.01), "Set minimum weight change for stopping criterion")
- ("mira-learning-rate", po::value<float>(&mira_learning_rate)->default_value(1), "Learning rate for MIRA (fixed or flexible)")
- ("mixing-frequency", po::value<size_t>(&mixingFrequency)->default_value(1), "How often per epoch to mix weights, when using mpi")
- ("model-hope-fear", po::value<bool>(&model_hope_fear)->default_value(false), "Use model, hope and fear translations for optimisation")
- ("moses-src", po::value<string>(&moses_src)->default_value(""), "Moses source directory")
- ("nbest,n", po::value<size_t>(&n)->default_value(1), "Number of translations in n-best list")
- ("normalise-weights", po::value<bool>(&normaliseWeights)->default_value(false), "Whether to normalise the updated weights before passing them to the decoder")
- ("normalise-margin", po::value<bool>(&normaliseMargin)->default_value(false), "Normalise the margin: squash between 0 and 1")
- ("perceptron-learning-rate", po::value<float>(&perceptron_learning_rate)->default_value(0.01), "Perceptron learning rate")
- ("print-feature-values", po::value<bool>(&print_feature_values)->default_value(false), "Print out feature values")
- ("print-feature-counts", po::value<bool>(&printFeatureCounts)->default_value(false), "Print out feature values, print feature list with hope counts after 1st epoch")
- ("print-nbest-with-features", po::value<bool>(&printNbestWithFeatures)->default_value(false), "Print out feature values, print feature list with hope counts after 1st epoch")
- ("print-weights", po::value<bool>(&print_weights)->default_value(false), "Print out current weights")
- ("print-core-weights", po::value<bool>(&print_core_weights)->default_value(false), "Print out current core weights")
- ("prune-zero-weights", po::value<bool>(&pruneZeroWeights)->default_value(false), "Prune zero-valued sparse feature weights")
- ("rank-n", po::value<int>(&rank_n)->default_value(-1), "Number of translations used for ranking")
- ("rank-only", po::value<bool>(&rank_only)->default_value(false), "Use only model translations for optimisation")
- ("reference-files,r", po::value<vector<string> >(&referenceFiles), "Reference translation files for training")
- ("reference-files-folds", po::value<vector<string> >(&referenceFilesFolds), "Reference translation files for training, one for each fold")
- ("sample", po::value<bool>(&sample)->default_value(false), "Sample a translation pair from hope/(model)/fear translations")
- ("scale-all", po::value<bool>(&scale_all)->default_value(false), "Scale all core features")
- ("scale-all-factor", po::value<float>(&scale_all_factor)->default_value(2), "Scaling factor for all core features")
- ("scale-by-inverse-length", po::value<bool>(&scaleByInverseLength)->default_value(false), "Scale the BLEU score by (a history of) the inverse input length")
- ("scale-by-input-length", po::value<bool>(&scaleByInputLength)->default_value(true), "Scale the BLEU score by (a history of) the input length")
- ("scale-by-avg-input-length", po::value<bool>(&scaleByAvgInputLength)->default_value(false), "Scale BLEU by an average of the input length")
- ("scale-by-avg-inverse-length", po::value<bool>(&scaleByAvgInverseLength)->default_value(false), "Scale BLEU by an average of the inverse input length")
- ("scale-by-x", po::value<float>(&scaleByX)->default_value(1), "Scale the BLEU score by value x")
- ("scale-lm", po::value<bool>(&scale_lm)->default_value(false), "Scale the language model feature")
- ("scale-factor-lm", po::value<float>(&scale_lm_factor)->default_value(2), "Scale the language model feature by this factor")
- ("scale-wp", po::value<bool>(&scale_wp)->default_value(false), "Scale the word penalty feature")
- ("scale-factor-wp", po::value<float>(&scale_wp_factor)->default_value(2), "Scale the word penalty feature by this factor")
- ("scale-margin", po::value<bool>(&scale_margin)->default_value(0), "Scale the margin by the Bleu score of the oracle translation")
- ("scale-margin-precision", po::value<bool>(&scale_margin_precision)->default_value(0), "Scale the margin by the precision of the oracle translation")
- ("scale-update", po::value<bool>(&scale_update)->default_value(0), "Scale the update by the Bleu score of the oracle translation")
- ("scale-update-precision", po::value<bool>(&scale_update_precision)->default_value(0), "Scale the update by the precision of the oracle translation")
- ("sentence-level-bleu", po::value<bool>(&sentenceBleu)->default_value(true), "Use a sentences level Bleu scoring function")
- ("shuffle", po::value<bool>(&shuffle)->default_value(false), "Shuffle input sentences before processing")
- ("sigmoid-param", po::value<float>(&sigmoidParam)->default_value(1), "y=sigmoidParam is the axis that this sigmoid approaches")
- ("slack", po::value<float>(&slack)->default_value(0.01), "Use slack in optimiser")
- ("sparse-average", po::value<bool>(&sparseAverage)->default_value(false), "Average weights by the number of processes")
- ("sparse-no-average", po::value<bool>(&sparseNoAverage)->default_value(false), "Don't average sparse weights, just sum")
- ("start-weights", po::value<string>(&startWeightFile)->default_value(""), "Weight file containing the start weights (already tuned, have to be non-zero)")
- ("stop-weights", po::value<bool>(&weightConvergence)->default_value(true), "Stop when weights converge")
- ("verbosity,v", po::value<int>(&verbosity)->default_value(0), "Verbosity level")
- ("weight-dump-frequency", po::value<size_t>(&weightDumpFrequency)->default_value(1), "How often per epoch to dump weights, when using mpi")
- ("weight-dump-stem", po::value<string>(&weightDumpStem)->default_value("weights"), "Stem of filename to use for dumping weights");
-
- po::options_description cmdline_options;
- cmdline_options.add(desc);
- po::variables_map vm;
- po::store(po::command_line_parser(argc, argv). options(cmdline_options).run(), vm);
- po::notify(vm);
-
- if (help) {
- std::cout << "Usage: " + string(argv[0])
- + " -f mosesini-file -i input-file -r reference-file(s) [options]"
- << std::endl;
- std::cout << desc << std::endl;
- return 0;
- }
-
- const StaticData &staticData = StaticData::Instance();
-
- // create threadpool, if using multi-threaded decoding
- // note: multi-threading is done on sentence-level,
- // each thread translates one sentence
-/*#ifdef WITH_THREADS
- if (threadcount < 1) {
- cerr << "Error: Need to specify a positive number of threads" << endl;
- exit(1);
- }
- ThreadPool pool(threadcount);
-#else
- if (threadcount > 1) {
- cerr << "Error: Thread count of " << threadcount << " but moses not built with thread support" << endl;
- exit(1);
+
+ bool help;
+ int verbosity;
+ string mosesConfigFile;
+ string inputFile;
+ vector<string> referenceFiles;
+ vector<string> mosesConfigFilesFolds, inputFilesFolds, referenceFilesFolds;
+ string coreWeightFile, startWeightFile;
+ size_t epochs;
+ string learner;
+ bool shuffle;
+ size_t mixingFrequency;
+ size_t weightDumpFrequency;
+ string weightDumpStem;
+ bool scale_margin, scale_margin_precision;
+ bool scale_update, scale_update_precision;
+ size_t n;
+ size_t batchSize;
+ bool distinctNbest;
+ bool accumulateWeights;
+ float historySmoothing;
+ bool scaleByInputLength, scaleByAvgInputLength;
+ bool scaleByInverseLength, scaleByAvgInverseLength;
+ float scaleByX;
+ float slack;
+ bool averageWeights;
+ bool weightConvergence;
+ float learning_rate;
+ float mira_learning_rate;
+ float perceptron_learning_rate;
+ string decoder_settings;
+ float min_weight_change;
+ bool normaliseWeights, normaliseMargin;
+ bool print_feature_values;
+ bool historyBleu ;
+ bool sentenceBleu;
+ bool perceptron_update;
+ bool hope_fear, hope_model;
+ bool model_hope_fear, rank_only;
+ int hope_n, fear_n, rank_n;
+ size_t bleu_smoothing_scheme;
+ float min_oracle_bleu;
+ float minBleuRatio, maxBleuRatio;
+ bool boost;
+ bool decode_hope, decode_fear, decode_model;
+ string decode_filename;
+ bool batchEqualsShard;
+ bool sparseAverage, dumpMixedWeights, sparseNoAverage;
+ int featureCutoff;
+ bool pruneZeroWeights;
+ bool megam;
+ bool printFeatureCounts, printNbestWithFeatures;
+ bool avgRefLength;
+ bool print_weights, print_core_weights, clear_static, debug_model, scale_lm, scale_wp;
+ float scale_lm_factor, scale_wp_factor;
+ bool sample;
+ string moses_src;
+ bool external_score = false;
+ bool most_violated;
+ float sigmoidParam;
+ float bleuWeight, bleuWeight_hope, bleuWeight_fear;
+ bool bleu_weight_lm, bleu_weight_lm_adjust;
+ float bleu_weight_lm_factor;
+ bool scale_all;
+ float scale_all_factor;
+ bool l1_regularize, l2_regularize;
+ float l1_lambda, l2_lambda;
+ po::options_description desc("Allowed options");
+ desc.add_options()
+ ("bleu-weight", po::value<float>(&bleuWeight)->default_value(1.0), "Bleu weight used in decoder objective")
+ ("bw-hope", po::value<float>(&bleuWeight_hope)->default_value(-1.0), "Bleu weight used in decoder objective for hope")
+ ("bw-fear", po::value<float>(&bleuWeight_fear)->default_value(-1.0), "Bleu weight used in decoder objective for fear")
+
+ ("tie-bw-to-lm", po::value<bool>(&bleu_weight_lm)->default_value(false), "Make bleu weight depend on lm weight")
+ ("adjust-bw", po::value<bool>(&bleu_weight_lm_adjust)->default_value(false), "Adjust bleu weight when lm weight changes")
+ ("bw-lm-factor", po::value<float>(&bleu_weight_lm_factor)->default_value(2.0), "Make bleu weight depend on lm weight by this factor")
+
+ ("scale-all", po::value<bool>(&scale_all)->default_value(false), "Scale all core features")
+ ("scaling-factor", po::value<float>(&scale_all_factor)->default_value(2), "Scaling factor for all core features")
+
+ ("accumulate-weights", po::value<bool>(&accumulateWeights)->default_value(false), "Accumulate and average weights over all epochs")
+ ("average-weights", po::value<bool>(&averageWeights)->default_value(false), "Set decoder weights to average weights after each update")
+ ("avg-ref-length", po::value<bool>(&avgRefLength)->default_value(false), "Use average reference length instead of shortest for BLEU score feature")
+ ("batch-equals-shard", po::value<bool>(&batchEqualsShard)->default_value(false), "Batch size is equal to shard size (purely batch)")
+ ("batch-size,b", po::value<size_t>(&batchSize)->default_value(1), "Size of batch that is send to optimiser for weight adjustments")
+ ("bleu-smoothing-scheme", po::value<size_t>(&bleu_smoothing_scheme)->default_value(1), "Set a smoothing scheme for sentence-Bleu: +1 (1), +0.1 (2), papineni (3) (default:1)")
+ ("boost", po::value<bool>(&boost)->default_value(false), "Apply boosting factor to updates on misranked candidates")
+ ("clear-static", po::value<bool>(&clear_static)->default_value(false), "Clear static data before every translation")
+ ("config,f", po::value<string>(&mosesConfigFile), "Moses ini-file")
+ ("configs-folds", po::value<vector<string> >(&mosesConfigFilesFolds), "Moses ini-files, one for each fold")
+ ("core-weights", po::value<string>(&coreWeightFile)->default_value(""), "Weight file containing the core weights (already tuned, have to be non-zero)")
+ ("debug-model", po::value<bool>(&debug_model)->default_value(false), "Get best model translation for debugging purposes")
+ ("decode-hope", po::value<bool>(&decode_hope)->default_value(false), "Decode dev input set according to hope objective")
+ ("decode-fear", po::value<bool>(&decode_fear)->default_value(false), "Decode dev input set according to fear objective")
+ ("decode-model", po::value<bool>(&decode_model)->default_value(false), "Decode dev input set according to normal objective")
+ ("decode-filename", po::value<string>(&decode_filename), "Filename for Bleu objective translations")
+ ("decoder-settings", po::value<string>(&decoder_settings)->default_value(""), "Decoder settings for tuning runs")
+ ("distinct-nbest", po::value<bool>(&distinctNbest)->default_value(true), "Use n-best list with distinct translations in inference step")
+ ("dump-mixed-weights", po::value<bool>(&dumpMixedWeights)->default_value(false), "Dump mixed weights instead of averaged weights")
+ ("epochs,e", po::value<size_t>(&epochs)->default_value(10), "Number of epochs")
+ ("feature-cutoff", po::value<int>(&featureCutoff)->default_value(-1), "Feature cutoff as additional regularization for sparse features")
+ ("fear-n", po::value<int>(&fear_n)->default_value(-1), "Number of fear translations used")
+ ("help", po::value(&help)->zero_tokens()->default_value(false), "Print this help message and exit")
+ ("history-bleu", po::value<bool>(&historyBleu)->default_value(false), "Use 1best translations to update the history")
+ ("history-smoothing", po::value<float>(&historySmoothing)->default_value(0.9), "Adjust the factor for history smoothing")
+ ("hope-fear", po::value<bool>(&hope_fear)->default_value(true), "Use only hope and fear translations for optimisation (not model)")
+ ("hope-model", po::value<bool>(&hope_model)->default_value(false), "Use only hope and model translations for optimisation (use --fear-n to set number of model translations)")
+ ("hope-n", po::value<int>(&hope_n)->default_value(-1), "Number of hope translations used")
+ ("input-file,i", po::value<string>(&inputFile), "Input file containing tokenised source")
+ ("input-files-folds", po::value<vector<string> >(&inputFilesFolds), "Input files containing tokenised source, one for each fold")
+ ("learner,l", po::value<string>(&learner)->default_value("mira"), "Learning algorithm")
+ ("l1-lambda", po::value<float>(&l1_lambda)->default_value(0.001), "Lambda for l1-regularization (w_i +/- lambda)")
+ ("l2-lambda", po::value<float>(&l2_lambda)->default_value(0.1), "Lambda for l2-regularization (w_i * (1 - lambda))")
+ ("l1-reg", po::value<bool>(&l1_regularize)->default_value(false), "L1-regularization")
+ ("l2-reg", po::value<bool>(&l2_regularize)->default_value(false), "L2-regularization")
+ ("min-bleu-ratio", po::value<float>(&minBleuRatio)->default_value(-1), "Set a minimum BLEU ratio between hope and fear")
+ ("max-bleu-ratio", po::value<float>(&maxBleuRatio)->default_value(-1), "Set a maximum BLEU ratio between hope and fear")
+ ("megam", po::value<bool>(&megam)->default_value(false), "Use megam for optimization step")
+ ("min-oracle-bleu", po::value<float>(&min_oracle_bleu)->default_value(0), "Set a minimum oracle BLEU score")
+ ("min-weight-change", po::value<float>(&min_weight_change)->default_value(0.01), "Set minimum weight change for stopping criterion")
+ ("mira-learning-rate", po::value<float>(&mira_learning_rate)->default_value(1), "Learning rate for MIRA (fixed or flexible)")
+ ("mixing-frequency", po::value<size_t>(&mixingFrequency)->default_value(1), "How often per epoch to mix weights, when using mpi")
+ ("model-hope-fear", po::value<bool>(&model_hope_fear)->default_value(false), "Use model, hope and fear translations for optimisation")
+ ("moses-src", po::value<string>(&moses_src)->default_value(""), "Moses source directory")
+ ("most-violated", po::value<bool>(&most_violated)->default_value(false), "Pick hypotheses according to constraint violation")
+ ("nbest,n", po::value<size_t>(&n)->default_value(1), "Number of translations in n-best list")
+ ("normalise-weights", po::value<bool>(&normaliseWeights)->default_value(false), "Whether to normalise the updated weights before passing them to the decoder")
+ ("normalise-margin", po::value<bool>(&normaliseMargin)->default_value(false), "Normalise the margin: squash between 0 and 1")
+ ("perceptron-learning-rate", po::value<float>(&perceptron_learning_rate)->default_value(0.01), "Perceptron learning rate")
+ ("print-feature-values", po::value<bool>(&print_feature_values)->default_value(false), "Print out feature values")
+ ("print-feature-counts", po::value<bool>(&printFeatureCounts)->default_value(false), "Print out feature values, print feature list with hope counts after 1st epoch")
+ ("print-nbest-with-features", po::value<bool>(&printNbestWithFeatures)->default_value(false), "Print out feature values, print feature list with hope counts after 1st epoch")
+ ("print-weights", po::value<bool>(&print_weights)->default_value(false), "Print out current weights")
+ ("print-core-weights", po::value<bool>(&print_core_weights)->default_value(true), "Print out current core weights")
+ ("prune-zero-weights", po::value<bool>(&pruneZeroWeights)->default_value(false), "Prune zero-valued sparse feature weights")
+ ("rank-n", po::value<int>(&rank_n)->default_value(-1), "Number of translations used for ranking")
+ ("rank-only", po::value<bool>(&rank_only)->default_value(false), "Use only model translations for optimisation")
+ ("reference-files,r", po::value<vector<string> >(&referenceFiles), "Reference translation files for training")
+ ("reference-files-folds", po::value<vector<string> >(&referenceFilesFolds), "Reference translation files for training, one for each fold")
+ ("sample", po::value<bool>(&sample)->default_value(false), "Sample a translation pair from hope/(model)/fear translations")
+ ("scale-by-inverse-length", po::value<bool>(&scaleByInverseLength)->default_value(false), "Scale BLEU by (history of) inverse input length")
+ ("scale-by-input-length", po::value<bool>(&scaleByInputLength)->default_value(true), "Scale BLEU by (history of) input length")
+ ("scale-by-avg-input-length", po::value<bool>(&scaleByAvgInputLength)->default_value(false), "Scale BLEU by average input length")
+ ("scale-by-avg-inverse-length", po::value<bool>(&scaleByAvgInverseLength)->default_value(false), "Scale BLEU by average inverse input length")
+ ("scale-by-x", po::value<float>(&scaleByX)->default_value(1), "Scale the BLEU score by value x")
+ ("scale-lm", po::value<bool>(&scale_lm)->default_value(false), "Scale the language model feature")
+ ("scale-factor-lm", po::value<float>(&scale_lm_factor)->default_value(2), "Scale the language model feature by this factor")
+ ("scale-wp", po::value<bool>(&scale_wp)->default_value(false), "Scale the word penalty feature")
+ ("scale-factor-wp", po::value<float>(&scale_wp_factor)->default_value(2), "Scale the word penalty feature by this factor")
+ ("scale-margin", po::value<bool>(&scale_margin)->default_value(0), "Scale the margin by the Bleu score of the oracle translation")
+ ("scale-margin-precision", po::value<bool>(&scale_margin_precision)->default_value(0), "Scale margin by precision of oracle")
+ ("scale-update", po::value<bool>(&scale_update)->default_value(0), "Scale update by Bleu score of oracle")
+ ("scale-update-precision", po::value<bool>(&scale_update_precision)->default_value(0), "Scale update by precision of oracle")
+ ("sentence-level-bleu", po::value<bool>(&sentenceBleu)->default_value(true), "Use a sentences level Bleu scoring function")
+ ("shuffle", po::value<bool>(&shuffle)->default_value(false), "Shuffle input sentences before processing")
+ ("sigmoid-param", po::value<float>(&sigmoidParam)->default_value(1), "y=sigmoidParam is the axis that this sigmoid approaches")
+ ("slack", po::value<float>(&slack)->default_value(0.01), "Use slack in optimiser")
+ ("sparse-average", po::value<bool>(&sparseAverage)->default_value(false), "Average weights by the number of processes")
+ ("sparse-no-average", po::value<bool>(&sparseNoAverage)->default_value(false), "Don't average sparse weights, just sum")
+ ("start-weights", po::value<string>(&startWeightFile)->default_value(""), "Weight file containing start weights")
+ ("stop-weights", po::value<bool>(&weightConvergence)->default_value(true), "Stop when weights converge")
+ ("verbosity,v", po::value<int>(&verbosity)->default_value(0), "Verbosity level")
+ ("weight-dump-frequency", po::value<size_t>(&weightDumpFrequency)->default_value(1), "How often per epoch to dump weights (mpi)")
+ ("weight-dump-stem", po::value<string>(&weightDumpStem)->default_value("weights"), "Stem of filename to use for dumping weights");
+
+ po::options_description cmdline_options;
+ cmdline_options.add(desc);
+ po::variables_map vm;
+ po::store(po::command_line_parser(argc, argv). options(cmdline_options).run(), vm);
+ po::notify(vm);
+
+ if (help) {
+ std::cout << "Usage: " + string(argv[0])
+ + " -f mosesini-file -i input-file -r reference-file(s) [options]" << std::endl;
+ std::cout << desc << std::endl;
+ return 0;
}
-#endif*/
+
+ const StaticData &staticData = StaticData::Instance();
+
+ cerr << "scale-all: " << scale_all << endl;
+ cerr << "scale-all-factor: " << scale_all_factor << endl;
+ cerr << "bleu weight: " << bleuWeight << endl;
+ cerr << "bleu weight hope: " << bleuWeight_hope << endl;
+ cerr << "bleu weight fear: " << bleuWeight_fear << endl;;
+ cerr << "bleu weight depends on lm: " << bleu_weight_lm << endl;
+ cerr << "by this factor: " << bleu_weight_lm_factor << endl;
+ cerr << "adjust dynamically: " << bleu_weight_lm_adjust << endl;
+ cerr << "l1-reg: " << l1_regularize << endl;
+ cerr << "l1-lambda: " << l1_lambda << endl;
+ cerr << "l2-reg: " << l2_regularize << endl;
+ cerr << "l2-lambda: " << l2_lambda << endl;
bool trainWithMultipleFolds = false;
if (mosesConfigFilesFolds.size() > 0 || inputFilesFolds.size() > 0 || referenceFilesFolds.size() > 0) {
@@ -403,28 +413,28 @@ int main(int argc, char** argv) {
// Optionally shuffle the sentences
vector<size_t> order;
if (trainWithMultipleFolds) {
- for (size_t i = 0; i < inputSentencesFolds[myFold].size(); ++i) {
- order.push_back(i);
- }
-
- if (shuffle) {
- cerr << "Shuffling input sentences.." << endl;
- RandomIndex rindex;
- random_shuffle(order.begin(), order.end(), rindex);
- }
+ for (size_t i = 0; i < inputSentencesFolds[myFold].size(); ++i) {
+ order.push_back(i);
+ }
+
+ /*if (shuffle) {
+ cerr << "Shuffling input sentences.." << endl;
+ RandomIndex rindex;
+ random_shuffle(order.begin(), order.end(), rindex);
+ }*/
}
else {
- if (rank == 0) {
- for (size_t i = 0; i < inputSentences.size(); ++i) {
- order.push_back(i);
- }
-
- if (shuffle) {
- cerr << "Shuffling input sentences.." << endl;
- RandomIndex rindex;
- random_shuffle(order.begin(), order.end(), rindex);
- }
- }
+ if (rank == 0) {
+ for (size_t i = 0; i < inputSentences.size(); ++i) {
+ order.push_back(i);
+ }
+
+ /*if (shuffle) {
+ cerr << "Shuffling input sentences.." << endl;
+ RandomIndex rindex;
+ random_shuffle(order.begin(), order.end(), rindex);
+ }*/
+ }
}
// initialise optimizer
@@ -567,11 +577,6 @@ int main(int argc, char** argv) {
}
decoder->setWeights(initialWeights);
- if (dummy == true) {
- scale_all = true;
- scale_all_factor = 2;
- }
-
if (scale_all) {
cerr << "Scale all core features by factor " << scale_all_factor << endl;
scale_lm = true;
@@ -587,6 +592,7 @@ int main(int argc, char** argv) {
for (LMList::const_iterator i = lmList.begin(); i != lmList.end(); ++i)
lmSum += abs(initialWeights.GetScoreForProducer(*i));
bleuWeight = lmSum * bleu_weight_lm_factor;
+ cerr << "Set bleu weight to lm weight * " << bleu_weight_lm_factor << endl;
}
if (bleuWeight_hope == -1) {
@@ -633,6 +639,52 @@ int main(int argc, char** argv) {
// int sumStillViolatedConstraints;
float epsilon = 0.0001;
for (size_t epoch = 0; epoch < epochs && !stop; ++epoch) {
+ if (shuffle) {
+ if (trainWithMultipleFolds || rank == 0) {
+ cerr << "Rank " << rank << ", epoch " << epoch << ", shuffling input sentences.." << endl;
+ RandomIndex rindex;
+ random_shuffle(order.begin(), order.end(), rindex);
+ }
+
+#ifdef MPI_ENABLE
+ if (!trainWithMultipleFolds)
+ mpi::broadcast(world, order, 0);
+#endif
+
+ // redo shards
+ vector<size_t> shard;
+ if (trainWithMultipleFolds) {
+ float shardSize = (float) (order.size())/coresPerFold;
+ size_t shardStart = (size_t) (shardSize * (rank % coresPerFold));
+ size_t shardEnd = shardStart + shardSize;
+ if (rank % coresPerFold == coresPerFold - 1) { // last rank of each fold
+ shardEnd = order.size();
+ shardSize = shardEnd - shardStart;
+ }
+ VERBOSE(1, "Rank: " << rank << ", shard size: " << shardSize << endl);
+ VERBOSE(1, "Rank: " << rank << ", shard start: " << shardStart << " shard end: " << shardEnd << endl);
+ shard.resize(shardSize);
+ copy(order.begin() + shardStart, order.begin() + shardEnd, shard.begin());
+ batchSize = 1;
+ }
+ else {
+ float shardSize = (float) (order.size()) / size;
+ size_t shardStart = (size_t) (shardSize * rank);
+ size_t shardEnd = (size_t) (shardSize * (rank + 1));
+ if (rank == size - 1) {
+ shardEnd = order.size();
+ shardSize = shardEnd - shardStart;
+ }
+ VERBOSE(1, "Shard size: " << shardSize << endl);
+ VERBOSE(1, "Rank: " << rank << " Shard start: " << shardStart << " Shard end: " << shardEnd << endl);
+ shard.resize(shardSize);
+ copy(order.begin() + shardStart, order.begin() + shardEnd, shard.begin());
+ if (batchEqualsShard)
+ batchSize = shardSize;
+ }
+ }
+
+
// sum of violated constraints in an epoch
// sumStillViolatedConstraints = 0;
@@ -827,6 +879,8 @@ int main(int argc, char** argv) {
int oracleSize = (int)oracle.size();
cerr << endl;
+ //exit(0);
+
// count sparse features occurring in hope translation
featureValuesHope[batchPosition][0].IncrementSparseHopeFeatures();
@@ -1116,10 +1170,10 @@ int main(int argc, char** argv) {
examples_in_batch++;
if (sample) {
- float bleuBest = -1000;
- float bleuWorst = 1000;
- size_t indexBest = -1;
- size_t indexWorst = -1;
+ float bleuHope = -1000;
+ float bleuFear = 1000;
+ size_t indexHope = -1;
+ size_t indexFear = -1;
cerr << "Rank " << rank << ", epoch " << epoch << ", external score? " << external_score << endl;
if (external_score) {
@@ -1145,104 +1199,155 @@ int main(int argc, char** argv) {
vector<float> bleuScoresNbest = BleuScorer::ScoreNbestList(scoreDataFile.str(), featureDataFile.str());
for (size_t i=0; i < bleuScoresNbest.size(); ++i) {
//cerr << "bleu: " << bleuScoresNbest[i]*current_input_length << endl;
- if (abs(bleuScoresNbest[i] - bleuBest) < epsilon) { // equal bleu scores
- if (modelScores[batchPosition][i] > modelScores[batchPosition][indexBest]) {
- if (abs(modelScores[batchPosition][i] - modelScores[batchPosition][indexBest]) > epsilon) {
- bleuBest = bleuScoresNbest[i];
- indexBest = i;
+ if (abs(bleuScoresNbest[i] - bleuHope) < epsilon) { // equal bleu scores
+ if (modelScores[batchPosition][i] > modelScores[batchPosition][indexHope]) {
+ if (abs(modelScores[batchPosition][i] - modelScores[batchPosition][indexHope]) > epsilon) {
+ bleuHope = bleuScoresNbest[i];
+ indexHope = i;
}
}
}
- else if (bleuScoresNbest[i] > bleuBest) { // greater than current best
- bleuBest = bleuScoresNbest[i];
- indexBest = i;
+ else if (bleuScoresNbest[i] > bleuHope) { // greater than current best
+ bleuHope = bleuScoresNbest[i];
+ indexHope = i;
}
- if (abs(bleuScoresNbest[i] - bleuWorst) < epsilon) { // equal bleu scores
- if (modelScores[batchPosition][i] > modelScores[batchPosition][indexWorst]) {
- if (abs(modelScores[batchPosition][i] - modelScores[batchPosition][indexWorst]) > epsilon) {
- bleuWorst = bleuScoresNbest[i];
- indexWorst = i;
+ if (abs(bleuScoresNbest[i] - bleuFear) < epsilon) { // equal bleu scores
+ if (modelScores[batchPosition][i] > modelScores[batchPosition][indexFear]) {
+ if (abs(modelScores[batchPosition][i] - modelScores[batchPosition][indexFear]) > epsilon) {
+ bleuFear = bleuScoresNbest[i];
+ indexFear = i;
}
}
}
- else if (bleuScoresNbest[i] < bleuWorst) { // worse than current worst
- bleuWorst = bleuScoresNbest[i];
- indexWorst = i;
+ else if (bleuScoresNbest[i] < bleuFear) { // worse than current worst
+ bleuFear = bleuScoresNbest[i];
+ indexFear = i;
}
}
}
else {
cerr << "Rank " << rank << ", epoch " << epoch << ", use dynamic score." << endl;
- // use dynamically calculated scores to find best and worst
- for (size_t i=0; i<bleuScores[batchPosition].size(); ++i) {
- //cerr << "bleu: " << bleuScores[batchPosition][i] << endl;
- if (abs(bleuScores[batchPosition][i] - bleuBest) < epsilon) { // equal bleu scores
- if (modelScores[batchPosition][i] > modelScores[batchPosition][indexBest]) {
- if (abs(modelScores[batchPosition][i] - modelScores[batchPosition][indexBest]) > epsilon) {
- bleuBest = bleuScores[batchPosition][i];
- indexBest = i;
+ if (most_violated) {
+ cerr << "Rank " << rank << ", epoch " << epoch << ", pick pair with most violated constraint" << endl;
+ // find hypotheses care with most strongly violated constraint
+ float currentViolation = 0;
+ float currentBleuDiff, currentModelDiff;
+ //float minBleuDiff = 0.5;
+ for (size_t i=0; i<bleuScores[batchPosition].size(); ++i) {
+ for (size_t j=i+1; j<bleuScores[batchPosition].size(); ++j) {
+ //if (abs(bleuScores[batchPosition][i] - bleuScores[batchPosition][j]) < minBleuDiff)
+ // continue;
+
+ size_t iHope, iFear;
+ if (bleuScores[batchPosition][i] >= bleuScores[batchPosition][j]) {
+ if (abs(bleuScores[batchPosition][i] - bleuScores[batchPosition][j]) > epsilon) {
+ // hope/fear
+ iHope = i;
+ iFear = j;
+ }
+ else
+ continue; // equal Bleu scores
+ }
+ else {
+ // fear/hope
+ iFear = i;
+ iHope = j;
+ }
+
+ float bleuDiff = bleuScores[batchPosition][iHope] - bleuScores[batchPosition][iFear];
+ float modelDiff = modelScores[batchPosition][iHope] - modelScores[batchPosition][iFear];
+ if (bleuDiff > modelDiff) {
+ float diff = bleuDiff - modelDiff;
+ if (diff > epsilon && diff > currentViolation) {
+ currentViolation = diff;
+ currentBleuDiff = bleuDiff;
+ currentModelDiff = modelDiff;
+ indexHope = iHope;
+ indexFear = iFear;
+ bleuHope = bleuScores[batchPosition][iHope];
+ bleuFear = bleuScores[batchPosition][iFear];
+ }
}
}
}
- else if (bleuScores[batchPosition][i] > bleuBest) { // greater than current best
- bleuBest = bleuScores[batchPosition][i];
- indexBest = i;
- }
-
- if (abs(bleuScores[batchPosition][i] - bleuWorst) < epsilon) { // equal bleu scores
- if (modelScores[batchPosition][i] > modelScores[batchPosition][indexWorst]) {
- if (abs(modelScores[batchPosition][i] - modelScores[batchPosition][indexWorst]) > epsilon) {
- bleuWorst = bleuScores[batchPosition][i];
- indexWorst = i;
+ }
+ else {
+ // use dynamically calculated scores to find best and worst
+ for (size_t i=0; i<bleuScores[batchPosition].size(); ++i) {
+ //cerr << "bleu: " << bleuScores[batchPosition][i] << endl;
+ if (abs(bleuScores[batchPosition][i] - bleuHope) < epsilon) { // equal bleu scores
+ if (modelScores[batchPosition][i] > modelScores[batchPosition][indexHope]) {
+ if (abs(modelScores[batchPosition][i] - modelScores[batchPosition][indexHope]) > epsilon) {
+ bleuHope = bleuScores[batchPosition][i];
+ indexHope = i;
+ }
+ }
+ }
+ else if (bleuScores[batchPosition][i] > bleuHope) { // greater than current best
+ bleuHope = bleuScores[batchPosition][i];
+ indexHope = i;
+ }
+
+ if (abs(bleuScores[batchPosition][i] - bleuFear) < epsilon) { // equal bleu scores
+ if (modelScores[batchPosition][i] > modelScores[batchPosition][indexFear]) {
+ if (abs(modelScores[batchPosition][i] - modelScores[batchPosition][indexFear]) > epsilon) {
+ bleuFear = bleuScores[batchPosition][i];
+ indexFear = i;
+ }
}
}
+ else if (bleuScores[batchPosition][i] < bleuFear) { // worse than current worst
+ bleuFear = bleuScores[batchPosition][i];
+ indexFear = i;
+ }
}
- else if (bleuScores[batchPosition][i] < bleuWorst) { // worse than current worst
- bleuWorst = bleuScores[batchPosition][i];
- indexWorst = i;
- }
}
}
- if ((external_score && (bleuBest*current_input_length <= bleuWorst*current_input_length)) || (bleuBest <= bleuWorst)) {
- if (external_score) {
- if (abs(bleuBest*current_input_length - bleuWorst*current_input_length) < epsilon) {
- cerr << "Rank " << rank << ", epoch " << epoch << ", WARNING: HOPE and FEAR have equal Bleu." << endl;
+ if (bleuHope == -1000 || bleuFear == 1000) {
+ cerr << "Rank " << rank << ", epoch " << epoch << ", no appropriate hypotheses found.." << endl;
+ }
+ else {
+ if ((external_score && (bleuHope*current_input_length <= bleuFear*current_input_length)) || (bleuHope <= bleuFear)) {
+ if (external_score) {
+ if (abs(bleuHope*current_input_length - bleuFear*current_input_length) < epsilon) {
+ cerr << "Rank " << rank << ", epoch " << epoch << ", WARNING: HOPE and FEAR have equal Bleu." << endl;
+ }
+ else {
+ cerr << "Rank " << rank << ", epoch " << epoch << ", ERROR: FEAR has better Bleu than HOPE." << endl;
+ }
}
else {
- cerr << "Rank " << rank << ", epoch " << epoch << ", ERROR: FEAR has better Bleu than HOPE." << endl;
+ if (abs(bleuHope - bleuFear) < epsilon) {
+ cerr << "\nRank " << rank << ", epoch " << epoch << ", WARNING: HOPE and FEAR have equal Bleu." << endl;
+ }
+ else {
+ cerr << "\nRank " << rank << ", epoch " << epoch << ", ERROR: FEAR has better Bleu than HOPE." << endl;
+ }
}
}
else {
- if (abs(bleuBest - bleuWorst) < epsilon) {
- cerr << "\nRank " << rank << ", epoch " << epoch << ", WARNING: HOPE and FEAR have equal Bleu." << endl;
+ if (external_score) {
+ // use actual sentence bleu (not dynamically computed)
+ bleuScoresHopeSample[batchPosition].push_back(bleuHope*current_input_length);
+ bleuScoresFearSample[batchPosition].push_back(bleuFear*current_input_length);
+ cerr << "Rank " << rank << ", epoch " << epoch << ", Best: " << bleuHope*current_input_length << " (" << indexHope << ")" << endl;
+ cerr << "Rank " << rank << ", epoch " << epoch << ", Worst: " << bleuFear*current_input_length << " (" << indexFear << ")" << endl;
}
else {
- cerr << "\nRank " << rank << ", epoch " << epoch << ", ERROR: FEAR has better Bleu than HOPE." << endl;
+ bleuScoresHopeSample[batchPosition].push_back(bleuHope);
+ bleuScoresFearSample[batchPosition].push_back(bleuFear);
+ cerr << "\nRank " << rank << ", epoch " << epoch << ", Best: " << bleuHope << " (" << indexHope << ")" << endl;
+ cerr << "Rank " << rank << ", epoch " << epoch << ", Worst: " << bleuFear << " (" << indexFear << ")" << endl;
}
- }
+
+ featureValuesHopeSample[batchPosition].push_back(featureValues[batchPosition][indexHope]);
+ featureValuesFearSample[batchPosition].push_back(featureValues[batchPosition][indexFear]);
+ modelScoresHopeSample[batchPosition].push_back(modelScores[batchPosition][indexHope]);
+ modelScoresFearSample[batchPosition].push_back(modelScores[batchPosition][indexFear]);
+ }
}
- else {
- if (external_score) {
- // use actual sentence bleu (not dynamically computed)
- bleuScoresHopeSample[batchPosition].push_back(bleuBest*current_input_length);
- bleuScoresFearSample[batchPosition].push_back(bleuWorst*current_input_length);
- cerr << "Rank " << rank << ", epoch " << epoch << ", Best: " << bleuBest*current_input_length << " (" << indexBest << ")" << endl;
- cerr << "Rank " << rank << ", epoch " << epoch << ", Worst: " << bleuWorst*current_input_length << " (" << indexWorst << ")" << endl;
- }
- else {
- bleuScoresHopeSample[batchPosition].push_back(bleuBest);
- bleuScoresFearSample[batchPosition].push_back(bleuWorst);
- cerr << "\nRank " << rank << ", epoch " << epoch << ", Best: " << bleuBest << " (" << indexBest << ")" << endl;
- cerr << "Rank " << rank << ", epoch " << epoch << ", Worst: " << bleuWorst << " (" << indexWorst << ")" << endl;
- }
-
- featureValuesHopeSample[batchPosition].push_back(featureValues[batchPosition][indexBest]);
- featureValuesFearSample[batchPosition].push_back(featureValues[batchPosition][indexWorst]);
- modelScoresHopeSample[batchPosition].push_back(modelScores[batchPosition][indexBest]);
- modelScoresFearSample[batchPosition].push_back(modelScores[batchPosition][indexWorst]);
- }
}
}
@@ -1595,8 +1700,14 @@ int main(int argc, char** argv) {
cerr << "Rank " << rank << ", epoch " << epoch << ", applying update.." << endl;
cerr << "Rank " << rank << ", epoch " << epoch << ", update: " << weightUpdate << endl;
mosesWeights.PlusEquals(weightUpdate);
+ if (l2_regularize) {
+ mixedWeights.L2Regularize(l2_lambda);
+ cerr << "Rank " << rank << ", epoch " << epoch << ", "
+ << "l2-reg. on mixedWeights with lambda=" << l2_lambda << endl;
+ }
+
if (normaliseWeights)
- mosesWeights.L1Normalise();
+ mosesWeights.L1Normalise();
cumulativeWeights.PlusEquals(mosesWeights);
if (sparseAverage) {
@@ -1680,24 +1791,35 @@ int main(int argc, char** argv) {
++weightMixingThisEpoch;
if (pruneZeroWeights) {
- size_t pruned = mixedWeights.PruneZeroWeightFeatures();
- cerr << "Rank " << rank << ", epoch " << epoch << ", "
- << pruned << " zero-weighted features pruned from mixedWeights." << endl;
-
- pruned = cumulativeWeights.PruneZeroWeightFeatures();
- cerr << "Rank " << rank << ", epoch " << epoch << ", "
- << pruned << " zero-weighted features pruned from cumulativeWeights." << endl;
+ size_t pruned = mixedWeights.PruneZeroWeightFeatures();
+ cerr << "Rank " << rank << ", epoch " << epoch << ", "
+ << pruned << " zero-weighted features pruned from mixedWeights." << endl;
+
+ pruned = cumulativeWeights.PruneZeroWeightFeatures();
+ cerr << "Rank " << rank << ", epoch " << epoch << ", "
+ << pruned << " zero-weighted features pruned from cumulativeWeights." << endl;
}
if (featureCutoff != -1 && weightMixingThisEpoch == mixingFrequency) {
- size_t pruned = mixedWeights.PruneSparseFeatures(featureCutoff);
- cerr << "Rank " << rank << ", epoch " << epoch << ", "
- << pruned << " features pruned from mixedWeights." << endl;
+ size_t pruned = mixedWeights.PruneSparseFeatures(featureCutoff);
+ cerr << "Rank " << rank << ", epoch " << epoch << ", "
+ << pruned << " features pruned from mixedWeights." << endl;
- pruned = cumulativeWeights.PruneSparseFeatures(featureCutoff);
- cerr << "Rank " << rank << ", epoch " << epoch << ", "
- << pruned << " features pruned from cumulativeWeights." << endl;
+ pruned = cumulativeWeights.PruneSparseFeatures(featureCutoff);
+ cerr << "Rank " << rank << ", epoch " << epoch << ", "
+ << pruned << " features pruned from cumulativeWeights." << endl;
}
+
+ if (l1_regularize && weightMixingThisEpoch == mixingFrequency) {
+ mixedWeights.L1Regularize(l1_lambda);
+ cerr << "Rank " << rank << ", epoch " << epoch << ", "
+ << "l1-reg. on mixedWeights with lambda=" << l1_lambda << endl;
+
+ // subtract lambda from every weight in the sum --> multiply by number of updates
+ cumulativeWeights.L1Regularize(l1_lambda*numberOfUpdatesThisEpoch);
+ cerr << "Rank " << rank << ", epoch " << epoch << ", "
+ << "l1-reg. on cumulativeWeights with lambda=" << l1_lambda*numberOfUpdatesThisEpoch << endl;
+ }
}
// broadcast average weights from process 0
@@ -1753,13 +1875,13 @@ int main(int argc, char** argv) {
if (proceed) {
#ifdef MPI_ENABLE
// average across processes
- mpi::reduce(world, tmpAverageWeights, mixedAverageWeights, SCCPlus(), 0);
- ScoreComponentCollection totalBinary;
- if (sparseAverage) {
- ScoreComponentCollection binary;
- binary.SetToBinaryOf(mosesWeights);
- mpi::reduce(world, binary, totalBinary, SCCPlus(), 0);
- }
+ mpi::reduce(world, tmpAverageWeights, mixedAverageWeights, SCCPlus(), 0);
+ ScoreComponentCollection totalBinary;
+ if (sparseAverage) {
+ ScoreComponentCollection binary;
+ binary.SetToBinaryOf(mosesWeights);
+ mpi::reduce(world, binary, totalBinary, SCCPlus(), 0);
+ }
#endif
#ifndef MPI_ENABLE
mixedAverageWeights = tmpAverageWeights;
diff --git a/mira/Main.h b/mira/Main.h
index 95a5374a5..95a5374a5 100644..100755
--- a/mira/Main.h
+++ b/mira/Main.h
diff --git a/mira/Makefile.am b/mira/Makefile.am
index cd490c853..cd490c853 100644..100755
--- a/mira/Makefile.am
+++ b/mira/Makefile.am
diff --git a/mira/MiraOptimiser.cpp b/mira/MiraOptimiser.cpp
index 6f30c8fd7..6f30c8fd7 100644..100755
--- a/mira/MiraOptimiser.cpp
+++ b/mira/MiraOptimiser.cpp
diff --git a/mira/MiraTest.cpp b/mira/MiraTest.cpp
index 774b324f8..774b324f8 100644..100755
--- a/mira/MiraTest.cpp
+++ b/mira/MiraTest.cpp
diff --git a/mira/Optimiser.h b/mira/Optimiser.h
index 8ad38bd5a..8ad38bd5a 100644..100755
--- a/mira/Optimiser.h
+++ b/mira/Optimiser.h
diff --git a/mira/Perceptron.cpp b/mira/Perceptron.cpp
index 409d2ba34..409d2ba34 100644..100755
--- a/mira/Perceptron.cpp
+++ b/mira/Perceptron.cpp
diff --git a/mira/expt.cfg b/mira/expt.cfg
index 416eb1d3f..416eb1d3f 100644..100755
--- a/mira/expt.cfg
+++ b/mira/expt.cfg
diff --git a/mira/mira.xcodeproj/project.pbxproj b/mira/mira.xcodeproj/project.pbxproj
index 67662f4e0..67662f4e0 100644..100755
--- a/mira/mira.xcodeproj/project.pbxproj
+++ b/mira/mira.xcodeproj/project.pbxproj