diff options
Diffstat (limited to 'mira/Main.cpp')
-rw-r--r--[-rwxr-xr-x] | mira/Main.cpp | 900 |
1 files changed, 400 insertions, 500 deletions
diff --git a/mira/Main.cpp b/mira/Main.cpp index 402ffff3f..b5586fe29 100755..100644 --- a/mira/Main.cpp +++ b/mira/Main.cpp @@ -44,6 +44,7 @@ namespace mpi = boost::mpi; #include "DummyScoreProducers.h" #include "LexicalReordering.h" #include "BleuScorer.h" +#include "HypothesisQueue.h" using namespace Mira; using namespace std; @@ -66,7 +67,7 @@ int main(int argc, char** argv) { string inputFile; vector<string> referenceFiles; vector<string> mosesConfigFilesFolds, inputFilesFolds, referenceFilesFolds; - string coreWeightFile, startWeightFile; + // string coreWeightFile, startWeightFile; size_t epochs; string learner; bool shuffle; @@ -116,7 +117,6 @@ int main(int argc, char** argv) { float scale_lm_factor, scale_wp_factor; bool sample; string moses_src; - bool external_score = false; float sigmoidParam; float bleuWeight, bleuWeight_hope, bleuWeight_fear; bool bleu_weight_lm, bleu_weight_lm_adjust; @@ -126,25 +126,44 @@ int main(int argc, char** argv) { bool l1_regularize, l2_regularize; float l1_lambda, l2_lambda; bool most_violated, all_violated, max_bleu_diff, one_against_all; - bool feature_confidence, signed_counts, averageConfidenceCounts; - float decay, core_r0, sparse_r0; + bool feature_confidence, signed_counts; + float decay_core, decay_sparse, core_r0, sparse_r0; + bool selective, summed, add2hope, skip_hope, skip_model, skip_fear; + float bleu_weight_fear_factor, scaling_constant; + bool hildreth; + float add2lm; bool realBleu, disableBleuFeature; + bool rescaleSlack, rewardHope; + bool makePairs; po::options_description desc("Allowed options"); desc.add_options() + ("make-pairs", po::value<bool>(&makePairs)->default_value(true), "Make pairs of hypotheses for 1slack") + ("reward-hope", po::value<bool>(&rewardHope)->default_value(false), "Reward hope features over fear features") + ("rescale-slack", po::value<bool>(&rescaleSlack)->default_value(false), "Rescale slack in 1-slack formulation") ("disable-bleu-feature", po::value<bool>(&disableBleuFeature)->default_value(false), "Disable the Bleu feature") - ("real-bleu", po::value<bool>(&realBleu)->default_value(false), "Compute real sentence Bleu on complete translations") + ("real-bleu", po::value<bool>(&realBleu)->default_value(false), "Compute real sentence Bleu on complete translations") + ("add2lm", po::value<float>(&add2lm)->default_value(0.0), "Add the specified amount to all LM weights") + ("hildreth", po::value<bool>(&hildreth)->default_value(false), "Prefer Hildreth over analytical update") + ("skip-hope", po::value<bool>(&skip_hope)->default_value(false), "Sample without hope translations") + ("skip-model", po::value<bool>(&skip_model)->default_value(false), "Sample without model translations") + ("skip-fear", po::value<bool>(&skip_fear)->default_value(false), "Sample without fear translations") + ("add2hope", po::value<bool>(&add2hope)->default_value(false), "Add 2 hope translations instead of 1") + ("scaling-constant", po::value<float>(&scaling_constant)->default_value(1.0), "Scale all core values by a constant at beginning of training") + ("selective", po::value<bool>(&selective)->default_value(false), "Build constraints for every feature") + ("summed", po::value<bool>(&summed)->default_value(false), "Sum up all constraints") + ("bleu-weight", po::value<float>(&bleuWeight)->default_value(1.0), "Bleu weight used in decoder objective") ("bw-hope", po::value<float>(&bleuWeight_hope)->default_value(-1.0), "Bleu weight used in decoder objective for hope") ("bw-fear", po::value<float>(&bleuWeight_fear)->default_value(-1.0), "Bleu weight used in decoder objective for fear") ("core-r0", po::value<float>(&core_r0)->default_value(1.0), "Start learning rate for core features") ("sparse-r0", po::value<float>(&sparse_r0)->default_value(1.0), "Start learning rate for sparse features") - ("avg-conf-counts", po::value<bool>(&averageConfidenceCounts)->default_value(true), "Divide confidence counts by number of processors") ("tie-bw-to-lm", po::value<bool>(&bleu_weight_lm)->default_value(false), "Make bleu weight depend on lm weight") ("adjust-bw", po::value<bool>(&bleu_weight_lm_adjust)->default_value(false), "Adjust bleu weight when lm weight changes") ("bw-lm-factor", po::value<float>(&bleu_weight_lm_factor)->default_value(2.0), "Make bleu weight depend on lm weight by this factor") - + ("bw-factor-fear", po::value<float>(&bleu_weight_fear_factor)->default_value(1.0), "Multiply fear weight by this factor") + ("scale-all", po::value<bool>(&scale_all)->default_value(false), "Scale all core features") ("scaling-factor", po::value<float>(&scale_all_factor)->default_value(2), "Scaling factor for all core features") @@ -158,8 +177,9 @@ int main(int argc, char** argv) { ("clear-static", po::value<bool>(&clear_static)->default_value(false), "Clear static data before every translation") ("config,f", po::value<string>(&mosesConfigFile), "Moses ini-file") ("configs-folds", po::value<vector<string> >(&mosesConfigFilesFolds), "Moses ini-files, one for each fold") - ("core-weights", po::value<string>(&coreWeightFile)->default_value(""), "Weight file containing the core weights (already tuned, have to be non-zero)") - ("decay", po::value<float>(&decay)->default_value(0.01), "Decay factor for updating feature learning rates") + //("core-weights", po::value<string>(&coreWeightFile)->default_value(""), "Weight file containing the core weights (already tuned, have to be non-zero)") + ("decay-core", po::value<float>(&decay_core)->default_value(0.001), "Decay factor for updating core feature learning rates") + ("decay-sparse", po::value<float>(&decay_sparse)->default_value(0.001), "Decay factor for updating sparse feature learning rates") ("debug-model", po::value<bool>(&debug_model)->default_value(false), "Get best model translation for debugging purposes") ("decode-hope", po::value<bool>(&decode_hope)->default_value(false), "Decode dev input set according to hope objective") ("decode-fear", po::value<bool>(&decode_fear)->default_value(false), "Decode dev input set according to fear objective") @@ -171,18 +191,18 @@ int main(int argc, char** argv) { ("epochs,e", po::value<size_t>(&epochs)->default_value(10), "Number of epochs") ("feature-confidence", po::value<bool>(&feature_confidence)->default_value(false), "Use feature weight confidence in weight updates") ("feature-cutoff", po::value<int>(&featureCutoff)->default_value(-1), "Feature cutoff as additional regularization for sparse features") - ("fear-n", po::value<int>(&fear_n)->default_value(-1), "Number of fear translations used") + ("fear-n", po::value<int>(&fear_n)->default_value(1), "Number of fear translations used") ("help", po::value(&help)->zero_tokens()->default_value(false), "Print this help message and exit") ("history-bleu", po::value<bool>(&historyBleu)->default_value(false), "Use 1best translations to update the history") ("history-smoothing", po::value<float>(&historySmoothing)->default_value(0.9), "Adjust the factor for history smoothing") ("hope-fear", po::value<bool>(&hope_fear)->default_value(true), "Use only hope and fear translations for optimisation (not model)") ("hope-model", po::value<bool>(&hope_model)->default_value(false), "Use only hope and model translations for optimisation (use --fear-n to set number of model translations)") - ("hope-n", po::value<int>(&hope_n)->default_value(-1), "Number of hope translations used") + ("hope-n", po::value<int>(&hope_n)->default_value(2), "Number of hope translations used") ("input-file,i", po::value<string>(&inputFile), "Input file containing tokenised source") ("input-files-folds", po::value<vector<string> >(&inputFilesFolds), "Input files containing tokenised source, one for each fold") ("learner,l", po::value<string>(&learner)->default_value("mira"), "Learning algorithm") - ("l1-lambda", po::value<float>(&l1_lambda)->default_value(0.001), "Lambda for l1-regularization (w_i +/- lambda)") - ("l2-lambda", po::value<float>(&l2_lambda)->default_value(0.1), "Lambda for l2-regularization (w_i * (1 - lambda))") + ("l1-lambda", po::value<float>(&l1_lambda)->default_value(0.0001), "Lambda for l1-regularization (w_i +/- lambda)") + ("l2-lambda", po::value<float>(&l2_lambda)->default_value(0.01), "Lambda for l2-regularization (w_i * (1 - lambda))") ("l1-reg", po::value<bool>(&l1_regularize)->default_value(false), "L1-regularization") ("l2-reg", po::value<bool>(&l2_regularize)->default_value(false), "L2-regularization") ("min-bleu-ratio", po::value<float>(&minBleuRatio)->default_value(-1), "Set a minimum BLEU ratio between hope and fear") @@ -233,7 +253,7 @@ int main(int argc, char** argv) { ("slack", po::value<float>(&slack)->default_value(0.01), "Use slack in optimiser") ("sparse-average", po::value<bool>(&sparseAverage)->default_value(false), "Average weights by the number of processes") ("sparse-no-average", po::value<bool>(&sparseNoAverage)->default_value(false), "Don't average sparse weights, just sum") - ("start-weights", po::value<string>(&startWeightFile)->default_value(""), "Weight file containing start weights") + //("start-weights", po::value<string>(&startWeightFile)->default_value(""), "Weight file containing start weights") ("stop-weights", po::value<bool>(&weightConvergence)->default_value(true), "Stop when weights converge") ("verbosity,v", po::value<int>(&verbosity)->default_value(0), "Verbosity level") ("weight-dump-frequency", po::value<size_t>(&weightDumpFrequency)->default_value(1), "How often per epoch to dump weights (mpi)") @@ -251,6 +271,10 @@ int main(int argc, char** argv) { std::cout << desc << std::endl; return 0; } + + cerr << "l1-reg: " << l1_regularize << endl; + cerr << "featureCutoff: " << featureCutoff << endl; + cerr << "featureConfidence: " << feature_confidence << endl; const StaticData &staticData = StaticData::Instance(); @@ -261,6 +285,7 @@ int main(int argc, char** argv) { trainWithMultipleFolds = true; } + cerr << "test 1" << endl; if (dumpMixedWeights && (mixingFrequency != weightDumpFrequency)) { cerr << "Set mixing frequency = weight dump frequency for dumping mixed weights!" << endl; exit(1); @@ -271,6 +296,7 @@ int main(int argc, char** argv) { exit(1); } + cerr << "test 2" << endl; if (trainWithMultipleFolds) { if (!mosesConfigFilesFolds.size()) { cerr << "Error: No moses ini files specified for training with folds" << endl; @@ -305,6 +331,7 @@ int main(int argc, char** argv) { } // load input and references + cerr << "test 3" << endl; vector<string> inputSentences; size_t inputSize = trainWithMultipleFolds? inputFilesFolds.size(): 0; size_t refSize = trainWithMultipleFolds? referenceFilesFolds.size(): referenceFiles.size(); @@ -382,6 +409,7 @@ int main(int argc, char** argv) { // add initial Bleu weight and references to initialize Bleu feature boost::trim(decoder_settings); decoder_settings += " -mira -distinct-nbest -weight-bl 1 -references"; + cerr << "test 4" << endl; if (trainWithMultipleFolds) { decoder_settings += " "; decoder_settings += referenceFilesFolds[myFold]; @@ -393,17 +421,22 @@ int main(int argc, char** argv) { } } + cerr << "test 5" << endl; vector<string> decoder_params; boost::split(decoder_params, decoder_settings, boost::is_any_of("\t ")); string configFile = trainWithMultipleFolds? mosesConfigFilesFolds[myFold] : mosesConfigFile; VERBOSE(1, "Rank " << rank << " reading config file from " << configFile << endl); + cerr << "test 6" << endl; MosesDecoder* decoder = new MosesDecoder(configFile, verbosity, decoder_params.size(), decoder_params); + cerr << "test 7" << endl; decoder->setBleuParameters(disableBleuFeature, sentenceBleu, scaleByInputLength, scaleByAvgInputLength, scaleByInverseLength, scaleByAvgInverseLength, scaleByX, historySmoothing, bleu_smoothing_scheme); + cerr << "test 8" << endl; SearchAlgorithm searchAlgorithm = staticData.GetSearchAlgorithm(); bool chartDecoding = (searchAlgorithm == ChartDecoding); + cerr << "test 9" << endl; // Optionally shuffle the sentences vector<size_t> order; @@ -426,6 +459,7 @@ int main(int argc, char** argv) { if (rank == 0) { cerr << "Optimising using Mira" << endl; cerr << "slack: " << slack << ", learning rate: " << mira_learning_rate << endl; + cerr << "selective: " << selective << endl; if (normaliseMargin) cerr << "sigmoid parameter: " << sigmoidParam << endl; } @@ -459,19 +493,19 @@ int main(int argc, char** argv) { } if (hope_n == -1) - hope_n = n; + hope_n = n; if (fear_n == -1) - fear_n = n; + fear_n = n; if (rank_n == -1) - rank_n = n; + rank_n = n; if (sample) - model_hope_fear = true; + model_hope_fear = true; if (model_hope_fear || hope_model || rank_only || megam) - hope_fear = false; // is true by default + hope_fear = false; // is true by default if (learner == "mira" && !(hope_fear || hope_model || model_hope_fear || rank_only || megam)) { - cerr << "Error: Need to select one of parameters --hope-fear/--model-hope-fear for mira update." << endl; - return 1; + cerr << "Error: Need to select one of parameters --hope-fear/--model-hope-fear for mira update." << endl; + return 1; } #ifdef MPI_ENABLE @@ -516,10 +550,10 @@ int main(int argc, char** argv) { staticData.GetTranslationSystem(TranslationSystem::DEFAULT).GetFeatureFunctions(); //const vector<FactorType> &inputFactorOrder = staticData.GetInputFactorOrder(); - ProducerWeightMap coreWeightMap, startWeightMap; + //ProducerWeightMap coreWeightMap, startWeightMap; ScoreComponentCollection initialWeights = decoder->getWeights(); // read start weight file - if (!startWeightFile.empty()) { + /*if (!startWeightFile.empty()) { if (!loadCoreWeights(startWeightFile, startWeightMap, featureFunctions)) { cerr << "Error: Failed to load start weights from " << startWeightFile << endl; return 1; @@ -550,13 +584,27 @@ int main(int argc, char** argv) { for(p = coreWeightMap.begin(); p!=coreWeightMap.end(); ++p) initialWeights.Assign(p->first, p->second); } - } + }*/ cerr << "Rank " << rank << ", initial weights: " << initialWeights << endl; + if (scaling_constant != 1.0) { + initialWeights.MultiplyEquals(scaling_constant); + cerr << "Rank " << rank << ", scaled initial weights: " << initialWeights << endl; + } + + if (add2lm != 0) { + const LMList& lmList_new = staticData.GetLMList(); + for (LMList::const_iterator i = lmList_new.begin(); i != lmList_new.end(); ++i) { + float lmWeight = initialWeights.GetScoreForProducer(*i) + add2lm; + initialWeights.Assign(*i, lmWeight); + cerr << "Rank " << rank << ", add " << add2lm << " to lm weight." << endl; + } + } if (normaliseWeights) { initialWeights.L1Normalise(); cerr << "Rank " << rank << ", normalised initial weights: " << initialWeights << endl; } + decoder->setWeights(initialWeights); if (scale_all) { @@ -583,7 +631,9 @@ int main(int argc, char** argv) { if (bleuWeight_fear == -1) { bleuWeight_fear = bleuWeight; } + bleuWeight_fear *= bleu_weight_fear_factor; cerr << "Bleu weight: " << bleuWeight << endl; + cerr << "Bleu weight fear: " << bleuWeight_fear << endl; if (decode_hope || decode_fear || decode_model) { size_t decode = 1; @@ -614,15 +664,15 @@ int main(int argc, char** argv) { ScoreComponentCollection mixedAverageWeightsBeforePrevious; // log feature counts and/or hope/fear translations with features - string f1 = "decode_hope_epoch0"; + /*string f1 = "decode_hope_epoch0"; string f2 = "decode_fear_epoch0"; - ofstream hopePlusFeatures(f1.c_str()); - ofstream fearPlusFeatures(f2.c_str()); - if (!hopePlusFeatures || !fearPlusFeatures) { - ostringstream msg; - msg << "Unable to open file"; - throw runtime_error(msg.str()); - } + ofstream hopePlusFeatures(f1.c_str()); + ofstream fearPlusFeatures(f2.c_str()); + if (!hopePlusFeatures || !fearPlusFeatures) { + ostringstream msg; + msg << "Unable to open file"; + throw runtime_error(msg.str()); + }*/ bool stop = false; // int sumStillViolatedConstraints; @@ -630,7 +680,7 @@ int main(int argc, char** argv) { // variables for feature confidence ScoreComponentCollection confidenceCounts, mixedConfidenceCounts, featureLearningRates; - featureLearningRates.UpdateLearningRates(decay, confidenceCounts, core_r0, sparse_r0); //initialise core learning rates + featureLearningRates.UpdateLearningRates(decay_core, decay_sparse, confidenceCounts, core_r0, sparse_r0); //initialise core learning rates cerr << "Initial learning rates, core: " << core_r0 << ", sparse: " << sparse_r0 << endl; for (size_t epoch = 0; epoch < epochs && !stop; ++epoch) { @@ -851,6 +901,7 @@ int main(int argc, char** argv) { } // select inference scheme + cerr << "Rank " << rank << ", epoch " << epoch << ", real Bleu? " << realBleu << endl; if (hope_fear || perceptron_update) { if (clear_static) { delete decoder; @@ -876,11 +927,11 @@ int main(int argc, char** argv) { // count sparse features occurring in hope translation featureValuesHope[batchPosition][0].IncrementSparseHopeFeatures(); - if (epoch == 0 && printNbestWithFeatures) { + /*if (epoch == 0 && printNbestWithFeatures) { decoder->outputNBestList(input, *sid, hope_n, 1, bleuWeight_hope, distinctNbest, avgRefLength, "", hopePlusFeatures); decoder->cleanup(chartDecoding); - } + }*/ float precision = bleuScoresHope[batchPosition][0]; @@ -950,11 +1001,11 @@ int main(int argc, char** argv) { // count sparse features occurring in fear translation featureValuesFear[batchPosition][0].IncrementSparseFearFeatures(); - if (epoch == 0 && printNbestWithFeatures) { - decoder->outputNBestList(input, *sid, fear_n, -1, bleuWeight_fear, distinctNbest, - avgRefLength, "", fearPlusFeatures); - decoder->cleanup(chartDecoding); - } + /*if (epoch == 0 && printNbestWithFeatures) { + decoder->outputNBestList(input, *sid, fear_n, -1, bleuWeight_fear, distinctNbest, + avgRefLength, "", fearPlusFeatures); + decoder->cleanup(chartDecoding); + }*/ // Bleu-related example selection bool skip = false; @@ -1068,330 +1119,191 @@ int main(int argc, char** argv) { examples_in_batch++; } - if (model_hope_fear) { - ostringstream hope_nbest_filename, fear_nbest_filename, model_nbest_filename, ref_filename; - if (sample && external_score) { - hope_nbest_filename << "decode_hope_rank" << rank << "." << hope_n << "best"; - fear_nbest_filename << "decode_fear_rank" << rank << "." << fear_n << "best"; - model_nbest_filename << "decode_model_rank" << rank << "." << n << "best"; - - // save reference - ref_filename << "decode_ref_rank" << rank; - referenceFileMegam = ref_filename.str(); - ofstream ref_out(referenceFileMegam.c_str()); - if (!ref_out) { - ostringstream msg; - msg << "Unable to open " << referenceFileMegam; - throw runtime_error(msg.str()); - } - ref_out << referenceSentences[decoder->getShortestReferenceIndex(*sid)][*sid] << "\n"; - ref_out.close(); - } - - // HOPE - if (clear_static) { - delete decoder; - StaticData::ClearDataStatic(); - decoder = new MosesDecoder(configFile, verbosity, decoder_params.size(), decoder_params); - decoder->setBleuParameters(disableBleuFeature, sentenceBleu, scaleByInputLength, scaleByAvgInputLength, scaleByInverseLength, scaleByAvgInverseLength, scaleByX, historySmoothing, bleu_smoothing_scheme); - decoder->setWeights(mosesWeights); - } + if (model_hope_fear) { + // HOPE + if (!skip_hope) { + cerr << "Rank " << rank << ", epoch " << epoch << ", " << n << "best hope translations" << endl; + size_t oraclePos = featureValues[batchPosition].size(); + decoder->getNBest(input, *sid, n, 1.0, bleuWeight_hope, + featureValues[batchPosition], bleuScores[batchPosition], modelScores[batchPosition], + 0, realBleu, distinctNbest, avgRefLength, rank, epoch, ""); + //vector<const Word*> oracle = outputHope[0]; + // needed for history + inputLengths.push_back(current_input_length); + ref_ids.push_back(*sid); + decoder->cleanup(chartDecoding); + //ref_length = decoder->getClosestReferenceLength(*sid, oracle.size()); + //float hope_length_ratio = (float)oracle.size()/ref_length; + cerr << endl; + + oracleFeatureValues.push_back(featureValues[batchPosition][oraclePos]); + oracleBleuScores.push_back(bleuScores[batchPosition][oraclePos]); + oracleModelScores.push_back(modelScores[batchPosition][oraclePos]); + } - cerr << "Rank " << rank << ", epoch " << epoch << ", " << n << "best hope translations" << endl; - size_t oraclePos = featureValues[batchPosition].size(); - decoder->getNBest(input, *sid, n, 1.0, bleuWeight_hope, - featureValues[batchPosition], bleuScores[batchPosition], modelScores[batchPosition], - 0, realBleu, distinctNbest, avgRefLength, rank, epoch, hope_nbest_filename.str()); - // needed for history - inputLengths.push_back(current_input_length); - ref_ids.push_back(*sid); - decoder->cleanup(chartDecoding); - //ref_length = decoder->getClosestReferenceLength(*sid, oracle.size()); - //float hope_length_ratio = (float)oracle.size()/ref_length; - cerr << endl; - - oracleFeatureValues.push_back(featureValues[batchPosition][oraclePos]); - oracleBleuScores.push_back(bleuScores[batchPosition][oraclePos]); - oracleModelScores.push_back(modelScores[batchPosition][oraclePos]); + // MODEL + if (!skip_model) { + cerr << "Rank " << rank << ", epoch " << epoch << ", " << n << "best wrt model score" << endl; + if (historyBleu) { + vector< vector<const Word*> > outputModel = decoder->getNBest(input, *sid, n, 0.0, bleuWeight, + featureValues[batchPosition], bleuScores[batchPosition], modelScores[batchPosition], + 1, realBleu, distinctNbest, avgRefLength, rank, epoch, ""); + vector<const Word*> bestModel = outputModel[0]; + oneBests.push_back(bestModel); + } + else { + decoder->getNBest(input, *sid, n, 0.0, bleuWeight, + featureValues[batchPosition], bleuScores[batchPosition], modelScores[batchPosition], + 0, realBleu, distinctNbest, avgRefLength, rank, epoch, ""); + } + decoder->cleanup(chartDecoding); + //ref_length = decoder->getClosestReferenceLength(*sid, bestModel.size()); + //float model_length_ratio = (float)bestModel.size()/ref_length; + cerr << endl; + } - // MODEL - if (clear_static) { - delete decoder; - StaticData::ClearDataStatic(); - decoder = new MosesDecoder(configFile, verbosity, decoder_params.size(), decoder_params); - decoder->setBleuParameters(disableBleuFeature, sentenceBleu, scaleByInputLength, scaleByAvgInputLength, scaleByInverseLength, scaleByAvgInverseLength, scaleByX, historySmoothing, bleu_smoothing_scheme); - decoder->setWeights(mosesWeights); - } + // FEAR + if (!skip_fear) { + cerr << "Rank " << rank << ", epoch " << epoch << ", " << n << "best fear translations" << endl; + decoder->getNBest(input, *sid, n, -1.0, bleuWeight_fear, + featureValues[batchPosition], bleuScores[batchPosition], modelScores[batchPosition], + 0, realBleu, distinctNbest, avgRefLength, rank, epoch, ""); + decoder->cleanup(chartDecoding); + //ref_length = decoder->getClosestReferenceLength(*sid, fear.size()); + //float fear_length_ratio = (float)fear.size()/ref_length; + } - cerr << "Rank " << rank << ", epoch " << epoch << ", " << n << "best wrt model score" << endl; - if (historyBleu) { - vector< vector<const Word*> > outputModel = decoder->getNBest(input, *sid, n, 0.0, bleuWeight, - featureValues[batchPosition], bleuScores[batchPosition], modelScores[batchPosition], - 1, realBleu, distinctNbest, avgRefLength, rank, epoch, model_nbest_filename.str()); - vector<const Word*> bestModel = outputModel[0]; - oneBests.push_back(bestModel); - } - else { - decoder->getNBest(input, *sid, n, 0.0, bleuWeight, - featureValues[batchPosition], bleuScores[batchPosition], modelScores[batchPosition], - 0, realBleu, distinctNbest, avgRefLength, rank, epoch, model_nbest_filename.str()); + examples_in_batch++; + + if (sample) { + float bleuHope = -1000; + float bleuFear = 1000; + size_t indexHope = -1; + size_t indexFear = -1; + vector<float> bleuHopeList; + vector<float> bleuFearList; + vector<float> indexHopeList; + vector<float> indexFearList; + + HypothesisQueue queueHope(hope_n); + HypothesisQueue queueFear(fear_n); + + cerr << endl; + if (most_violated || all_violated || one_against_all) { + bleuHope = -1000; + bleuFear = 1000; + indexHope = -1; + indexFear = -1; + if (most_violated) + cerr << "Rank " << rank << ", epoch " << epoch << ", pick pair with most violated constraint" << endl; + else if (all_violated) + cerr << "Rank " << rank << ", epoch " << epoch << ", pick all pairs with violated constraints"; + else + cerr << "Rank " << rank << ", epoch " << epoch << ", pick all pairs with hope"; + + // find best hope, then find fear that violates our constraint most + for (size_t i=0; i<bleuScores[batchPosition].size(); ++i) { + if (abs(bleuScores[batchPosition][i] - bleuHope) < epsilon) { // equal bleu scores + if (modelScores[batchPosition][i] > modelScores[batchPosition][indexHope]) { + if (abs(modelScores[batchPosition][i] - modelScores[batchPosition][indexHope]) > epsilon) { + // better model score + bleuHope = bleuScores[batchPosition][i]; + indexHope = i; + } + } } - decoder->cleanup(chartDecoding); - //ref_length = decoder->getClosestReferenceLength(*sid, bestModel.size()); - //float model_length_ratio = (float)bestModel.size()/ref_length; - cerr << endl; - - // FEAR - if (clear_static) { - delete decoder; - StaticData::ClearDataStatic(); - decoder = new MosesDecoder(configFile, verbosity, decoder_params.size(), decoder_params); - decoder->setBleuParameters(disableBleuFeature, sentenceBleu, scaleByInputLength, scaleByAvgInputLength, scaleByInverseLength, scaleByAvgInverseLength, scaleByX, historySmoothing, bleu_smoothing_scheme); - decoder->setWeights(mosesWeights); + else if (bleuScores[batchPosition][i] > bleuHope) { // better than current best + bleuHope = bleuScores[batchPosition][i]; + indexHope = i; } - - cerr << "Rank " << rank << ", epoch " << epoch << ", " << n << "best fear translations" << endl; - decoder->getNBest(input, *sid, n, -1.0, bleuWeight_fear, - featureValues[batchPosition], bleuScores[batchPosition], modelScores[batchPosition], - 0, realBleu, distinctNbest, avgRefLength, rank, epoch, fear_nbest_filename.str()); - decoder->cleanup(chartDecoding); - //ref_length = decoder->getClosestReferenceLength(*sid, fear.size()); - //float fear_length_ratio = (float)fear.size()/ref_length; - - examples_in_batch++; - - if (sample) { - float bleuHope = -1000; - float bleuFear = 1000; - size_t indexHope = -1; - size_t indexFear = -1; - vector<float> bleuHopeList; - vector<float> bleuFearList; - vector<float> indexHopeList; - vector<float> indexFearList; - - if (external_score) { - // concatenate nbest files (use hope, model, fear lists to extract samples from) - stringstream nbestStreamMegam, catCmd, sortCmd, scoreDataFile, featureDataFile; - nbestStreamMegam << "decode_hypos_rank" << rank << "." << (hope_n+n+fear_n) << "best"; - nbestFileMegam = nbestStreamMegam.str(); - catCmd << "cat " << hope_nbest_filename.str() << " " << model_nbest_filename.str() - << " " << fear_nbest_filename.str() << " > " << nbestFileMegam; - system(catCmd.str().c_str()); - - // extract features and scores - scoreDataFile << "decode_hypos_rank" << rank << ".scores.dat"; - featureDataFile << "decode_hypos_rank" << rank << ".features.dat"; - stringstream extractorCmd; - extractorCmd << moses_src << "/dist/bin/extractor" - " --scconfig case:true --scfile " << scoreDataFile.str() << " --ffile " << featureDataFile.str() << " -r " << referenceFileMegam << " -n " << nbestFileMegam; - system(extractorCmd.str().c_str()); + } - // NOTE: here we are just scoring the nbest lists created above. - // We will use the (real, not dynamically computed) sentence bleu scores to select a pair of two - // translations with maximal Bleu difference - vector<float> bleuScoresNbest = BleuScorer::ScoreNbestList(scoreDataFile.str(), featureDataFile.str()); - for (size_t i=0; i < bleuScoresNbest.size(); ++i) { - //cerr << "bleu: " << bleuScoresNbest[i]*current_input_length << endl; - if (abs(bleuScoresNbest[i] - bleuHope) < epsilon) { // equal bleu scores - if (modelScores[batchPosition][i] > modelScores[batchPosition][indexHope]) { - if (abs(modelScores[batchPosition][i] - modelScores[batchPosition][indexHope]) > epsilon) { - bleuHope = bleuScoresNbest[i]; - indexHope = i; - } - } - } - else if (bleuScoresNbest[i] > bleuHope) { // better than current best - bleuHope = bleuScoresNbest[i]; - indexHope = i; - } - - if (abs(bleuScoresNbest[i] - bleuFear) < epsilon) { // equal bleu scores - if (modelScores[batchPosition][i] > modelScores[batchPosition][indexFear]) { - if (abs(modelScores[batchPosition][i] - modelScores[batchPosition][indexFear]) > epsilon) { - bleuFear = bleuScoresNbest[i]; - indexFear = i; - } - } - } - else if (bleuScoresNbest[i] < bleuFear) { // worse than current worst - bleuFear = bleuScoresNbest[i]; - indexFear = i; - } - } + float currentViolation = 0; + float minimum_bleu_diff = 0.01; + for (size_t i=0; i<bleuScores[batchPosition].size(); ++i) { + float bleuDiff = bleuHope - bleuScores[batchPosition][i]; + float modelDiff = modelScores[batchPosition][indexHope] - modelScores[batchPosition][i]; + if (bleuDiff > epsilon) { + if (one_against_all && bleuDiff > minimum_bleu_diff) { + cerr << ".. adding pair"; + bleuHopeList.push_back(bleuHope); + bleuFearList.push_back(bleuScores[batchPosition][i]); + indexHopeList.push_back(indexHope); + indexFearList.push_back(i); } - else { - cerr << endl; - if (most_violated || all_violated || one_against_all) { - bleuHope = -1000; - bleuFear = 1000; - indexHope = -1; - indexFear = -1; - if (most_violated) - cerr << "Rank " << rank << ", epoch " << epoch << ", pick pair with most violated constraint"; - else if (all_violated) - cerr << "Rank " << rank << ", epoch " << epoch << ", pick all pairs with violated constraints"; - else - cerr << "Rank " << rank << ", epoch " << epoch << ", pick all pairs with hope"; - - // find best hope, then find fear that violates our constraint most - for (size_t i=0; i<bleuScores[batchPosition].size(); ++i) { - if (abs(bleuScores[batchPosition][i] - bleuHope) < epsilon) { // equal bleu scores - if (modelScores[batchPosition][i] > modelScores[batchPosition][indexHope]) { - if (abs(modelScores[batchPosition][i] - modelScores[batchPosition][indexHope]) > epsilon) { - // better model score - bleuHope = bleuScores[batchPosition][i]; - indexHope = i; - } - } - } - else if (bleuScores[batchPosition][i] > bleuHope) { // better than current best - bleuHope = bleuScores[batchPosition][i]; - indexHope = i; - } - } - - float currentViolation = 0; - float minimum_bleu_diff = 0.01; - for (size_t i=0; i<bleuScores[batchPosition].size(); ++i) { - float bleuDiff = bleuHope - bleuScores[batchPosition][i]; - float modelDiff = modelScores[batchPosition][indexHope] - modelScores[batchPosition][i]; - if (bleuDiff > epsilon) { - if (one_against_all && bleuDiff > minimum_bleu_diff) { - cerr << ".. adding pair"; - bleuHopeList.push_back(bleuHope); - bleuFearList.push_back(bleuScores[batchPosition][i]); - indexHopeList.push_back(indexHope); - indexFearList.push_back(i); - } - else if (modelDiff < bleuDiff) { - float diff = bleuDiff - modelDiff; - if (diff > epsilon) { - if (all_violated) { - cerr << ".. adding pair"; - bleuHopeList.push_back(bleuHope); - bleuFearList.push_back(bleuScores[batchPosition][i]); - indexHopeList.push_back(indexHope); - indexFearList.push_back(i); - } - else if (most_violated && diff > currentViolation) { - currentViolation = diff; - bleuFear = bleuScores[batchPosition][i]; - indexFear = i; - cerr << "Rank " << rank << ", epoch " << epoch << ", current violation: " << currentViolation << " (" << modelDiff << " >= " << bleuDiff << ")" << endl; - } - } - } - } - } - - if (most_violated) { - if (currentViolation > 0) { - cerr << ".. adding pair with violation " << currentViolation << endl; - bleuHopeList.push_back(bleuHope); - bleuFearList.push_back(bleuFear); - indexHopeList.push_back(indexHope); - indexFearList.push_back(indexFear); - } - else cerr << ".. none" << endl; - } - else cerr << endl; - } - if (max_bleu_diff) { - bleuHope = -1000; - bleuFear = 1000; - indexHope = -1; - indexFear = -1; - cerr << "Rank " << rank << ", epoch " << epoch << ", pick pair with max Bleu diff"; - // use dynamically calculated scores to find best and worst - for (size_t i=0; i<bleuScores[batchPosition].size(); ++i) { - //cerr << "bleu: " << bleuScores[batchPosition][i] << endl; - if (abs(bleuScores[batchPosition][i] - bleuHope) < epsilon) { // equal bleu scores - if (modelScores[batchPosition][i] > modelScores[batchPosition][indexHope]) { - if (abs(modelScores[batchPosition][i] - modelScores[batchPosition][indexHope]) > epsilon) { - bleuHope = bleuScores[batchPosition][i]; - indexHope = i; - } - } - } - else if (bleuScores[batchPosition][i] > bleuHope) { // better than current best - bleuHope = bleuScores[batchPosition][i]; - indexHope = i; - } - - if (abs(bleuScores[batchPosition][i] - bleuFear) < epsilon) { // equal bleu scores - if (modelScores[batchPosition][i] > modelScores[batchPosition][indexFear]) { - if (abs(modelScores[batchPosition][i] - modelScores[batchPosition][indexFear]) > epsilon) { - bleuFear = bleuScores[batchPosition][i]; - indexFear = i; - } - } - } - else if (bleuScores[batchPosition][i] < bleuFear) { // worse than current worst - bleuFear = bleuScores[batchPosition][i]; - indexFear = i; - } - } - - if (bleuHope != -1000 && bleuFear != 1000 && (bleuHope - bleuFear) > epsilon) { - cerr << ".. adding 1 pair" << endl; + else if (modelDiff < bleuDiff) { + float diff = bleuDiff - modelDiff; + if (diff > epsilon) { + if (all_violated) { + cerr << ".. adding pair"; bleuHopeList.push_back(bleuHope); - bleuFearList.push_back(bleuFear); + bleuFearList.push_back(bleuScores[batchPosition][i]); indexHopeList.push_back(indexHope); - indexFearList.push_back(indexFear); + indexFearList.push_back(i); } - else cerr << "none" << endl; - } - } - - if (bleuHopeList.size() == 0 || bleuFearList.size() == 0) { - cerr << "Rank " << rank << ", epoch " << epoch << ", no appropriate hypotheses found.." << endl; - skip_sample = true; - } - else { - if (bleuHope != -1000 && bleuFear != 1000 && bleuHope <= bleuFear) { - if (abs(bleuHope - bleuFear) < epsilon) - cerr << "\nRank " << rank << ", epoch " << epoch << ", WARNING: HOPE and FEAR have equal Bleu." << endl; - else - cerr << "\nRank " << rank << ", epoch " << epoch << ", ERROR: FEAR has better Bleu than HOPE." << endl; + else if (most_violated && diff > currentViolation) { + currentViolation = diff; + bleuFear = bleuScores[batchPosition][i]; + indexFear = i; + cerr << "Rank " << rank << ", epoch " << epoch << ", current violation: " << currentViolation << " (" << modelDiff << " >= " << bleuDiff << ")" << endl; + } } - else { - if (external_score) { - // use actual sentence bleu (not dynamically computed) - bleuScoresHopeSample[batchPosition].push_back(bleuHope*current_input_length); - bleuScoresFearSample[batchPosition].push_back(bleuFear*current_input_length); - featureValuesHopeSample[batchPosition].push_back(featureValues[batchPosition][indexHope]); - featureValuesFearSample[batchPosition].push_back(featureValues[batchPosition][indexFear]); - modelScoresHopeSample[batchPosition].push_back(modelScores[batchPosition][indexHope]); - modelScoresFearSample[batchPosition].push_back(modelScores[batchPosition][indexFear]); - cerr << "Rank " << rank << ", epoch " << epoch << ", Best: " << bleuHope*current_input_length << " (" << indexHope << ")" << endl; - cerr << "Rank " << rank << ", epoch " << epoch << ", Worst: " << bleuFear*current_input_length << " (" << indexFear << ")" << endl; - } - else { - cerr << endl; - for (size_t i=0; i<bleuHopeList.size(); ++i) { - float bHope = bleuHopeList[i]; - float bFear = bleuFearList[i]; - size_t iHope = indexHopeList[i]; - size_t iFear = indexFearList[i]; - cerr << "Rank " << rank << ", epoch " << epoch << ", Hope[" << i << "]: " << bHope << " (" << iHope << ")" << endl; - cerr << "Rank " << rank << ", epoch " << epoch << ", Fear[" << i << "]: " << bFear << " (" << iFear << ")" << endl; - bleuScoresHopeSample[batchPosition].push_back(bHope); - bleuScoresFearSample[batchPosition].push_back(bFear); - featureValuesHopeSample[batchPosition].push_back(featureValues[batchPosition][iHope]); - featureValuesFearSample[batchPosition].push_back(featureValues[batchPosition][iFear]); - modelScoresHopeSample[batchPosition].push_back(modelScores[batchPosition][iHope]); - modelScoresFearSample[batchPosition].push_back(modelScores[batchPosition][iFear]); - - featureValues[batchPosition][iHope].IncrementSparseHopeFeatures(); - featureValues[batchPosition][iFear].IncrementSparseFearFeatures(); - } - } - } } + } + } + + if (most_violated) { + if (currentViolation > 0) { + cerr << "Rank " << rank << ", epoch " << epoch << ", adding pair with violation " << currentViolation << endl; + bleuHopeList.push_back(bleuHope); + bleuFearList.push_back(bleuFear); + indexHopeList.push_back(indexHope); + indexFearList.push_back(indexFear); } + else + cerr << "Rank " << rank << ", epoch " << epoch << ", no violated constraint found." << endl; + } + else cerr << endl; + } + if (max_bleu_diff) { + cerr << "Rank " << rank << ", epoch " << epoch << ", pick pair with max Bleu diff from list: " << bleuScores[batchPosition].size() << endl; + for (size_t i=0; i<bleuScores[batchPosition].size(); ++i) { + BleuIndexPair hope(bleuScores[batchPosition][i], i); + queueHope.Push(hope); + BleuIndexPair fear(-1*(bleuScores[batchPosition][i]), i); + queueFear.Push(fear); + } + } + + cerr << endl; + + vector<BleuIndexPair> hopeList, fearList; + for (size_t i=0; i<hope_n && !queueHope.Empty(); ++i) hopeList.push_back(queueHope.Pop()); + for (size_t i=0; i<fear_n && !queueFear.Empty(); ++i) fearList.push_back(queueFear.Pop()); + + for (size_t i=0; i<hopeList.size(); ++i) { + float hopeBleu = hopeList[i].first; + size_t hopeIndex = hopeList[i].second; + for (size_t j=0; j<fearList.size(); ++j) { + float fearBleu = -1*(fearList[j].first); + size_t fearIndex = fearList[j].second; + cerr << "Rank " << rank << ", epoch " << epoch << ", hope: " << hopeBleu << " (" << hopeIndex << "), fear: " << fearBleu << " (" << fearIndex << ")" << endl; + bleuScoresHopeSample[batchPosition].push_back(hopeBleu); + bleuScoresFearSample[batchPosition].push_back(fearBleu); + featureValuesHopeSample[batchPosition].push_back(featureValues[batchPosition][hopeIndex]); + featureValuesFearSample[batchPosition].push_back(featureValues[batchPosition][fearIndex]); + modelScoresHopeSample[batchPosition].push_back(modelScores[batchPosition][hopeIndex]); + modelScoresFearSample[batchPosition].push_back(modelScores[batchPosition][fearIndex]); + + featureValues[batchPosition][hopeIndex].IncrementSparseHopeFeatures(); + featureValues[batchPosition][fearIndex].IncrementSparseFearFeatures(); + } + } + if (!makePairs) + cerr << "Rank " << rank << ", epoch " << epoch << "summing up hope and fear vectors, no pairs" << endl; + } } - + // next input sentence ++sid; ++actualBatchSize; @@ -1474,11 +1386,11 @@ int main(int argc, char** argv) { } else if (examples_in_batch == 0 || (sample && skip_sample)) { - cerr << "Rank " << rank << ", epoch " << epoch << ", batch is empty." << endl; + cerr << "Rank " << rank << ", epoch " << epoch << ", batch is empty." << endl; } else { vector<vector<float> > losses(actualBatchSize); - if (model_hope_fear) { + if (model_hope_fear && !skip_hope) { // Set loss for each sentence as BLEU(oracle) - BLEU(hypothesis) for (size_t batchPosition = 0; batchPosition < actualBatchSize; ++batchPosition) { for (size_t j = 0; j < bleuScores[batchPosition].size(); ++j) { @@ -1486,7 +1398,7 @@ int main(int argc, char** argv) { } } } - + // set weight for bleu feature to 0 before optimizing vector<const ScoreProducer*>::const_iterator iter; const vector<const ScoreProducer*> featureFunctions2 = staticData.GetTranslationSystem(TranslationSystem::DEFAULT).GetFeatureFunctions(); @@ -1499,13 +1411,9 @@ int main(int argc, char** argv) { // scale LM feature (to avoid rapid changes) if (scale_lm) { + cerr << "scale lm" << endl; const LMList& lmList_new = staticData.GetLMList(); for (LMList::const_iterator iter = lmList_new.begin(); iter != lmList_new.end(); ++iter) { - // scale up weight - float lmWeight = mosesWeights.GetScoreForProducer(*iter); - mosesWeights.Assign(*iter, lmWeight*scale_lm_factor); - cerr << "Rank " << rank << ", epoch " << epoch << ", lm weight scaled from " << lmWeight << " to " << lmWeight*scale_lm_factor << endl; - // scale down score if (sample) { scaleFeatureScore(*iter, scale_lm_factor, featureValuesHopeSample, rank, epoch); @@ -1523,9 +1431,6 @@ int main(int argc, char** argv) { if (scale_wp) { // scale up weight WordPenaltyProducer *wp = staticData.GetFirstWordPenaltyProducer(); - float wpWeight = mosesWeights.GetScoreForProducer(wp); - mosesWeights.Assign(wp, wpWeight*scale_wp_factor); - cerr << "Rank " << rank << ", epoch " << epoch << ", wp weight scaled from " << wpWeight << " to " << wpWeight*scale_wp_factor << endl; // scale down score if (sample) { @@ -1542,10 +1447,7 @@ int main(int argc, char** argv) { if (scale_all) { // scale distortion DistortionScoreProducer *dp = staticData.GetDistortionScoreProducer(); - float dWeight = mosesWeights.GetScoreForProducer(dp); - mosesWeights.Assign(dp, dWeight*scale_all_factor); - cerr << "Rank " << rank << ", epoch " << epoch << ", distortion weight scaled from " << dWeight << " to " << dWeight*scale_all_factor << endl; - + // scale down score if (sample) { scaleFeatureScore(dp, scale_all_factor, featureValuesHopeSample, rank, epoch); @@ -1561,15 +1463,7 @@ int main(int argc, char** argv) { vector<LexicalReordering*> lrVec = staticData.GetLexicalReorderModels(); for (size_t i=0; i<lrVec.size(); ++i) { LexicalReordering* lr = lrVec[i]; - // scale up weight - vector<float> dWeights = mosesWeights.GetScoresForProducer(lr); - for (size_t j=0; j<dWeights.size(); ++j) { - cerr << "Rank " << rank << ", epoch " << epoch << ", d weight scaled from " << dWeights[j]; - dWeights[j] *= scale_all_factor; - cerr << " to " << dWeights[j] << endl; - } - mosesWeights.Assign(lr, dWeights); - + // scale down score if (sample) { scaleFeatureScores(lr, scale_all_factor, featureValuesHopeSample, rank, epoch); @@ -1586,14 +1480,6 @@ int main(int argc, char** argv) { vector<PhraseDictionaryFeature*> pdVec = staticData.GetPhraseDictionaryModels(); for (size_t i=0; i<pdVec.size(); ++i) { PhraseDictionaryFeature* pd = pdVec[i]; - // scale up weight - vector<float> tWeights = mosesWeights.GetScoresForProducer(pd); - for (size_t j=0; j<tWeights.size(); ++j) { - cerr << "Rank " << rank << ", epoch " << epoch << ", t weight scaled from " << tWeights[j]; - tWeights[j] *= scale_all_factor; - cerr << " to " << tWeights[j] << endl; - } - mosesWeights.Assign(pd, tWeights); // scale down score if (sample) { @@ -1607,20 +1493,7 @@ int main(int argc, char** argv) { } } } - - // set core features to 0 to avoid updating the feature weights - if (coreWeightMap.size() > 0) { - if (sample) { - ignoreCoreFeatures(featureValuesHopeSample, coreWeightMap); - ignoreCoreFeatures(featureValuesFearSample, coreWeightMap); - } - else { - ignoreCoreFeatures(featureValues, coreWeightMap); - ignoreCoreFeatures(featureValuesHope, coreWeightMap); - ignoreCoreFeatures(featureValuesFear, coreWeightMap); - } - } - + // print out the feature values if (print_feature_values) { cerr << "\nRank " << rank << ", epoch " << epoch << ", feature values: " << endl; @@ -1641,9 +1514,7 @@ int main(int argc, char** argv) { // apply learning rates to feature vectors before optimization if (feature_confidence) { - cerr << "Rank " << rank << ", epoch " << epoch << ", apply feature learning rates with decay " << decay << ": " << featureLearningRates << endl; - //weightUpdate.MultiplyEqualsBackoff(featureLearningRates, sparse_r0); - //cerr << "Rank " << rank << ", epoch " << epoch << ", scaled update: " << weightUpdate << endl; + cerr << "Rank " << rank << ", epoch " << epoch << ", apply feature learning rates with decays " << decay_core << "/" << decay_sparse << ": " << featureLearningRates << endl; if (sample) { cerr << "Rank " << rank << ", epoch " << epoch << ", feature values before: " << featureValuesHopeSample[0][0] << endl; applyPerFeatureLearningRates(featureValuesHopeSample, featureLearningRates, sparse_r0); @@ -1674,120 +1545,122 @@ int main(int argc, char** argv) { } } + // if we scaled up the weights, scale down model scores now + if (scaling_constant != 1.0) { + if (hope_fear || hope_model || perceptron_update) { + for (size_t i = 0; i < modelScoresHope.size(); ++i) + for (size_t j = 0; j < modelScoresHope[i].size(); ++j) { + modelScoresHope[i][j] /= scaling_constant; + modelScoresFear[i][j] /= scaling_constant; + } + } + else if (model_hope_fear || rank_only) { + if (sample) { + cerr << "Rank " << rank << ", epoch " << epoch << ", scale down model scores for sampling.. " << endl; + for (size_t i = 0; i < modelScoresHopeSample.size(); ++i) + for (size_t j = 0; j < modelScoresHopeSample[i].size(); ++j) { + modelScoresHopeSample[i][j] /= scaling_constant; + modelScoresFearSample[i][j] /= scaling_constant; + } + } + else { + for (size_t i = 0; i < modelScores.size(); ++i) + for (size_t j = 0; j < modelScores[i].size(); ++j) + modelScores[i][j] /= scaling_constant; + } + } + } + // Run optimiser on batch: VERBOSE(1, "\nRank " << rank << ", epoch " << epoch << ", run optimiser:" << endl); size_t update_status = 1; ScoreComponentCollection weightUpdate; if (perceptron_update) { vector<vector<float> > dummy1; - update_status = optimiser->updateWeightsHopeFear(mosesWeights, weightUpdate, + update_status = optimiser->updateWeightsHopeFear( weightUpdate, featureValuesHope, featureValuesFear, dummy1, dummy1, dummy1, dummy1, learning_rate, rank, epoch); } else if (hope_fear || hope_model) { if (bleuScoresHope[0][0] >= min_oracle_bleu) { - if (hope_n == 1 && fear_n ==1 && batchSize == 1) { - update_status = ((MiraOptimiser*) optimiser)->updateWeightsAnalytically(mosesWeights, weightUpdate, + if (hope_n == 1 && fear_n ==1 && batchSize == 1 && !hildreth) { + update_status = ((MiraOptimiser*) optimiser)->updateWeightsAnalytically(weightUpdate, featureValuesHope[0][0], featureValuesFear[0][0], bleuScoresHope[0][0], bleuScoresFear[0][0], modelScoresHope[0][0], modelScoresFear[0][0], learning_rate, rank, epoch); } - else { - update_status = optimiser->updateWeightsHopeFear(mosesWeights, weightUpdate, + else + update_status = optimiser->updateWeightsHopeFear(weightUpdate, featureValuesHope, featureValuesFear, bleuScoresHope, bleuScoresFear, - modelScoresHope, modelScoresFear, learning_rate, rank, epoch); - } + modelScoresHope, modelScoresFear, learning_rate, rank, epoch); } else update_status = 1; } else if (rank_only) { - // learning ranking of model translations - update_status = ((MiraOptimiser*) optimiser)->updateWeightsRankModel(mosesWeights, weightUpdate, + // learning ranking of model translations + if (summed) + update_status = ((MiraOptimiser*) optimiser)->updateWeightsRankModelSummed(weightUpdate, + featureValues, bleuScores, modelScores, learning_rate, rank, epoch); + else + update_status = ((MiraOptimiser*) optimiser)->updateWeightsRankModel(weightUpdate, featureValues, bleuScores, modelScores, learning_rate, rank, epoch); } else { // model_hope_fear if (sample) { - update_status = optimiser->updateWeightsHopeFear(mosesWeights, weightUpdate, - featureValuesHopeSample, featureValuesFearSample, bleuScoresHopeSample, bleuScoresFearSample, + if (selective) + update_status = ((MiraOptimiser*)optimiser)->updateWeightsHopeFearSelective(weightUpdate, + featureValuesHopeSample, featureValuesFearSample, + bleuScoresHopeSample, bleuScoresFearSample, modelScoresHopeSample, + modelScoresFearSample, learning_rate, rank, epoch); + else if (summed) + update_status = ((MiraOptimiser*)optimiser)->updateWeightsHopeFearSummed(weightUpdate, + featureValuesHopeSample, featureValuesFearSample, + bleuScoresHopeSample, bleuScoresFearSample, modelScoresHopeSample, + modelScoresFearSample, learning_rate, rank, epoch, rescaleSlack, rewardHope, makePairs); + else { + if (batchSize == 1 && featureValuesHopeSample[0].size() == 1 && !hildreth) { + cerr << "Rank " << rank << ", epoch " << epoch << ", model score hope: " << modelScoresHopeSample[0][0] << endl; + cerr << "Rank " << rank << ", epoch " << epoch << ", model score fear: " << modelScoresFearSample[0][0] << endl; + update_status = ((MiraOptimiser*) optimiser)->updateWeightsAnalytically(weightUpdate, + featureValuesHopeSample[0][0], featureValuesFearSample[0][0], + bleuScoresHopeSample[0][0], bleuScoresFearSample[0][0], + modelScoresHopeSample[0][0], modelScoresFearSample[0][0], + learning_rate, rank, epoch); + } + else { + cerr << "Rank " << rank << ", epoch " << epoch << ", model score hope: " << modelScoresHopeSample[0][0] << endl; + cerr << "Rank " << rank << ", epoch " << epoch << ", model score fear: " << modelScoresFearSample[0][0] << endl; + update_status = optimiser->updateWeightsHopeFear(weightUpdate, + featureValuesHopeSample, featureValuesFearSample, + bleuScoresHopeSample, bleuScoresFearSample, modelScoresHopeSample, modelScoresFearSample, learning_rate, rank, epoch); + } + } } else { - update_status = ((MiraOptimiser*) optimiser)->updateWeights(mosesWeights, weightUpdate, - featureValues, losses, bleuScores, modelScores, oracleFeatureValues, oracleBleuScores, oracleModelScores, learning_rate, rank, epoch); + if (summed) { + // don't differentiate between hope and model/fear, treat all the same and sum constraints + update_status = ((MiraOptimiser*) optimiser)->updateWeightsRankModelSummed(weightUpdate, + featureValues, bleuScores, modelScores, learning_rate, rank, epoch); + } + else + update_status = ((MiraOptimiser*) optimiser)->updateWeights(weightUpdate, + featureValues, losses, bleuScores, modelScores, oracleFeatureValues, oracleBleuScores, oracleModelScores, learning_rate, rank, epoch); } } // sumStillViolatedConstraints += update_status; - // rescale LM feature - if (scale_lm) { - const LMList& lmList_new = staticData.GetLMList(); - for (LMList::const_iterator iter = lmList_new.begin(); iter != lmList_new.end(); ++iter) { - // scale weight back down - float lmWeight = mosesWeights.GetScoreForProducer(*iter); - mosesWeights.Assign(*iter, lmWeight/scale_lm_factor); - cerr << "Rank " << rank << ", epoch " << epoch << ", lm weight rescaled from " << lmWeight << " to " << lmWeight/scale_lm_factor << endl; - } - } - - // rescale WP feature - if (scale_wp) { - // scale weight back down - WordPenaltyProducer *wp = staticData.GetFirstWordPenaltyProducer(); - float wpWeight = mosesWeights.GetScoreForProducer(wp); - mosesWeights.Assign(wp, wpWeight/scale_wp_factor); - cerr << "Rank " << rank << ", epoch " << epoch << ", wp weight rescaled from " << wpWeight << " to " << wpWeight/scale_wp_factor << endl; - } - - if (scale_all) { - // rescale distortion - DistortionScoreProducer *dp = staticData.GetDistortionScoreProducer(); - float dWeight = mosesWeights.GetScoreForProducer(dp); - mosesWeights.Assign(dp, dWeight/scale_all_factor); - cerr << "Rank " << rank << ", epoch " << epoch << ", distortion weight rescaled from " << dWeight << " to " << dWeight/scale_all_factor << endl; - - // rescale lexical reordering - vector<LexicalReordering*> lr = staticData.GetLexicalReorderModels(); - for (size_t i=0; i<lr.size(); ++i) { - vector<float> dWeights = mosesWeights.GetScoresForProducer(lr[i]); - for (size_t j=0; j<dWeights.size(); ++j) { - cerr << "Rank " << rank << ", epoch " << epoch << ", d weight rescaled from " << dWeights[j]; - dWeights[j] /=scale_all_factor; - cerr << " to " << dWeights[j] << endl; - } - mosesWeights.Assign(lr[i], dWeights); - } - - // rescale phrase models - vector<PhraseDictionaryFeature*> pd = staticData.GetPhraseDictionaryModels(); - for (size_t i=0; i<pd.size(); ++i) { - vector<float> tWeights = mosesWeights.GetScoresForProducer(pd[i]); - for (size_t j=0; j<tWeights.size(); ++j) { - cerr << "Rank " << rank << ", epoch " << epoch << ", t weight rescaled from " << tWeights[j]; - tWeights[j] /=scale_all_factor; - cerr << " to " << tWeights[j] << endl; - } - mosesWeights.Assign(pd[i], tWeights); - } - } - if (update_status == 0) { // if weights were updated // apply weight update cerr << "Rank " << rank << ", epoch " << epoch << ", update: " << weightUpdate << endl; - if (l2_regularize) { - weightUpdate.L2Regularize(l2_lambda); - cerr << "Rank " << rank << ", epoch " << epoch << ", " - << "l2-reg. on mosesWeights with lambda=" << l2_lambda << endl; - cerr << "Rank " << rank << ", epoch " << epoch << ", regularized update: " << weightUpdate << endl; - } - if (feature_confidence) { // update confidence counts based on weight update confidenceCounts.UpdateConfidenceCounts(weightUpdate, signed_counts); // update feature learning rates - featureLearningRates.UpdateLearningRates(decay, confidenceCounts, core_r0, sparse_r0); + featureLearningRates.UpdateLearningRates(decay_core, decay_sparse, confidenceCounts, core_r0, sparse_r0); } mosesWeights.PlusEquals(weightUpdate); @@ -1817,6 +1690,7 @@ int main(int argc, char** argv) { // set new Moses weights decoder->setWeights(mosesWeights); + cerr << "Rank " << rank << ", epoch " << epoch << ", new weights: " << mosesWeights << endl; // adjust bleu weight if (bleu_weight_lm_adjust) { @@ -1852,11 +1726,12 @@ int main(int argc, char** argv) { // mix weights? if (mix) { #ifdef MPI_ENABLE + cerr << "Rank " << rank << ", epoch " << epoch << ", mixing weights.. " << endl; // collect all weights in mixedWeights and divide by number of processes mpi::reduce(world, mosesWeights, mixedWeights, SCCPlus(), 0); // mix confidence counts - mpi::reduce(world, confidenceCounts, mixedConfidenceCounts, SCCPlus(), 0); + //mpi::reduce(world, confidenceCounts, mixedConfidenceCounts, SCCPlus(), 0); ScoreComponentCollection totalBinary; if (sparseAverage) { ScoreComponentCollection binary; @@ -1873,11 +1748,8 @@ int main(int argc, char** argv) { mixedWeights.DivideEquals(size); // divide confidence counts - if (averageConfidenceCounts) { - mixedConfidenceCounts.DivideEquals(size); - cerr << "Rank " << rank << ", epoch " << epoch << ", average confidence counts." << endl; - } - + //mixedConfidenceCounts.DivideEquals(size); + // normalise weights after averaging if (normaliseWeights) { mixedWeights.L1Normalise(); @@ -1905,15 +1777,17 @@ int main(int argc, char** argv) { << pruned << " features pruned from cumulativeWeights." << endl; } - if (l1_regularize && weightMixingThisEpoch == mixingFrequency) { - mixedWeights.L1Regularize(l1_lambda); - cerr << "Rank " << rank << ", epoch " << epoch << ", " - << "l1-reg. on mixedWeights with lambda=" << l1_lambda << endl; - - // subtract lambda from every weight in the sum --> multiply by number of updates - cumulativeWeights.L1Regularize(l1_lambda*numberOfUpdatesThisEpoch); - cerr << "Rank " << rank << ", epoch " << epoch << ", " - << "l1-reg. on cumulativeWeights with lambda=" << l1_lambda*numberOfUpdatesThisEpoch << endl; + if (weightMixingThisEpoch == mixingFrequency) { + if (l1_regularize) { + size_t pruned = mixedWeights.SparseL1Regularize(l1_lambda); + cerr << "Rank " << rank << ", epoch " << epoch << ", " + << "l1-reg. on mixedWeights with lambda=" << l1_lambda << ", pruned: " << pruned << endl; + } + if (l2_regularize) { + mixedWeights.SparseL2Regularize(l2_lambda); + cerr << "Rank " << rank << ", epoch " << epoch << ", " + << "l2-reg. on mixedWeights with lambda=" << l2_lambda << endl; + } } } @@ -1923,8 +1797,8 @@ int main(int argc, char** argv) { mosesWeights = mixedWeights; // broadcast summed confidence counts - mpi::broadcast(world, mixedConfidenceCounts, 0); - confidenceCounts = mixedConfidenceCounts; + //mpi::broadcast(world, mixedConfidenceCounts, 0); + //confidenceCounts = mixedConfidenceCounts; #endif #ifndef MPI_ENABLE @@ -2021,6 +1895,25 @@ int main(int argc, char** argv) { cerr << "Dumping mixed average weights during epoch " << epoch << " to " << filename.str() << endl << endl; mixedAverageWeights.Save(filename.str()); ++weightEpochDump; + + if (weightEpochDump == weightDumpFrequency) { + if (l1_regularize) { + size_t pruned = mixedAverageWeights.SparseL1Regularize(l1_lambda); + cerr << "Rank " << rank << ", epoch " << epoch << ", " + << "l1-reg. on mixedAverageWeights with lambda=" << l1_lambda << ", pruned: " << pruned << endl; + } + if (l2_regularize) { + mixedAverageWeights.SparseL2Regularize(l2_lambda); + cerr << "Rank " << rank << ", epoch " << epoch << ", " + << "l2-reg. on mixedAverageWeights with lambda=" << l2_lambda << endl; + } + + if (l1_regularize || l2_regularize) { + filename << "_reg"; + cerr << "Dumping regularized mixed average weights during epoch " << epoch << " to " << filename.str() << endl << endl; + mixedAverageWeights.Save(filename.str()); + } + } if (weightEpochDump == weightDumpFrequency && printFeatureCounts) { // print out all features with counts @@ -2042,11 +1935,11 @@ int main(int argc, char** argv) { } // end of shard loop, end of this epoch - if (printNbestWithFeatures && rank == 0 && epoch == 0) { - cerr << "Writing out hope/fear nbest list with features: " << f1 << ", " << f2 << endl; - hopePlusFeatures.close(); - fearPlusFeatures.close(); - } + /*if (printNbestWithFeatures && rank == 0 && epoch == 0) { + cerr << "Writing out hope/fear nbest list with features: " << f1 << ", " << f2 << endl; + hopePlusFeatures.close(); + fearPlusFeatures.close(); + }*/ if (historyBleu) { cerr << "Bleu feature history after epoch " << epoch << endl; @@ -2133,6 +2026,13 @@ int main(int argc, char** argv) { time(&now); cerr << "Rank " << rank << ", " << ctime(&now); + if (rank == 0) { + ScoreComponentCollection dummy; + ostringstream endfilename; + endfilename << "finished"; + dummy.Save(endfilename.str()); + } + delete decoder; exit(0); } @@ -2147,7 +2047,7 @@ bool loadSentences(const string& filename, vector<string>& sentences) { return true; } -bool loadCoreWeights(const string& filename, ProducerWeightMap& coreWeightMap, const vector<const ScoreProducer*> &featureFunctions) { +/*bool loadCoreWeights(const string& filename, ProducerWeightMap& coreWeightMap, const vector<const ScoreProducer*> &featureFunctions) { ifstream in(filename.c_str()); if (!in) return false; @@ -2180,10 +2080,10 @@ bool loadCoreWeights(const string& filename, ProducerWeightMap& coreWeightMap, c store_weights.push_back(weight); if (store_weights.size() == featureFunctions[i]->GetNumScoreComponents()) { coreWeightMap.insert(ProducerWeightPair(featureFunctions[i], store_weights)); - /*cerr << "insert " << store_weights.size() << " weights for " << featureFunctions[i]->GetScoreProducerDescription() << " ("; + cerr << "insert " << store_weights.size() << " weights for " << featureFunctions[i]->GetScoreProducerDescription() << " ("; for (size_t j=0; j < store_weights.size(); ++j) cerr << store_weights[j] << " "; - cerr << ")" << endl;*/ + cerr << ")" << endl; store_weights.clear(); } } @@ -2191,7 +2091,7 @@ bool loadCoreWeights(const string& filename, ProducerWeightMap& coreWeightMap, c } } return true; -} +}*/ bool evaluateModulo(size_t shard_position, size_t mix_or_dump_base, size_t actual_batch_size) { if (mix_or_dump_base == 0) return 0; @@ -2369,7 +2269,7 @@ void scaleFeatureScore(ScoreProducer *sp, float scaling_factor, vector<vector<Sc for (size_t i=0; i<featureValues.size(); ++i) { // each item in batch for (size_t j=0; j<featureValues[i].size(); ++j) { // each item in nbest featureScore = featureValues[i][j].GetScoreForProducer(sp); - featureValues[i][j].Assign(sp, featureScore/scaling_factor); + featureValues[i][j].Assign(sp, featureScore*scaling_factor); //cerr << "Rank " << rank << ", epoch " << epoch << ", " << name << " score scaled from " << featureScore << " to " << featureScore/scaling_factor << endl; } } @@ -2383,7 +2283,7 @@ void scaleFeatureScores(ScoreProducer *sp, float scaling_factor, vector<vector<S for (size_t j=0; j<featureValues[i].size(); ++j) { // each item in nbest vector<float> featureScores = featureValues[i][j].GetScoresForProducer(sp); for (size_t k=0; k<featureScores.size(); ++k) - featureScores[k] /= scaling_factor; + featureScores[k] *= scaling_factor; featureValues[i][j].Assign(sp, featureScores); //cerr << "Rank " << rank << ", epoch " << epoch << ", " << name << " score scaled from " << featureScore << " to " << featureScore/scaling_factor << endl; } |