diff options
Diffstat (limited to 'mira/MiraOptimiser.cpp')
-rw-r--r--[-rwxr-xr-x] | mira/MiraOptimiser.cpp | 421 |
1 files changed, 416 insertions, 5 deletions
diff --git a/mira/MiraOptimiser.cpp b/mira/MiraOptimiser.cpp index c3ccfb011..a0fb07238 100755..100644 --- a/mira/MiraOptimiser.cpp +++ b/mira/MiraOptimiser.cpp @@ -8,7 +8,6 @@ using namespace std; namespace Mira { size_t MiraOptimiser::updateWeights( - ScoreComponentCollection& currWeights, ScoreComponentCollection& weightUpdate, const vector<vector<ScoreComponentCollection> >& featureValues, const vector<vector<float> >& losses, @@ -152,7 +151,6 @@ size_t MiraOptimiser::updateWeights( } size_t MiraOptimiser::updateWeightsHopeFear( - Moses::ScoreComponentCollection& currWeights, Moses::ScoreComponentCollection& weightUpdate, const std::vector< std::vector<Moses::ScoreComponentCollection> >& featureValuesHope, const std::vector< std::vector<Moses::ScoreComponentCollection> >& featureValuesFear, @@ -315,7 +313,6 @@ size_t MiraOptimiser::updateWeightsHopeFear( } size_t MiraOptimiser::updateWeightsAnalytically( - ScoreComponentCollection& currWeights, ScoreComponentCollection& weightUpdate, ScoreComponentCollection& featureValuesHope, ScoreComponentCollection& featureValuesFear, @@ -457,7 +454,6 @@ size_t MiraOptimiser::updateWeightsAnalytically( } size_t MiraOptimiser::updateWeightsRankModel( - Moses::ScoreComponentCollection& currWeights, Moses::ScoreComponentCollection& weightUpdate, const std::vector< std::vector<Moses::ScoreComponentCollection> >& featureValues, const std::vector<std::vector<float> >& bleuScores, @@ -478,11 +474,12 @@ size_t MiraOptimiser::updateWeightsRankModel( float oldDistanceFromOptimum = 0; // iterate over input sentences (1 (online) or more (batch)) - float minBleuDifference = 1.0; + float minBleuDifference = 0.005; for (size_t i = 0; i < featureValues.size(); ++i) { // Build all pairs where the first has higher Bleu than the second for (size_t j = 0; j < featureValues[i].size(); ++j) { for (size_t k = 0; k < featureValues[i].size(); ++k) { + if (j == k) continue; if (bleuScores[i][j] - minBleuDifference < bleuScores[i][k]) // need at least a positive Bleu difference of specified amount continue; @@ -598,5 +595,419 @@ size_t MiraOptimiser::updateWeightsRankModel( return 0; } +size_t MiraOptimiser::updateWeightsHopeFearSelective( + Moses::ScoreComponentCollection& weightUpdate, + const std::vector< std::vector<Moses::ScoreComponentCollection> >& featureValuesHope, + const std::vector< std::vector<Moses::ScoreComponentCollection> >& featureValuesFear, + const std::vector<std::vector<float> >& bleuScoresHope, + const std::vector<std::vector<float> >& bleuScoresFear, + const std::vector<std::vector<float> >& modelScoresHope, + const std::vector<std::vector<float> >& modelScoresFear, + float learning_rate, + size_t rank, + size_t epoch, + int updatePosition) { + + // vector of feature values differences for all created constraints + vector<ScoreComponentCollection> nonZeroFeatures; + vector<float> lossMinusModelScoreDiffs; + + // Make constraints for new hypothesis translations + float epsilon = 0.0001; + int violatedConstraintsBefore = 0; + + // iterate over input sentences (1 (online) or more (batch)) + for (size_t i = 0; i < featureValuesHope.size(); ++i) { + if (updatePosition != -1) { + if (i < updatePosition) + continue; + else if (i > updatePosition) + break; + } + + // Pick all pairs[j,j] of hope and fear translations for one input sentence + for (size_t j = 0; j < featureValuesHope[i].size(); ++j) { + ScoreComponentCollection featureValueDiff = featureValuesHope[i][j]; + featureValueDiff.MinusEquals(featureValuesFear[i][j]); + if (featureValueDiff.GetL1Norm() == 0) { + cerr << "Rank " << rank << ", epoch " << epoch << ", features equal --> skip" << endl; + continue; + } + + // check if constraint is violated + float loss = bleuScoresHope[i][j] - bleuScoresFear[i][j]; + float modelScoreDiff = modelScoresHope[i][j] - modelScoresFear[i][j]; + float diff = 0; + if (loss > modelScoreDiff) + diff = loss - modelScoreDiff; + if (diff > epsilon) + ++violatedConstraintsBefore; + cerr << "Rank " << rank << ", epoch " << epoch << ", constraint: " << modelScoreDiff << " >= " << loss << " (current violation: " << diff << ")" << endl; + + // iterate over difference vector and add a constraint for every non-zero feature + FVector features = featureValueDiff.GetScoresVector(); + size_t n_core = 0, n_sparse = 0, n_sparse_hope = 0, n_sparse_fear = 0; + for (size_t i=0; i<features.coreSize(); ++i) { + if (features[i] != 0.0) { + ++n_core; + ScoreComponentCollection f; + f.Assign(i, features[i]); + nonZeroFeatures.push_back(f); + } + } + // 1 + 2 + /*for (FVector::iterator i = features.begin(); i != features.end(); ++i) { + if (i->second != 0.0) { + ++n_sparse; + ScoreComponentCollection f; + f.Assign((i->first).name(), i->second); + nonZeroFeatures.push_back(f); + cerr << "Rank " << rank << ", epoch " << epoch << ", f: " << f << endl; + } + } + cerr << "Rank " << rank << ", epoch " << epoch << ", non-zero features: " << nonZeroFeatures.size() << endl;*/ + + // 3 + vector<ScoreComponentCollection> nonZeroFeaturesHope; + vector<ScoreComponentCollection> nonZeroFeaturesFear; + for (FVector::iterator i = features.begin(); i != features.end(); ++i) { + if (i->second != 0.0) { + ScoreComponentCollection f; + f.Assign((i->first).name(), i->second); + cerr << "Rank " << rank << ", epoch " << epoch << ", f: " << f << endl; + + if (i->second > 0.0) { + ++n_sparse_hope; + nonZeroFeaturesHope.push_back(f); + } + else { + ++n_sparse_fear; + nonZeroFeaturesFear.push_back(f); + } + } + } + + //1 + /*float n = n_core + n_sparse; + for (size_t i=0; i<n; ++i) + lossMinusModelScoreDiffs.push_back(diff/n); + + //2 + float diff_10 = diff * 0.1; + float diff_90 = diff * 0.9; + cerr << "Rank " << rank << ", epoch " << epoch << ", core diff: " << diff_10/n_core << endl; + cerr << "Rank " << rank << ", epoch " << epoch << ", sparse diff: " << diff_90/n_sparse << endl; + for (size_t i=0; i<n_core; ++i) + lossMinusModelScoreDiffs.push_back(diff_10/n_core); + for (size_t i=0; i<n_sparse; ++i) + lossMinusModelScoreDiffs.push_back(diff_90/n_sparse);*/ + + // 3 + float n = n_core + n_sparse_hope + n_sparse_fear; + for (size_t i=0; i<n_core; ++i) + lossMinusModelScoreDiffs.push_back(diff/n); + for (size_t i=0; i<n_sparse_hope; ++i) { + nonZeroFeatures.push_back(nonZeroFeaturesHope[i]); + lossMinusModelScoreDiffs.push_back((diff/n)*1.1); + } + for (size_t i=0; i<n_sparse_fear; ++i) { + nonZeroFeatures.push_back(nonZeroFeaturesFear[i]); + lossMinusModelScoreDiffs.push_back(diff/n); + } + cerr << "Rank " << rank << ", epoch " << epoch << ", core diff: " << diff/n << endl; + cerr << "Rank " << rank << ", epoch " << epoch << ", hope diff: " << ((diff/n)*1.1) << endl; + cerr << "Rank " << rank << ", epoch " << epoch << ", fear diff: " << diff/n << endl; + } + } + + assert(nonZeroFeatures.size() == lossMinusModelScoreDiffs.size()); + + // run optimisation: compute alphas for all given constraints + vector<float> alphas; + ScoreComponentCollection summedUpdate; + if (violatedConstraintsBefore > 0) { + cerr << "Rank " << rank << ", epoch " << epoch << ", number of constraints passed to optimizer: " << nonZeroFeatures.size() << endl; + alphas = Hildreth::optimise(nonZeroFeatures, lossMinusModelScoreDiffs, m_slack); + + // Update the weight vector according to the alphas and the feature value differences + // * w' = w' + SUM alpha_i * (h_i(oracle) - h_i(hypothesis)) + for (size_t k = 0; k < nonZeroFeatures.size(); ++k) { + float alpha = alphas[k]; + cerr << "Rank " << rank << ", epoch " << epoch << ", alpha: " << alpha << endl; + if (alpha != 0) { + ScoreComponentCollection update(nonZeroFeatures[k]); + update.MultiplyEquals(alpha); + + // sum updates + summedUpdate.PlusEquals(update); + } + } + } + else { + cerr << "Rank " << rank << ", epoch " << epoch << ", no constraint violated for this batch" << endl; + // return 0; + return 1; + } + + // apply learning rate + if (learning_rate != 1) { + cerr << "Rank " << rank << ", epoch " << epoch << ", apply learning rate " << learning_rate << " to update." << endl; + summedUpdate.MultiplyEquals(learning_rate); + } + + // scale update by BLEU of oracle (for batch size 1 only) + if (featureValuesHope.size() == 1) { + if (m_scale_update) { + cerr << "Rank " << rank << ", epoch " << epoch << ", scaling summed update with oracle bleu score " << bleuScoresHope[0][0] << endl; + summedUpdate.MultiplyEquals(bleuScoresHope[0][0]); + } + } + + //cerr << "Rank " << rank << ", epoch " << epoch << ", update: " << summedUpdate << endl; + weightUpdate.PlusEquals(summedUpdate); + return 0; +} + +size_t MiraOptimiser::updateWeightsHopeFearSummed( + Moses::ScoreComponentCollection& weightUpdate, + const std::vector< std::vector<Moses::ScoreComponentCollection> >& featureValuesHope, + const std::vector< std::vector<Moses::ScoreComponentCollection> >& featureValuesFear, + const std::vector<std::vector<float> >& bleuScoresHope, + const std::vector<std::vector<float> >& bleuScoresFear, + const std::vector<std::vector<float> >& modelScoresHope, + const std::vector<std::vector<float> >& modelScoresFear, + float learning_rate, + size_t rank, + size_t epoch, + bool rescaleSlack, + bool rewardHope, + bool makePairs) { + + // vector of feature values differences for all created constraints + ScoreComponentCollection averagedFeatureDiffs; + float averagedViolations = 0; + + // Make constraints for new hypothesis translations + float epsilon = 0.0001; + int violatedConstraintsBefore = 0; + + if (!makePairs) { + ScoreComponentCollection featureValueDiff; + float lossHope = 0, lossFear = 0, modelScoreHope = 0, modelScoreFear = 0, hopeCount = 0, fearCount = 0; + // add all hope vectors + for (size_t i = 0; i < featureValuesHope.size(); ++i) { + for (size_t j = 0; j < featureValuesHope[i].size(); ++j) { + featureValueDiff.PlusEquals(featureValuesHope[i][j]); + lossHope += bleuScoresHope[i][j]; + modelScoreHope += modelScoresHope[i][j]; + ++hopeCount; + } + } + lossHope /= hopeCount; + modelScoreHope /= hopeCount; + + // subtract all fear vectors + for (size_t i = 0; i < featureValuesFear.size(); ++i) { + for (size_t j = 0; j < featureValuesFear[i].size(); ++j) { + featureValueDiff.MinusEquals(featureValuesFear[i][j]); + lossFear += bleuScoresFear[i][j]; + modelScoreFear += modelScoresFear[i][j]; + ++fearCount; + } + } + lossFear /= fearCount; + modelScoreFear /= fearCount; + + if (featureValueDiff.GetL1Norm() == 0) { + cerr << "Rank " << rank << ", epoch " << epoch << ", features equal --> skip" << endl; + cerr << "Rank " << rank << ", epoch " << epoch << ", no constraint violated for this batch" << endl; + return 1; + } + + // check if constraint is violated + float lossDiff = lossHope - lossFear; + float modelScoreDiff = modelScoreHope - modelScoreFear; + float diff = 0; + if (lossDiff > modelScoreDiff) + diff = lossDiff - modelScoreDiff; + if (diff > epsilon) + ++violatedConstraintsBefore; + cerr << "Rank " << rank << ", epoch " << epoch << ", constraint: " << modelScoreDiff << " >= " << lossDiff << " (current violation: " <<\ + diff << ")" << endl; + + // add constraint + averagedFeatureDiffs = featureValueDiff; + averagedViolations = diff; + } + else { + // iterate over input sentences (1 (online) or more (batch)) + for (size_t i = 0; i < featureValuesHope.size(); ++i) { + // Pick all pairs[j,j] of hope and fear translations for one input sentence and add them up + for (size_t j = 0; j < featureValuesHope[i].size(); ++j) { + ScoreComponentCollection featureValueDiff = featureValuesHope[i][j]; + featureValueDiff.MinusEquals(featureValuesFear[i][j]); + if (featureValueDiff.GetL1Norm() == 0) { + cerr << "Rank " << rank << ", epoch " << epoch << ", features equal --> skip" << endl; + continue; + } + + // check if constraint is violated + float lossDiff = bleuScoresHope[i][j] - bleuScoresFear[i][j]; + float modelScoreDiff = modelScoresHope[i][j] - modelScoresFear[i][j]; + if (rescaleSlack) { + cerr << "Rank " << rank << ", epoch " << epoch << ", modelScoreDiff scaled by lossDiff: " << modelScoreDiff << " --> " << modelScoreDiff*lossDiff << endl; + modelScoreDiff *= lossDiff; + } + float diff = 0; + if (lossDiff > modelScoreDiff) + diff = lossDiff - modelScoreDiff; + if (diff > epsilon) + ++violatedConstraintsBefore; + cerr << "Rank " << rank << ", epoch " << epoch << ", constraint: " << modelScoreDiff << " >= " << lossDiff << " (current violation: " << diff << ")" << endl; + + // add constraint + if (rescaleSlack) { + averagedFeatureDiffs.MultiplyEquals(lossDiff); + cerr << "Rank " << rank << ", epoch " << epoch << ", featureValueDiff scaled by lossDiff." << endl; + } + averagedFeatureDiffs.PlusEquals(featureValueDiff); + averagedViolations += diff; + } + } + } + + // divide by number of constraints (1/n) + if (!makePairs) { + averagedFeatureDiffs.DivideEquals(featureValuesHope[0].size()); + } + else { + averagedFeatureDiffs.DivideEquals(featureValuesHope[0].size()); + averagedViolations /= featureValuesHope[0].size(); + } + cerr << "Rank " << rank << ", epoch " << epoch << ", averaged feature diffs: " << averagedFeatureDiffs << endl; + cerr << "Rank " << rank << ", epoch " << epoch << ", averaged violations: " << averagedViolations << endl; + + if (violatedConstraintsBefore > 0) { + if (rewardHope) { + averagedFeatureDiffs.RedistributeMass(0.1); + cerr << "Rank " << rank << ", epoch " << epoch << ", redistributed feature diffs: " << averagedFeatureDiffs << endl; + } + + // compute alpha for given constraint: (loss diff - model score diff) / || feature value diff ||^2 + // featureValueDiff.GetL2Norm() * featureValueDiff.GetL2Norm() == featureValueDiff.InnerProduct(featureValueDiff) + // from Crammer&Singer 2006: alpha = min {C , l_t/ ||x||^2} + // adjusted for 1 slack according to Joachims 2009, OP4 (margin rescaling), OP5 (slack rescaling) + float squaredNorm = averagedFeatureDiffs.GetL2Norm() * averagedFeatureDiffs.GetL2Norm(); + float alpha = averagedViolations / squaredNorm; + cerr << "Rank " << rank << ", epoch " << epoch << ", unclipped alpha: " << alpha << endl; + if (m_slack > 0 ) { + if (alpha > m_slack) { + alpha = m_slack; + } + else if (alpha < m_slack*(-1)) { + alpha = m_slack*(-1); + } + } + cerr << "Rank " << rank << ", epoch " << epoch << ", clipped alpha: " << alpha << endl; + + // compute update + averagedFeatureDiffs.MultiplyEquals(alpha); + weightUpdate.PlusEquals(averagedFeatureDiffs); + return 0; + } + else { + cerr << "Rank " << rank << ", epoch " << epoch << ", no constraint violated for this batch" << endl; + return 1; + } +} + +size_t MiraOptimiser::updateWeightsRankModelSummed( + Moses::ScoreComponentCollection& weightUpdate, + const std::vector< std::vector<Moses::ScoreComponentCollection> >& featureValues, + const std::vector<std::vector<float> >& bleuScores, + const std::vector<std::vector<float> >& modelScores, + float learning_rate, + size_t rank, + size_t epoch) { + // vector of feature values differences for all created constraints + ScoreComponentCollection averagedFeatureDiffs; + float averagedViolations = 0; + + // Make constraints for new hypothesis translations + float epsilon = 0.0001; + int violatedConstraintsBefore = 0; + float oldDistanceFromOptimum = 0; + + // iterate over input sentences (1 (online) or more (batch)) + float minBleuDifference = 0.005; + size_t numConstraints = 0; + for (size_t i = 0; i < featureValues.size(); ++i) { + // Build all pairs where the first has higher Bleu than the second + for (size_t j = 0; j < featureValues[i].size(); ++j) { + for (size_t k = 0; k < featureValues[i].size(); ++k) { + if (j == k) continue; + if (bleuScores[i][j] - minBleuDifference < bleuScores[i][k]) // need at least a positive Bleu difference of specified amount + continue; + + ScoreComponentCollection featureValueDiff = featureValues[i][j]; + featureValueDiff.MinusEquals(featureValues[i][k]); + if (featureValueDiff.GetL1Norm() == 0) { + cerr << "Rank " << rank << ", epoch " << epoch << ", features equal --> skip" << endl; + continue; + } + + // check if constraint is violated + float loss = bleuScores[i][j] - bleuScores[i][k]; + float modelScoreDiff = modelScores[i][j] - modelScores[i][k]; + float diff = 0; + if (loss > modelScoreDiff) + diff = loss - modelScoreDiff; + if (diff > epsilon) + ++violatedConstraintsBefore; + cerr << "Rank " << rank << ", epoch " << epoch << ", constraint: " << modelScoreDiff << " >= " << loss << " (current violation: " << diff << ")" << endl; + + // add constraint + averagedFeatureDiffs.PlusEquals(featureValueDiff); + averagedViolations += diff; + ++numConstraints; + } + } + } + + // divide by number of constraints + averagedFeatureDiffs.DivideEquals(numConstraints); + averagedViolations /= numConstraints; + + cerr << "Rank " << rank << ", epoch " << epoch << ", averaged feature diffs: " << averagedFeatureDiffs << endl; + cerr << "Rank " << rank << ", epoch " << epoch << ", averaged violations: " << averagedViolations << endl; + + if (violatedConstraintsBefore > 0) { + // compute alpha for given constraint: (loss - model score diff) / || feature value diff ||^2 + // featureValueDiff.GetL2Norm() * featureValueDiff.GetL2Norm() == featureValueDiff.InnerProduct(featureValueDiff) + // from Crammer&Singer 2006: alpha = min {C , l_t/ ||x||^2} + float squaredNorm = averagedFeatureDiffs.GetL2Norm() * averagedFeatureDiffs.GetL2Norm(); + float alpha = averagedViolations / squaredNorm; + cerr << "Rank " << rank << ", epoch " << epoch << ", unclipped alpha: " << alpha << endl; + if (m_slack > 0 ) { + if (alpha > m_slack) { + alpha = m_slack; + } + else if (alpha < m_slack*(-1)) { + alpha = m_slack*(-1); + } + } + cerr << "Rank " << rank << ", epoch " << epoch << ", clipped alpha: " << alpha << endl; + + // compute update + averagedFeatureDiffs.MultiplyEquals(alpha); + weightUpdate.PlusEquals(averagedFeatureDiffs); + return 0; + } + else { + cerr << "Rank " << rank << ", epoch " << epoch << ", no constraint violated for this batch" << endl; + return 1; + } +} + } |