Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/moses-smt/mosesdecoder.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
Diffstat (limited to 'mira/MiraOptimiser.cpp')
-rw-r--r--[-rwxr-xr-x]mira/MiraOptimiser.cpp421
1 files changed, 416 insertions, 5 deletions
diff --git a/mira/MiraOptimiser.cpp b/mira/MiraOptimiser.cpp
index c3ccfb011..a0fb07238 100755..100644
--- a/mira/MiraOptimiser.cpp
+++ b/mira/MiraOptimiser.cpp
@@ -8,7 +8,6 @@ using namespace std;
namespace Mira {
size_t MiraOptimiser::updateWeights(
- ScoreComponentCollection& currWeights,
ScoreComponentCollection& weightUpdate,
const vector<vector<ScoreComponentCollection> >& featureValues,
const vector<vector<float> >& losses,
@@ -152,7 +151,6 @@ size_t MiraOptimiser::updateWeights(
}
size_t MiraOptimiser::updateWeightsHopeFear(
- Moses::ScoreComponentCollection& currWeights,
Moses::ScoreComponentCollection& weightUpdate,
const std::vector< std::vector<Moses::ScoreComponentCollection> >& featureValuesHope,
const std::vector< std::vector<Moses::ScoreComponentCollection> >& featureValuesFear,
@@ -315,7 +313,6 @@ size_t MiraOptimiser::updateWeightsHopeFear(
}
size_t MiraOptimiser::updateWeightsAnalytically(
- ScoreComponentCollection& currWeights,
ScoreComponentCollection& weightUpdate,
ScoreComponentCollection& featureValuesHope,
ScoreComponentCollection& featureValuesFear,
@@ -457,7 +454,6 @@ size_t MiraOptimiser::updateWeightsAnalytically(
}
size_t MiraOptimiser::updateWeightsRankModel(
- Moses::ScoreComponentCollection& currWeights,
Moses::ScoreComponentCollection& weightUpdate,
const std::vector< std::vector<Moses::ScoreComponentCollection> >& featureValues,
const std::vector<std::vector<float> >& bleuScores,
@@ -478,11 +474,12 @@ size_t MiraOptimiser::updateWeightsRankModel(
float oldDistanceFromOptimum = 0;
// iterate over input sentences (1 (online) or more (batch))
- float minBleuDifference = 1.0;
+ float minBleuDifference = 0.005;
for (size_t i = 0; i < featureValues.size(); ++i) {
// Build all pairs where the first has higher Bleu than the second
for (size_t j = 0; j < featureValues[i].size(); ++j) {
for (size_t k = 0; k < featureValues[i].size(); ++k) {
+ if (j == k) continue;
if (bleuScores[i][j] - minBleuDifference < bleuScores[i][k]) // need at least a positive Bleu difference of specified amount
continue;
@@ -598,5 +595,419 @@ size_t MiraOptimiser::updateWeightsRankModel(
return 0;
}
+size_t MiraOptimiser::updateWeightsHopeFearSelective(
+ Moses::ScoreComponentCollection& weightUpdate,
+ const std::vector< std::vector<Moses::ScoreComponentCollection> >& featureValuesHope,
+ const std::vector< std::vector<Moses::ScoreComponentCollection> >& featureValuesFear,
+ const std::vector<std::vector<float> >& bleuScoresHope,
+ const std::vector<std::vector<float> >& bleuScoresFear,
+ const std::vector<std::vector<float> >& modelScoresHope,
+ const std::vector<std::vector<float> >& modelScoresFear,
+ float learning_rate,
+ size_t rank,
+ size_t epoch,
+ int updatePosition) {
+
+ // vector of feature values differences for all created constraints
+ vector<ScoreComponentCollection> nonZeroFeatures;
+ vector<float> lossMinusModelScoreDiffs;
+
+ // Make constraints for new hypothesis translations
+ float epsilon = 0.0001;
+ int violatedConstraintsBefore = 0;
+
+ // iterate over input sentences (1 (online) or more (batch))
+ for (size_t i = 0; i < featureValuesHope.size(); ++i) {
+ if (updatePosition != -1) {
+ if (i < updatePosition)
+ continue;
+ else if (i > updatePosition)
+ break;
+ }
+
+ // Pick all pairs[j,j] of hope and fear translations for one input sentence
+ for (size_t j = 0; j < featureValuesHope[i].size(); ++j) {
+ ScoreComponentCollection featureValueDiff = featureValuesHope[i][j];
+ featureValueDiff.MinusEquals(featureValuesFear[i][j]);
+ if (featureValueDiff.GetL1Norm() == 0) {
+ cerr << "Rank " << rank << ", epoch " << epoch << ", features equal --> skip" << endl;
+ continue;
+ }
+
+ // check if constraint is violated
+ float loss = bleuScoresHope[i][j] - bleuScoresFear[i][j];
+ float modelScoreDiff = modelScoresHope[i][j] - modelScoresFear[i][j];
+ float diff = 0;
+ if (loss > modelScoreDiff)
+ diff = loss - modelScoreDiff;
+ if (diff > epsilon)
+ ++violatedConstraintsBefore;
+ cerr << "Rank " << rank << ", epoch " << epoch << ", constraint: " << modelScoreDiff << " >= " << loss << " (current violation: " << diff << ")" << endl;
+
+ // iterate over difference vector and add a constraint for every non-zero feature
+ FVector features = featureValueDiff.GetScoresVector();
+ size_t n_core = 0, n_sparse = 0, n_sparse_hope = 0, n_sparse_fear = 0;
+ for (size_t i=0; i<features.coreSize(); ++i) {
+ if (features[i] != 0.0) {
+ ++n_core;
+ ScoreComponentCollection f;
+ f.Assign(i, features[i]);
+ nonZeroFeatures.push_back(f);
+ }
+ }
+ // 1 + 2
+ /*for (FVector::iterator i = features.begin(); i != features.end(); ++i) {
+ if (i->second != 0.0) {
+ ++n_sparse;
+ ScoreComponentCollection f;
+ f.Assign((i->first).name(), i->second);
+ nonZeroFeatures.push_back(f);
+ cerr << "Rank " << rank << ", epoch " << epoch << ", f: " << f << endl;
+ }
+ }
+ cerr << "Rank " << rank << ", epoch " << epoch << ", non-zero features: " << nonZeroFeatures.size() << endl;*/
+
+ // 3
+ vector<ScoreComponentCollection> nonZeroFeaturesHope;
+ vector<ScoreComponentCollection> nonZeroFeaturesFear;
+ for (FVector::iterator i = features.begin(); i != features.end(); ++i) {
+ if (i->second != 0.0) {
+ ScoreComponentCollection f;
+ f.Assign((i->first).name(), i->second);
+ cerr << "Rank " << rank << ", epoch " << epoch << ", f: " << f << endl;
+
+ if (i->second > 0.0) {
+ ++n_sparse_hope;
+ nonZeroFeaturesHope.push_back(f);
+ }
+ else {
+ ++n_sparse_fear;
+ nonZeroFeaturesFear.push_back(f);
+ }
+ }
+ }
+
+ //1
+ /*float n = n_core + n_sparse;
+ for (size_t i=0; i<n; ++i)
+ lossMinusModelScoreDiffs.push_back(diff/n);
+
+ //2
+ float diff_10 = diff * 0.1;
+ float diff_90 = diff * 0.9;
+ cerr << "Rank " << rank << ", epoch " << epoch << ", core diff: " << diff_10/n_core << endl;
+ cerr << "Rank " << rank << ", epoch " << epoch << ", sparse diff: " << diff_90/n_sparse << endl;
+ for (size_t i=0; i<n_core; ++i)
+ lossMinusModelScoreDiffs.push_back(diff_10/n_core);
+ for (size_t i=0; i<n_sparse; ++i)
+ lossMinusModelScoreDiffs.push_back(diff_90/n_sparse);*/
+
+ // 3
+ float n = n_core + n_sparse_hope + n_sparse_fear;
+ for (size_t i=0; i<n_core; ++i)
+ lossMinusModelScoreDiffs.push_back(diff/n);
+ for (size_t i=0; i<n_sparse_hope; ++i) {
+ nonZeroFeatures.push_back(nonZeroFeaturesHope[i]);
+ lossMinusModelScoreDiffs.push_back((diff/n)*1.1);
+ }
+ for (size_t i=0; i<n_sparse_fear; ++i) {
+ nonZeroFeatures.push_back(nonZeroFeaturesFear[i]);
+ lossMinusModelScoreDiffs.push_back(diff/n);
+ }
+ cerr << "Rank " << rank << ", epoch " << epoch << ", core diff: " << diff/n << endl;
+ cerr << "Rank " << rank << ", epoch " << epoch << ", hope diff: " << ((diff/n)*1.1) << endl;
+ cerr << "Rank " << rank << ", epoch " << epoch << ", fear diff: " << diff/n << endl;
+ }
+ }
+
+ assert(nonZeroFeatures.size() == lossMinusModelScoreDiffs.size());
+
+ // run optimisation: compute alphas for all given constraints
+ vector<float> alphas;
+ ScoreComponentCollection summedUpdate;
+ if (violatedConstraintsBefore > 0) {
+ cerr << "Rank " << rank << ", epoch " << epoch << ", number of constraints passed to optimizer: " << nonZeroFeatures.size() << endl;
+ alphas = Hildreth::optimise(nonZeroFeatures, lossMinusModelScoreDiffs, m_slack);
+
+ // Update the weight vector according to the alphas and the feature value differences
+ // * w' = w' + SUM alpha_i * (h_i(oracle) - h_i(hypothesis))
+ for (size_t k = 0; k < nonZeroFeatures.size(); ++k) {
+ float alpha = alphas[k];
+ cerr << "Rank " << rank << ", epoch " << epoch << ", alpha: " << alpha << endl;
+ if (alpha != 0) {
+ ScoreComponentCollection update(nonZeroFeatures[k]);
+ update.MultiplyEquals(alpha);
+
+ // sum updates
+ summedUpdate.PlusEquals(update);
+ }
+ }
+ }
+ else {
+ cerr << "Rank " << rank << ", epoch " << epoch << ", no constraint violated for this batch" << endl;
+ // return 0;
+ return 1;
+ }
+
+ // apply learning rate
+ if (learning_rate != 1) {
+ cerr << "Rank " << rank << ", epoch " << epoch << ", apply learning rate " << learning_rate << " to update." << endl;
+ summedUpdate.MultiplyEquals(learning_rate);
+ }
+
+ // scale update by BLEU of oracle (for batch size 1 only)
+ if (featureValuesHope.size() == 1) {
+ if (m_scale_update) {
+ cerr << "Rank " << rank << ", epoch " << epoch << ", scaling summed update with oracle bleu score " << bleuScoresHope[0][0] << endl;
+ summedUpdate.MultiplyEquals(bleuScoresHope[0][0]);
+ }
+ }
+
+ //cerr << "Rank " << rank << ", epoch " << epoch << ", update: " << summedUpdate << endl;
+ weightUpdate.PlusEquals(summedUpdate);
+ return 0;
+}
+
+size_t MiraOptimiser::updateWeightsHopeFearSummed(
+ Moses::ScoreComponentCollection& weightUpdate,
+ const std::vector< std::vector<Moses::ScoreComponentCollection> >& featureValuesHope,
+ const std::vector< std::vector<Moses::ScoreComponentCollection> >& featureValuesFear,
+ const std::vector<std::vector<float> >& bleuScoresHope,
+ const std::vector<std::vector<float> >& bleuScoresFear,
+ const std::vector<std::vector<float> >& modelScoresHope,
+ const std::vector<std::vector<float> >& modelScoresFear,
+ float learning_rate,
+ size_t rank,
+ size_t epoch,
+ bool rescaleSlack,
+ bool rewardHope,
+ bool makePairs) {
+
+ // vector of feature values differences for all created constraints
+ ScoreComponentCollection averagedFeatureDiffs;
+ float averagedViolations = 0;
+
+ // Make constraints for new hypothesis translations
+ float epsilon = 0.0001;
+ int violatedConstraintsBefore = 0;
+
+ if (!makePairs) {
+ ScoreComponentCollection featureValueDiff;
+ float lossHope = 0, lossFear = 0, modelScoreHope = 0, modelScoreFear = 0, hopeCount = 0, fearCount = 0;
+ // add all hope vectors
+ for (size_t i = 0; i < featureValuesHope.size(); ++i) {
+ for (size_t j = 0; j < featureValuesHope[i].size(); ++j) {
+ featureValueDiff.PlusEquals(featureValuesHope[i][j]);
+ lossHope += bleuScoresHope[i][j];
+ modelScoreHope += modelScoresHope[i][j];
+ ++hopeCount;
+ }
+ }
+ lossHope /= hopeCount;
+ modelScoreHope /= hopeCount;
+
+ // subtract all fear vectors
+ for (size_t i = 0; i < featureValuesFear.size(); ++i) {
+ for (size_t j = 0; j < featureValuesFear[i].size(); ++j) {
+ featureValueDiff.MinusEquals(featureValuesFear[i][j]);
+ lossFear += bleuScoresFear[i][j];
+ modelScoreFear += modelScoresFear[i][j];
+ ++fearCount;
+ }
+ }
+ lossFear /= fearCount;
+ modelScoreFear /= fearCount;
+
+ if (featureValueDiff.GetL1Norm() == 0) {
+ cerr << "Rank " << rank << ", epoch " << epoch << ", features equal --> skip" << endl;
+ cerr << "Rank " << rank << ", epoch " << epoch << ", no constraint violated for this batch" << endl;
+ return 1;
+ }
+
+ // check if constraint is violated
+ float lossDiff = lossHope - lossFear;
+ float modelScoreDiff = modelScoreHope - modelScoreFear;
+ float diff = 0;
+ if (lossDiff > modelScoreDiff)
+ diff = lossDiff - modelScoreDiff;
+ if (diff > epsilon)
+ ++violatedConstraintsBefore;
+ cerr << "Rank " << rank << ", epoch " << epoch << ", constraint: " << modelScoreDiff << " >= " << lossDiff << " (current violation: " <<\
+ diff << ")" << endl;
+
+ // add constraint
+ averagedFeatureDiffs = featureValueDiff;
+ averagedViolations = diff;
+ }
+ else {
+ // iterate over input sentences (1 (online) or more (batch))
+ for (size_t i = 0; i < featureValuesHope.size(); ++i) {
+ // Pick all pairs[j,j] of hope and fear translations for one input sentence and add them up
+ for (size_t j = 0; j < featureValuesHope[i].size(); ++j) {
+ ScoreComponentCollection featureValueDiff = featureValuesHope[i][j];
+ featureValueDiff.MinusEquals(featureValuesFear[i][j]);
+ if (featureValueDiff.GetL1Norm() == 0) {
+ cerr << "Rank " << rank << ", epoch " << epoch << ", features equal --> skip" << endl;
+ continue;
+ }
+
+ // check if constraint is violated
+ float lossDiff = bleuScoresHope[i][j] - bleuScoresFear[i][j];
+ float modelScoreDiff = modelScoresHope[i][j] - modelScoresFear[i][j];
+ if (rescaleSlack) {
+ cerr << "Rank " << rank << ", epoch " << epoch << ", modelScoreDiff scaled by lossDiff: " << modelScoreDiff << " --> " << modelScoreDiff*lossDiff << endl;
+ modelScoreDiff *= lossDiff;
+ }
+ float diff = 0;
+ if (lossDiff > modelScoreDiff)
+ diff = lossDiff - modelScoreDiff;
+ if (diff > epsilon)
+ ++violatedConstraintsBefore;
+ cerr << "Rank " << rank << ", epoch " << epoch << ", constraint: " << modelScoreDiff << " >= " << lossDiff << " (current violation: " << diff << ")" << endl;
+
+ // add constraint
+ if (rescaleSlack) {
+ averagedFeatureDiffs.MultiplyEquals(lossDiff);
+ cerr << "Rank " << rank << ", epoch " << epoch << ", featureValueDiff scaled by lossDiff." << endl;
+ }
+ averagedFeatureDiffs.PlusEquals(featureValueDiff);
+ averagedViolations += diff;
+ }
+ }
+ }
+
+ // divide by number of constraints (1/n)
+ if (!makePairs) {
+ averagedFeatureDiffs.DivideEquals(featureValuesHope[0].size());
+ }
+ else {
+ averagedFeatureDiffs.DivideEquals(featureValuesHope[0].size());
+ averagedViolations /= featureValuesHope[0].size();
+ }
+ cerr << "Rank " << rank << ", epoch " << epoch << ", averaged feature diffs: " << averagedFeatureDiffs << endl;
+ cerr << "Rank " << rank << ", epoch " << epoch << ", averaged violations: " << averagedViolations << endl;
+
+ if (violatedConstraintsBefore > 0) {
+ if (rewardHope) {
+ averagedFeatureDiffs.RedistributeMass(0.1);
+ cerr << "Rank " << rank << ", epoch " << epoch << ", redistributed feature diffs: " << averagedFeatureDiffs << endl;
+ }
+
+ // compute alpha for given constraint: (loss diff - model score diff) / || feature value diff ||^2
+ // featureValueDiff.GetL2Norm() * featureValueDiff.GetL2Norm() == featureValueDiff.InnerProduct(featureValueDiff)
+ // from Crammer&Singer 2006: alpha = min {C , l_t/ ||x||^2}
+ // adjusted for 1 slack according to Joachims 2009, OP4 (margin rescaling), OP5 (slack rescaling)
+ float squaredNorm = averagedFeatureDiffs.GetL2Norm() * averagedFeatureDiffs.GetL2Norm();
+ float alpha = averagedViolations / squaredNorm;
+ cerr << "Rank " << rank << ", epoch " << epoch << ", unclipped alpha: " << alpha << endl;
+ if (m_slack > 0 ) {
+ if (alpha > m_slack) {
+ alpha = m_slack;
+ }
+ else if (alpha < m_slack*(-1)) {
+ alpha = m_slack*(-1);
+ }
+ }
+ cerr << "Rank " << rank << ", epoch " << epoch << ", clipped alpha: " << alpha << endl;
+
+ // compute update
+ averagedFeatureDiffs.MultiplyEquals(alpha);
+ weightUpdate.PlusEquals(averagedFeatureDiffs);
+ return 0;
+ }
+ else {
+ cerr << "Rank " << rank << ", epoch " << epoch << ", no constraint violated for this batch" << endl;
+ return 1;
+ }
+}
+
+size_t MiraOptimiser::updateWeightsRankModelSummed(
+ Moses::ScoreComponentCollection& weightUpdate,
+ const std::vector< std::vector<Moses::ScoreComponentCollection> >& featureValues,
+ const std::vector<std::vector<float> >& bleuScores,
+ const std::vector<std::vector<float> >& modelScores,
+ float learning_rate,
+ size_t rank,
+ size_t epoch) {
+ // vector of feature values differences for all created constraints
+ ScoreComponentCollection averagedFeatureDiffs;
+ float averagedViolations = 0;
+
+ // Make constraints for new hypothesis translations
+ float epsilon = 0.0001;
+ int violatedConstraintsBefore = 0;
+ float oldDistanceFromOptimum = 0;
+
+ // iterate over input sentences (1 (online) or more (batch))
+ float minBleuDifference = 0.005;
+ size_t numConstraints = 0;
+ for (size_t i = 0; i < featureValues.size(); ++i) {
+ // Build all pairs where the first has higher Bleu than the second
+ for (size_t j = 0; j < featureValues[i].size(); ++j) {
+ for (size_t k = 0; k < featureValues[i].size(); ++k) {
+ if (j == k) continue;
+ if (bleuScores[i][j] - minBleuDifference < bleuScores[i][k]) // need at least a positive Bleu difference of specified amount
+ continue;
+
+ ScoreComponentCollection featureValueDiff = featureValues[i][j];
+ featureValueDiff.MinusEquals(featureValues[i][k]);
+ if (featureValueDiff.GetL1Norm() == 0) {
+ cerr << "Rank " << rank << ", epoch " << epoch << ", features equal --> skip" << endl;
+ continue;
+ }
+
+ // check if constraint is violated
+ float loss = bleuScores[i][j] - bleuScores[i][k];
+ float modelScoreDiff = modelScores[i][j] - modelScores[i][k];
+ float diff = 0;
+ if (loss > modelScoreDiff)
+ diff = loss - modelScoreDiff;
+ if (diff > epsilon)
+ ++violatedConstraintsBefore;
+ cerr << "Rank " << rank << ", epoch " << epoch << ", constraint: " << modelScoreDiff << " >= " << loss << " (current violation: " << diff << ")" << endl;
+
+ // add constraint
+ averagedFeatureDiffs.PlusEquals(featureValueDiff);
+ averagedViolations += diff;
+ ++numConstraints;
+ }
+ }
+ }
+
+ // divide by number of constraints
+ averagedFeatureDiffs.DivideEquals(numConstraints);
+ averagedViolations /= numConstraints;
+
+ cerr << "Rank " << rank << ", epoch " << epoch << ", averaged feature diffs: " << averagedFeatureDiffs << endl;
+ cerr << "Rank " << rank << ", epoch " << epoch << ", averaged violations: " << averagedViolations << endl;
+
+ if (violatedConstraintsBefore > 0) {
+ // compute alpha for given constraint: (loss - model score diff) / || feature value diff ||^2
+ // featureValueDiff.GetL2Norm() * featureValueDiff.GetL2Norm() == featureValueDiff.InnerProduct(featureValueDiff)
+ // from Crammer&Singer 2006: alpha = min {C , l_t/ ||x||^2}
+ float squaredNorm = averagedFeatureDiffs.GetL2Norm() * averagedFeatureDiffs.GetL2Norm();
+ float alpha = averagedViolations / squaredNorm;
+ cerr << "Rank " << rank << ", epoch " << epoch << ", unclipped alpha: " << alpha << endl;
+ if (m_slack > 0 ) {
+ if (alpha > m_slack) {
+ alpha = m_slack;
+ }
+ else if (alpha < m_slack*(-1)) {
+ alpha = m_slack*(-1);
+ }
+ }
+ cerr << "Rank " << rank << ", epoch " << epoch << ", clipped alpha: " << alpha << endl;
+
+ // compute update
+ averagedFeatureDiffs.MultiplyEquals(alpha);
+ weightUpdate.PlusEquals(averagedFeatureDiffs);
+ return 0;
+ }
+ else {
+ cerr << "Rank " << rank << ", epoch " << epoch << ", no constraint violated for this batch" << endl;
+ return 1;
+ }
+}
+
}