Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/moses-smt/mosesdecoder.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
path: root/mira
diff options
context:
space:
mode:
authorEva Hasler <ehasler@saxnot.inf.ed.ac.uk>2012-05-18 21:58:42 +0400
committerEva Hasler <ehasler@saxnot.inf.ed.ac.uk>2012-05-18 21:58:42 +0400
commit4f766434866345c3b59f2c8b690a67504ab7cc5a (patch)
tree992179696f5df5d271e43cd7bc72668dca2dbc66 /mira
parentc8f04a7e481113272f01c99b77cdf26097a5fe37 (diff)
update feature learning rate code
Diffstat (limited to 'mira')
-rwxr-xr-xmira/Main.cpp88
-rwxr-xr-xmira/Main.h2
-rwxr-xr-xmira/MiraOptimiser.cpp6
3 files changed, 73 insertions, 23 deletions
diff --git a/mira/Main.cpp b/mira/Main.cpp
index 5901d5d1b..3b558e911 100755
--- a/mira/Main.cpp
+++ b/mira/Main.cpp
@@ -126,14 +126,18 @@ int main(int argc, char** argv) {
bool l1_regularize, l2_regularize;
float l1_lambda, l2_lambda;
bool most_violated, all_violated, max_bleu_diff, one_against_all;
- bool feature_confidence, signed_counts;
- float decay;
+ bool feature_confidence, signed_counts, averageConfidenceCounts;
+ float decay, core_r0, sparse_r0;
po::options_description desc("Allowed options");
desc.add_options()
("bleu-weight", po::value<float>(&bleuWeight)->default_value(1.0), "Bleu weight used in decoder objective")
("bw-hope", po::value<float>(&bleuWeight_hope)->default_value(-1.0), "Bleu weight used in decoder objective for hope")
("bw-fear", po::value<float>(&bleuWeight_fear)->default_value(-1.0), "Bleu weight used in decoder objective for fear")
+ ("core-r0", po::value<float>(&core_r0)->default_value(1.0), "Start learning rate for core features")
+ ("sparse-r0", po::value<float>(&sparse_r0)->default_value(1.0), "Start learning rate for sparse features")
+ ("avg-conf-counts", po::value<bool>(&averageConfidenceCounts)->default_value(true), "Divide confidence counts by number of processors")
+
("tie-bw-to-lm", po::value<bool>(&bleu_weight_lm)->default_value(false), "Make bleu weight depend on lm weight")
("adjust-bw", po::value<bool>(&bleu_weight_lm_adjust)->default_value(false), "Adjust bleu weight when lm weight changes")
("bw-lm-factor", po::value<float>(&bleu_weight_lm_factor)->default_value(2.0), "Make bleu weight depend on lm weight by this factor")
@@ -221,7 +225,7 @@ int main(int argc, char** argv) {
("scale-update-precision", po::value<bool>(&scale_update_precision)->default_value(0), "Scale update by precision of oracle")
("sentence-level-bleu", po::value<bool>(&sentenceBleu)->default_value(true), "Use a sentences level Bleu scoring function")
("shuffle", po::value<bool>(&shuffle)->default_value(false), "Shuffle input sentences before processing")
- ("signed-counts", po::value<bool>(&signed_counts)->default_value(true), "use signed counts for feature learning rates")
+ ("signed-counts", po::value<bool>(&signed_counts)->default_value(false), "Use signed counts for feature learning rates")
("sigmoid-param", po::value<float>(&sigmoidParam)->default_value(1), "y=sigmoidParam is the axis that this sigmoid approaches")
("slack", po::value<float>(&slack)->default_value(0.01), "Use slack in optimiser")
("sparse-average", po::value<bool>(&sparseAverage)->default_value(false), "Average weights by the number of processes")
@@ -636,7 +640,8 @@ int main(int argc, char** argv) {
// variables for feature confidence
ScoreComponentCollection confidenceCounts, mixedConfidenceCounts, featureLearningRates;
- featureLearningRates.UpdateLearningRates(decay, confidenceCounts); //initialise core learning rates
+ featureLearningRates.UpdateLearningRates(decay, confidenceCounts, core_r0, sparse_r0); //initialise core learning rates
+ cerr << "Initial learning rates, core: " << core_r0 << ", sparse: " << sparse_r0 << endl;
for (size_t epoch = 0; epoch < epochs && !stop; ++epoch) {
if (shuffle) {
@@ -1644,6 +1649,41 @@ int main(int argc, char** argv) {
}
}
+ // apply learning rates to feature vectors before optimization
+ if (feature_confidence) {
+ cerr << "Rank " << rank << ", epoch " << epoch << ", apply feature learning rates with decay " << decay << ": " << featureLearningRates << endl;
+ //weightUpdate.MultiplyEqualsBackoff(featureLearningRates, sparse_r0);
+ //cerr << "Rank " << rank << ", epoch " << epoch << ", scaled update: " << weightUpdate << endl;
+ if (sample) {
+ cerr << "Rank " << rank << ", epoch " << epoch << ", feature values before: " << featureValuesHopeSample[0][0] << endl;
+ applyPerFeatureLearningRates(featureValuesHopeSample, featureLearningRates, sparse_r0);
+ cerr << "Rank " << rank << ", epoch " << epoch << ", feature values after: " << featureValuesHopeSample[0][0] << endl;
+ applyPerFeatureLearningRates(featureValuesFearSample, featureLearningRates, sparse_r0);
+ }
+ else {
+ applyPerFeatureLearningRates(featureValuesHope, featureLearningRates, sparse_r0);
+ applyPerFeatureLearningRates(featureValuesFear, featureLearningRates, sparse_r0);
+ applyPerFeatureLearningRates(featureValues, featureLearningRates, sparse_r0);
+ }
+ }
+ else {
+ // apply fixed learning rates
+ cerr << "Rank " << rank << ", epoch " << epoch << ", apply fixed learning rates, core: " << core_r0 << ", sparse: " << sparse_r0 << endl;
+ if (core_r0 != 1.0 || sparse_r0 != 1.0) {
+ if (sample) {
+ cerr << "Rank " << rank << ", epoch " << epoch << ", feature values before: " << featureValuesHopeSample[0][0] << endl;
+ applyLearningRates(featureValuesHopeSample, core_r0, sparse_r0);
+ cerr << "Rank " << rank << ", epoch " << epoch << ", feature values after: " << featureValuesHopeSample[0][0] << endl;
+ applyLearningRates(featureValuesFearSample, core_r0, sparse_r0);
+ }
+ else {
+ applyLearningRates(featureValuesHope, core_r0, sparse_r0);
+ applyLearningRates(featureValuesFear, core_r0, sparse_r0);
+ applyLearningRates(featureValues, core_r0, sparse_r0);
+ }
+ }
+ }
+
// Run optimiser on batch:
VERBOSE(1, "\nRank " << rank << ", epoch " << epoch << ", run optimiser:" << endl);
size_t update_status = 1;
@@ -1753,17 +1793,13 @@ int main(int argc, char** argv) {
}
if (feature_confidence) {
- cerr << "Rank " << rank << ", epoch " << epoch << ", apply feature learning rates with decay " << decay << ": " << featureLearningRates << endl;
- weightUpdate.MultiplyEqualsSafe(featureLearningRates);
- if (rank == 0)
- cerr << "Rank " << rank << ", epoch " << epoch << ", scaled update: " << weightUpdate << endl;
-
- // update confidence counts
+ // update confidence counts based on weight update
confidenceCounts.UpdateConfidenceCounts(weightUpdate, signed_counts);
-
+
// update feature learning rates
- featureLearningRates.UpdateLearningRates(decay, confidenceCounts);
+ featureLearningRates.UpdateLearningRates(decay, confidenceCounts, core_r0, sparse_r0);
}
+
mosesWeights.PlusEquals(weightUpdate);
if (normaliseWeights)
@@ -1846,8 +1882,11 @@ int main(int argc, char** argv) {
else
mixedWeights.DivideEquals(size);
- // divide confidence counts
- mixedConfidenceCounts.DivideEquals(size);
+ // divide confidence counts
+ if (averageConfidenceCounts) {
+ mixedConfidenceCounts.DivideEquals(size);
+ cerr << "Rank " << rank << ", epoch " << epoch << ", average confidence counts." << endl;
+ }
// normalise weights after averaging
if (normaliseWeights) {
@@ -1893,16 +1932,13 @@ int main(int argc, char** argv) {
decoder->setWeights(mixedWeights);
mosesWeights = mixedWeights;
- // broadcast confidence counts
- if (rank == 0)
- cerr << "Rank " << rank << ", epoch " << epoch << ", confidence counts before: " << confidenceCounts << endl;
+ // broadcast summed confidence counts
mpi::broadcast(world, mixedConfidenceCounts, 0);
confidenceCounts = mixedConfidenceCounts;
- if (rank == 0)
- cerr << "Rank " << rank << ", epoch " << epoch << ", confidence counts after: " << confidenceCounts << endl;
+
#endif
#ifndef MPI_ENABLE
- // cerr << "\nRank " << rank << ", no mixing, weights: " << mosesWeights << endl;
+ //cerr << "\nRank " << rank << ", no mixing, weights: " << mosesWeights << endl;
mixedWeights = mosesWeights;
#endif
} // end mixing
@@ -2322,6 +2358,18 @@ void decodeHopeOrFear(size_t rank, size_t size, size_t decode, string filename,
exit(0);
}
+void applyLearningRates(vector<vector<ScoreComponentCollection> > &featureValues, float core_r0, float sparse_r0) {
+ for (size_t i=0; i<featureValues.size(); ++i) // each item in batch
+ for (size_t j=0; j<featureValues[i].size(); ++j) // each item in nbest
+ featureValues[i][j].MultiplyEquals(core_r0, sparse_r0);
+}
+
+void applyPerFeatureLearningRates(vector<vector<ScoreComponentCollection> > &featureValues, ScoreComponentCollection featureLearningRates, float sparse_r0) {
+ for (size_t i=0; i<featureValues.size(); ++i) // each item in batch
+ for (size_t j=0; j<featureValues[i].size(); ++j) // each item in nbest
+ featureValues[i][j].MultiplyEqualsBackoff(featureLearningRates, sparse_r0);
+}
+
void scaleFeatureScore(ScoreProducer *sp, float scaling_factor, vector<vector<ScoreComponentCollection> > &featureValues, size_t rank, size_t epoch) {
string name = sp->GetScoreProducerWeightShortName();
diff --git a/mira/Main.h b/mira/Main.h
index 1bc95a4f2..690a475d6 100755
--- a/mira/Main.h
+++ b/mira/Main.h
@@ -51,6 +51,8 @@ void ignoreCoreFeatures(std::vector<std::vector<Moses::ScoreComponentCollection>
void takeLogs(std::vector<std::vector<Moses::ScoreComponentCollection> > &featureValues, size_t base);
void deleteTranslations(std::vector<std::vector<const Moses::Word*> > &translations);
void decodeHopeOrFear(size_t rank, size_t size, size_t decode, std::string decode_filename, std::vector<std::string> &inputSentences, Mira::MosesDecoder* decoder, size_t n, float bleuWeight);
+void applyLearningRates(std::vector<std::vector<Moses::ScoreComponentCollection> > &featureValues, float core_r0, float sparse_r0);
+void applyPerFeatureLearningRates(std::vector<std::vector<Moses::ScoreComponentCollection> > &featureValues, Moses::ScoreComponentCollection featureLearningRates, float sparse_r0);
void scaleFeatureScore(Moses::ScoreProducer *sp, float scaling_factor, std::vector<std::vector<Moses::ScoreComponentCollection> > &featureValues, size_t rank, size_t epoch);
void scaleFeatureScores(Moses::ScoreProducer *sp, float scaling_factor, std::vector<std::vector<Moses::ScoreComponentCollection> > &featureValues, size_t rank, size_t epoch);
diff --git a/mira/MiraOptimiser.cpp b/mira/MiraOptimiser.cpp
index 0cf57e39e..c3ccfb011 100755
--- a/mira/MiraOptimiser.cpp
+++ b/mira/MiraOptimiser.cpp
@@ -581,10 +581,10 @@ size_t MiraOptimiser::updateWeightsRankModel(
weightUpdate.PlusEquals(summedUpdate);
// Sanity check: are there still violated constraints after optimisation?
- int violatedConstraintsAfter = 0;
+ /*int violatedConstraintsAfter = 0;
float newDistanceFromOptimum = 0;
for (size_t i = 0; i < featureValueDiffs.size(); ++i) {
- float modelScoreDiff = featureValueDiffs[i].InnerProduct(currWeights);
+ float modelScoreDiff = featureValueDiffs[i].InnerProduct(currWeights);
float loss = all_losses[i];
float diff = loss - modelScoreDiff;
if (diff > epsilon) {
@@ -593,7 +593,7 @@ size_t MiraOptimiser::updateWeightsRankModel(
}
}
VERBOSE(1, "Rank " << rank << ", epoch " << epoch << ", violated constraint before: " << violatedConstraintsBefore << ", after: " << violatedConstraintsAfter << ", change: " << violatedConstraintsBefore - violatedConstraintsAfter << endl);
- VERBOSE(1, "Rank " << rank << ", epoch " << epoch << ", error before: " << oldDistanceFromOptimum << ", after: " << newDistanceFromOptimum << ", change: " << oldDistanceFromOptimum - newDistanceFromOptimum << endl);
+ VERBOSE(1, "Rank " << rank << ", epoch " << epoch << ", error before: " << oldDistanceFromOptimum << ", after: " << newDistanceFromOptimum << ", change: " << oldDistanceFromOptimum - newDistanceFromOptimum << endl);*/
// return violatedConstraintsAfter;
return 0;
}