fix start weights in experiment.perl, add hypothesis queue for picking hope and fear translations, add variations to 1slack formulation

author: Eva Hasler <ehasler@saxnot.inf.ed.ac.uk> 2012-06-01 04:49:42 +0400
committer: Eva Hasler <ehasler@saxnot.inf.ed.ac.uk> 2012-06-01 04:49:42 +0400
commit: 6a6a35c65eaeb42b3f1aa44237332dd6f1bc652c (patch)
tree: 8a02d798832fe36e9efb60f8a2e30d9368728a13 /mira
parent: 62d10a2af3f9509bf7680b8549ff1ffd9774dd83 (diff)
18 files changed, 1023 insertions, 549 deletions
diff --git a/mira/Decoder.cpp b/mira/Decoder.cpp
index ad71c9308..05459d30f 100755..100644
--- a/mira/Decoder.cpp
+++ b/mira/Decoder.cpp
@@ -176,7 +176,8 @@ namespace Mira {
     		cerr << endl;  	
     	cerr << "Rank " << rank << ", epoch " << epoch << ", \"" << path.GetTargetPhrase() << "\", score: " 
     		 << scoreWithoutBleu << ", Bleu: " << bleuScore << ", total: " << path.GetTotalScore();
-    	cerr << " (d-bleu: " << dynBleuScore << ", r-bleu: " << realBleuScore << ") ";
+	if (m_bleuScoreFeature->Enabled() && realBleu)
+	  cerr << " (d-bleu: " << dynBleuScore << ", r-bleu: " << realBleuScore << ") ";
 
     	// set bleu score to zero in the feature vector since we do not want to optimise its weight
     	setBleuScore(featureValues.back(), 0);
@@ -239,7 +240,8 @@ namespace Mira {
     	  cerr << endl;
     	cerr << "Rank " << rank << ", epoch " << epoch << ", \"" << path.GetOutputPhrase() << "\", score: " 
     		 << scoreWithoutBleu << ", Bleu: " << bleuScore << ", total: " << path.GetTotalScore();
-    	cerr << " (d-bleu: " << dynBleuScore << ", r-bleu: " << realBleuScore << ") ";
+	if (m_bleuScoreFeature->Enabled() && realBleu)
+	  cerr << " (d-bleu: " << dynBleuScore << ", r-bleu: " << realBleuScore << ") ";
 
     	// set bleu score to zero in the feature vector since we do not want to optimise its weight
     	setBleuScore(featureValues.back(), 0);
diff --git a/mira/Decoder.h b/mira/Decoder.h
index 859f97094..859f97094 100755..100644
--- a/mira/Decoder.h
+++ b/mira/Decoder.h
diff --git a/mira/Hildreth.cpp b/mira/Hildreth.cpp
index 53d1e0881..53d1e0881 100755..100644
--- a/mira/Hildreth.cpp
+++ b/mira/Hildreth.cpp
diff --git a/mira/Hildreth.h b/mira/Hildreth.h
index 699115242..699115242 100755..100644
--- a/mira/Hildreth.h
+++ b/mira/Hildreth.h
diff --git a/mira/HildrethTest.cpp b/mira/HildrethTest.cpp
index a32dcd1d3..a32dcd1d3 100755..100644
--- a/mira/HildrethTest.cpp
+++ b/mira/HildrethTest.cpp
diff --git a/mira/HypothesisQueue.cpp b/mira/HypothesisQueue.cpp
new file mode 100644
index 000000000..67a42dfdd
--- /dev/null
+++ b/mira/HypothesisQueue.cpp
@@ -0,0 +1,57 @@
+/***********************************************************************
+ Moses - statistical machine translation system
+ Copyright (C) 2006-2011 University of Edinburgh
+ 
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+ 
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ Lesser General Public License for more details.
+ 
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+***********************************************************************/
+
+#include <iostream>
+#include "HypothesisQueue.h"
+
+using namespace std;
+
+namespace Moses {
+
+HypothesisQueue::~HypothesisQueue() {
+  m_queue.clear();
+}
+
+void HypothesisQueue::Push(BleuIndexPair hypo) {
+  pair<set<BleuIndexPair>::iterator,bool> ret;
+
+  if (m_capacity == 0 || m_queue.size() < m_capacity) {
+    ret = m_queue.insert(hypo);
+  } else if (hypo.first > (*(m_queue.rbegin())).first) {
+    // Remove the worst-scoring item from the queue and insert hypo (only erase item if new item was successfully added )
+    ret = m_queue.insert(hypo);
+    if ((*(ret.first)).second == 1) {
+      HypoQueueType::iterator p = m_queue.end();
+      --p;
+      m_queue.erase(p);
+    }
+  } else {
+    // The hypo is unusable: the queue is full and hypo has a worse (or
+    // equal) score than the worst-scoring item already held.
+  }
+}
+
+BleuIndexPair HypothesisQueue::Pop() {
+  HypoQueueType::iterator p = m_queue.begin();
+  BleuIndexPair top = *p;
+  m_queue.erase(p);
+  return top;
+}
+
+}  // namespace Moses
diff --git a/mira/HypothesisQueue.h b/mira/HypothesisQueue.h
new file mode 100644
index 000000000..89500eaa0
--- /dev/null
+++ b/mira/HypothesisQueue.h
@@ -0,0 +1,65 @@
+/***********************************************************************
+ Moses - statistical machine translation system
+ Copyright (C) 2006-2011 University of Edinburgh
+ 
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+ 
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ Lesser General Public License for more details.
+ 
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+***********************************************************************/
+
+#pragma once
+
+#include <set>
+
+namespace Moses {
+
+// pair of Bleu score and index
+typedef std::pair<float, size_t> BleuIndexPair;
+
+// A bounded priority queue of BleuIndexPairs. The top item is
+// the best scoring hypothesis.  The queue assumes ownership of pushed items and
+// relinquishes ownership when they are popped.  Any remaining items at the
+// time of the queue's destruction are deleted.
+class HypothesisQueue {
+
+ public:
+  // Create empty queue with fixed capacity of c.  Capacity 0 means unbounded.
+  HypothesisQueue(size_t c) : m_capacity(c) {}
+  ~HypothesisQueue();
+
+  bool Empty() { return m_queue.empty(); }
+
+  // Add the hypo to the queue or delete it if the queue is full and the
+  // score is no better than the queue's worst score.
+  void Push(BleuIndexPair hypo);
+
+  // Remove the best-scoring detour from the queue and return it.  The
+  // caller is responsible for deleting the object.
+  BleuIndexPair Pop();
+
+ private:
+  struct HypothesisOrderer {
+    bool operator()(BleuIndexPair a,
+                     BleuIndexPair b) {
+      return (a.first > b.first);
+    }
+  };
+
+  //typedef std::multiset<BleuIndexPair, HypothesisOrderer> HypoQueueType;
+  typedef std::set<BleuIndexPair, HypothesisOrderer> HypoQueueType;
+  
+  HypoQueueType m_queue;
+  const size_t m_capacity;
+};
+
+}  // namespace Moses
diff --git a/mira/Jamfile b/mira/Jamfile
index cede96233..cede96233 100755..100644
--- a/mira/Jamfile
+++ b/mira/Jamfile
diff --git a/mira/Main.cpp b/mira/Main.cpp
index 402ffff3f..b5586fe29 100755..100644
--- a/mira/Main.cpp
+++ b/mira/Main.cpp
@@ -44,6 +44,7 @@ namespace mpi = boost::mpi;
 #include "DummyScoreProducers.h"
 #include "LexicalReordering.h"
 #include "BleuScorer.h"
+#include "HypothesisQueue.h"
 
 using namespace Mira;
 using namespace std;
@@ -66,7 +67,7 @@ int main(int argc, char** argv) {
   string inputFile;
   vector<string> referenceFiles;
   vector<string> mosesConfigFilesFolds, inputFilesFolds, referenceFilesFolds;
-  string coreWeightFile, startWeightFile;
+  //  string coreWeightFile, startWeightFile;
   size_t epochs;
   string learner;
   bool shuffle;
@@ -116,7 +117,6 @@ int main(int argc, char** argv) {
   float scale_lm_factor, scale_wp_factor;
   bool sample;
   string moses_src;
-  bool external_score = false;
   float sigmoidParam;
   float bleuWeight, bleuWeight_hope, bleuWeight_fear;
   bool bleu_weight_lm, bleu_weight_lm_adjust;
@@ -126,25 +126,44 @@ int main(int argc, char** argv) {
   bool l1_regularize, l2_regularize;
   float l1_lambda, l2_lambda;
   bool most_violated, all_violated, max_bleu_diff, one_against_all;
-  bool feature_confidence, signed_counts, averageConfidenceCounts;
-  float decay, core_r0, sparse_r0;
+  bool feature_confidence, signed_counts;
+  float decay_core, decay_sparse, core_r0, sparse_r0;
+  bool selective, summed, add2hope, skip_hope, skip_model, skip_fear;
+  float bleu_weight_fear_factor, scaling_constant;
+  bool hildreth;
+  float add2lm;
   bool realBleu, disableBleuFeature;
+  bool rescaleSlack, rewardHope;
+  bool makePairs;
   po::options_description desc("Allowed options");
   desc.add_options()
+    ("make-pairs", po::value<bool>(&makePairs)->default_value(true), "Make pairs of hypotheses for 1slack")
+    ("reward-hope", po::value<bool>(&rewardHope)->default_value(false), "Reward hope features over fear features")
+    ("rescale-slack", po::value<bool>(&rescaleSlack)->default_value(false), "Rescale slack in 1-slack formulation")
     ("disable-bleu-feature", po::value<bool>(&disableBleuFeature)->default_value(false), "Disable the Bleu feature")
-	("real-bleu", po::value<bool>(&realBleu)->default_value(false), "Compute real sentence Bleu on complete translations") 
+    ("real-bleu", po::value<bool>(&realBleu)->default_value(false), "Compute real sentence Bleu on complete translations") 
+    ("add2lm", po::value<float>(&add2lm)->default_value(0.0), "Add the specified amount to all LM weights")
+    ("hildreth", po::value<bool>(&hildreth)->default_value(false), "Prefer Hildreth over analytical update")
+    ("skip-hope", po::value<bool>(&skip_hope)->default_value(false), "Sample without hope translations")
+    ("skip-model", po::value<bool>(&skip_model)->default_value(false), "Sample without model translations")
+    ("skip-fear", po::value<bool>(&skip_fear)->default_value(false), "Sample without fear translations")
+    ("add2hope", po::value<bool>(&add2hope)->default_value(false), "Add 2 hope translations instead of 1")
+    ("scaling-constant", po::value<float>(&scaling_constant)->default_value(1.0), "Scale all core values by a constant at beginning of training")
+    ("selective", po::value<bool>(&selective)->default_value(false), "Build constraints for every feature")       
+    ("summed", po::value<bool>(&summed)->default_value(false), "Sum up all constraints")
+    
     ("bleu-weight", po::value<float>(&bleuWeight)->default_value(1.0), "Bleu weight used in decoder objective")
     ("bw-hope", po::value<float>(&bleuWeight_hope)->default_value(-1.0), "Bleu weight used in decoder objective for hope")
     ("bw-fear", po::value<float>(&bleuWeight_fear)->default_value(-1.0), "Bleu weight used in decoder objective for fear")
     
     ("core-r0", po::value<float>(&core_r0)->default_value(1.0), "Start learning rate for core features")
     ("sparse-r0", po::value<float>(&sparse_r0)->default_value(1.0), "Start learning rate for sparse features")
-    ("avg-conf-counts", po::value<bool>(&averageConfidenceCounts)->default_value(true), "Divide confidence counts by number of processors")
 
     ("tie-bw-to-lm", po::value<bool>(&bleu_weight_lm)->default_value(false), "Make bleu weight depend on lm weight")   
     ("adjust-bw", po::value<bool>(&bleu_weight_lm_adjust)->default_value(false), "Adjust bleu weight when lm weight changes")       
     ("bw-lm-factor", po::value<float>(&bleu_weight_lm_factor)->default_value(2.0), "Make bleu weight depend on lm weight by this factor")
-    
+    ("bw-factor-fear", po::value<float>(&bleu_weight_fear_factor)->default_value(1.0), "Multiply fear weight by this factor")
+
     ("scale-all", po::value<bool>(&scale_all)->default_value(false), "Scale all core features")
     ("scaling-factor", po::value<float>(&scale_all_factor)->default_value(2), "Scaling factor for all core features")
     
@@ -158,8 +177,9 @@ int main(int argc, char** argv) {
     ("clear-static", po::value<bool>(&clear_static)->default_value(false), "Clear static data before every translation")
     ("config,f", po::value<string>(&mosesConfigFile), "Moses ini-file")
     ("configs-folds", po::value<vector<string> >(&mosesConfigFilesFolds), "Moses ini-files, one for each fold")
-    ("core-weights", po::value<string>(&coreWeightFile)->default_value(""), "Weight file containing the core weights (already tuned, have to be non-zero)")
-    ("decay", po::value<float>(&decay)->default_value(0.01), "Decay factor for updating feature learning rates")
+    //("core-weights", po::value<string>(&coreWeightFile)->default_value(""), "Weight file containing the core weights (already tuned, have to be non-zero)")
+    ("decay-core", po::value<float>(&decay_core)->default_value(0.001), "Decay factor for updating core feature learning rates")
+    ("decay-sparse", po::value<float>(&decay_sparse)->default_value(0.001), "Decay factor for updating sparse feature learning rates")
     ("debug-model", po::value<bool>(&debug_model)->default_value(false), "Get best model translation for debugging purposes")
     ("decode-hope", po::value<bool>(&decode_hope)->default_value(false), "Decode dev input set according to hope objective")
     ("decode-fear", po::value<bool>(&decode_fear)->default_value(false), "Decode dev input set according to fear objective")
@@ -171,18 +191,18 @@ int main(int argc, char** argv) {
     ("epochs,e", po::value<size_t>(&epochs)->default_value(10), "Number of epochs")
     ("feature-confidence", po::value<bool>(&feature_confidence)->default_value(false), "Use feature weight confidence in weight updates")
     ("feature-cutoff", po::value<int>(&featureCutoff)->default_value(-1), "Feature cutoff as additional regularization for sparse features")
-    ("fear-n", po::value<int>(&fear_n)->default_value(-1), "Number of fear translations used")
+    ("fear-n", po::value<int>(&fear_n)->default_value(1), "Number of fear translations used")
     ("help", po::value(&help)->zero_tokens()->default_value(false), "Print this help message and exit")
     ("history-bleu", po::value<bool>(&historyBleu)->default_value(false), "Use 1best translations to update the history")
     ("history-smoothing", po::value<float>(&historySmoothing)->default_value(0.9), "Adjust the factor for history smoothing")
     ("hope-fear", po::value<bool>(&hope_fear)->default_value(true), "Use only hope and fear translations for optimisation (not model)")
     ("hope-model", po::value<bool>(&hope_model)->default_value(false), "Use only hope and model translations for optimisation (use --fear-n to set number of model translations)")
-    ("hope-n", po::value<int>(&hope_n)->default_value(-1), "Number of hope translations used")
+    ("hope-n", po::value<int>(&hope_n)->default_value(2), "Number of hope translations used")
     ("input-file,i", po::value<string>(&inputFile), "Input file containing tokenised source")
     ("input-files-folds", po::value<vector<string> >(&inputFilesFolds), "Input files containing tokenised source, one for each fold")
     ("learner,l", po::value<string>(&learner)->default_value("mira"), "Learning algorithm")
-    ("l1-lambda", po::value<float>(&l1_lambda)->default_value(0.001), "Lambda for l1-regularization (w_i +/- lambda)")
-    ("l2-lambda", po::value<float>(&l2_lambda)->default_value(0.1), "Lambda for l2-regularization (w_i * (1 - lambda))")
+    ("l1-lambda", po::value<float>(&l1_lambda)->default_value(0.0001), "Lambda for l1-regularization (w_i +/- lambda)")
+    ("l2-lambda", po::value<float>(&l2_lambda)->default_value(0.01), "Lambda for l2-regularization (w_i * (1 - lambda))")
     ("l1-reg", po::value<bool>(&l1_regularize)->default_value(false), "L1-regularization")
     ("l2-reg", po::value<bool>(&l2_regularize)->default_value(false), "L2-regularization")
     ("min-bleu-ratio", po::value<float>(&minBleuRatio)->default_value(-1), "Set a minimum BLEU ratio between hope and fear")
@@ -233,7 +253,7 @@ int main(int argc, char** argv) {
     ("slack", po::value<float>(&slack)->default_value(0.01), "Use slack in optimiser")
     ("sparse-average", po::value<bool>(&sparseAverage)->default_value(false), "Average weights by the number of processes")
     ("sparse-no-average", po::value<bool>(&sparseNoAverage)->default_value(false), "Don't average sparse weights, just sum")
-    ("start-weights", po::value<string>(&startWeightFile)->default_value(""), "Weight file containing start weights")
+    //("start-weights", po::value<string>(&startWeightFile)->default_value(""), "Weight file containing start weights")
     ("stop-weights", po::value<bool>(&weightConvergence)->default_value(true), "Stop when weights converge")
     ("verbosity,v", po::value<int>(&verbosity)->default_value(0), "Verbosity level")
     ("weight-dump-frequency", po::value<size_t>(&weightDumpFrequency)->default_value(1), "How often per epoch to dump weights (mpi)")
@@ -251,6 +271,10 @@ int main(int argc, char** argv) {
     std::cout << desc << std::endl;
     return 0;
   }
+
+  cerr << "l1-reg: " << l1_regularize << endl;
+  cerr << "featureCutoff: " << featureCutoff << endl;
+  cerr << "featureConfidence: " << feature_confidence << endl;
   
   const StaticData &staticData = StaticData::Instance();
 
@@ -261,6 +285,7 @@ int main(int argc, char** argv) {
   	trainWithMultipleFolds = true;
   }
 
+  cerr << "test 1" << endl;
   if (dumpMixedWeights && (mixingFrequency != weightDumpFrequency)) {
 	  cerr << "Set mixing frequency = weight dump frequency for dumping mixed weights!" << endl;
 	  exit(1);
@@ -271,6 +296,7 @@ int main(int argc, char** argv) {
 	  exit(1);
   }
 
+  cerr << "test 2" << endl;
   if (trainWithMultipleFolds) {
 	  if (!mosesConfigFilesFolds.size()) {
 		  cerr << "Error: No moses ini files specified for training with folds" << endl;
@@ -305,6 +331,7 @@ int main(int argc, char** argv) {
   }
 
 	// load input and references
+  cerr << "test 3" << endl;
   	vector<string> inputSentences;
   	size_t inputSize = trainWithMultipleFolds? inputFilesFolds.size(): 0;
   	size_t refSize = trainWithMultipleFolds? referenceFilesFolds.size(): referenceFiles.size(); 
@@ -382,6 +409,7 @@ int main(int argc, char** argv) {
 	// add initial Bleu weight and references to initialize Bleu feature
 	boost::trim(decoder_settings);
 	decoder_settings += " -mira -distinct-nbest -weight-bl 1 -references";
+	cerr << "test 4" << endl;
 	if (trainWithMultipleFolds) {
 		decoder_settings += " ";
 		decoder_settings += referenceFilesFolds[myFold];
@@ -393,17 +421,22 @@ int main(int argc, char** argv) {
 		}
 	}
 	
+	cerr << "test 5" << endl;
 	vector<string> decoder_params;
 	boost::split(decoder_params, decoder_settings, boost::is_any_of("\t "));
 	
 	string configFile = trainWithMultipleFolds? mosesConfigFilesFolds[myFold] : mosesConfigFile;
 	VERBOSE(1, "Rank " << rank << " reading config file from " << configFile << endl);
+	cerr << "test 6" << endl;
 	MosesDecoder* decoder = new MosesDecoder(configFile, verbosity, decoder_params.size(), decoder_params);
+	cerr << "test 7" << endl;
 	decoder->setBleuParameters(disableBleuFeature, sentenceBleu, scaleByInputLength, scaleByAvgInputLength,
 			scaleByInverseLength, scaleByAvgInverseLength,
 			scaleByX, historySmoothing, bleu_smoothing_scheme);
+	cerr << "test 8" << endl;
 	SearchAlgorithm searchAlgorithm = staticData.GetSearchAlgorithm();
 	bool chartDecoding = (searchAlgorithm == ChartDecoding);
+	cerr << "test 9" << endl;
 
 	// Optionally shuffle the sentences
 	vector<size_t> order;
@@ -426,6 +459,7 @@ int main(int argc, char** argv) {
 		if (rank == 0) {
 			cerr << "Optimising using Mira" << endl;
 			cerr << "slack: " << slack << ", learning rate: " << mira_learning_rate << endl;
+			cerr << "selective: " << selective << endl;
 			if (normaliseMargin) 
 			  cerr << "sigmoid parameter: " << sigmoidParam << endl;
 		}
@@ -459,19 +493,19 @@ int main(int argc, char** argv) {
 	}
 
 	if (hope_n == -1)
-		hope_n = n;
+	  hope_n = n;
 	if (fear_n == -1)
-		fear_n = n;
+	  fear_n = n;
 	if (rank_n == -1)
-		rank_n = n;
+	  rank_n = n;
 
 	if (sample)
-		model_hope_fear = true;
+	  model_hope_fear = true;
 	if (model_hope_fear || hope_model || rank_only || megam)
-		hope_fear = false; // is true by default
+	  hope_fear = false; // is true by default
 	if (learner == "mira" && !(hope_fear || hope_model || model_hope_fear || rank_only || megam)) {
-		cerr << "Error: Need to select one of parameters --hope-fear/--model-hope-fear for mira update." << endl;
-		return 1;
+	  cerr << "Error: Need to select one of parameters --hope-fear/--model-hope-fear for mira update." << endl;
+	  return 1;
 	}
 
 #ifdef MPI_ENABLE
@@ -516,10 +550,10 @@ int main(int argc, char** argv) {
 			staticData.GetTranslationSystem(TranslationSystem::DEFAULT).GetFeatureFunctions();
 	//const vector<FactorType> &inputFactorOrder = staticData.GetInputFactorOrder();
 
-	ProducerWeightMap coreWeightMap, startWeightMap;
+	//ProducerWeightMap coreWeightMap, startWeightMap;
 	ScoreComponentCollection initialWeights = decoder->getWeights();
 	// read start weight file                                                                                
-	if (!startWeightFile.empty()) {
+	/*if (!startWeightFile.empty()) {
 	  if (!loadCoreWeights(startWeightFile, startWeightMap, featureFunctions)) {
 	    cerr << "Error: Failed to load start weights from " << startWeightFile << endl;
 	    return 1;
@@ -550,13 +584,27 @@ int main(int argc, char** argv) {
 	    for(p = coreWeightMap.begin(); p!=coreWeightMap.end(); ++p)
 	      initialWeights.Assign(p->first, p->second);
 	  }
-	}
+	  }*/
         cerr << "Rank " << rank << ", initial weights: " << initialWeights << endl;
+	if (scaling_constant != 1.0) {
+	  initialWeights.MultiplyEquals(scaling_constant);
+	  cerr << "Rank " << rank << ", scaled initial weights: " << initialWeights << endl;
+	}
+
+	if (add2lm != 0) {
+	  const LMList& lmList_new = staticData.GetLMList();
+	  for (LMList::const_iterator i = lmList_new.begin(); i != lmList_new.end(); ++i) {
+	    float lmWeight = initialWeights.GetScoreForProducer(*i) + add2lm;
+	    initialWeights.Assign(*i, lmWeight);
+	    cerr << "Rank " << rank << ", add " << add2lm << " to lm weight." << endl;
+	  }
+	}
 	
 	if (normaliseWeights) {
 	  initialWeights.L1Normalise();
 	  cerr << "Rank " << rank << ", normalised initial weights: " << initialWeights << endl;
 	}
+
 	decoder->setWeights(initialWeights);
 
 	if (scale_all) {
@@ -583,7 +631,9 @@ int main(int argc, char** argv) {
 	if (bleuWeight_fear == -1) {
 	  bleuWeight_fear = bleuWeight;
 	}
+	bleuWeight_fear *= bleu_weight_fear_factor;
 	cerr << "Bleu weight: " << bleuWeight << endl;
+	cerr << "Bleu weight fear: " << bleuWeight_fear << endl;
 
 	if (decode_hope || decode_fear || decode_model) {
 	  size_t decode = 1;
@@ -614,15 +664,15 @@ int main(int argc, char** argv) {
 	ScoreComponentCollection mixedAverageWeightsBeforePrevious;
 
 	// log feature counts and/or hope/fear translations with features
-	string f1 = "decode_hope_epoch0";
+	/*string f1 = "decode_hope_epoch0";
 	string f2 = "decode_fear_epoch0";
-  ofstream hopePlusFeatures(f1.c_str());
-  ofstream fearPlusFeatures(f2.c_str());
-  if (!hopePlusFeatures || !fearPlusFeatures) {
-  	ostringstream msg;
-  	msg << "Unable to open file";
-  	throw runtime_error(msg.str());
-  }
+	ofstream hopePlusFeatures(f1.c_str());
+	ofstream fearPlusFeatures(f2.c_str());
+	if (!hopePlusFeatures || !fearPlusFeatures) {
+	  ostringstream msg;
+	  msg << "Unable to open file";
+	  throw runtime_error(msg.str());
+	  }*/
 
 	bool stop = false;
 //	int sumStillViolatedConstraints;
@@ -630,7 +680,7 @@ int main(int argc, char** argv) {
 
 	// variables for feature confidence
 	ScoreComponentCollection confidenceCounts, mixedConfidenceCounts, featureLearningRates;
-       	featureLearningRates.UpdateLearningRates(decay, confidenceCounts, core_r0, sparse_r0); //initialise core learning rates
+       	featureLearningRates.UpdateLearningRates(decay_core, decay_sparse, confidenceCounts, core_r0, sparse_r0); //initialise core learning rates
 	cerr << "Initial learning rates, core: " << core_r0 << ", sparse: " << sparse_r0 << endl;
 
 	for (size_t epoch = 0; epoch < epochs && !stop; ++epoch) {
@@ -851,6 +901,7 @@ int main(int argc, char** argv) {
 				}
 				
 				// select inference scheme
+				cerr << "Rank " << rank << ", epoch " << epoch << ", real Bleu? " << realBleu << endl;
 				if (hope_fear || perceptron_update) {				  
 				  if (clear_static) {
 				    delete decoder;
@@ -876,11 +927,11 @@ int main(int argc, char** argv) {
 					// count sparse features occurring in hope translation
 					featureValuesHope[batchPosition][0].IncrementSparseHopeFeatures();
 
-					if (epoch == 0 && printNbestWithFeatures) {
+					/*if (epoch == 0 && printNbestWithFeatures) {
 						decoder->outputNBestList(input, *sid, hope_n, 1, bleuWeight_hope, distinctNbest,
 								avgRefLength, "", hopePlusFeatures);
 						decoder->cleanup(chartDecoding);
-					}
+					}*/
 
 					
 					float precision = bleuScoresHope[batchPosition][0];
@@ -950,11 +1001,11 @@ int main(int argc, char** argv) {
 					// count sparse features occurring in fear translation
 					featureValuesFear[batchPosition][0].IncrementSparseFearFeatures();
 					
-					if (epoch == 0 && printNbestWithFeatures) {
-						decoder->outputNBestList(input, *sid, fear_n, -1, bleuWeight_fear, distinctNbest,
-								avgRefLength, "", fearPlusFeatures);
-						decoder->cleanup(chartDecoding);
-					}
+					/*if (epoch == 0 && printNbestWithFeatures) {
+					  decoder->outputNBestList(input, *sid, fear_n, -1, bleuWeight_fear, distinctNbest,
+								   avgRefLength, "", fearPlusFeatures);
+					  decoder->cleanup(chartDecoding);
+					}*/
 
 					// Bleu-related example selection
 					bool skip = false;
@@ -1068,330 +1119,191 @@ int main(int argc, char** argv) {
 
 					examples_in_batch++;
 				}
-				if (model_hope_fear) {
-					ostringstream hope_nbest_filename, fear_nbest_filename, model_nbest_filename, ref_filename;
-					if (sample && external_score) {					  
-					  hope_nbest_filename << "decode_hope_rank" << rank << "." << hope_n << "best";
-					  fear_nbest_filename << "decode_fear_rank" << rank << "." << fear_n << "best";
-					  model_nbest_filename << "decode_model_rank" << rank << "." << n << "best";
-					  
-					  // save reference
-					  ref_filename <<  "decode_ref_rank" << rank;
-					  referenceFileMegam = ref_filename.str();
-					  ofstream ref_out(referenceFileMegam.c_str());
-					  if (!ref_out) {
-					    ostringstream msg;
-					    msg << "Unable to open " << referenceFileMegam;
-					    throw runtime_error(msg.str());
-					  }
-					  ref_out << referenceSentences[decoder->getShortestReferenceIndex(*sid)][*sid] << "\n";
-					  ref_out.close();
-					}					
-					
-					// HOPE
-					if (clear_static) {
-					  delete decoder;
-					  StaticData::ClearDataStatic();
-					  decoder = new MosesDecoder(configFile, verbosity, decoder_params.size(), decoder_params);
-					  decoder->setBleuParameters(disableBleuFeature, sentenceBleu, scaleByInputLength, scaleByAvgInputLength, scaleByInverseLength, scaleByAvgInverseLength, scaleByX, historySmoothing, bleu_smoothing_scheme);
-					  decoder->setWeights(mosesWeights);
-					}
+				if (model_hope_fear) {		
+				  // HOPE
+				  if (!skip_hope) {
+				    cerr << "Rank " << rank << ", epoch " << epoch << ", " << n << "best hope translations" << endl;
+				    size_t oraclePos = featureValues[batchPosition].size();
+				    decoder->getNBest(input, *sid, n, 1.0, bleuWeight_hope,
+						      featureValues[batchPosition], bleuScores[batchPosition], modelScores[batchPosition],
+						      0, realBleu, distinctNbest, avgRefLength, rank, epoch, "");
+				    //vector<const Word*> oracle = outputHope[0];
+				    // needed for history
+				    inputLengths.push_back(current_input_length);
+				    ref_ids.push_back(*sid);
+				    decoder->cleanup(chartDecoding);
+				    //ref_length = decoder->getClosestReferenceLength(*sid, oracle.size());
+				    //float hope_length_ratio = (float)oracle.size()/ref_length;
+				    cerr << endl;
+				    
+				    oracleFeatureValues.push_back(featureValues[batchPosition][oraclePos]);
+				    oracleBleuScores.push_back(bleuScores[batchPosition][oraclePos]);
+				    oracleModelScores.push_back(modelScores[batchPosition][oraclePos]);
+				  }
 
-					cerr << "Rank " << rank << ", epoch " << epoch << ", " << n << "best hope translations" << endl;
-					size_t oraclePos = featureValues[batchPosition].size();
-					decoder->getNBest(input, *sid, n, 1.0, bleuWeight_hope,
-							featureValues[batchPosition], bleuScores[batchPosition], modelScores[batchPosition],
-							0, realBleu, distinctNbest, avgRefLength, rank, epoch, hope_nbest_filename.str());
-					// needed for history
-					inputLengths.push_back(current_input_length);
-					ref_ids.push_back(*sid);
-					decoder->cleanup(chartDecoding);
-					//ref_length = decoder->getClosestReferenceLength(*sid, oracle.size());
-					//float hope_length_ratio = (float)oracle.size()/ref_length;
-					cerr << endl;
-					
-					oracleFeatureValues.push_back(featureValues[batchPosition][oraclePos]);
-					oracleBleuScores.push_back(bleuScores[batchPosition][oraclePos]);
-					oracleModelScores.push_back(modelScores[batchPosition][oraclePos]);
+				  // MODEL
+				  if (!skip_model) {
+				    cerr << "Rank " << rank << ", epoch " << epoch << ", " << n << "best wrt model score" << endl;
+				    if (historyBleu) {
+				      vector< vector<const Word*> > outputModel = decoder->getNBest(input, *sid, n, 0.0, bleuWeight,
+					   featureValues[batchPosition], bleuScores[batchPosition], modelScores[batchPosition],
+					   1, realBleu, distinctNbest, avgRefLength, rank, epoch, "");
+				      vector<const Word*> bestModel = outputModel[0];
+				      oneBests.push_back(bestModel);
+				    }
+				    else {
+				      decoder->getNBest(input, *sid, n, 0.0, bleuWeight,
+						featureValues[batchPosition], bleuScores[batchPosition], modelScores[batchPosition],
+						0, realBleu, distinctNbest, avgRefLength, rank, epoch, "");
+				    }
+				    decoder->cleanup(chartDecoding);
+				    //ref_length = decoder->getClosestReferenceLength(*sid, bestModel.size());
+				    //float model_length_ratio = (float)bestModel.size()/ref_length;
+				    cerr << endl;
+				  }
 
-					// MODEL
-					if (clear_static) {
-					  delete decoder;
-					  StaticData::ClearDataStatic();
-					  decoder = new MosesDecoder(configFile, verbosity, decoder_params.size(), decoder_params);
-					  decoder->setBleuParameters(disableBleuFeature, sentenceBleu, scaleByInputLength, scaleByAvgInputLength, scaleByInverseLength, scaleByAvgInverseLength, scaleByX, historySmoothing, bleu_smoothing_scheme);
-					  decoder->setWeights(mosesWeights);
-					}
+				  // FEAR			
+				  if (!skip_fear) {
+				    cerr << "Rank " << rank << ", epoch " << epoch << ", " << n << "best fear translations" << endl;
+				    decoder->getNBest(input, *sid, n, -1.0, bleuWeight_fear,
+						featureValues[batchPosition], bleuScores[batchPosition], modelScores[batchPosition],
+						0, realBleu, distinctNbest, avgRefLength, rank, epoch, "");
+				    decoder->cleanup(chartDecoding);
+				    //ref_length = decoder->getClosestReferenceLength(*sid, fear.size());
+				    //float fear_length_ratio = (float)fear.size()/ref_length;
+				  }
 
-					cerr << "Rank " << rank << ", epoch " << epoch << ", " << n << "best wrt model score" << endl;
-					if (historyBleu) {
-					vector< vector<const Word*> > outputModel = decoder->getNBest(input, *sid, n, 0.0, bleuWeight,
-							featureValues[batchPosition], bleuScores[batchPosition], modelScores[batchPosition],
-							1, realBleu, distinctNbest, avgRefLength, rank, epoch, model_nbest_filename.str());
-						vector<const Word*> bestModel = outputModel[0];
-						oneBests.push_back(bestModel);
-					}
-					else {
-						decoder->getNBest(input, *sid, n, 0.0, bleuWeight,
-							featureValues[batchPosition], bleuScores[batchPosition], modelScores[batchPosition],
-							0, realBleu, distinctNbest, avgRefLength, rank, epoch, model_nbest_filename.str());
+				  examples_in_batch++;
+				  
+				  if (sample) {
+				    float bleuHope = -1000;
+				    float bleuFear = 1000;
+				    size_t indexHope = -1;
+				    size_t indexFear = -1;
+				    vector<float> bleuHopeList;
+				    vector<float> bleuFearList;
+				    vector<float> indexHopeList;
+				    vector<float> indexFearList;
+				    
+				    HypothesisQueue queueHope(hope_n);
+				    HypothesisQueue queueFear(fear_n);
+				    
+				    cerr << endl;					    
+				    if (most_violated || all_violated || one_against_all) {
+				      bleuHope = -1000;
+				      bleuFear = 1000;
+				      indexHope = -1;
+				      indexFear = -1;
+				      if (most_violated)
+					cerr << "Rank " << rank << ", epoch " << epoch << ", pick pair with most violated constraint" << endl;
+				      else if (all_violated)
+					cerr << "Rank " << rank << ", epoch " << epoch << ", pick all pairs with violated constraints";
+				      else 
+					cerr << "Rank " << rank << ", epoch " << epoch << ", pick all pairs with hope";
+					    
+				      // find best hope, then find fear that violates our constraint most
+				      for (size_t i=0; i<bleuScores[batchPosition].size(); ++i) {
+					if (abs(bleuScores[batchPosition][i] - bleuHope) < epsilon) { // equal bleu scores          
+					  if (modelScores[batchPosition][i] > modelScores[batchPosition][indexHope]) {
+					    if (abs(modelScores[batchPosition][i] - modelScores[batchPosition][indexHope]) > epsilon) {
+					      // better model score
+					      bleuHope = bleuScores[batchPosition][i];
+					      indexHope = i;
+					    }
+					  }
 					}
-					decoder->cleanup(chartDecoding);
-					//ref_length = decoder->getClosestReferenceLength(*sid, bestModel.size());
-					//float model_length_ratio = (float)bestModel.size()/ref_length;
-					cerr << endl;
-
-					// FEAR
-					if (clear_static) {
-                                          delete decoder;
-					  StaticData::ClearDataStatic();
-                                          decoder = new MosesDecoder(configFile, verbosity, decoder_params.size(), decoder_params);
-                                          decoder->setBleuParameters(disableBleuFeature, sentenceBleu, scaleByInputLength, scaleByAvgInputLength, scaleByInverseLength, scaleByAvgInverseLength, scaleByX, historySmoothing, bleu_smoothing_scheme);
-                                          decoder->setWeights(mosesWeights);
+					else if (bleuScores[batchPosition][i] > bleuHope) { // better than current best         
+					  bleuHope = bleuScores[batchPosition][i];
+					  indexHope = i;
 					}
-					
-					cerr << "Rank " << rank << ", epoch " << epoch << ", " << n << "best fear translations" << endl;
-					decoder->getNBest(input, *sid, n, -1.0, bleuWeight_fear,
-							featureValues[batchPosition], bleuScores[batchPosition], modelScores[batchPosition],
-							0, realBleu, distinctNbest, avgRefLength, rank, epoch, fear_nbest_filename.str());
-					decoder->cleanup(chartDecoding);
-					//ref_length = decoder->getClosestReferenceLength(*sid, fear.size());
-					//float fear_length_ratio = (float)fear.size()/ref_length;
-
-					examples_in_batch++;
-					
-					if (sample) {
-					  float bleuHope = -1000;
-					  float bleuFear = 1000;
-					  size_t indexHope = -1;
-					  size_t indexFear = -1;
-					  vector<float> bleuHopeList;
-					  vector<float> bleuFearList;
-					  vector<float> indexHopeList;
-					  vector<float> indexFearList;
-					  
-					  if (external_score) {
-					    // concatenate nbest files (use hope, model, fear lists to extract samples from)
-					    stringstream nbestStreamMegam, catCmd, sortCmd, scoreDataFile, featureDataFile;
-					    nbestStreamMegam << "decode_hypos_rank" << rank << "." << (hope_n+n+fear_n) << "best";
-					    nbestFileMegam = nbestStreamMegam.str();
-					    catCmd << "cat " << hope_nbest_filename.str() << " " << model_nbest_filename.str() 
-						   << " " << fear_nbest_filename.str() << " > " << nbestFileMegam;
-					    system(catCmd.str().c_str());
-					    
-					    // extract features and scores
-					    scoreDataFile << "decode_hypos_rank" << rank << ".scores.dat";
-					    featureDataFile << "decode_hypos_rank" << rank << ".features.dat";
-					    stringstream extractorCmd;
-					    extractorCmd << moses_src << "/dist/bin/extractor"
-					      " --scconfig case:true --scfile " << scoreDataFile.str() << " --ffile " << featureDataFile.str() << " -r " << referenceFileMegam << " -n " << nbestFileMegam;
-					    system(extractorCmd.str().c_str());
+				      }
 					    
-					    // NOTE: here we are just scoring the nbest lists created above. 
-					    // We will use the (real, not dynamically computed) sentence bleu scores to select a pair of two
-					    // translations with maximal Bleu difference
-					    vector<float> bleuScoresNbest = BleuScorer::ScoreNbestList(scoreDataFile.str(), featureDataFile.str());
-					    for (size_t i=0; i < bleuScoresNbest.size(); ++i) {
-					      //cerr << "bleu: " << bleuScoresNbest[i]*current_input_length << endl;
-					      if (abs(bleuScoresNbest[i] - bleuHope) < epsilon) { // equal bleu scores
-						if (modelScores[batchPosition][i] > modelScores[batchPosition][indexHope]) {
-						  if (abs(modelScores[batchPosition][i] - modelScores[batchPosition][indexHope]) > epsilon) {
-						    bleuHope = bleuScoresNbest[i];
-						    indexHope = i;
-						  }
-						}
-					      }
-					      else if (bleuScoresNbest[i] > bleuHope) { // better than current best
-						bleuHope = bleuScoresNbest[i];
-						indexHope = i;
-					      }
-					      
-					      if (abs(bleuScoresNbest[i] - bleuFear) < epsilon) { // equal bleu scores
-						if (modelScores[batchPosition][i] > modelScores[batchPosition][indexFear]) {
-						  if (abs(modelScores[batchPosition][i] - modelScores[batchPosition][indexFear]) > epsilon) {
-						    bleuFear = bleuScoresNbest[i];
-						    indexFear = i;
-						  }
-						}
-					      }
-					      else if (bleuScoresNbest[i] < bleuFear) { // worse than current worst
-						bleuFear = bleuScoresNbest[i];
-						indexFear = i;
-					      }
-					    }
+				      float currentViolation = 0;
+				      float minimum_bleu_diff = 0.01;
+				      for (size_t i=0; i<bleuScores[batchPosition].size(); ++i) {
+					float bleuDiff = bleuHope - bleuScores[batchPosition][i];
+					float modelDiff = modelScores[batchPosition][indexHope] - modelScores[batchPosition][i];
+					if (bleuDiff > epsilon) {
+					  if (one_against_all && bleuDiff > minimum_bleu_diff) {
+					    cerr << ".. adding pair";
+					    bleuHopeList.push_back(bleuHope);
+					    bleuFearList.push_back(bleuScores[batchPosition][i]);
+					    indexHopeList.push_back(indexHope);
+					    indexFearList.push_back(i);
 					  }
-					  else {
-					    cerr << endl;
-					    if (most_violated || all_violated || one_against_all) {
-					      bleuHope = -1000;
-					      bleuFear = 1000;
-					      indexHope = -1;
-					      indexFear = -1;
-					      if (most_violated)
-						cerr << "Rank " << rank << ", epoch " << epoch << ", pick pair with most violated constraint";
-					      else if (all_violated)
-						cerr << "Rank " << rank << ", epoch " << epoch << ", pick all pairs with violated constraints";
-					      else 
-						cerr << "Rank " << rank << ", epoch " << epoch << ", pick all pairs with hope";
-
-					      // find best hope, then find fear that violates our constraint most
-					      for (size_t i=0; i<bleuScores[batchPosition].size(); ++i) {
-						if (abs(bleuScores[batchPosition][i] - bleuHope) < epsilon) { // equal bleu scores          
-                                                  if (modelScores[batchPosition][i] > modelScores[batchPosition][indexHope]) {
-                                                    if (abs(modelScores[batchPosition][i] - modelScores[batchPosition][indexHope]) > epsilon) {
-						      // better model score
-						      bleuHope = bleuScores[batchPosition][i];
-						      indexHope = i;
-						    }
-                                                  }
-                                                }
-                                                else if (bleuScores[batchPosition][i] > bleuHope) { // better than current best         
-                                                  bleuHope = bleuScores[batchPosition][i];
-                                                  indexHope = i;
-                                                }
-					      }
-			      
-					      float currentViolation = 0;
-					      float minimum_bleu_diff = 0.01;
-					      for (size_t i=0; i<bleuScores[batchPosition].size(); ++i) {
-						float bleuDiff = bleuHope - bleuScores[batchPosition][i];
-						float modelDiff = modelScores[batchPosition][indexHope] - modelScores[batchPosition][i];
-						if (bleuDiff > epsilon) {
-						  if (one_against_all && bleuDiff > minimum_bleu_diff) {
-						    cerr << ".. adding pair";
-						    bleuHopeList.push_back(bleuHope);
-						    bleuFearList.push_back(bleuScores[batchPosition][i]);
-						    indexHopeList.push_back(indexHope);
-						    indexFearList.push_back(i);
-						  }
-						  else if (modelDiff < bleuDiff) {
-						    float diff = bleuDiff - modelDiff;
-						    if (diff > epsilon) { 
-						      if (all_violated) {
-							cerr << ".. adding pair";
-							bleuHopeList.push_back(bleuHope);
-							bleuFearList.push_back(bleuScores[batchPosition][i]);
-							indexHopeList.push_back(indexHope);
-							indexFearList.push_back(i);
-						      }
-						      else if (most_violated && diff > currentViolation) {
-							currentViolation = diff;
-							bleuFear = bleuScores[batchPosition][i];
-							indexFear = i;
-							cerr << "Rank " << rank << ", epoch " << epoch << ", current violation: " << currentViolation << " (" << modelDiff << " >= " << bleuDiff << ")" << endl;
-						      }						    
-						    }
-						  }
-						}						
-					      }
-					      
-					      if (most_violated) {
-						if (currentViolation > 0) {
-						  cerr << ".. adding pair with violation " << currentViolation << endl;
-						  bleuHopeList.push_back(bleuHope);
-						  bleuFearList.push_back(bleuFear);
-						  indexHopeList.push_back(indexHope);
-						  indexFearList.push_back(indexFear);
-						}
-						else cerr << ".. none" << endl;
-					      }
-					      else cerr << endl;
-					    }
-					    if (max_bleu_diff) {
-					      bleuHope = -1000;
-                                              bleuFear = 1000;
-                                              indexHope = -1;
-                                              indexFear = -1;
-					      cerr << "Rank " << rank << ", epoch " << epoch << ", pick pair with max Bleu diff";
-					      // use dynamically calculated scores to find best and worst 
-					      for (size_t i=0; i<bleuScores[batchPosition].size(); ++i) {
-						//cerr << "bleu: " << bleuScores[batchPosition][i] << endl;
-						if (abs(bleuScores[batchPosition][i] - bleuHope) < epsilon) { // equal bleu scores
-						  if (modelScores[batchPosition][i] > modelScores[batchPosition][indexHope]) {
-						    if (abs(modelScores[batchPosition][i] - modelScores[batchPosition][indexHope]) > epsilon) {
-						      bleuHope = bleuScores[batchPosition][i];
-						      indexHope = i;
-						    }
-						  }
-						}
-						else if (bleuScores[batchPosition][i] > bleuHope) { // better than current best
-						  bleuHope = bleuScores[batchPosition][i];
-						  indexHope = i;
-						}
-						
-						if (abs(bleuScores[batchPosition][i] - bleuFear) < epsilon) { // equal bleu scores
-						  if (modelScores[batchPosition][i] > modelScores[batchPosition][indexFear]) {
-						    if (abs(modelScores[batchPosition][i] - modelScores[batchPosition][indexFear]) > epsilon) {
-						      bleuFear = bleuScores[batchPosition][i];
-						      indexFear = i;
-						    }
-						  }
-						}
-						else if (bleuScores[batchPosition][i] < bleuFear) { // worse than current worst
-						  bleuFear = bleuScores[batchPosition][i];
-						  indexFear = i;
-						}						      	
-					      }
-					      
-					      if (bleuHope != -1000 && bleuFear != 1000 && (bleuHope - bleuFear) > epsilon) {
-						cerr << ".. adding 1 pair" << endl;
+					  else if (modelDiff < bleuDiff) {
+					    float diff = bleuDiff - modelDiff;
+					    if (diff > epsilon) { 
+					      if (all_violated) {
+						cerr << ".. adding pair";
 						bleuHopeList.push_back(bleuHope);
-						bleuFearList.push_back(bleuFear);
+						bleuFearList.push_back(bleuScores[batchPosition][i]);
 						indexHopeList.push_back(indexHope);
-						indexFearList.push_back(indexFear);					      
+						indexFearList.push_back(i);
 					      }
-					      else cerr << "none" << endl;
-					    }
-					  }
-					  
-					  if (bleuHopeList.size() == 0 || bleuFearList.size() == 0) {
-					    cerr << "Rank " << rank << ", epoch " << epoch << ", no appropriate hypotheses found.." << endl;
-					    skip_sample = true;
-					  }
-					  else {
-					    if (bleuHope != -1000 && bleuFear != 1000 && bleuHope <= bleuFear) {
-					      if (abs(bleuHope - bleuFear) < epsilon) 
-						cerr << "\nRank " << rank << ", epoch " << epoch << ", WARNING: HOPE and FEAR have equal Bleu." << endl;
-					      else 
-						cerr << "\nRank " << rank << ", epoch " << epoch << ", ERROR: FEAR has better Bleu than HOPE." << endl;     				     
+					      else if (most_violated && diff > currentViolation) {
+						currentViolation = diff;
+						bleuFear = bleuScores[batchPosition][i];
+						indexFear = i;
+						cerr << "Rank " << rank << ", epoch " << epoch << ", current violation: " << currentViolation << " (" << modelDiff << " >= " << bleuDiff << ")" << endl;
+					      }						    
 					    }
-					    else {
-					      if (external_score) {
-						// use actual sentence bleu (not dynamically computed)
-						bleuScoresHopeSample[batchPosition].push_back(bleuHope*current_input_length);
-						bleuScoresFearSample[batchPosition].push_back(bleuFear*current_input_length);
-						featureValuesHopeSample[batchPosition].push_back(featureValues[batchPosition][indexHope]);
-						featureValuesFearSample[batchPosition].push_back(featureValues[batchPosition][indexFear]);
-						modelScoresHopeSample[batchPosition].push_back(modelScores[batchPosition][indexHope]);
-						modelScoresFearSample[batchPosition].push_back(modelScores[batchPosition][indexFear]);
-						cerr << "Rank " << rank << ", epoch " << epoch << ", Best: " << bleuHope*current_input_length << " (" << indexHope << ")" << endl;
-						cerr << "Rank " << rank << ", epoch " << epoch << ", Worst: " << bleuFear*current_input_length << " (" << indexFear << ")" << endl;
-					      }
-					      else {
-						cerr << endl;
-						for (size_t i=0; i<bleuHopeList.size(); ++i) {
-						  float bHope = bleuHopeList[i];
-						  float bFear = bleuFearList[i];
-						  size_t iHope = indexHopeList[i];
-						  size_t iFear = indexFearList[i];
-						  cerr << "Rank " << rank << ", epoch " << epoch << ", Hope[" << i << "]: " << bHope << " (" << iHope << ")" << endl;
-						  cerr << "Rank " << rank << ", epoch " << epoch << ", Fear[" << i << "]: " << bFear << " (" << iFear << ")" << endl;				
-						  bleuScoresHopeSample[batchPosition].push_back(bHope);
-						  bleuScoresFearSample[batchPosition].push_back(bFear);
-						  featureValuesHopeSample[batchPosition].push_back(featureValues[batchPosition][iHope]);
-						  featureValuesFearSample[batchPosition].push_back(featureValues[batchPosition][iFear]);
-						  modelScoresHopeSample[batchPosition].push_back(modelScores[batchPosition][iHope]);
-						  modelScoresFearSample[batchPosition].push_back(modelScores[batchPosition][iFear]);
-
-						  featureValues[batchPosition][iHope].IncrementSparseHopeFeatures();
-						  featureValues[batchPosition][iFear].IncrementSparseFearFeatures();
-						}
-					      }
-					    }						
 					  }
+					}						
+				      }
+					    
+				      if (most_violated) {
+					if (currentViolation > 0) {
+					  cerr << "Rank " << rank << ", epoch " << epoch << ", adding pair with violation " << currentViolation << endl;
+					  bleuHopeList.push_back(bleuHope);
+					  bleuFearList.push_back(bleuFear);
+					  indexHopeList.push_back(indexHope);
+					  indexFearList.push_back(indexFear);
 					}
+					else 
+					  cerr << "Rank " << rank << ", epoch " << epoch << ", no violated constraint found." << endl;
+				      }
+				      else cerr << endl;
+				    }
+				    if (max_bleu_diff) {
+				      cerr << "Rank " << rank << ", epoch " << epoch << ", pick pair with max Bleu diff from list: " << bleuScores[batchPosition].size() << endl;
+				      for (size_t i=0; i<bleuScores[batchPosition].size(); ++i) {
+					BleuIndexPair hope(bleuScores[batchPosition][i], i);
+					queueHope.Push(hope);
+					BleuIndexPair fear(-1*(bleuScores[batchPosition][i]), i);
+					queueFear.Push(fear);
+				      }				      
+				    }
+				    
+				    cerr << endl;
+				    
+				    vector<BleuIndexPair> hopeList, fearList;
+				    for (size_t i=0; i<hope_n && !queueHope.Empty(); ++i) hopeList.push_back(queueHope.Pop());
+				    for (size_t i=0; i<fear_n && !queueFear.Empty(); ++i) fearList.push_back(queueFear.Pop());
+				    
+				    for (size_t i=0; i<hopeList.size(); ++i) {
+				      float hopeBleu = hopeList[i].first;
+				      size_t hopeIndex = hopeList[i].second;
+				      for (size_t j=0; j<fearList.size(); ++j) {
+					float fearBleu = -1*(fearList[j].first);
+					size_t fearIndex = fearList[j].second;
+					cerr << "Rank " << rank << ", epoch " << epoch << ", hope: " << hopeBleu << " (" << hopeIndex  << "), fear: " << fearBleu << " (" << fearIndex << ")" << endl;
+					bleuScoresHopeSample[batchPosition].push_back(hopeBleu);                                     
+					bleuScoresFearSample[batchPosition].push_back(fearBleu);                                       
+					featureValuesHopeSample[batchPosition].push_back(featureValues[batchPosition][hopeIndex]);       
+					featureValuesFearSample[batchPosition].push_back(featureValues[batchPosition][fearIndex]);       
+					modelScoresHopeSample[batchPosition].push_back(modelScores[batchPosition][hopeIndex]);           
+					modelScoresFearSample[batchPosition].push_back(modelScores[batchPosition][fearIndex]); 
+					
+					featureValues[batchPosition][hopeIndex].IncrementSparseHopeFeatures();                 
+					featureValues[batchPosition][fearIndex].IncrementSparseFearFeatures();
+				      }
+				    }
+				    if (!makePairs)
+				      cerr << "Rank " << rank << ", epoch " << epoch << "summing up hope and fear vectors, no pairs" << endl;
+				  }
 				}
-
+							  				  
 				// next input sentence
 				++sid;
 				++actualBatchSize;
@@ -1474,11 +1386,11 @@ int main(int argc, char** argv) {
 
 			}
 			else if (examples_in_batch == 0 || (sample && skip_sample)) {
-				cerr << "Rank " << rank << ", epoch " << epoch << ", batch is empty." << endl;
+			  cerr << "Rank " << rank << ", epoch " << epoch << ", batch is empty." << endl;
 			}
 			else {
 				vector<vector<float> > losses(actualBatchSize);
-				if (model_hope_fear) {
+				if (model_hope_fear && !skip_hope) {
 					// Set loss for each sentence as BLEU(oracle) - BLEU(hypothesis)
 					for (size_t batchPosition = 0; batchPosition < actualBatchSize; ++batchPosition) {
 						for (size_t j = 0; j < bleuScores[batchPosition].size(); ++j) {
@@ -1486,7 +1398,7 @@ int main(int argc, char** argv) {
 						}
 					}
 				}
-
+				
 				// set weight for bleu feature to 0 before optimizing
 				vector<const ScoreProducer*>::const_iterator iter;
 				const vector<const ScoreProducer*> featureFunctions2 = staticData.GetTranslationSystem(TranslationSystem::DEFAULT).GetFeatureFunctions();
@@ -1499,13 +1411,9 @@ int main(int argc, char** argv) {
 
 				// scale LM feature (to avoid rapid changes)
 				if (scale_lm) {
+				  cerr << "scale lm" << endl;
 				  const LMList& lmList_new = staticData.GetLMList();
 				  for (LMList::const_iterator iter = lmList_new.begin(); iter != lmList_new.end(); ++iter) {
-				    // scale up weight
-				    float lmWeight = mosesWeights.GetScoreForProducer(*iter);
-				    mosesWeights.Assign(*iter, lmWeight*scale_lm_factor);
-				    cerr << "Rank " << rank << ", epoch " << epoch << ", lm weight scaled from " << lmWeight << " to " << lmWeight*scale_lm_factor << endl;
-
 				    // scale down score
 				    if (sample) {
 				    	scaleFeatureScore(*iter, scale_lm_factor, featureValuesHopeSample, rank, epoch);
@@ -1523,9 +1431,6 @@ int main(int argc, char** argv) {
 				if (scale_wp) {
 				  // scale up weight  
 				  WordPenaltyProducer *wp = staticData.GetFirstWordPenaltyProducer();
-				  float wpWeight = mosesWeights.GetScoreForProducer(wp);
-				  mosesWeights.Assign(wp, wpWeight*scale_wp_factor);
-				  cerr << "Rank " << rank << ", epoch " << epoch << ", wp weight scaled from " << wpWeight << " to " << wpWeight*scale_wp_factor << endl;
 
 				  // scale down score
 				  if (sample) {
@@ -1542,10 +1447,7 @@ int main(int argc, char** argv) {
 				if (scale_all) {
 				  // scale distortion
 				  DistortionScoreProducer *dp = staticData.GetDistortionScoreProducer();
-				  float dWeight = mosesWeights.GetScoreForProducer(dp);
-                                  mosesWeights.Assign(dp, dWeight*scale_all_factor);
-                                  cerr << "Rank " << rank << ", epoch " << epoch << ", distortion weight scaled from " << dWeight << " to " << dWeight*scale_all_factor << endl;
-
+				  
                                   // scale down score                                                                                      
                                   if (sample) {
                                     scaleFeatureScore(dp, scale_all_factor, featureValuesHopeSample, rank, epoch);
@@ -1561,15 +1463,7 @@ int main(int argc, char** argv) {
 				  vector<LexicalReordering*> lrVec = staticData.GetLexicalReorderModels();
                                   for (size_t i=0; i<lrVec.size(); ++i) {
 				    LexicalReordering* lr = lrVec[i];
-				    // scale up weight                                                                                    
-				    vector<float> dWeights = mosesWeights.GetScoresForProducer(lr);
-				    for (size_t j=0; j<dWeights.size(); ++j) {
-				      cerr << "Rank " << rank << ", epoch " << epoch << ", d weight scaled from " << dWeights[j];
-				      dWeights[j] *= scale_all_factor;
-				      cerr << " to " << dWeights[j] << endl;
-				    }
-				    mosesWeights.Assign(lr, dWeights);
-                           
+				   
 				    // scale down score                                                                                  
 				    if (sample) {
 				      scaleFeatureScores(lr, scale_all_factor, featureValuesHopeSample, rank, epoch);
@@ -1586,14 +1480,6 @@ int main(int argc, char** argv) {
 				  vector<PhraseDictionaryFeature*> pdVec = staticData.GetPhraseDictionaryModels();
                                   for (size_t i=0; i<pdVec.size(); ++i) {
 				    PhraseDictionaryFeature* pd = pdVec[i];
-                                    // scale up weight                                                                                        
-                                    vector<float> tWeights = mosesWeights.GetScoresForProducer(pd);
-				    for (size_t j=0; j<tWeights.size(); ++j) {
-                                      cerr << "Rank " << rank << ", epoch " << epoch << ", t weight scaled from " << tWeights[j];
-                                      tWeights[j] *= scale_all_factor;
-                                      cerr << " to " << tWeights[j] << endl;
-                                    }
-                                    mosesWeights.Assign(pd, tWeights);
 
                                     // scale down score                                                                                     
                                     if (sample) {
@@ -1607,20 +1493,7 @@ int main(int argc, char** argv) {
                                     }
                                   }
 				}
-				
-				// set core features to 0 to avoid updating the feature weights
-				if (coreWeightMap.size() > 0) {
-					if (sample) {
-						ignoreCoreFeatures(featureValuesHopeSample, coreWeightMap);
-						ignoreCoreFeatures(featureValuesFearSample, coreWeightMap);
-					}
-					else {
-						ignoreCoreFeatures(featureValues, coreWeightMap);
-						ignoreCoreFeatures(featureValuesHope, coreWeightMap);
-						ignoreCoreFeatures(featureValuesFear, coreWeightMap);
-					}
-				}
-			
+					
 				// print out the feature values
 				if (print_feature_values) {
 					cerr << "\nRank " << rank << ", epoch " << epoch << ", feature values: " << endl;
@@ -1641,9 +1514,7 @@ int main(int argc, char** argv) {
 
 				// apply learning rates to feature vectors before optimization
 				if (feature_confidence) {
-				  cerr << "Rank " << rank << ", epoch " << epoch << ", apply feature learning rates with decay " << decay << ": " << featureLearningRates << endl;
-				  //weightUpdate.MultiplyEqualsBackoff(featureLearningRates, sparse_r0);
-				  //cerr << "Rank " << rank << ", epoch " << epoch << ", scaled update: " << weightUpdate << endl;
+				  cerr << "Rank " << rank << ", epoch " << epoch << ", apply feature learning rates with decays " << decay_core << "/" << decay_sparse << ": " << featureLearningRates << endl;
 				  if (sample) {
 				    cerr << "Rank " << rank << ", epoch " << epoch << ", feature values before: " << featureValuesHopeSample[0][0] << endl;
 				    applyPerFeatureLearningRates(featureValuesHopeSample, featureLearningRates, sparse_r0);
@@ -1674,120 +1545,122 @@ int main(int argc, char** argv) {
 				  }
 				}
 
+				// if we scaled up the weights, scale down model scores now
+				if (scaling_constant != 1.0) {
+				  if (hope_fear || hope_model || perceptron_update) {
+				    for (size_t i = 0; i < modelScoresHope.size(); ++i)
+				      for (size_t j = 0; j < modelScoresHope[i].size(); ++j) {
+					modelScoresHope[i][j] /= scaling_constant;
+					modelScoresFear[i][j] /= scaling_constant;
+				      }
+				  }
+				  else if (model_hope_fear || rank_only) {
+				    if (sample) {
+				      cerr << "Rank " << rank << ", epoch " << epoch << ", scale down model scores for sampling.. " << endl;
+				      for (size_t i = 0; i < modelScoresHopeSample.size(); ++i)
+					for (size_t j = 0; j < modelScoresHopeSample[i].size(); ++j) {
+					  modelScoresHopeSample[i][j] /= scaling_constant;
+					  modelScoresFearSample[i][j] /= scaling_constant;
+					}
+				    }
+				    else { 
+				      for (size_t i = 0; i < modelScores.size(); ++i)
+                                        for (size_t j = 0; j < modelScores[i].size(); ++j) 
+                                          modelScores[i][j] /= scaling_constant;
+				    }
+				  }
+				}
+
 				// Run optimiser on batch:
 				VERBOSE(1, "\nRank " << rank << ", epoch " << epoch << ", run optimiser:" << endl);
 				size_t update_status = 1;
 				ScoreComponentCollection weightUpdate;
 				if (perceptron_update) {
 					vector<vector<float> > dummy1;
-					update_status = optimiser->updateWeightsHopeFear(mosesWeights, weightUpdate,
+					update_status = optimiser->updateWeightsHopeFear( weightUpdate,
 							featureValuesHope, featureValuesFear, dummy1, dummy1, dummy1, dummy1, learning_rate, rank, epoch);
 				}
 				else if (hope_fear || hope_model) {
 					if (bleuScoresHope[0][0] >= min_oracle_bleu) {
-						if (hope_n == 1 && fear_n ==1 && batchSize == 1) {
-							update_status = ((MiraOptimiser*) optimiser)->updateWeightsAnalytically(mosesWeights, weightUpdate,
+						if (hope_n == 1 && fear_n ==1 && batchSize == 1 && !hildreth) {
+							update_status = ((MiraOptimiser*) optimiser)->updateWeightsAnalytically(weightUpdate,
 									featureValuesHope[0][0], featureValuesFear[0][0], bleuScoresHope[0][0], bleuScoresFear[0][0],
 									modelScoresHope[0][0], modelScoresFear[0][0], learning_rate, rank, epoch);
 						}
-						else {
-						  update_status = optimiser->updateWeightsHopeFear(mosesWeights, weightUpdate,
+						else 						  
+						  update_status = optimiser->updateWeightsHopeFear(weightUpdate,
 									featureValuesHope, featureValuesFear, bleuScoresHope, bleuScoresFear,
-									modelScoresHope, modelScoresFear, learning_rate, rank, epoch);
-						}
+									modelScoresHope, modelScoresFear, learning_rate, rank, epoch);				
 					}
 					else
 						update_status = 1;
 				}
 				else if (rank_only) {
-					// learning ranking of model translations
-					update_status = ((MiraOptimiser*) optimiser)->updateWeightsRankModel(mosesWeights, weightUpdate,
+				  // learning ranking of model translations
+				  if (summed)
+				    update_status = ((MiraOptimiser*) optimiser)->updateWeightsRankModelSummed(weightUpdate,
+							featureValues, bleuScores, modelScores, learning_rate, rank, epoch);
+				  else
+				    update_status = ((MiraOptimiser*) optimiser)->updateWeightsRankModel(weightUpdate,
 							featureValues, bleuScores, modelScores, learning_rate, rank, epoch);
 				}
 				else {
 					// model_hope_fear
 					if (sample) {
-						update_status = optimiser->updateWeightsHopeFear(mosesWeights, weightUpdate,
-								featureValuesHopeSample, featureValuesFearSample, bleuScoresHopeSample, bleuScoresFearSample,
+					  if (selective) 
+					    update_status = ((MiraOptimiser*)optimiser)->updateWeightsHopeFearSelective(weightUpdate, 
+							     featureValuesHopeSample, featureValuesFearSample, 
+							     bleuScoresHopeSample, bleuScoresFearSample, modelScoresHopeSample, 
+							     modelScoresFearSample, learning_rate, rank, epoch);
+					  else if (summed)
+					    update_status = ((MiraOptimiser*)optimiser)->updateWeightsHopeFearSummed(weightUpdate,
+							     featureValuesHopeSample, featureValuesFearSample,
+							     bleuScoresHopeSample, bleuScoresFearSample, modelScoresHopeSample,
+							     modelScoresFearSample, learning_rate, rank, epoch, rescaleSlack, rewardHope, makePairs);
+					  else {
+					    if (batchSize == 1 && featureValuesHopeSample[0].size() == 1 && !hildreth) {
+					      cerr << "Rank " << rank << ", epoch " << epoch << ", model score hope: " << modelScoresHopeSample[0][0] << endl;
+					      cerr << "Rank " << rank << ", epoch " << epoch << ", model score fear: " << modelScoresFearSample[0][0] << endl;
+					      update_status = ((MiraOptimiser*) optimiser)->updateWeightsAnalytically(weightUpdate, 
+                                                             featureValuesHopeSample[0][0], featureValuesFearSample[0][0], 
+							     bleuScoresHopeSample[0][0], bleuScoresFearSample[0][0], 
+							     modelScoresHopeSample[0][0], modelScoresFearSample[0][0], 
+							     learning_rate, rank, epoch);
+					    }
+					    else {
+					      cerr << "Rank " << rank << ", epoch " << epoch << ", model score hope: " << modelScoresHopeSample[0][0] << endl;
+                                              cerr << "Rank " << rank << ", epoch " << epoch << ", model score fear: " << modelScoresFearSample[0][0] << endl;
+					      update_status = optimiser->updateWeightsHopeFear(weightUpdate,
+								featureValuesHopeSample, featureValuesFearSample, 
+								bleuScoresHopeSample, bleuScoresFearSample,
 								modelScoresHopeSample, modelScoresFearSample, learning_rate, rank, epoch);
+					    }
+					  }
 					}
 					else {
-						update_status = ((MiraOptimiser*) optimiser)->updateWeights(mosesWeights, weightUpdate,
-							featureValues, losses, bleuScores, modelScores, oracleFeatureValues, oracleBleuScores, oracleModelScores, learning_rate, rank, epoch);
+					  if (summed) {
+					    // don't differentiate between hope and model/fear, treat all the same and sum constraints
+					    update_status = ((MiraOptimiser*) optimiser)->updateWeightsRankModelSummed(weightUpdate,
+							     featureValues, bleuScores, modelScores, learning_rate, rank, epoch);
+					  }
+					  else
+					    update_status = ((MiraOptimiser*) optimiser)->updateWeights(weightUpdate,
+							     featureValues, losses, bleuScores, modelScores, oracleFeatureValues, oracleBleuScores, oracleModelScores, learning_rate, rank, epoch);
 					}
 				}
 
 //			sumStillViolatedConstraints += update_status;
 
-				// rescale LM feature                                              
-				if (scale_lm) {
-				  const LMList& lmList_new = staticData.GetLMList();
-				  for (LMList::const_iterator iter = lmList_new.begin(); iter != lmList_new.end(); ++iter) {
-				    // scale weight back down                                                                
-				    float lmWeight = mosesWeights.GetScoreForProducer(*iter);
-				    mosesWeights.Assign(*iter, lmWeight/scale_lm_factor);
-				    cerr << "Rank " << rank << ", epoch " << epoch << ", lm weight rescaled from " << lmWeight << " to " << lmWeight/scale_lm_factor << endl;
-				  }
-				}
-
-				// rescale WP feature
-				if (scale_wp) {
-				  // scale weight back down
-				  WordPenaltyProducer *wp = staticData.GetFirstWordPenaltyProducer();
-				  float wpWeight = mosesWeights.GetScoreForProducer(wp);
-				  mosesWeights.Assign(wp, wpWeight/scale_wp_factor);
-				  cerr << "Rank " << rank << ", epoch " << epoch << ", wp weight rescaled from " << wpWeight << " to " << wpWeight/scale_wp_factor << endl;                                  
-                                }
-
-				if (scale_all) {
-				  // rescale distortion
-				  DistortionScoreProducer *dp = staticData.GetDistortionScoreProducer();
-                                  float dWeight = mosesWeights.GetScoreForProducer(dp);
-                                  mosesWeights.Assign(dp, dWeight/scale_all_factor);
-                                  cerr << "Rank " << rank << ", epoch " << epoch << ", distortion weight rescaled from " << dWeight << " to " << dWeight/scale_all_factor << endl;
-
-				  // rescale lexical reordering
-				  vector<LexicalReordering*> lr = staticData.GetLexicalReorderModels();
-                                  for (size_t i=0; i<lr.size(); ++i) {
-				    vector<float> dWeights = mosesWeights.GetScoresForProducer(lr[i]);
-				    for (size_t j=0; j<dWeights.size(); ++j) {
-				      cerr << "Rank " << rank << ", epoch " << epoch << ", d weight rescaled from " << dWeights[j];
-				      dWeights[j] /=scale_all_factor;
-				      cerr << " to " << dWeights[j] << endl;				    
-				    }
-				    mosesWeights.Assign(lr[i], dWeights);				    
-				  }
-
-				  // rescale phrase models
-				  vector<PhraseDictionaryFeature*> pd = staticData.GetPhraseDictionaryModels();
-                                  for (size_t i=0; i<pd.size(); ++i) {
-				    vector<float> tWeights = mosesWeights.GetScoresForProducer(pd[i]);
-				    for (size_t j=0; j<tWeights.size(); ++j) {
-                                      cerr << "Rank " << rank << ", epoch " << epoch << ", t weight rescaled from " << tWeights[j];
-                                      tWeights[j] /=scale_all_factor;
-                                      cerr << " to " << tWeights[j] << endl;
-                                    }
-                                    mosesWeights.Assign(pd[i], tWeights);
-				  }
-				}
-
 				if (update_status == 0) {	 // if weights were updated
 					// apply weight update
 				        cerr << "Rank " << rank << ", epoch " << epoch << ", update: " << weightUpdate << endl;
 					
-					if (l2_regularize) {
-					  weightUpdate.L2Regularize(l2_lambda);
-					  cerr << "Rank " << rank << ", epoch " << epoch << ", " 
-					       << "l2-reg. on mosesWeights with lambda=" << l2_lambda << endl;  
-					  cerr << "Rank " << rank << ", epoch " << epoch << ", regularized update: " << weightUpdate << endl;
-					}
-
 					if (feature_confidence) {
 					  // update confidence counts based on weight update
 					  confidenceCounts.UpdateConfidenceCounts(weightUpdate, signed_counts);
 					  				  
 					  // update feature learning rates
-					  featureLearningRates.UpdateLearningRates(decay, confidenceCounts, core_r0, sparse_r0); 
+					  featureLearningRates.UpdateLearningRates(decay_core, decay_sparse, confidenceCounts, core_r0, sparse_r0); 
 					}
 
 					mosesWeights.PlusEquals(weightUpdate);
@@ -1817,6 +1690,7 @@ int main(int argc, char** argv) {
 
 					// set new Moses weights
 					decoder->setWeights(mosesWeights);
+					cerr << "Rank " << rank << ", epoch " << epoch << ", new weights: " << mosesWeights << endl;
 
 					// adjust bleu weight
 					if (bleu_weight_lm_adjust) {
@@ -1852,11 +1726,12 @@ int main(int argc, char** argv) {
 			// mix weights?
 			if (mix) {
 #ifdef MPI_ENABLE
+			  cerr << "Rank " << rank << ", epoch " << epoch << ", mixing weights.. " << endl;
 				// collect all weights in mixedWeights and divide by number of processes
 				mpi::reduce(world, mosesWeights, mixedWeights, SCCPlus(), 0);
 
 				// mix confidence counts
-				mpi::reduce(world, confidenceCounts, mixedConfidenceCounts, SCCPlus(), 0);
+				//mpi::reduce(world, confidenceCounts, mixedConfidenceCounts, SCCPlus(), 0);
 				ScoreComponentCollection totalBinary;
 				if (sparseAverage) {
 					ScoreComponentCollection binary;
@@ -1873,11 +1748,8 @@ int main(int argc, char** argv) {
 					  mixedWeights.DivideEquals(size);
 					
 					// divide confidence counts
-					if (averageConfidenceCounts) {
-					  mixedConfidenceCounts.DivideEquals(size);
-					  cerr << "Rank " << rank << ", epoch " << epoch << ", average confidence counts." << endl;
-					}
-
+					//mixedConfidenceCounts.DivideEquals(size);
+					
 					// normalise weights after averaging
 					if (normaliseWeights) {
 						mixedWeights.L1Normalise();
@@ -1905,15 +1777,17 @@ int main(int argc, char** argv) {
 					       << pruned << " features pruned from cumulativeWeights." << endl;
 					}
 					
-					if (l1_regularize && weightMixingThisEpoch == mixingFrequency) {
-					  mixedWeights.L1Regularize(l1_lambda);
-					  cerr << "Rank " << rank << ", epoch " << epoch << ", " 
-					       << "l1-reg. on mixedWeights with lambda=" << l1_lambda << endl;
-					  
-					  // subtract lambda from every weight in the sum --> multiply by number of updates
-					  cumulativeWeights.L1Regularize(l1_lambda*numberOfUpdatesThisEpoch);
-					  cerr << "Rank " << rank << ", epoch " << epoch << ", " 
-					       << "l1-reg. on cumulativeWeights with lambda=" << l1_lambda*numberOfUpdatesThisEpoch << endl;
+					if (weightMixingThisEpoch == mixingFrequency) {
+					  if (l1_regularize) { 
+					    size_t pruned = mixedWeights.SparseL1Regularize(l1_lambda);
+					    cerr << "Rank " << rank << ", epoch " << epoch << ", " 
+					       << "l1-reg. on mixedWeights with lambda=" << l1_lambda << ", pruned: " << pruned << endl;
+					  					  }
+					  if (l2_regularize) {
+					    mixedWeights.SparseL2Regularize(l2_lambda);
+					    cerr << "Rank " << rank << ", epoch " << epoch << ", " 
+						 << "l2-reg. on mixedWeights with lambda=" << l2_lambda << endl;  
+					  }
 					}										
 				}
 
@@ -1923,8 +1797,8 @@ int main(int argc, char** argv) {
 				mosesWeights = mixedWeights;
 
 				// broadcast summed confidence counts
-				mpi::broadcast(world, mixedConfidenceCounts, 0);
-				confidenceCounts = mixedConfidenceCounts;
+				//mpi::broadcast(world, mixedConfidenceCounts, 0);
+				//confidenceCounts = mixedConfidenceCounts;
 				
 #endif
 #ifndef MPI_ENABLE
@@ -2021,6 +1895,25 @@ int main(int argc, char** argv) {
 			      cerr << "Dumping mixed average weights during epoch " << epoch << " to " << filename.str() << endl << endl;
 			      mixedAverageWeights.Save(filename.str());
 			      ++weightEpochDump;
+			      
+			      if (weightEpochDump == weightDumpFrequency) {
+				if (l1_regularize) { 
+				  size_t pruned = mixedAverageWeights.SparseL1Regularize(l1_lambda);
+				  cerr << "Rank " << rank << ", epoch " << epoch << ", " 
+				       << "l1-reg. on mixedAverageWeights with lambda=" << l1_lambda << ", pruned: " << pruned << endl;
+				}
+				if (l2_regularize) {
+				  mixedAverageWeights.SparseL2Regularize(l2_lambda);
+				  cerr << "Rank " << rank << ", epoch " << epoch << ", " 
+				       << "l2-reg. on mixedAverageWeights with lambda=" << l2_lambda << endl;  
+				}
+				
+				if (l1_regularize || l2_regularize) {
+				  filename << "_reg";
+				  cerr << "Dumping regularized mixed average weights during epoch " << epoch << " to " << filename.str() << endl << endl;
+				  mixedAverageWeights.Save(filename.str());
+				}
+			      }										
 
 			      if (weightEpochDump == weightDumpFrequency && printFeatureCounts) {
 			      	// print out all features with counts
@@ -2042,11 +1935,11 @@ int main(int argc, char** argv) {
 
 		} // end of shard loop, end of this epoch
 
-		if (printNbestWithFeatures && rank == 0 && epoch == 0) {
-      cerr << "Writing out hope/fear nbest list with features: " << f1 << ", " << f2 << endl;
-			hopePlusFeatures.close();
-			fearPlusFeatures.close();
-		}
+	        /*if (printNbestWithFeatures && rank == 0 && epoch == 0) {
+		  cerr << "Writing out hope/fear nbest list with features: " << f1 << ", " << f2 << endl;
+		  hopePlusFeatures.close();
+		  fearPlusFeatures.close();
+		}*/
 
 		if (historyBleu) {
 			cerr << "Bleu feature history after epoch " <<  epoch << endl;
@@ -2133,6 +2026,13 @@ int main(int argc, char** argv) {
 	time(&now);
 	cerr << "Rank " << rank << ", " << ctime(&now);
 
+	if (rank == 0) {
+	  ScoreComponentCollection dummy;
+	  ostringstream endfilename;
+	  endfilename << "finished";
+	  dummy.Save(endfilename.str());
+	}
+
 	delete decoder;
 	exit(0);
 }
@@ -2147,7 +2047,7 @@ bool loadSentences(const string& filename, vector<string>& sentences) {
 	return true;
 }
 
-bool loadCoreWeights(const string& filename, ProducerWeightMap& coreWeightMap, const vector<const ScoreProducer*> &featureFunctions) {
+/*bool loadCoreWeights(const string& filename, ProducerWeightMap& coreWeightMap, const vector<const ScoreProducer*> &featureFunctions) {
 	ifstream in(filename.c_str());
 	if (!in)
 		return false;
@@ -2180,10 +2080,10 @@ bool loadCoreWeights(const string& filename, ProducerWeightMap& coreWeightMap, c
 					store_weights.push_back(weight);
 					if (store_weights.size() == featureFunctions[i]->GetNumScoreComponents()) {
 						coreWeightMap.insert(ProducerWeightPair(featureFunctions[i], store_weights));
-						/*cerr << "insert " << store_weights.size() << " weights for " << featureFunctions[i]->GetScoreProducerDescription() << " (";
+						cerr << "insert " << store_weights.size() << " weights for " << featureFunctions[i]->GetScoreProducerDescription() << " (";
 						for (size_t j=0; j < store_weights.size(); ++j)
 							cerr << store_weights[j] << " ";
-							cerr << ")" << endl;*/
+							cerr << ")" << endl;
 						store_weights.clear();
 					}
 				}
@@ -2191,7 +2091,7 @@ bool loadCoreWeights(const string& filename, ProducerWeightMap& coreWeightMap, c
 		}
 	}
 	return true;
-}
+}*/
 
 bool evaluateModulo(size_t shard_position, size_t mix_or_dump_base, size_t actual_batch_size) {
 	if (mix_or_dump_base == 0) return 0;
@@ -2369,7 +2269,7 @@ void scaleFeatureScore(ScoreProducer *sp, float scaling_factor, vector<vector<Sc
   for (size_t i=0; i<featureValues.size(); ++i) { // each item in batch
     for (size_t j=0; j<featureValues[i].size(); ++j) { // each item in nbest
       featureScore = featureValues[i][j].GetScoreForProducer(sp);
-      featureValues[i][j].Assign(sp, featureScore/scaling_factor);
+      featureValues[i][j].Assign(sp, featureScore*scaling_factor);
       //cerr << "Rank " << rank << ", epoch " << epoch << ", " << name << " score scaled from " << featureScore << " to " << featureScore/scaling_factor << endl;
     }
   }
@@ -2383,7 +2283,7 @@ void scaleFeatureScores(ScoreProducer *sp, float scaling_factor, vector<vector<S
     for (size_t j=0; j<featureValues[i].size(); ++j) { // each item in nbest                                                               
       vector<float> featureScores = featureValues[i][j].GetScoresForProducer(sp);
       for (size_t k=0; k<featureScores.size(); ++k)
-	featureScores[k] /= scaling_factor;
+	featureScores[k] *= scaling_factor;
       featureValues[i][j].Assign(sp, featureScores);
       //cerr << "Rank " << rank << ", epoch " << epoch << ", " << name << " score scaled from " << featureScore << " to " << featureScore/scaling_factor << endl;                                                                                                                            
     }
diff --git a/mira/Main.h b/mira/Main.h
index 690a475d6..ff6e18d7b 100755..100644
--- a/mira/Main.h
+++ b/mira/Main.h
@@ -44,7 +44,7 @@ struct RandomIndex {
 
 //void OutputNBestList(const MosesChart::TrellisPathList &nBestList, const TranslationSystem* system, long translationId);
 bool loadSentences(const std::string& filename, std::vector<std::string>& sentences);
-bool loadCoreWeights(const std::string& filename, ProducerWeightMap& coreWeightMap, const std::vector<const Moses::ScoreProducer*> &featureFunctions);
+//bool loadCoreWeights(const std::string& filename, ProducerWeightMap& coreWeightMap, const std::vector<const Moses::ScoreProducer*> &featureFunctions);
 bool evaluateModulo(size_t shard_position, size_t mix_or_dump_base, size_t actual_batch_size);
 void printFeatureValues(std::vector<std::vector<Moses::ScoreComponentCollection> > &featureValues);
 void ignoreCoreFeatures(std::vector<std::vector<Moses::ScoreComponentCollection> > &featureValues, ProducerWeightMap &coreWeightMap);
diff --git a/mira/Makefile.am b/mira/Makefile.am
index cd490c853..cd490c853 100755..100644
--- a/mira/Makefile.am
+++ b/mira/Makefile.am
diff --git a/mira/MiraOptimiser.cpp b/mira/MiraOptimiser.cpp
index c3ccfb011..a0fb07238 100755..100644
--- a/mira/MiraOptimiser.cpp
+++ b/mira/MiraOptimiser.cpp
@@ -8,7 +8,6 @@ using namespace std;
 namespace Mira {
 
 size_t MiraOptimiser::updateWeights(
-		ScoreComponentCollection& currWeights,
 		ScoreComponentCollection& weightUpdate,
     const vector<vector<ScoreComponentCollection> >& featureValues,
     const vector<vector<float> >& losses,
@@ -152,7 +151,6 @@ size_t MiraOptimiser::updateWeights(
 }
 
 size_t MiraOptimiser::updateWeightsHopeFear(
-		Moses::ScoreComponentCollection& currWeights,
 		Moses::ScoreComponentCollection& weightUpdate,
 		const std::vector< std::vector<Moses::ScoreComponentCollection> >& featureValuesHope,
 		const std::vector< std::vector<Moses::ScoreComponentCollection> >& featureValuesFear,
@@ -315,7 +313,6 @@ size_t MiraOptimiser::updateWeightsHopeFear(
 }
 
 size_t MiraOptimiser::updateWeightsAnalytically(
-		ScoreComponentCollection& currWeights,
 		ScoreComponentCollection& weightUpdate,
 		ScoreComponentCollection& featureValuesHope,
     ScoreComponentCollection& featureValuesFear,
@@ -457,7 +454,6 @@ size_t MiraOptimiser::updateWeightsAnalytically(
 }
 
 size_t MiraOptimiser::updateWeightsRankModel(
-		Moses::ScoreComponentCollection& currWeights,
 		Moses::ScoreComponentCollection& weightUpdate,
 		const std::vector< std::vector<Moses::ScoreComponentCollection> >& featureValues,
 		const std::vector<std::vector<float> >& bleuScores,
@@ -478,11 +474,12 @@ size_t MiraOptimiser::updateWeightsRankModel(
 	float oldDistanceFromOptimum = 0;
 
 	// iterate over input sentences (1 (online) or more (batch))
-	float minBleuDifference = 1.0;
+	float minBleuDifference = 0.005;
 	for (size_t i = 0; i < featureValues.size(); ++i) {
 		// Build all pairs where the first has higher Bleu than the second
 		for (size_t j = 0; j < featureValues[i].size(); ++j) {
 			for (size_t k = 0; k < featureValues[i].size(); ++k) {
+			        if (j == k) continue;
 				if (bleuScores[i][j] - minBleuDifference  < bleuScores[i][k]) // need at least a positive Bleu difference of specified amount
 				  continue;
 				
@@ -598,5 +595,419 @@ size_t MiraOptimiser::updateWeightsRankModel(
 	return 0;
 }
 
+size_t MiraOptimiser::updateWeightsHopeFearSelective(
+		Moses::ScoreComponentCollection& weightUpdate,
+		const std::vector< std::vector<Moses::ScoreComponentCollection> >& featureValuesHope,
+		const std::vector< std::vector<Moses::ScoreComponentCollection> >& featureValuesFear,
+		const std::vector<std::vector<float> >& bleuScoresHope,
+		const std::vector<std::vector<float> >& bleuScoresFear,
+		const std::vector<std::vector<float> >& modelScoresHope,
+		const std::vector<std::vector<float> >& modelScoresFear,
+		float learning_rate,
+		size_t rank,
+		size_t epoch,
+		int updatePosition) {
+
+  // vector of feature values differences for all created constraints
+  vector<ScoreComponentCollection> nonZeroFeatures;
+  vector<float> lossMinusModelScoreDiffs;
+ 
+  // Make constraints for new hypothesis translations
+  float epsilon = 0.0001;
+  int violatedConstraintsBefore = 0;
+  
+  // iterate over input sentences (1 (online) or more (batch))
+  for (size_t i = 0; i < featureValuesHope.size(); ++i) {
+    if (updatePosition != -1) {
+      if (i < updatePosition)
+	continue;
+      else if (i > updatePosition)
+	break;
+    }
+    
+    // Pick all pairs[j,j] of hope and fear translations for one input sentence
+    for (size_t j = 0; j < featureValuesHope[i].size(); ++j) {
+      ScoreComponentCollection featureValueDiff = featureValuesHope[i][j];
+      featureValueDiff.MinusEquals(featureValuesFear[i][j]);
+      if (featureValueDiff.GetL1Norm() == 0) {
+	cerr << "Rank " << rank << ", epoch " << epoch << ", features equal --> skip" << endl;
+	continue;
+      }
+           
+      // check if constraint is violated
+      float loss = bleuScoresHope[i][j] - bleuScoresFear[i][j];
+      float modelScoreDiff = modelScoresHope[i][j] - modelScoresFear[i][j];
+      float diff = 0;
+      if (loss > modelScoreDiff) 
+	diff = loss - modelScoreDiff;
+      if (diff > epsilon) 
+	++violatedConstraintsBefore;   				
+      cerr << "Rank " << rank << ", epoch " << epoch << ", constraint: " << modelScoreDiff << " >= " << loss << " (current violation: " << diff << ")" << endl;	
+	    
+      // iterate over difference vector and add a constraint for every non-zero feature
+      FVector features = featureValueDiff.GetScoresVector();
+      size_t n_core = 0, n_sparse = 0, n_sparse_hope = 0, n_sparse_fear = 0;
+      for (size_t i=0; i<features.coreSize(); ++i) {
+	if (features[i] != 0.0) {
+	  ++n_core;
+	  ScoreComponentCollection f;
+	  f.Assign(i, features[i]);
+	  nonZeroFeatures.push_back(f);
+	}
+      }
+      // 1 + 2 
+      /*for (FVector::iterator i = features.begin(); i != features.end(); ++i) {
+	if (i->second != 0.0) {
+	  ++n_sparse;
+	  ScoreComponentCollection f;
+          f.Assign((i->first).name(), i->second);
+	  nonZeroFeatures.push_back(f);
+	  cerr << "Rank " << rank << ", epoch " << epoch << ", f: " << f << endl;
+	}
+      }
+      cerr << "Rank " << rank << ", epoch " << epoch << ", non-zero features: " << nonZeroFeatures.size() << endl;*/
+
+      // 3 
+      vector<ScoreComponentCollection> nonZeroFeaturesHope;
+      vector<ScoreComponentCollection> nonZeroFeaturesFear;
+      for (FVector::iterator i = features.begin(); i != features.end(); ++i) {
+        if (i->second != 0.0) {
+          ScoreComponentCollection f;
+          f.Assign((i->first).name(), i->second);
+          cerr << "Rank " << rank << ", epoch " << epoch << ", f: " << f << endl;
+
+	  if (i->second > 0.0) {
+	    ++n_sparse_hope;
+	    nonZeroFeaturesHope.push_back(f);
+	  }
+	  else {
+	    ++n_sparse_fear;
+	    nonZeroFeaturesFear.push_back(f);
+	  }
+        }
+      }
+
+      //1
+      /*float n = n_core + n_sparse;
+      for (size_t i=0; i<n; ++i)
+	lossMinusModelScoreDiffs.push_back(diff/n);
+      
+      //2 
+      float diff_10 = diff * 0.1;
+      float diff_90 = diff * 0.9;
+      cerr << "Rank " << rank << ", epoch " << epoch << ", core diff: " << diff_10/n_core << endl;
+      cerr << "Rank " << rank << ", epoch " << epoch << ", sparse diff: " << diff_90/n_sparse << endl;
+      for (size_t i=0; i<n_core; ++i) 
+	lossMinusModelScoreDiffs.push_back(diff_10/n_core);
+      for (size_t i=0; i<n_sparse; ++i) 
+      lossMinusModelScoreDiffs.push_back(diff_90/n_sparse);*/
+
+      // 3 
+      float n = n_core + n_sparse_hope + n_sparse_fear;
+      for (size_t i=0; i<n_core; ++i)
+        lossMinusModelScoreDiffs.push_back(diff/n);      
+      for (size_t i=0; i<n_sparse_hope; ++i) {
+	nonZeroFeatures.push_back(nonZeroFeaturesHope[i]);
+        lossMinusModelScoreDiffs.push_back((diff/n)*1.1);
+      }
+      for (size_t i=0; i<n_sparse_fear; ++i) {
+	nonZeroFeatures.push_back(nonZeroFeaturesFear[i]);
+	lossMinusModelScoreDiffs.push_back(diff/n);
+      }
+      cerr << "Rank " << rank << ", epoch " << epoch << ", core diff: " << diff/n << endl;
+      cerr << "Rank " << rank << ", epoch " << epoch << ", hope diff: " << ((diff/n)*1.1) << endl;
+      cerr << "Rank " << rank << ", epoch " << epoch << ", fear diff: " << diff/n << endl;
+    }
+  }
+
+  assert(nonZeroFeatures.size() == lossMinusModelScoreDiffs.size());
+
+  // run optimisation: compute alphas for all given constraints
+  vector<float> alphas;
+  ScoreComponentCollection summedUpdate;
+  if (violatedConstraintsBefore > 0) {
+    cerr << "Rank " << rank << ", epoch " << epoch << ", number of constraints passed to optimizer: " << nonZeroFeatures.size() << endl;
+    alphas = Hildreth::optimise(nonZeroFeatures, lossMinusModelScoreDiffs, m_slack);
+    
+    // Update the weight vector according to the alphas and the feature value differences
+    // * w' = w' + SUM alpha_i * (h_i(oracle) - h_i(hypothesis))
+    for (size_t k = 0; k < nonZeroFeatures.size(); ++k) {
+      float alpha = alphas[k];
+      cerr << "Rank " << rank << ", epoch " << epoch << ", alpha: " << alpha << endl;
+      if (alpha != 0) {
+	ScoreComponentCollection update(nonZeroFeatures[k]);
+	update.MultiplyEquals(alpha);
+	
+	// sum updates
+	summedUpdate.PlusEquals(update);
+      }
+    }
+  }
+  else {
+    cerr << "Rank " << rank << ", epoch " << epoch << ", no constraint violated for this batch" << endl;
+    //	  return 0;
+    return 1;
+  }
+  
+  // apply learning rate
+  if (learning_rate != 1) {
+    cerr << "Rank " << rank << ", epoch " << epoch << ", apply learning rate " << learning_rate << " to update." << endl;
+    summedUpdate.MultiplyEquals(learning_rate);
+  }
+  
+  // scale update by BLEU of oracle (for batch size 1 only)
+  if (featureValuesHope.size() == 1) {
+    if (m_scale_update) {
+      cerr << "Rank " << rank << ", epoch " << epoch << ", scaling summed update with oracle bleu score " << bleuScoresHope[0][0] << endl;
+      summedUpdate.MultiplyEquals(bleuScoresHope[0][0]);
+    }
+  }
+  
+  //cerr << "Rank " << rank << ", epoch " << epoch << ", update: " << summedUpdate << endl;
+  weightUpdate.PlusEquals(summedUpdate);
+  return 0;
+}
+
+size_t MiraOptimiser::updateWeightsHopeFearSummed(
+		Moses::ScoreComponentCollection& weightUpdate,
+		const std::vector< std::vector<Moses::ScoreComponentCollection> >& featureValuesHope,
+		const std::vector< std::vector<Moses::ScoreComponentCollection> >& featureValuesFear,
+		const std::vector<std::vector<float> >& bleuScoresHope,
+		const std::vector<std::vector<float> >& bleuScoresFear,
+		const std::vector<std::vector<float> >& modelScoresHope,
+		const std::vector<std::vector<float> >& modelScoresFear,
+		float learning_rate,
+		size_t rank,
+		size_t epoch,
+		bool rescaleSlack,
+		bool rewardHope,
+		bool makePairs) {
+
+  // vector of feature values differences for all created constraints
+  ScoreComponentCollection averagedFeatureDiffs;
+  float averagedViolations = 0;
+ 
+  // Make constraints for new hypothesis translations
+  float epsilon = 0.0001;
+  int violatedConstraintsBefore = 0;
+  
+  if (!makePairs) {
+    ScoreComponentCollection featureValueDiff;
+    float lossHope = 0, lossFear = 0, modelScoreHope = 0, modelScoreFear = 0, hopeCount = 0, fearCount = 0;
+    // add all hope vectors
+    for (size_t i = 0; i < featureValuesHope.size(); ++i) {
+      for (size_t j = 0; j < featureValuesHope[i].size(); ++j)  {
+	featureValueDiff.PlusEquals(featureValuesHope[i][j]);
+	lossHope += bleuScoresHope[i][j];
+	modelScoreHope += modelScoresHope[i][j];
+	++hopeCount;
+      }
+    }
+    lossHope /= hopeCount;
+    modelScoreHope /= hopeCount;
+    
+    // subtract all fear  vectors
+    for (size_t i = 0; i < featureValuesFear.size(); ++i) {
+      for (size_t j = 0; j < featureValuesFear[i].size(); ++j) {
+        featureValueDiff.MinusEquals(featureValuesFear[i][j]);
+	lossFear += bleuScoresFear[i][j];
+        modelScoreFear += modelScoresFear[i][j];
+	++fearCount;
+      }
+    }
+    lossFear /= fearCount;
+    modelScoreFear /= fearCount;
+
+    if (featureValueDiff.GetL1Norm() == 0) {
+      cerr << "Rank " << rank << ", epoch " << epoch << ", features equal --> skip" << endl;
+      cerr << "Rank " << rank << ", epoch " << epoch << ", no constraint violated for this batch" << endl;
+      return 1;
+    }
+
+    // check if constraint is violated                                                                                                       
+    float lossDiff = lossHope - lossFear;
+    float modelScoreDiff = modelScoreHope - modelScoreFear;
+    float diff = 0;
+    if (lossDiff > modelScoreDiff)
+      diff = lossDiff - modelScoreDiff;
+    if (diff > epsilon)
+      ++violatedConstraintsBefore;
+    cerr << "Rank " << rank << ", epoch " << epoch << ", constraint: " << modelScoreDiff << " >= " << lossDiff << " (current violation: " <<\
+      diff << ")" << endl;
+
+    // add constraint                                                                                                                        
+    averagedFeatureDiffs = featureValueDiff;
+    averagedViolations = diff;
+  }
+  else {
+  // iterate over input sentences (1 (online) or more (batch))
+  for (size_t i = 0; i < featureValuesHope.size(); ++i) {    
+    // Pick all pairs[j,j] of hope and fear translations for one input sentence and add them up 
+    for (size_t j = 0; j < featureValuesHope[i].size(); ++j) {
+      ScoreComponentCollection featureValueDiff = featureValuesHope[i][j];
+      featureValueDiff.MinusEquals(featureValuesFear[i][j]);
+      if (featureValueDiff.GetL1Norm() == 0) {
+	cerr << "Rank " << rank << ", epoch " << epoch << ", features equal --> skip" << endl;
+	continue;
+      }
+           
+      // check if constraint is violated
+      float lossDiff = bleuScoresHope[i][j] - bleuScoresFear[i][j];
+      float modelScoreDiff = modelScoresHope[i][j] - modelScoresFear[i][j];
+      if (rescaleSlack) {
+	cerr << "Rank " << rank << ", epoch " << epoch << ", modelScoreDiff scaled by lossDiff: " << modelScoreDiff << " --> " << modelScoreDiff*lossDiff << endl;
+	modelScoreDiff *= lossDiff; 
+      }
+      float diff = 0;
+      if (lossDiff > modelScoreDiff) 
+	diff = lossDiff - modelScoreDiff;
+      if (diff > epsilon) 
+	++violatedConstraintsBefore;   				
+      cerr << "Rank " << rank << ", epoch " << epoch << ", constraint: " << modelScoreDiff << " >= " << lossDiff << " (current violation: " << diff << ")" << endl;	
+
+      // add constraint
+      if (rescaleSlack) {
+	averagedFeatureDiffs.MultiplyEquals(lossDiff);
+	cerr << "Rank " << rank << ", epoch " << epoch << ", featureValueDiff scaled by lossDiff." << endl;
+      }
+      averagedFeatureDiffs.PlusEquals(featureValueDiff);
+      averagedViolations += diff;
+    }	    
+  }
+  }
+
+  // divide by number of constraints (1/n)
+  if (!makePairs) {
+    averagedFeatureDiffs.DivideEquals(featureValuesHope[0].size());
+  }
+  else {
+    averagedFeatureDiffs.DivideEquals(featureValuesHope[0].size());
+    averagedViolations /= featureValuesHope[0].size();
+  }
+  cerr << "Rank " << rank << ", epoch " << epoch << ", averaged feature diffs: " << averagedFeatureDiffs << endl;
+  cerr << "Rank " << rank << ", epoch " << epoch << ", averaged violations: " << averagedViolations << endl;
+
+  if (violatedConstraintsBefore > 0) {
+    if (rewardHope) {
+      averagedFeatureDiffs.RedistributeMass(0.1);
+      cerr << "Rank " << rank << ", epoch " << epoch << ", redistributed feature diffs: " << averagedFeatureDiffs << endl;
+    }
+
+    // compute alpha for given constraint: (loss diff - model score diff) / || feature value diff ||^2                                
+    // featureValueDiff.GetL2Norm() * featureValueDiff.GetL2Norm() == featureValueDiff.InnerProduct(featureValueDiff)                     
+    // from Crammer&Singer 2006: alpha = min {C , l_t/ ||x||^2}                                                         
+    // adjusted for 1 slack according to Joachims 2009, OP4 (margin rescaling), OP5 (slack rescaling)
+    float squaredNorm = averagedFeatureDiffs.GetL2Norm() * averagedFeatureDiffs.GetL2Norm();
+    float alpha = averagedViolations / squaredNorm;
+    cerr << "Rank " << rank << ", epoch " << epoch << ", unclipped alpha: " << alpha << endl;
+    if (m_slack > 0 ) {
+      if (alpha > m_slack) {
+	alpha = m_slack;
+      }
+      else if (alpha < m_slack*(-1)) {
+	alpha = m_slack*(-1);
+      }
+    }
+    cerr << "Rank " << rank << ", epoch " << epoch << ", clipped alpha: " << alpha << endl;
+  
+    // compute update
+    averagedFeatureDiffs.MultiplyEquals(alpha);
+    weightUpdate.PlusEquals(averagedFeatureDiffs);
+    return 0;
+  }
+  else {
+    cerr << "Rank " << rank << ", epoch " << epoch << ", no constraint violated for this batch" << endl;
+    return 1;
+  }
+}
+
+size_t MiraOptimiser::updateWeightsRankModelSummed(
+		Moses::ScoreComponentCollection& weightUpdate,
+		const std::vector< std::vector<Moses::ScoreComponentCollection> >& featureValues,
+		const std::vector<std::vector<float> >& bleuScores,
+		const std::vector<std::vector<float> >& modelScores,
+		float learning_rate,
+		size_t rank,
+		size_t epoch) {
+  // vector of feature values differences for all created constraints                                                                                  
+  ScoreComponentCollection averagedFeatureDiffs;
+  float averagedViolations = 0;
+
+  // Make constraints for new hypothesis translations
+  float epsilon = 0.0001;
+  int violatedConstraintsBefore = 0;
+  float oldDistanceFromOptimum = 0;
+  
+  // iterate over input sentences (1 (online) or more (batch))
+  float minBleuDifference = 0.005;
+  size_t numConstraints = 0;
+  for (size_t i = 0; i < featureValues.size(); ++i) {
+    // Build all pairs where the first has higher Bleu than the second
+    for (size_t j = 0; j < featureValues[i].size(); ++j) {
+      for (size_t k = 0; k < featureValues[i].size(); ++k) {
+	if (j == k) continue;
+	if (bleuScores[i][j] - minBleuDifference  < bleuScores[i][k]) // need at least a positive Bleu difference of specified amount
+	  continue;
+	
+	ScoreComponentCollection featureValueDiff = featureValues[i][j];
+	featureValueDiff.MinusEquals(featureValues[i][k]);
+	if (featureValueDiff.GetL1Norm() == 0) {
+	  cerr << "Rank " << rank << ", epoch " << epoch << ", features equal --> skip" << endl;
+	  continue;
+	}
+	
+	// check if constraint is violated                                                                         
+	float loss = bleuScores[i][j] - bleuScores[i][k];
+	float modelScoreDiff = modelScores[i][j] - modelScores[i][k];
+	float diff = 0;
+	if (loss > modelScoreDiff)
+	  diff = loss - modelScoreDiff;
+	if (diff > epsilon)
+	  ++violatedConstraintsBefore;
+	cerr << "Rank " << rank << ", epoch " << epoch << ", constraint: " << modelScoreDiff << " >= " << loss << " (current violation: " << diff << ")" << endl;
+
+	// add constraint                                                                                                        
+	averagedFeatureDiffs.PlusEquals(featureValueDiff);
+	averagedViolations += diff;
+	++numConstraints;
+      }
+    }
+  }
+
+  // divide by number of constraints                                                                                                              
+  averagedFeatureDiffs.DivideEquals(numConstraints);
+  averagedViolations /= numConstraints;
+
+  cerr << "Rank " << rank << ", epoch " << epoch << ", averaged feature diffs: " << averagedFeatureDiffs << endl;
+  cerr << "Rank " << rank << ", epoch " << epoch << ", averaged violations: " << averagedViolations << endl;
+
+  if (violatedConstraintsBefore > 0) {
+    // compute alpha for given constraint: (loss - model score diff) / || feature value diff ||^2                                                          
+    // featureValueDiff.GetL2Norm() * featureValueDiff.GetL2Norm() == featureValueDiff.InnerProduct(featureValueDiff)                                
+    // from Crammer&Singer 2006: alpha = min {C , l_t/ ||x||^2}                                                                                         
+    float squaredNorm = averagedFeatureDiffs.GetL2Norm() * averagedFeatureDiffs.GetL2Norm();
+    float alpha = averagedViolations / squaredNorm;
+    cerr << "Rank " << rank << ", epoch " << epoch << ", unclipped alpha: " << alpha << endl;
+    if (m_slack > 0 ) {
+      if (alpha > m_slack) {
+        alpha = m_slack;
+      }
+      else if (alpha < m_slack*(-1)) {
+        alpha = m_slack*(-1);
+      }
+    }
+    cerr << "Rank " << rank << ", epoch " << epoch << ", clipped alpha: " << alpha << endl;
+
+    // compute update                                                                                                                            
+    averagedFeatureDiffs.MultiplyEquals(alpha);
+    weightUpdate.PlusEquals(averagedFeatureDiffs);
+    return 0;
+  }
+  else {
+    cerr << "Rank " << rank << ", epoch " << epoch << ", no constraint violated for this batch" << endl;
+    return 1;
+  }
+}
+
 }
 
diff --git a/mira/MiraTest.cpp b/mira/MiraTest.cpp
index 774b324f8..774b324f8 100755..100644
--- a/mira/MiraTest.cpp
+++ b/mira/MiraTest.cpp
diff --git a/mira/Optimiser.h b/mira/Optimiser.h
index 8ad38bd5a..b91ad2c4f 100755..100644
--- a/mira/Optimiser.h
+++ b/mira/Optimiser.h
@@ -31,7 +31,6 @@ namespace Mira {
       Optimiser() {}
 
       virtual size_t updateWeightsHopeFear(
-      		Moses::ScoreComponentCollection& currWeights,
       		Moses::ScoreComponentCollection& weightUpdate,
 				  const std::vector<std::vector<Moses::ScoreComponentCollection> >& featureValuesHope,
 				  const std::vector<std::vector<Moses::ScoreComponentCollection> >& featureValuesFear,
@@ -48,7 +47,6 @@ namespace Mira {
   class Perceptron : public Optimiser {
     public:
 			virtual size_t updateWeightsHopeFear(
-					Moses::ScoreComponentCollection& currWeights,
 					Moses::ScoreComponentCollection& weightUpdate,
 					const std::vector<std::vector<Moses::ScoreComponentCollection> >& featureValuesHope,
 					const std::vector<std::vector<Moses::ScoreComponentCollection> >& featureValuesFear,
@@ -80,8 +78,7 @@ namespace Mira {
 		  m_normaliseMargin(normaliseMargin),
 		  m_sigmoidParam(sigmoidParam) { }
    
-	  size_t updateWeights(Moses::ScoreComponentCollection& currWeights,
-	  								Moses::ScoreComponentCollection& weightUpdate,
+	  size_t updateWeights(Moses::ScoreComponentCollection& weightUpdate,
       						  const std::vector<std::vector<Moses::ScoreComponentCollection> >& featureValues,
       						  const std::vector<std::vector<float> >& losses,
       						  const std::vector<std::vector<float> >& bleuScores,
@@ -92,8 +89,7 @@ namespace Mira {
       						  float learning_rate,
       						  size_t rank,
       						  size_t epoch);
-     virtual size_t updateWeightsHopeFear(Moses::ScoreComponentCollection& currWeights,
-    		 	 	 	 	 	 	Moses::ScoreComponentCollection& weightUpdate,
+     virtual size_t updateWeightsHopeFear(Moses::ScoreComponentCollection& weightUpdate,
       						  const std::vector<std::vector<Moses::ScoreComponentCollection> >& featureValuesHope,
       						  const std::vector<std::vector<Moses::ScoreComponentCollection> >& featureValuesFear,
       						  const std::vector<std::vector<float> >& bleuScoresHope,
@@ -104,8 +100,31 @@ namespace Mira {
       						  size_t rank,
       						  size_t epoch,
       						  int updatePosition = -1);
-     size_t updateWeightsAnalytically(Moses::ScoreComponentCollection& currWeights,
-    		 Moses::ScoreComponentCollection& weightUpdate,
+     size_t updateWeightsHopeFearSelective(Moses::ScoreComponentCollection& weightUpdate,
+					  const std::vector<std::vector<Moses::ScoreComponentCollection> >& featureValuesHope,
+					  const std::vector<std::vector<Moses::ScoreComponentCollection> >& featureValuesFear,
+					  const std::vector<std::vector<float> >& bleuScoresHope,
+					  const std::vector<std::vector<float> >& bleuScoresFear,
+					  const std::vector<std::vector<float> >& modelScoresHope,
+					  const std::vector<std::vector<float> >& modelScoresFear,
+					  float learning_rate,
+					  size_t rank,
+					  size_t epoch,
+					  int updatePosition = -1);
+     size_t updateWeightsHopeFearSummed(Moses::ScoreComponentCollection& weightUpdate,
+					  const std::vector<std::vector<Moses::ScoreComponentCollection> >& featureValuesHope,
+					  const std::vector<std::vector<Moses::ScoreComponentCollection> >& featureValuesFear,
+					  const std::vector<std::vector<float> >& bleuScoresHope,
+					  const std::vector<std::vector<float> >& bleuScoresFear,
+					  const std::vector<std::vector<float> >& modelScoresHope,
+					  const std::vector<std::vector<float> >& modelScoresFear,
+					  float learning_rate,
+					  size_t rank,
+					  size_t epoch,
+					  bool rescaleSlack,
+					  bool rewardHope,
+					  bool makePairs);
+     size_t updateWeightsAnalytically(Moses::ScoreComponentCollection& weightUpdate,
     		 Moses::ScoreComponentCollection& featureValuesHope,
     		 Moses::ScoreComponentCollection& featureValuesFear,
     		 float bleuScoreHope,
@@ -115,15 +134,21 @@ namespace Mira {
     		 float learning_rate,
     		 size_t rank,
     		 size_t epoch);
-     size_t updateWeightsRankModel(Moses::ScoreComponentCollection& currWeights,
-    		 Moses::ScoreComponentCollection& weightUpdate,
+     size_t updateWeightsRankModel(Moses::ScoreComponentCollection& weightUpdate,
     		 const std::vector<std::vector<Moses::ScoreComponentCollection> >& featureValues,
     		 const std::vector<std::vector<float> >& bleuScores,
     		 const std::vector<std::vector<float> >& modelScores,
     		 float learning_rate,
     		 size_t rank,
     		 size_t epoch);
-   
+     size_t updateWeightsRankModelSummed(Moses::ScoreComponentCollection& weightUpdate,
+				   const std::vector<std::vector<Moses::ScoreComponentCollection> >& featureValues,
+				   const std::vector<std::vector<float> >& bleuScores,
+				   const std::vector<std::vector<float> >& modelScores,
+				   float learning_rate,
+				   size_t rank,
+				   size_t epoch);
+
      void setSlack(float slack) {
     	 m_slack = slack;
      }
diff --git a/mira/Perceptron.cpp b/mira/Perceptron.cpp
index 409d2ba34..569a83216 100755..100644
--- a/mira/Perceptron.cpp
+++ b/mira/Perceptron.cpp
@@ -25,7 +25,6 @@ using namespace std;
 namespace Mira {
 
 size_t Perceptron::updateWeightsHopeFear(
-		ScoreComponentCollection& currWeights,
 		ScoreComponentCollection& weightUpdate,
 		const vector< vector<ScoreComponentCollection> >& featureValuesHope,
 		const vector< vector<ScoreComponentCollection> >& featureValuesFear,
diff --git a/mira/expt.cfg b/mira/expt.cfg
index 416eb1d3f..416eb1d3f 100755..100644
--- a/mira/expt.cfg
+++ b/mira/expt.cfg
diff --git a/mira/mira.xcodeproj/project.pbxproj b/mira/mira.xcodeproj/project.pbxproj
index 67662f4e0..67662f4e0 100755..100644
--- a/mira/mira.xcodeproj/project.pbxproj
+++ b/mira/mira.xcodeproj/project.pbxproj
diff --git a/mira/training-expt.perl b/mira/training-expt.perl
index 03d780ea7..948a12fca 100755
--- a/mira/training-expt.perl
+++ b/mira/training-expt.perl
@@ -177,7 +177,7 @@ else {
 		$mixing_frequency = 1;
 	    }
 
-	    print "Warning: mixing frequency must not be larger than shard size, setting mixing frequency to $mixing_frequency\n";
+	    print STDERR "Warning: mixing frequency must not be larger than shard size, setting mixing frequency to $mixing_frequency\n";
 	}
     }
 
@@ -188,7 +188,7 @@ else {
 		$weight_dump_frequency = 1;
 	    }
 	    
-	    print "Warning: weight dump frequency must not be larger than shard size, setting weight dump frequency to $weight_dump_frequency\n";
+	    print STDERR "Warning: weight dump frequency must not be larger than shard size, setting weight dump frequency to $weight_dump_frequency\n";
 	}
     }
 
@@ -199,7 +199,7 @@ else {
 		$mixing_frequency = 1;
 	    }
 	    
-	    print "Warning: mixing frequency must not be larger than (shard size/batch size), setting mixing frequency to $mixing_frequency\n";
+	    print STDERR "Warning: mixing frequency must not be larger than (shard size/batch size), setting mixing frequency to $mixing_frequency\n";
 	}
     }
 
@@ -210,7 +210,7 @@ else {
 		$weight_dump_frequency = 1;
 	    }
 
-	    print "Warning: weight dump frequency must not be larger than (shard size/batch size), setting weight dump frequency to $weight_dump_frequency\n";
+	    print STDERR "Warning: weight dump frequency must not be larger than (shard size/batch size), setting weight dump frequency to $weight_dump_frequency\n";
 	}
     }
 }
@@ -315,7 +315,7 @@ else {
 close TRAIN;
 
 if (! $execute) {
-    print "Written train file: $train_script_file\n";
+    print STDERR "Written train file: $train_script_file\n";
     exit 0;
 }
 
@@ -336,7 +336,7 @@ my $train_iteration = -1;
 # optionally continue from a later epoch (if $continue_from_epoch > 0)
 if ($continue_from_epoch > 0) {
     $train_iteration = $continue_from_epoch - 1;
-    print "Continuing training from epoch $continue_from_epoch, with weights from ini file $moses_ini_file.\n";  
+    print STDERR "Continuing training from epoch $continue_from_epoch, with weights from ini file $moses_ini_file.\n";  
 }
 
 while(1) {
@@ -344,7 +344,7 @@ while(1) {
     $train_iteration += 1;   # starts at 0
     my $new_weight_file = "$working_dir/$weight_file_stem" . "_";
     if ($weight_dump_frequency == 0) {
-	print "No weights, no testing..\n";
+	print STDERR "No weights, no testing..\n";
 	exit(0);
     }
     
@@ -369,34 +369,48 @@ while(1) {
     
     my $expected_num_files = $epochs*$weight_dump_frequency;
     if ($wait_for_bleu) {
-	print "Expected number of BLEU files: $expected_num_files \n";
+	print STDERR "Expected number of BLEU files: $expected_num_files \n";
     }
     if (-e "$working_dir/stopping") {
 	wait_for_bleu($expected_num_files) if ($wait_for_bleu);
 	
-	print "Training finished at " . scalar(localtime()) . " because stopping criterion was reached.\n";
+	print STDERR "Training finished at " . scalar(localtime()) . " because stopping criterion was reached.\n";
         exit 0;
     }
     else {
-	print "Waiting for $new_weight_file\n";
+	print STDERR "Waiting for $new_weight_file\n";
 	if (!$skipTrain) {
 	    while ((! -e $new_weight_file) && &check_running($train_job_id)) {
 		sleep 10;
 	    }
 	}
 	if (! -e $new_weight_file ) {
-	    wait_for_bleu($expected_num_files) if ($wait_for_bleu);
-	    
-	    print "Training finished at " . scalar(localtime()) . "\n";
-	    exit 0;
+	    if (-e "$working_dir/stopping" or -e "$working_dir/finished") {
+		wait_for_bleu($expected_num_files) if ($wait_for_bleu);
+		
+		print STDERR "Training finished at " . scalar(localtime()) . "\n";
+		exit 0;
+	    }
+	    else {
+		# training finished with error
+		print STDERR "Error: training was aborted at " . scalar(localtime()) . "\n";
+                exit 1;
+	    }
 	}
     }
 
     #new weight file written. create test script and submit    
     my $suffix = "";
-    print "weight file exists? ".(-e $new_weight_file)."\n";
+    print STDERR "weight file exists? ".(-e $new_weight_file)."\n";
     if (!$skip_devtest) {
 	createTestScriptAndSubmit($epoch, $epoch_slice, $new_weight_file, $suffix, "devtest", $devtest_ini_file, $devtest_input_file, $devtest_reference_files, $skip_submit_test);
+
+	my $regularized_weight_file = $new_weight_file."_reg";
+	if (-e $regularized_weight_file) {
+	    print STDERR "Submitting test script for regularized weights. \n"; 
+	    $epoch_slice .= "_reg";
+	    createTestScriptAndSubmit($epoch, $epoch_slice, $regularized_weight_file, $suffix, "devtest", $devtest_ini_file, $devtest_input_file, $devtest_reference_files, $skip_submit_test);	    
+	}
     }
     if (!$skip_dev) {
 	createTestScriptAndSubmit($epoch, $epoch_slice, $new_weight_file, $suffix, "dev", $moses_ini_file, $input_file, $reference_files, $skip_submit_test);
@@ -405,13 +419,14 @@ while(1) {
 
 sub wait_for_bleu() {
     my $expected_num_files = $_[0];
-    print "Waiting for $expected_num_files bleu files..\n";
+    print STDERR "Waiting for $expected_num_files bleu files..\n";
     my @bleu_files = glob("*.bleu");
     while (scalar(@bleu_files) < $expected_num_files) {
-	sleep 10;
+	sleep 30;
 	@bleu_files = glob("*.bleu");
+	#print STDERR "currently have ".(scalar(@bleu_files))."\n";
     }
-    print "$expected_num_files BLEU files completed, continue.\n"; 
+    print STDERR "$expected_num_files BLEU files completed, continue.\n"; 
 }
 
 sub createTestScriptAndSubmit {
@@ -525,8 +540,8 @@ sub createTestScriptAndSubmit {
     }
     close WEIGHTS;
     
-    print "Number of core weights read: ".$readCoreWeights."\n";
-    print "Number of extra weights read: ".$readExtraWeights."\n";
+    print STDERR "Number of core weights read: ".$readCoreWeights."\n";
+    print STDERR "Number of extra weights read: ".$readExtraWeights."\n";
     
     # If there is a core weight file, we have to load the core weights from that file (NOTE: this is not necessary if the core weights are also printed to the weights file)
 #    if (defined $core_weight_file) {
@@ -696,7 +711,7 @@ sub createTestScriptAndSubmit {
     if(!$skip_submit) {
 	if ($have_sge) {
 	    if ($extra_memory_devtest) {
-		print "Extra memory for test job: $extra_memory_devtest \n";
+		print STDERR "Extra memory for test job: $extra_memory_devtest \n";
 		&submit_job_sge_extra_memory($test_script_file,$extra_memory_devtest);
 	    }
 	    else {
@@ -768,13 +783,13 @@ sub check_exists_noThrow {
 sub submit_job_sge {
     my($script_file) = @_;
     my $qsub_result = `qsub -P $queue $script_file`;
-    print "SUBMIT CMD: qsub -P $queue $script_file\n";
+    print STDERR "SUBMIT CMD: qsub -P $queue $script_file\n";
     if ($qsub_result !~ /Your job (\d+)/) {
-        print "Failed to qsub job: $qsub_result\n";
+        print STDERR "Failed to qsub job: $qsub_result\n";
         return 0;
     }
     my $job_name = basename($script_file);
-    print "Submitted job: $job_name  id: $1  " .
+    print STDERR "Submitted job: $job_name  id: $1  " .
         scalar(localtime()) . "\n";
     return $1;
 }
@@ -782,13 +797,13 @@ sub submit_job_sge {
 sub submit_job_sge_extra_memory {
     my($script_file,$extra_memory) = @_;
     my $qsub_result = `qsub -pe $extra_memory -P $queue $script_file`;                                                                                
-    print "SUBMIT CMD: qsub -pe $extra_memory -P $queue $script_file \n";
+    print STDERR "SUBMIT CMD: qsub -pe $extra_memory -P $queue $script_file \n";
     if ($qsub_result !~ /Your job (\d+)/) {
-        print "Failed to qsub job: $qsub_result\n";
+        print STDERR "Failed to qsub job: $qsub_result\n";
         return 0;
     }
     my $job_name = basename($script_file);
-    print "Submitted job: $job_name  id: $1  " .
+    print STDERR "Submitted job: $job_name  id: $1  " .
         scalar(localtime()) . "\n";
     return $1;
 }
@@ -802,10 +817,10 @@ sub submit_job_no_sge {
   my $pid = undef;
   if ($pid = fork) {
     my $job_name = basename($script_file);
-    print "Launched : $job_name  pid: $pid  " .  scalar(localtime()) . "\n";
+    print STDERR "Launched : $job_name  pid: $pid  " .  scalar(localtime()) . "\n";
     return $pid;
   } elsif (defined $pid) { 
-      print "Executing script $script_file, writing to $out and $err.\n";
+      print STDERR "Executing script $script_file, writing to $out and $err.\n";
       `cd $working_dir; sh $script_file 1>$out 2> $err`;
     exit;
   } else {
author	Eva Hasler <ehasler@saxnot.inf.ed.ac.uk>	2012-06-01 04:49:42 +0400
committer	Eva Hasler <ehasler@saxnot.inf.ed.ac.uk>	2012-06-01 04:49:42 +0400
commit	6a6a35c65eaeb42b3f1aa44237332dd6f1bc652c (patch)
tree	8a02d798832fe36e9efb60f8a2e30d9368728a13 /mira
parent	62d10a2af3f9509bf7680b8549ff1ffd9774dd83 (diff)