Merge branch 'master' of github.com:moses-smt/mosesdecoder

author: Barry Haddow <barry.haddow@gmail.com> 2013-04-12 19:07:26 +0400
committer: Barry Haddow <barry.haddow@gmail.com> 2013-04-12 19:07:26 +0400
commit: 9d42c7f6f74bbb0079768a762fc4546d20d6b634 (patch)
tree: ab1a2a2884a3b3b809a969ea0eb36fb98416347e
parent: c5965b8587b37986ebab786905a8ef9f218403de (diff)
parent: 517d6c7bb834e40bcf25e8cbc79985180cb7f29f (diff)
98 files changed, 2718 insertions, 348 deletions
diff --git a/.gitmodules b/.gitmodules
index e69de29bb..d3a8cb4da 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -0,0 +1,3 @@
+[submodule "contrib/arrow-pipelines/python/libs/pypeline"]
+	path = contrib/arrow-pipelines/python/libs/pypeline
+	url = git://github.com/ianj-als/pypeline.git
diff --git a/BUILD-INSTRUCTIONS.txt b/BUILD-INSTRUCTIONS.txt
index 318956ccd..3dac64f60 100644
--- a/BUILD-INSTRUCTIONS.txt
+++ b/BUILD-INSTRUCTIONS.txt
@@ -45,7 +45,7 @@ ADVICE ON INSTALLING EXTERNAL LIBRARIES
 Generally, for trouble installing external libraries, you should get support
 directly from the library maker:
 
-Boost: http://www.boost.org/doc/libs/1_48_0/more/getting_started/unix-variants.html
+Boost: http://www.boost.org/doc/libs/release/more/getting_started/unix-variants.html
 IRSTLM: https://list.fbk.eu/sympa/subscribe/user-irstlm
 SRILM: http://www.speech.sri.com/projects/srilm/#srilm-user
 
diff --git a/NOTICE b/NOTICE
index 7d631cd88..23d8b2ad1 100644
--- a/NOTICE
+++ b/NOTICE
@@ -1,3 +1,5 @@
 This code includes data from Daniel Naber's Language Tools (czech abbreviations).
 
 This code includes data from czech wiktionary (also czech abbreviations).
+
+
diff --git a/OnDiskPt/Main.cpp b/OnDiskPt/Main.cpp
index 5f6da5a33..5d4e0be8d 100644
--- a/OnDiskPt/Main.cpp
+++ b/OnDiskPt/Main.cpp
@@ -174,6 +174,7 @@ OnDiskPt::PhrasePtr Tokenize(SourcePhrase &sourcePhrase, TargetPhrase &targetPhr
         break;
 	}
       default:
+        cerr << "ERROR in line " << line << endl;
         assert(false);
         break;
       }
diff --git a/biconcor/PhrasePair.cpp b/biconcor/PhrasePair.cpp
index 9c16be77c..038fa3a31 100644
--- a/biconcor/PhrasePair.cpp
+++ b/biconcor/PhrasePair.cpp
@@ -8,7 +8,42 @@
 
 using namespace std;
 
-void PhrasePair::Print( ostream* out, int width ) const
+void PhrasePair::Print( ostream* out ) const
+{
+  // source
+  int sentence_start = m_source_position - m_source_start;
+  char source_length = m_suffixArray->GetSentenceLength( m_suffixArray->GetSentence( m_source_position ) );
+
+  for( char i=0; i<source_length; i++ ) {
+    if (i>0) *out << " ";
+    *out << m_suffixArray->GetWord( sentence_start + i );
+  }
+
+  // target
+  *out << " |||";
+  for( char i=0; i<m_target_length; i++ ) {
+    *out << " " << m_targetCorpus->GetWord( m_sentence_id, i);
+  }
+
+  // source span
+  *out << " ||| " << (int)m_source_start << " " << (int)m_source_end;
+
+  // target span
+  *out << " ||| " << (int)m_target_start << " " << (int)m_target_end;
+
+  // word alignment
+  *out << " |||";
+
+  INDEX ap_points = m_alignment->GetNumberOfAlignmentPoints( m_sentence_id );
+  for( INDEX i=0; i<ap_points; i++) {
+    *out << " " << m_alignment->GetSourceWord( m_sentence_id, i )
+	 << "-" << m_alignment->GetTargetWord( m_sentence_id, i );
+  }
+
+  *out << endl;
+}
+
+void PhrasePair::PrintPretty( ostream* out, int width ) const
 {
   vector< WORD_ID >::const_iterator t;
 
diff --git a/biconcor/PhrasePair.h b/biconcor/PhrasePair.h
index f8a7881a0..f1dadb637 100644
--- a/biconcor/PhrasePair.h
+++ b/biconcor/PhrasePair.h
@@ -43,7 +43,8 @@ public:
   ~PhrasePair () {}
 
   void PrintTarget( std::ostream* out ) const;
-  void Print( std::ostream* out, int width ) const;
+  void Print( std::ostream* out ) const;
+  void PrintPretty( std::ostream* out, int width ) const;
   void PrintHTML( std::ostream* out ) const;
   void PrintClippedHTML( std::ostream* out, int width ) const;
 };
diff --git a/biconcor/PhrasePairCollection.cpp b/biconcor/PhrasePairCollection.cpp
index 17c95d24a..7497b2af8 100644
--- a/biconcor/PhrasePairCollection.cpp
+++ b/biconcor/PhrasePairCollection.cpp
@@ -13,31 +13,32 @@
 
 using namespace std;
 
-PhrasePairCollection::PhrasePairCollection( SuffixArray *sa, TargetCorpus *tc, Alignment *a )
+PhrasePairCollection::PhrasePairCollection( SuffixArray *sa, TargetCorpus *tc, Alignment *a, int max_translation, int max_example )
   :m_suffixArray(sa)
   ,m_targetCorpus(tc)
   ,m_alignment(a)
   ,m_size(0)
-  ,m_max_lookup(10000)
-  ,m_max_pp_target(50)
-  ,m_max_pp(50)
+  ,m_max_lookup(10000)          // maximum number of source occurrences sampled
+  ,m_max_translation(max_translation)    // max number of different distinct translations returned
+  ,m_max_example(max_example) // max number of examples returned for each distinct translation
 {}
 
 PhrasePairCollection::~PhrasePairCollection()
 {}
 
-bool PhrasePairCollection::GetCollection( const vector< string >& sourceString )
+int PhrasePairCollection::GetCollection( const vector< string >& sourceString )
 {
   INDEX first_match, last_match;
   if (! m_suffixArray->FindMatches( sourceString, first_match, last_match )) {
-    return false;
+    return 0;
   }
-  cerr << "\tfirst match " << first_match << endl;
-  cerr << "\tlast match " << last_match << endl;
+  //cerr << "\tfirst match " << first_match << endl;
+  //cerr << "\tlast match " << last_match << endl;
 
   INDEX found = last_match - first_match +1;
 
   map< vector< WORD_ID >, INDEX > index;
+  int real_count = 0;
   for( INDEX i=first_match; i<=last_match; i++ ) {
     int position = m_suffixArray->GetPosition( i );
     int source_start = m_suffixArray->GetWordInSentence( position );
@@ -45,23 +46,23 @@ bool PhrasePairCollection::GetCollection( const vector< string >& sourceString )
     INDEX sentence_id = m_suffixArray->GetSentence( position );
     int sentence_length = m_suffixArray->GetSentenceLength( sentence_id );
     int target_length = m_targetCorpus->GetSentenceLength( sentence_id );
-    cerr << "match " << (i-first_match)
-         << " in sentence " << sentence_id
-         << ", starting at word " << source_start
-         << " of " << sentence_length
-         << ". target sentence has " << target_length << " words.";
+    //cerr << "match " << (i-first_match)
+         //<< " in sentence " << sentence_id
+         //<< ", starting at word " << source_start
+         //<< " of " << sentence_length
+         //<< ". target sentence has " << target_length << " words.";
     int target_start, target_end, pre_null, post_null;
     if (m_alignment->PhraseAlignment( sentence_id, target_length, source_start, source_end, target_start, target_end, pre_null, post_null)) {
-      cerr << " aligned to [" << (int)target_start << "," << (int)target_end << "]";
-      cerr << " +(" << (int)pre_null << "," << (int)post_null << ")";
+      //cerr << " aligned to [" << (int)target_start << "," << (int)target_end << "]";
+      //cerr << " +(" << (int)pre_null << "," << (int)post_null << ")";
 			bool null_boundary_words = false;
       for (int pre = 0; pre <= pre_null && (pre == 0 || null_boundary_words); pre++ ) {
         for (int post = 0; post <= post_null && (post == 0 || null_boundary_words); post++ ) {
           vector< WORD_ID > targetString;
-          cerr << "; ";
+          //cerr << "; ";
           for (int target = target_start - pre; target <= target_end + post; target++) {
             targetString.push_back( m_targetCorpus->GetWordId( sentence_id, target) );
-            cerr << m_targetCorpus->GetWord( sentence_id, target) << " ";
+            //cerr << m_targetCorpus->GetWord( sentence_id, target) << " ";
           }
           PhrasePair *phrasePair = new PhrasePair( m_suffixArray, m_targetCorpus, m_alignment, sentence_id, target_length, position, source_start, source_end, target_start-pre, target_end+post, pre, post, pre_null-pre, post_null-post);
           // matchCollection.Add( sentence_id, )
@@ -76,37 +77,47 @@ bool PhrasePairCollection::GetCollection( const vector< string >& sourceString )
       }
     }
 		else {
-			cerr << "mismatch " << (i-first_match)
-					 << " in sentence " << sentence_id
-					 << ", starting at word " << source_start
-					 << " of " << sentence_length
-					 << ". target sentence has " << target_length << " words.";
+			//cerr << "mismatch " << (i-first_match)
+			//		 << " in sentence " << sentence_id
+			//		 << ", starting at word " << source_start
+			//		 << " of " << sentence_length
+			//		 << ". target sentence has " << target_length << " words.";
 			Mismatch *mismatch = new Mismatch( m_suffixArray, m_targetCorpus, m_alignment, sentence_id, position, sentence_length, target_length, source_start, source_end );
 			if (mismatch->Unaligned())
 				m_unaligned.push_back( mismatch );
 			else
 				m_mismatch.push_back( mismatch );
 		}
-    cerr << endl;
+    //cerr << endl;
 
     if (found > (INDEX)m_max_lookup) {
       i += found/m_max_lookup-1;
     }
+    real_count++;
   }
   sort(m_collection.begin(), m_collection.end(), CompareBySize());
-  return true;
+  return real_count;
 }
 
-void PhrasePairCollection::Print() const
+void PhrasePairCollection::Print(bool pretty) const
 {
   vector< vector<PhrasePair*> >::const_iterator ppWithSameTarget;
-  for( ppWithSameTarget = m_collection.begin(); ppWithSameTarget != m_collection.end(); ppWithSameTarget++ ) {
+  int i=0;
+  for( ppWithSameTarget = m_collection.begin(); ppWithSameTarget != m_collection.end() && i<m_max_translation; i++, ppWithSameTarget++ ) {
     (*(ppWithSameTarget->begin()))->PrintTarget( &cout );
     int count = ppWithSameTarget->size();
     cout << "(" << count << ")" << endl;
-    vector< PhrasePair* >::const_iterator p;
-    for(p = ppWithSameTarget->begin(); p != ppWithSameTarget->end(); p++ ) {
-      (*p)->Print( &cout, 100 );
+    vector< PhrasePair* >::const_iterator p = ppWithSameTarget->begin();
+    for(int j=0; j<ppWithSameTarget->size() && j<m_max_example; j++, p++ ) {
+      if (pretty) {
+        (*p)->PrintPretty( &cout, 100 );
+      }
+      else {
+        (*p)->Print( &cout );
+      }
+      if (ppWithSameTarget->size() > m_max_example) {
+        p += ppWithSameTarget->size()/m_max_example-1;
+      }
     }
   }
 }
@@ -117,7 +128,7 @@ void PhrasePairCollection::PrintHTML() const
 	bool singleton = false;
 	// loop over all translations
   vector< vector<PhrasePair*> >::const_iterator ppWithSameTarget;
-  for( ppWithSameTarget = m_collection.begin(); ppWithSameTarget != m_collection.end() && pp_target<m_max_pp_target; ppWithSameTarget++, pp_target++ ) {
+  for( ppWithSameTarget = m_collection.begin(); ppWithSameTarget != m_collection.end() && pp_target<m_max_translation; ppWithSameTarget++, pp_target++ ) {
 
 		int count = ppWithSameTarget->size();
 		if (!singleton) {
@@ -143,9 +154,9 @@ void PhrasePairCollection::PrintHTML() const
 		int i=0;
     for(p = ppWithSameTarget->begin(); i<10 && pp<count && p != ppWithSameTarget->end(); p++, pp++, i++ ) {
       (*p)->PrintClippedHTML( &cout, 160 );
-      if (count > m_max_pp) {
-        p += count/m_max_pp-1;
-        pp += count/m_max_pp-1;
+      if (count > m_max_example) {
+        p += count/m_max_example-1;
+        pp += count/m_max_example-1;
       }
     }
 		if (i == 10 && pp < count) {			
@@ -153,11 +164,11 @@ void PhrasePairCollection::PrintHTML() const
 			cout << "<tr><td colspan=7 align=center class=\"pp_more\" onclick=\"javascript:document.getElementById('pp_" << pp_target << "').style.display = 'none'; document.getElementById('pp_ext_" << pp_target << "').style.display = 'block';\">(more)</td></tr></table></div>";
 			cout << "<div id=\"pp_ext_" << pp_target << "\" style=\"display:none;\";\">";
 			cout << "<table align=\"center\">";
-			for(i=0, pp=0, p = ppWithSameTarget->begin(); i<m_max_pp && pp<count && p != ppWithSameTarget->end(); p++, pp++, i++ ) {
+			for(i=0, pp=0, p = ppWithSameTarget->begin(); i<m_max_example && pp<count && p != ppWithSameTarget->end(); p++, pp++, i++ ) {
 				(*p)->PrintClippedHTML( &cout, 160 );
-				if (count > m_max_pp) {
-					p += count/m_max_pp-1;
-					pp += count/m_max_pp-1;
+				if (count > m_max_example) {
+					p += count/m_max_example-1;
+					pp += count/m_max_example-1;
 				}
 			}
 		}
@@ -172,7 +183,7 @@ void PhrasePairCollection::PrintHTML() const
 	if (singleton) cout << "</table></div>\n";
 	else if (pp_target > 9)	cout << "</div>";
 
-	size_t max_mismatch = m_max_pp/3;
+	size_t max_mismatch = m_max_example/3;
 	// unaligned phrases
 	if (m_unaligned.size() > 0) {
 		cout << "<p class=\"pp_singleton_header\">unaligned" 
diff --git a/biconcor/PhrasePairCollection.h b/biconcor/PhrasePairCollection.h
index f88bfc10f..e076eba9b 100644
--- a/biconcor/PhrasePairCollection.h
+++ b/biconcor/PhrasePairCollection.h
@@ -22,19 +22,19 @@ private:
   std::vector< Mismatch* > m_mismatch, m_unaligned;
   int m_size;
   int m_max_lookup;
-  int m_max_pp_target;
-  int m_max_pp;
+  int m_max_translation;
+  int m_max_example;
 
   // No copying allowed.
   PhrasePairCollection(const PhrasePairCollection&);
   void operator=(const PhrasePairCollection&);
 
 public:
-  PhrasePairCollection ( SuffixArray *, TargetCorpus *, Alignment * );
+  PhrasePairCollection ( SuffixArray *, TargetCorpus *, Alignment *, int, int );
   ~PhrasePairCollection ();
 
-  bool GetCollection( const std::vector<std::string >& sourceString );
-  void Print() const;
+  int GetCollection( const std::vector<std::string >& sourceString );
+  void Print(bool pretty) const;
   void PrintHTML() const;
 };
 
diff --git a/biconcor/biconcor.cpp b/biconcor/biconcor.cpp
index a25e63cb7..f4e7c03fb 100644
--- a/biconcor/biconcor.cpp
+++ b/biconcor/biconcor.cpp
@@ -19,8 +19,12 @@ int main(int argc, char* argv[])
   int saveFlag = false;
   int createFlag = false;
   int queryFlag = false;
-  int htmlFlag = false;
-  string info = "usage: suffix-query\n\t[--load file]\n\t[--save file]\n\t[--create source-corpus]\n\t[--query string]\n\t[--target target-corpus]\n\t[--alignment file]\n";
+  int htmlFlag = false;   // output as HTML
+  int prettyFlag = false; // output readable on screen
+  int stdioFlag = false;  // receive requests from STDIN, respond to STDOUT
+  int max_translation = 20;
+  int max_example = 50;
+  string info = "usage: biconcor\n\t[--load model-file]\n\t[--save model-file]\n\t[--create source-corpus]\n\t[--query string]\n\t[--target target-corpus]\n\t[--alignment file]\n\t[--translations count]\n\t[--examples count]\n\t[--html]\n\t[--stdio]\n";
   while(1) {
     static struct option long_options[] = {
       {"load", required_argument, 0, 'l'},
@@ -29,11 +33,15 @@ int main(int argc, char* argv[])
       {"query", required_argument, 0, 'q'},
       {"target", required_argument, 0, 't'},
       {"alignment", required_argument, 0, 'a'},
-      {"html", no_argument, &htmlFlag, 0},
+      {"html", no_argument, 0, 'h'},
+      {"pretty", no_argument, 0, 'p'},
+      {"stdio", no_argument, 0, 'i'},
+      {"translations", required_argument, 0, 'o'},
+      {"examples", required_argument, 0, 'e'},
       {0, 0, 0, 0}
     };
     int option_index = 0;
-    int c = getopt_long (argc, argv, "l:s:c:q:Q:t:a:h", long_options, &option_index);
+    int c = getopt_long (argc, argv, "l:s:c:q:Q:t:a:hpio:e:", long_options, &option_index);
     if (c == -1) break;
     switch (c) {
     case 'l':
@@ -62,11 +70,29 @@ int main(int argc, char* argv[])
       query = string(optarg);
       queryFlag = true;
       break;
+    case 'o':
+      max_translation = atoi(optarg);
+      break;
+    case 'e':
+      max_example = atoi(optarg);
+      break;
+    case 'p':
+      prettyFlag = true;
+      break;
+    case 'h':
+      htmlFlag = true;
+      break;
+    case 'i':
+      stdioFlag = true;
+      break;
     default:
       cerr << info;
       exit(1);
     }
   }
+  if (stdioFlag) {
+    queryFlag = true;
+  }
 
   // check if parameter settings are legal
   if (saveFlag && !createFlag) {
@@ -111,12 +137,37 @@ int main(int argc, char* argv[])
     targetCorpus.Load( fileNameSuffix );
     alignment.Load( fileNameSuffix );
   }
-  if (queryFlag) {
+  if (stdioFlag) {
+    cout << "-|||- BICONCOR START -|||-" << endl << flush;
+    while(true) {
+      string query;
+      if (getline(cin, query, '\n').eof()) {
+        return 0;
+      }
+      vector< string > queryString = alignment.Tokenize( query.c_str() );
+      PhrasePairCollection ppCollection( &suffixArray, &targetCorpus, &alignment, max_translation, max_example );
+      int total = ppCollection.GetCollection( queryString );
+      cout << "TOTAL: " << total << endl;
+      if (htmlFlag) {
+        ppCollection.PrintHTML();
+      }
+      else {
+	ppCollection.Print(prettyFlag);
+      }
+      cout << "-|||- BICONCOR END -|||-" << endl << flush;
+    }
+  }
+  else if (queryFlag) {
     cerr << "query is " << query << endl;
     vector< string > queryString = alignment.Tokenize( query.c_str() );
-    PhrasePairCollection ppCollection( &suffixArray, &targetCorpus, &alignment );
+    PhrasePairCollection ppCollection( &suffixArray, &targetCorpus, &alignment, max_translation, max_example );
     ppCollection.GetCollection( queryString );
-    ppCollection.PrintHTML();
+    if (htmlFlag) {
+      ppCollection.PrintHTML();
+    }
+    else {
+      ppCollection.Print(prettyFlag);
+    }
   }
 
   return 0;
diff --git a/bjam b/bjam
index d0d94dedb..0ebf105c3 100755
--- a/bjam
+++ b/bjam
@@ -1,17 +1,17 @@
 #!/bin/bash
 set -e
+top="$(dirname "$0")"
 if
   bjam="$(which bjam 2>/dev/null)" && #exists
   [ ${#bjam} != 0 ] && #paranoia about which printing nothing then returning true
   ! grep UFIHGUFIHBDJKNCFZXAEVA "${bjam}" </dev/null >/dev/null && #bjam in path isn't this script
   "${bjam}" --sanity-test 2>/dev/null |grep Sane >/dev/null && #The test in jam-files/sanity.jam passes
-  (cd jam-files/fail && ! "${bjam}") >/dev/null #Returns non-zero on failure
+  (cd "${top}/jam-files/fail" && ! "${bjam}") >/dev/null #Returns non-zero on failure
 then
   #Delegate to system bjam
   exec "${bjam}" "$@"
 fi
 
-top="$(dirname "$0")"
 if [ ! -x "$top"/jam-files/bjam ] || "$top"/jam-files/bjam -v |grep 2011.4 >/dev/null; then
   pushd "$top/jam-files/engine"
   ./build.sh
diff --git a/contrib/arrow-pipelines/documentation/training-pipeline/moses-pypeline.dia b/contrib/arrow-pipelines/documentation/training-pipeline/moses-pypeline.dia
new file mode 100644
index 000000000..1d35a1dea
--- /dev/null
+++ b/contrib/arrow-pipelines/documentation/training-pipeline/moses-pypeline.dia
diff --git a/contrib/arrow-pipelines/python/README b/contrib/arrow-pipelines/python/README
new file mode 100644
index 000000000..e1e12975c
--- /dev/null
+++ b/contrib/arrow-pipelines/python/README
@@ -0,0 +1,32 @@
+Arrow Based Moses Training Pipeline
+===================================
+
+To use the demonstration you must first initialise the git submodules for this clone. Return to the top level directory and issue the following command:
+
+$ git submodule init
+
+This will clone the Pypeline submodule that is available on GitHub (https://github.com/ianj-als/pypeline). To install Pypeline:
+
+$ cd libs/pypeline
+$ python setup.py install
+
+Alternatively, you can set an appropriate PYTHONPATH enviornment variable to the Pypeline library.
+
+This demonstration implements a training pipeline that is shown in the Dia diagram in ../documentation/training-pipeline/moses-pypeline.dia.
+
+Three environment variables need to be set before the manager.py script can be run, they are:
+
+ - MOSES_HOME : The directory where Moses has been cloned, or installed,
+ - IRSTLM : The installation directory of your IRSTLM, and
+ - GIZA_HOME : The installation directory of GIZA++.
+
+The manager.py script takes four positional command-line arguments:
+
+ - The source language code,
+ - The target language code,
+ - The source corpus file. This file *must* be cleaned prior to use, and
+ - The target corpus file. This file *must* be cleaned prior to use.
+
+For example, run the manager.py script with:
+
+$ python manager.py en lt cleantrain.en cleantrain.lt
diff --git a/contrib/arrow-pipelines/python/libs/pypeline b/contrib/arrow-pipelines/python/libs/pypeline
new file mode 160000
+Subproject a7084b686f5196f1bbac5d389b4a6cd7f15c83f
diff --git a/contrib/arrow-pipelines/python/manager.py b/contrib/arrow-pipelines/python/manager.py
new file mode 100644
index 000000000..1c3ece111
--- /dev/null
+++ b/contrib/arrow-pipelines/python/manager.py
@@ -0,0 +1,192 @@
+import logging
+import os
+
+from concurrent.futures import Future, ThreadPoolExecutor
+from functools import partial
+from pypeline.helpers.parallel_helpers import eval_pipeline, \
+    cons_function_component, \
+    cons_wire, \
+    cons_split_wire, \
+    cons_unsplit_wire, \
+    cons_dictionary_wire
+
+
+#
+# Some logging please
+#
+FORMAT = '%(asctime)-15s : %(threadName)s : %(levelname)s - %(message)s'
+logging.basicConfig(format = FORMAT, level = logging.DEBUG)
+logger = logging.getLogger("manager")
+
+
+# Build the pipeline components
+def build_components(components, configuration, executor):
+  pipeline_components = dict()
+  pipeline_configuration = dict()
+
+  for component_id, module_name in components.items():
+    logger.info("Loading [%s] component from [%s]..." % (component_id, module_name))
+
+    module = __import__(module_name, fromlist = ['configure', 'initialise'])
+    
+    # Component builds its own configuration object
+    config_func = getattr(module, 'configure')
+    component_config = config_func(configuration)
+    pipeline_configuration.update(component_config)
+
+    # Now build the component
+    init_func = getattr(module, 'initialise')
+    component_function = init_func(component_config)
+
+    # A wrapper for the component's function that submits to the executor
+    def get_component_function_wrapper(inner_function, comp_id, mod_name):
+      def component_function_wrapper(a, s):
+        logger.info("Running component [%s], from module [%s], with value [%s] and state [%s]..." % \
+                    (comp_id, mod_name, a, s))
+        return inner_function(a, s)
+
+      return component_function_wrapper
+
+    # Arrowize the component
+    component = cons_function_component(get_component_function_wrapper(component_function, component_id, module_name))
+
+    # And store
+    pipeline_components[component_id] = component
+
+  return pipeline_components, pipeline_configuration
+
+
+# Go!
+def main(src_lang, trg_lang, src_filename, trg_filename):
+  # Global configuration
+  # One day, this configuration shall be constructed from
+  # command line options, or a properties file.
+  configuration = {
+    'moses_installation_dir': os.environ['MOSES_HOME'],
+    'irstlm_installation_dir': os.environ['IRSTLM'],
+    'giza_installation_dir': os.environ['GIZA_HOME'],
+    'src_lang': src_lang,
+    'src_tokenisation_dir': './tokenisation',
+    'trg_lang': trg_lang,
+    'trg_tokenisation_dir': './tokenisation',
+    'segment_length_limit': 60,
+    'irstlm_smoothing_method': 'improved-kneser-ney',
+    'language_model_directory': './language-model',
+    'translation_model_directory': './translation-model',
+    'mert_working_directory': './mert',
+    'evaluation_data_size': 100,
+    'development_data_size': 100
+  }
+
+  # The modules to load
+  # In the future, the components shall be specified in some kind
+  # pipeline description file.
+  component_modules = {
+    'src_tokenizer': 'training.components.tokenizer.src_tokenizer',
+    'trg_tokenizer': 'training.components.tokenizer.trg_tokenizer',
+    'cleanup': 'training.components.cleanup.cleanup',
+    'data_split': 'training.components.data_split.data_split',
+    'irstlm_build': 'training.components.irstlm_build.irstlm_build',
+    'model_training': 'training.components.model_training.model_training',
+    'mert': 'training.components.mert.mert'
+  }
+
+  # The thread pool
+  executor = ThreadPoolExecutor(max_workers = 3)
+
+  # Phew, build the required components
+  components, component_config = build_components(component_modules, configuration, executor)
+
+  #
+  # Wire up components
+  # Description of wiring should be, in the future, alongside the component
+  # specification in some kind of confuguration file. Components shall be
+  # declared then used, i.e., bind a component instance to a unique component
+  # identifier, then wire component instances together by identifier.
+  #
+
+  #
+  # Tokenisation of source and target...
+  #
+  # IRSTLM Build components
+  irstlm_build_component = cons_split_wire() >> \
+                           (cons_wire(lambda a, s: {'input_filename':  a['tokenised_trg_filename']}) >> \
+                            components['irstlm_build']).second() >> \
+                           cons_unsplit_wire(lambda t, b: {'tokenised_trg_filename': t['tokenised_trg_filename'],
+                                                           'trg_language_model_filename': b['compiled_lm_filename']})
+
+  # The complete tokenisation component
+  tokenisation_component = (components['src_tokenizer'] & components['trg_tokenizer']) >> \
+                           irstlm_build_component.second() >> \
+                           cons_unsplit_wire(lambda t, b: {'src_filename': t['tokenised_src_filename'],
+                                                           'trg_filename': b['tokenised_trg_filename'],
+                                                           'trg_language_model_filename': b['trg_language_model_filename']})
+
+  #
+  # Cleanup and Data Spliting...
+  #
+
+  #
+  # A function that clips off the last '.' delimited string
+  #
+  def clip_last_bit(filename):
+    bn = os.path.basename(filename)
+    directory = os.path.dirname(filename)
+    bits = bn.split(".")
+    bits.pop()
+    return os.path.join(directory, ".".join(bits))
+
+  cleanup_datasplit_component = components['cleanup'] >> \
+                                cons_wire(lambda a, s: {'src_filename': a['cleaned_src_filename'],
+                                                        'trg_filename': a['cleaned_trg_filename']}) >> \
+                                components['data_split'] >> \
+                                cons_wire(lambda a, s: {'training_data_filename': clip_last_bit(a['train_src_filename']),
+                                                        'eval_src_filename': a['eval_src_filename'],
+                                                        'eval_trg_filename': a['eval_trg_filename']})
+
+  #
+  # Translation model training
+  #
+  translation_model_component = cons_split_wire() >> \
+                                components['model_training'].first() >> \
+                                cons_unsplit_wire(lambda t, b: {'moses_ini_file': t['moses_ini_file'],
+                                                                'development_data_filename': b['eval_src_filename']})
+
+  #
+  # The whole pipeline
+  #
+  pipeline = tokenisation_component >> \
+             cons_split_wire() >> \
+             (cleanup_datasplit_component >> translation_model_component).first() >> \
+             cons_unsplit_wire(lambda t, b: {'moses_ini_file': t['moses_ini_file'],
+                                             'development_data_filename': clip_last_bit(t['development_data_filename']),
+                                             'trg_language_model_filename': b['trg_language_model_filename'],
+                                             'trg_language_model_order': 3,
+                                             'trg_language_model_type': 9}) >> \
+             components['mert']
+
+
+  #
+  # The input to the pipeline
+  #
+  value = {'src_filename': src_filename,
+           'trg_filename': trg_filename}
+
+  #
+  # Evaluate the pipeline
+  #
+  logger.info("Evaluating pipeline with input [%s]..." % value)
+  new_value = eval_pipeline(executor, pipeline, value, component_config)
+
+  #
+  # Wait for all components to finish
+  #
+  executor.shutdown(True)
+  
+  logger.info("Pipeline evaluated to %s" % new_value)
+
+
+if __name__ == '__main__':
+  import sys
+
+  main(sys.argv[1], sys.argv[2], sys.argv[3], sys.argv[4])
diff --git a/contrib/arrow-pipelines/python/test/__init__.py b/contrib/arrow-pipelines/python/test/__init__.py
new file mode 100644
index 000000000..e69de29bb
--- /dev/null
+++ b/contrib/arrow-pipelines/python/test/__init__.py
diff --git a/contrib/arrow-pipelines/python/test/test.py b/contrib/arrow-pipelines/python/test/test.py
new file mode 100644
index 000000000..628796f7d
--- /dev/null
+++ b/contrib/arrow-pipelines/python/test/test.py
@@ -0,0 +1,11 @@
+import subprocess
+
+def cat(filename, content):
+  fh = open(filename, "w")
+  for line in content:
+    #print(line, file=fh)
+    print >> fh, line
+  fh.close()
+
+def diff(filename1, filename2):
+  subprocess.check_output(["diff", filename1, filename2], stderr=subprocess.STDOUT)
diff --git a/contrib/arrow-pipelines/python/training/__init__.py b/contrib/arrow-pipelines/python/training/__init__.py
new file mode 100644
index 000000000..e69de29bb
--- /dev/null
+++ b/contrib/arrow-pipelines/python/training/__init__.py
diff --git a/contrib/arrow-pipelines/python/training/components/__init__.py b/contrib/arrow-pipelines/python/training/components/__init__.py
new file mode 100644
index 000000000..e69de29bb
--- /dev/null
+++ b/contrib/arrow-pipelines/python/training/components/__init__.py
diff --git a/contrib/arrow-pipelines/python/training/components/cleanup/__init__.py b/contrib/arrow-pipelines/python/training/components/cleanup/__init__.py
new file mode 100644
index 000000000..e69de29bb
--- /dev/null
+++ b/contrib/arrow-pipelines/python/training/components/cleanup/__init__.py
diff --git a/contrib/arrow-pipelines/python/training/components/cleanup/cleanup.py b/contrib/arrow-pipelines/python/training/components/cleanup/cleanup.py
new file mode 100644
index 000000000..cb2e057ce
--- /dev/null
+++ b/contrib/arrow-pipelines/python/training/components/cleanup/cleanup.py
@@ -0,0 +1,125 @@
+from pypeline.helpers.helpers import cons_function_component
+
+def configure(args):
+  result = {}
+  result['segment_length'] = args['segment_length_limit']
+  return result
+
+def initialise(config):
+  def _filter(limit, ifh1, ofh1, ifh2, ofh2):
+    def _short(line):
+      n = 0
+      for c in line:
+        if c == " ":
+          n += 1
+      #print(line, ":", n)
+      return n < limit
+
+    for (l1, l2) in zip(ifh1, ifh2):
+      if _short(l1) and _short(l2):
+        print >>ofh1, l1,
+        print >>ofh2, l2,
+
+  def _make_cleaned_filename(filename):
+    bits = filename.split(".")
+    bits[-1] = "clean"
+    return ".".join(bits)
+
+  def _filter_main(value, config):
+    limit = config['segment_length']
+    (ifh1, ifh2, ofh1, ofh2) = (None, None, None, None)
+    try:
+      input_src_filename = value['src_filename']
+      input_trg_filename = value['trg_filename']
+
+      print "Cleanup: Cleaning [%s] and [%s]..." % (input_src_filename, input_trg_filename)
+
+      ifh1 = open(input_src_filename, "r")
+      ifh2 = open(input_trg_filename, "r")
+
+      cleaned_src_filename = _make_cleaned_filename(input_src_filename)
+      cleaned_trg_filename = _make_cleaned_filename(input_trg_filename)
+      ofh1 = open(cleaned_src_filename, "w")
+      ofh2 = open(cleaned_trg_filename, "w")
+
+      _filter(limit, ifh1, ofh1, ifh2, ofh2)
+
+      return {'cleaned_src_filename': cleaned_src_filename,
+              'cleaned_trg_filename': cleaned_trg_filename}
+    finally:
+      def _safe_close(fh):
+        if fh is not None:
+          fh.close()
+      _safe_close(ifh1)
+      _safe_close(ifh2)
+      _safe_close(ofh1)
+      _safe_close(ofh2)
+    
+  return _filter_main
+
+
+if __name__ == '__main__':
+  import os
+  import tempfile
+  import test.test as thelp
+
+  from pypeline.helpers.helpers import eval_pipeline
+
+
+  def _test_main():
+    configuration = {'segment_length_limit': 20}
+
+    src_filename = tempfile.mkstemp(suffix = ".src", dir = "/tmp")
+    trg_filename = tempfile.mkstemp(suffix = ".trg", dir = "/tmp")
+
+    box_eval = {
+      'src_filename': src_filename[1],
+      'trg_filename': trg_filename[1],
+      'cleaned_src_file_expected': src_filename[1] + ".expected",
+      'cleaned_trg_file_expected': trg_filename[1] + ".expected"
+    }
+
+    try:
+      _prep_files(box_eval)
+      _run_test(configuration, box_eval)
+    finally:
+      _cleanup_files(box_eval)
+
+
+  def _run_test(configuration, box_eval):
+    box_config = configure(configuration)
+    box = initialise(box_config)
+    
+    output = eval_pipeline(box, box_eval, box_config)
+    try:
+      thelp.diff(box_eval['cleaned_src_file_expected'], output['cleaned_src_filename'])
+      thelp.diff(box_eval['cleaned_trg_file_expected'], output['cleaned_trg_filename'])
+    finally:
+      os.unlink(output['cleaned_src_filename'])
+      os.unlink(output['cleaned_trg_filename'])
+
+
+  def _line(line_lengths):
+    def _gen_line(tokens):
+      return " ".join(map(lambda n: "tok" + str(n), range(tokens)))
+    return map(_gen_line, line_lengths)
+
+
+  def _prep_files(box_eval):
+    thelp.cat(box_eval['src_filename'], _line([10, 20, 30, 40, 17, 21]))
+    thelp.cat(box_eval['trg_filename'], _line([40, 30, 20, 10, 20, 21]))
+    #expected output:
+    thelp.cat(box_eval['cleaned_src_file_expected'], _line([17]))
+    thelp.cat(box_eval['cleaned_trg_file_expected'], _line([20]))
+
+
+  def _cleanup_files(box_eval):
+    try:
+      for key, filename in box_eval.items():
+        os.unlink(filename)
+    except:
+      pass
+
+
+  _test_main()
+
diff --git a/contrib/arrow-pipelines/python/training/components/cleanup/cleanup3.py b/contrib/arrow-pipelines/python/training/components/cleanup/cleanup3.py
new file mode 100644
index 000000000..27625c612
--- /dev/null
+++ b/contrib/arrow-pipelines/python/training/components/cleanup/cleanup3.py
@@ -0,0 +1,109 @@
+from pypeline.helpers.helpers import cons_function_component
+
+def configure(args):
+  result = {}
+  result['segment_length'] = args['segment_length_limit']
+  return result
+
+def initialise(config):
+  def _filter(limit, ifh1, ofh1, ifh2, ofh2):
+    def _short(line):
+      n = 0
+      for c in line:
+        if c == " ":
+          n += 1
+      #print(line, ":", n)
+      return n < limit
+
+    for (l1, l2) in zip(ifh1, ifh2):
+      if _short(l1) and _short(l2):
+        print(l1, end='', file=ofh1)
+        print(l2, end='', file=ofh2)
+
+  def _filter_main(config, value):
+    limit = config['segment_length']
+    (ifh1, ifh2, ofh1, ofh2) = (None, None, None, None)
+    try:
+      ifh1 = open(value['src_filename'], "r")
+      ifh2 = open(value['trg_filename'], "r")
+      ofh1 = open(value['cleaned_src_filename'], "w")
+      ofh2 = open(value['cleaned_trg_filename'], "w")
+
+      _filter(limit, ifh1, ofh1, ifh2, ofh2)
+
+      return {'cleaned_src_filename': value['cleaned_src_filename'],
+              'cleaned_trg_filename': value['cleaned_trg_filename']}
+    finally:
+      def _safe_close(fh):
+        if fh is not None:
+          fh.close()
+      _safe_close(ifh1)
+      _safe_close(ifh2)
+      _safe_close(ofh1)
+      _safe_close(ofh2)
+    
+  return cons_function_component(_filter_main)
+
+
+if __name__ == '__main__':
+  import os
+  import tempfile
+  import training.components.shared.test as thelp
+
+
+  def _test_main():
+    configuration = {'segment_length_limit': 20}
+
+    src_filename = tempfile.mkstemp(suffix = "src", dir = "/tmp")
+    trg_filename = tempfile.mkstemp(suffix = "trg", dir = "/tmp")
+
+    box_eval = {
+      'src_filename': src_filename[1],
+      'trg_filename': trg_filename[1],
+      'cleaned_src_filename': src_filename[1] + ".clean",
+      'cleaned_trg_filename': trg_filename[1] + ".clean",
+      'cleaned_src_file_expected': src_filename[1] + ".expected",
+      'cleaned_trg_file_expected': trg_filename[1] + ".expected"
+    }
+
+    try:
+      _prep_files(box_eval)
+      _run_test(configuration, box_eval)
+    finally:
+      _cleanup_files(box_eval)
+
+
+  def _run_test(configuration, box_eval):
+    from pypeline.helpers.helpers import run_pipeline
+    box_config = configure(configuration)
+    box = initialise(box_config)
+    
+    run_pipeline(box, box_config, box_eval)
+    thelp.diff(box_eval['cleaned_src_file_expected'], box_eval['cleaned_src_filename'])
+    thelp.diff(box_eval['cleaned_trg_file_expected'], box_eval['cleaned_trg_filename'])
+
+
+  def _line(line_lengths):
+    def _gen_line(tokens):
+      return " ".join(map(lambda n: "tok" + str(n), range(tokens)))
+    return map(_gen_line, line_lengths)
+
+
+  def _prep_files(box_eval):
+    thelp.cat(box_eval['src_filename'], _line([10, 20, 30, 40, 17, 21]))
+    thelp.cat(box_eval['trg_filename'], _line([40, 30, 20, 10, 20, 21]))
+    #expected output:
+    thelp.cat(box_eval['cleaned_src_file_expected'], _line([17]))
+    thelp.cat(box_eval['cleaned_trg_file_expected'], _line([20]))
+
+
+  def _cleanup_files(box_eval):
+    try:
+      for key, filename in box_eval.items():
+        os.unlink(filename)
+    except:
+      pass
+
+
+  _test_main()
+
diff --git a/contrib/arrow-pipelines/python/training/components/data_split/__init__.py b/contrib/arrow-pipelines/python/training/components/data_split/__init__.py
new file mode 100644
index 000000000..e69de29bb
--- /dev/null
+++ b/contrib/arrow-pipelines/python/training/components/data_split/__init__.py
diff --git a/contrib/arrow-pipelines/python/training/components/data_split/data_split.py b/contrib/arrow-pipelines/python/training/components/data_split/data_split.py
new file mode 100644
index 000000000..b8469cbf6
--- /dev/null
+++ b/contrib/arrow-pipelines/python/training/components/data_split/data_split.py
@@ -0,0 +1,146 @@
+from pypeline.helpers.helpers import cons_function_component
+
+def configure(args):
+  result = {}
+  result['evaluate_size'] = args['evaluation_data_size']
+  result['development_size'] = args['development_data_size']
+  return result
+
+def initialise(config):
+
+  def _copy(size, inp, ofh1, ofh2):
+    try:
+      while size != 0:
+        (l1, l2) = inp.next()
+        print >>ofh1, l1,
+        print >>ofh2, l2,
+        size -= 1
+    except StopIteration:
+      pass
+
+  def _make_split_filename(filename, data_set):
+    bits = filename.split(".")
+    last = bits.pop()
+    lang_code = bits.pop()
+    
+    bits.append(last)
+    bits.append(data_set)
+    bits.append(lang_code)
+
+    new_filename = ".".join(bits)
+    return new_filename
+
+  def _splitter_main(value, config):
+    (ifh1, ifh2, ofh1, ofh2) = (None, None, None, None)
+    try:
+      input_src_filename = value['src_filename']
+      input_trg_filename = value['trg_filename']
+
+      ifh1 = open(input_src_filename, "r")
+      ifh2 = open(input_trg_filename, "r")
+      inp = iter(zip(ifh1, ifh2))
+
+      result = {}
+      for (data_set, size) in [
+        ('devel', config['development_size']),
+        ('eval', config['evaluate_size']),
+        ('train', -1)
+                ]:
+        output_src_filename = _make_split_filename(input_src_filename, data_set)
+        output_trg_filename = _make_split_filename(input_trg_filename, data_set)
+        ofh1 = open(output_src_filename, "w")
+        ofh2 = open(output_trg_filename, "w")
+
+        _copy(size, inp, ofh1, ofh2)
+        result[data_set + '_src_filename'] = output_src_filename
+        result[data_set + '_trg_filename'] = output_trg_filename
+
+      return result
+
+    finally:
+      def _safe_close(fh):
+        if fh is not None:
+          fh.close()
+      _safe_close(ifh1)
+      _safe_close(ifh2)
+      _safe_close(ofh1)
+      _safe_close(ofh2)
+    
+  return _splitter_main
+
+
+if __name__ == '__main__':
+  import os
+  import tempfile
+  import test.test as thelp
+
+  from pypeline.helpers.helpers import eval_pipeline
+
+
+  def _test_main():
+    configuration = {
+      'evaluation_data_size': 7,
+      'development_data_size': 13,
+    }
+
+    src_filename = tempfile.mkstemp(suffix = ".src", dir = "/tmp")
+    trg_filename = tempfile.mkstemp(suffix = ".trg", dir = "/tmp")
+
+    box_eval = {
+      'src_filename': src_filename[1],
+      'trg_filename': trg_filename[1],
+      'devel_src_expected': src_filename[1] + ".devel.expected",
+      'devel_trg_expected': trg_filename[1] + ".devel.expected",
+      'eval_src_expected': src_filename[1] + ".eval.expected",
+      'eval_trg_expected': trg_filename[1] + ".eval.expected",
+      'train_src_expected': src_filename[1] + ".train.expected",
+      'train_trg_expected': trg_filename[1] + ".train.expected",
+    }
+
+    try:
+      _prep_files(box_eval)
+      _run_test(configuration, box_eval)
+    finally:
+      _cleanup_files(box_eval)
+
+
+  def _run_test(configuration, box_eval):
+    box_config = configure(configuration)
+    box = initialise(box_config)
+    
+    output = eval_pipeline(box, box_eval, box_config)
+    for data_set in ['devel', 'eval', 'train']:
+      for lang in ['src', 'trg']:
+        filename = output[data_set + '_' + lang + '_filename']
+        filename_expected = box_eval[data_set + '_' + lang + '_expected']
+      thelp.diff(filename_expected, filename)
+
+
+  def _line(line_lengths):
+    def _gen_line(tokens):
+      return " ".join(map(lambda n: "tok" + str(n), range(tokens)))
+    return map(_gen_line, line_lengths)
+
+
+  def _prep_files(box_eval):
+    thelp.cat(box_eval['src_filename'], _line(range(50)))
+    thelp.cat(box_eval['trg_filename'], _line(range(50)))
+    #expected output:
+    thelp.cat(box_eval['devel_src_expected'], _line(range(0,13)))
+    thelp.cat(box_eval['devel_trg_expected'], _line(range(0,13)))
+    thelp.cat(box_eval['eval_src_expected'], _line(range(13,20)))
+    thelp.cat(box_eval['eval_trg_expected'], _line(range(13,20)))
+    thelp.cat(box_eval['train_src_expected'], _line(range(20,50)))
+    thelp.cat(box_eval['train_trg_expected'], _line(range(20,50)))
+
+
+  def _cleanup_files(box_eval):
+    try:
+      for key, filename in box_eval.items():
+        os.unlink(filename)
+    except:
+      pass
+
+
+  _test_main()
+
diff --git a/contrib/arrow-pipelines/python/training/components/irstlm_build/__init__.py b/contrib/arrow-pipelines/python/training/components/irstlm_build/__init__.py
new file mode 100644
index 000000000..e69de29bb
--- /dev/null
+++ b/contrib/arrow-pipelines/python/training/components/irstlm_build/__init__.py
diff --git a/contrib/arrow-pipelines/python/training/components/irstlm_build/irstlm_build.py b/contrib/arrow-pipelines/python/training/components/irstlm_build/irstlm_build.py
new file mode 100644
index 000000000..f65d61973
--- /dev/null
+++ b/contrib/arrow-pipelines/python/training/components/irstlm_build/irstlm_build.py
@@ -0,0 +1,106 @@
+import os
+import shutil
+import subprocess
+import tempfile
+
+from pypeline.helpers.helpers import cons_function_component
+
+def configure(args):
+    config = dict()
+    config['irstlm_install_directory'] = args['irstlm_installation_dir']
+    config['smoothing_method'] = args['irstlm_smoothing_method']
+    config['lm_directory'] = args['language_model_directory']
+    return config
+
+def initialise(config):
+    def process(a, s):
+        # Create the LM directory if we need to
+        if os.path.exists(s['lm_directory']) is False:
+            os.makedirs(s['lm_directory'])
+
+        # The filename of the file to chew through
+        start_end_input_filename = a['input_filename']
+        if os.path.exists(start_end_input_filename) is False:
+            raise Exception("IRSTLM Build: Input file could not be found at [%s]" % start_end_input_filename)
+
+        # Derive the output file name for the add start-end marker processor
+        filename_bits = os.path.basename(start_end_input_filename).split(".")
+        filename_bits[2] = "sb";
+        start_end_output_filename = os.path.join(s['lm_directory'], ".".join(filename_bits))
+
+        # Derive the output file name of the LM build
+        filename_bits[2] = "lm"
+        lm_filename = os.path.join(s['lm_directory'], ".".join(filename_bits))
+
+        # Derive the compiled LM file name
+        filename_bits[2] = "arpa"
+        compiled_lm_filename = os.path.join(s['lm_directory'], ".".join(filename_bits))
+
+        # First thing to do is add start and end markers
+        start_end_cmdline = [os.path.join(s['irstlm_install_directory'], "bin", "add-start-end.sh")]
+        infile = open(start_end_input_filename, 'r')
+        outfile = open(start_end_output_filename, 'w')
+        print "IRSTLM Build: Invoking [%s]..." % " ".join(start_end_cmdline)
+        return_code = subprocess.check_call(start_end_cmdline, stdin = infile, stdout = outfile)
+        if return_code:
+            raise Exception("IRSTLM add start and end markers failed: input file = [%s], output file = [%s], return code = [%d]" % \
+                            start_end_input_filename, start_end_output_filename, return_code)
+
+        # Next build the language model
+        tmp_dir = tempfile.mkdtemp(dir = "/tmp")
+        try:
+            build_lm_cmdline = [os.path.join(s['irstlm_install_directory'], "bin", "build-lm.sh"),
+                                "-i", start_end_output_filename,
+                                "-t", tmp_dir,
+                                "-p",
+                                "-s", s['smoothing_method'],
+                                "-o", lm_filename]
+            print "IRSTLM Build: Invoking [%s]..." % " ".join(build_lm_cmdline)
+            return_code = subprocess.check_call(build_lm_cmdline)
+            if return_code: 
+                raise Exception("IRST language model failed to build: return code = [%d]" % return_code)
+        finally:
+            if os.path.exists(tmp_dir):
+                shutil.rmtree(tmp_dir)
+
+        # Compile the LM
+        lm_filename = lm_filename + ".gz"
+        compile_lm_cmdline = [os.path.join(s['irstlm_install_directory'], "bin", "compile-lm"),
+                              "--text", "yes",
+                              lm_filename,
+                              compiled_lm_filename]
+        print "IRSTLM Build: Invoking [%s]..." % " ".join(compile_lm_cmdline)
+        return_code = subprocess.check_call(compile_lm_cmdline)
+        if return_code:
+            raise Exception("IRST language model compilation failed: return code = [%d]" % return_code)
+
+        output = {'add_start_end_filename': start_end_output_filename,
+                  'lm_filename': lm_filename,
+                  'compiled_lm_filename': compiled_lm_filename}
+
+        print "IRSTLM Build: Output = %s" % output
+
+        return output
+
+    return process
+
+
+if __name__ == '__main__':
+    from pypeline.helpers.helpers import eval_pipeline
+
+    lm_dir = os.environ["PWD"]
+    configuration = {'irstlm_root': os.environ["IRSTLM"],
+                     'irstlm_smoothing_method': 'improved-kneser-ney',
+                     'language_model_directory': lm_dir}
+    component_config = configure(configuration)
+    component = initialise(component_config)
+
+    value = eval_pipeline(component,
+                          {'input_filename': '/Users/ianjohnson/Dropbox/Documents/MTM2012/tokenised_files/news-commentary-v7.fr-en.tok.en'},
+                          component_config)
+    target = {'add_start_end_filename': os.path.join(lm_dir, 'news-commentary-v7.fr-en.sb.en'),
+              'lm_filename': os.path.join(lm_dir, 'news-commentary-v7.fr-en.lm.en.gz'),
+              'compiled_lm_filename': os.path.join(lm_dir, 'news-commentary-v7.fr-en.arpa.en')}
+    print "Target: %s" % target
+    if value != target:
+        raise Exception("Massive fail!")
diff --git a/contrib/arrow-pipelines/python/training/components/mert/__init__.py b/contrib/arrow-pipelines/python/training/components/mert/__init__.py
new file mode 100644
index 000000000..e69de29bb
--- /dev/null
+++ b/contrib/arrow-pipelines/python/training/components/mert/__init__.py
diff --git a/contrib/arrow-pipelines/python/training/components/mert/mert.py b/contrib/arrow-pipelines/python/training/components/mert/mert.py
new file mode 100755
index 000000000..2b60b1720
--- /dev/null
+++ b/contrib/arrow-pipelines/python/training/components/mert/mert.py
@@ -0,0 +1,83 @@
+#!/usr/bin/env python
+
+import os, shutil, subprocess
+
+from pypeline.helpers.helpers import cons_function_component
+
+def configure(args):
+    result = {}
+    result['src_lang'] = args['src_lang']
+    result['trg_lang'] = args['trg_lang']
+    result['moses_installation_dir'] = args['moses_installation_dir']
+    result['mert_working_dir'] = args['mert_working_directory']
+    return result
+
+def initialise(config):
+
+    def process(a, s):
+        infilename = os.path.abspath(a['development_data_filename'])
+        lm_file = os.path.abspath(a['trg_language_model_filename'])
+        lm_order = int(a['trg_language_model_order'])
+        lm_type = int(a['trg_language_model_type'])
+        orig_moses_ini = os.path.abspath(a['moses_ini_file'])
+        
+        if not os.path.exists(orig_moses_ini):
+            raise Exception, "Error: Input moses.ini does not exist"
+
+        workdir = os.path.abspath(config['mert_working_dir'])
+        #simply call the training perl script
+        #remove the workdir if it is already there
+        if os.path.exists(workdir):
+            shutil.rmtree(workdir)
+        os.makedirs(workdir)
+
+        #local vars
+        moses_install_dir = os.path.abspath(config['moses_installation_dir'])
+        mert_perl = os.path.join(moses_install_dir, 'scripts', 'training', 'mert-moses.pl')
+        bin_dir = os.path.join(moses_install_dir, 'bin')
+        moses_bin = os.path.join(moses_install_dir, 'bin', 'moses')
+        src_file = infilename + '.' + config['src_lang']
+        ref_file = infilename + '.' + config['trg_lang']
+        logfile = os.path.join(workdir, 'log')
+        #change lm configuration in moses ini
+        moses_ini = os.path.join(workdir, 'trained-moses.ini')
+        cmd = r"cat %(orig_moses_ini)s | sed '/\[lmodel-file\]/,/^[[:space:]]*$/c\[lmodel-file\]\n%(lm_type)s 0 %(lm_order)s %(lm_file)s\n' > %(moses_ini)s"
+        cmd = cmd % locals()
+        os.system(cmd)
+        
+        #the command
+        cmd = '%(mert_perl)s --mertdir %(bin_dir)s --working-dir %(workdir)s %(src_file)s %(ref_file)s %(moses_bin)s %(moses_ini)s 2> %(logfile)s'
+        cmd = cmd % locals()
+
+        pipe = subprocess.Popen(cmd, stdin = subprocess.PIPE, stdout = subprocess.PIPE, shell=True)
+        pipe.wait()
+
+        #check the moses ini
+        new_mosesini = os.path.join(workdir, 'moses.ini')
+        if not os.path.exists(new_mosesini):
+            raise Exception, 'Failed MERT'
+        
+        return {'moses_ini_file':new_mosesini}
+
+    return process
+
+if __name__ == '__main__':
+
+    def __test():
+        configuration = {'src_lang':'en',
+                         'trg_lang':'lt',
+                         'moses_installation_dir':os.path.abspath('../../../../'),
+                         'mert_working_dir':'../../../../../tuning'}
+        values = {'development_data_filename':'../../../../../corpus/tune',
+                  'moses_ini_file':'../../../../../model/model/moses.ini',
+                  'trg_language_model_filename':'../../../../../corpus/train.lt.lm',
+                  'trg_language_model_type':9,
+                  'trg_language_model_order':4}
+        from pypeline.helpers.helpers import run_pipeline
+        box_config = configure(configuration)
+        box = initialise(configuration)
+        print run_pipeline(box, values, None)
+
+    #do some test
+    __test()
+
diff --git a/contrib/arrow-pipelines/python/training/components/model_training/__init__.py b/contrib/arrow-pipelines/python/training/components/model_training/__init__.py
new file mode 100644
index 000000000..e69de29bb
--- /dev/null
+++ b/contrib/arrow-pipelines/python/training/components/model_training/__init__.py
diff --git a/contrib/arrow-pipelines/python/training/components/model_training/model_training.py b/contrib/arrow-pipelines/python/training/components/model_training/model_training.py
new file mode 100755
index 000000000..e990307d2
--- /dev/null
+++ b/contrib/arrow-pipelines/python/training/components/model_training/model_training.py
@@ -0,0 +1,72 @@
+#!/usr/bin/env python
+
+import os, shutil, subprocess
+
+from pypeline.helpers.helpers import cons_function_component
+
+def configure(args):
+    result = {}
+    result['src_lang'] = args['src_lang']
+    result['trg_lang'] = args['trg_lang']
+    result['moses_installation_dir'] = args['moses_installation_dir']
+    result['external_bin_dir'] = args['giza_installation_dir']
+    result['model_directory'] = args['translation_model_directory']
+    return result
+
+def initialise(config):
+
+    def process(a, s):
+        infilename = os.path.abspath(a['training_data_filename'])
+        workdir = os.path.abspath(config['model_directory'])
+        #simply call the training perl script
+        #remove the workdir if it is already there
+        if os.path.exists(workdir):
+            shutil.rmtree(workdir)
+        os.makedirs(workdir)
+        
+        #local vars
+        train_model_perl = os.path.abspath(config['moses_installation_dir']) + os.sep + 'scripts' + os.sep + 'training' + os.sep + 'train-model.perl'
+        src_lang = config['src_lang'].lower()
+        trg_lang = config['trg_lang'].lower()
+        external_bin = os.path.abspath(config['external_bin_dir'])
+        #create a dummy lm file
+        dummy_lmfile = workdir + os.sep + 'dummy.lm'
+        f = open(dummy_lmfile, 'w')
+        print >> f, "dummy lm file"
+        f.close()
+        logfile = workdir + os.sep + 'log'
+        
+        #the command
+        cmd = '%(train_model_perl)s -root-dir %(workdir)s -corpus %(infilename)s -f %(src_lang)s -e %(trg_lang)s -alignment grow-diag-final-and -reordering msd-bidirectional-fe -lm 0:5:%(dummy_lmfile)s:0 -external-bin-dir %(external_bin)s 2> %(logfile)s'
+
+        cmd = cmd % locals()
+
+        pipe = subprocess.Popen(cmd, stdin = subprocess.PIPE, stdout = subprocess.PIPE, shell=True)
+        pipe.wait()
+
+        #check the moses ini
+        mosesini = workdir + os.sep + 'model' + os.sep + 'moses.ini'
+        if not os.path.exists(mosesini):
+            raise Exception, 'Failed training model'
+        
+        return {'moses_ini_file':mosesini}
+
+    return process
+
+if __name__ == '__main__':
+
+    def __test():
+        configuration = {'src_lang':'en',
+                         'trg_lang':'lt',
+                         'moses_installation_dir':os.environ['MOSES_HOME'],
+                         'giza_installation_dir':os.environ['GIZA_HOME'],
+                         'translation_model_directory':'model-dir'}
+        values = {'training_data_filename':'/Users/ianjohnson/work/MTM-2012/corpus/training/cleantrain'}
+        from pypeline.helpers.helpers import run_pipeline
+        box_config = configure(configuration)
+        box = initialise(box_config)
+        print run_pipeline(box, values, None)
+
+    #do some test
+    __test()
+
diff --git a/contrib/arrow-pipelines/python/training/components/tokenizer/__init__.py b/contrib/arrow-pipelines/python/training/components/tokenizer/__init__.py
new file mode 100644
index 000000000..e69de29bb
--- /dev/null
+++ b/contrib/arrow-pipelines/python/training/components/tokenizer/__init__.py
diff --git a/contrib/arrow-pipelines/python/training/components/tokenizer/src_tokenizer.py b/contrib/arrow-pipelines/python/training/components/tokenizer/src_tokenizer.py
new file mode 100755
index 000000000..57f8771df
--- /dev/null
+++ b/contrib/arrow-pipelines/python/training/components/tokenizer/src_tokenizer.py
@@ -0,0 +1,43 @@
+#!/usr/bin/env python
+
+import os
+
+from tokenizer import Tokenizer
+
+from pypeline.helpers.helpers import cons_function_component
+
+def configure(args):
+    result = {}
+    result['src_lang'] = args['src_lang']
+    result['src_tokenisation_dir'] = args['src_tokenisation_dir']
+    result['moses_installation_dir'] = args['moses_installation_dir']
+    return result
+
+def initialise(config):
+
+    def process(a, s):
+        infilename = a['src_filename']
+        outfilename = Tokenizer.batch_tokenise(
+            config['src_lang'], 
+            config['moses_installation_dir'], 
+            infilename, 
+            config['src_tokenisation_dir'])
+        return {'tokenised_src_filename':outfilename}
+
+    return process
+
+if __name__ == '__main__':
+
+    def __test():
+        configuration = {'src_lang':'de',
+                         'src_tokenisation_dir':'tmptok',
+                         'moses_installation_dir':os.path.abspath('../../../../')}
+        values = {'src_filename':'tmp.de'}
+        from pypeline.helpers.helpers import run_pipeline
+        box_config = configure(configuration)
+        box = initialise(configuration)
+        print run_pipeline(box, values, None)
+
+    #do some test
+    __test()
+
diff --git a/contrib/arrow-pipelines/python/training/components/tokenizer/tmp.de b/contrib/arrow-pipelines/python/training/components/tokenizer/tmp.de
new file mode 100644
index 000000000..c6b41edbe
--- /dev/null
+++ b/contrib/arrow-pipelines/python/training/components/tokenizer/tmp.de
@@ -0,0 +1,3 @@
+asdfweoih
+awfwoeijf awefo
+what's this
diff --git a/contrib/arrow-pipelines/python/training/components/tokenizer/tokenizer.py b/contrib/arrow-pipelines/python/training/components/tokenizer/tokenizer.py
new file mode 100644
index 000000000..354ec1abc
--- /dev/null
+++ b/contrib/arrow-pipelines/python/training/components/tokenizer/tokenizer.py
@@ -0,0 +1,36 @@
+#!/usr/bin/env python
+
+import sys, os, subprocess
+
+class Tokenizer:
+    
+    @staticmethod
+    def batch_tokenise(lang, mosesdir, infilename, workdir):
+        print "Tokenizing [%s] in working directory [%s]..." % (infilename, workdir)
+        if not os.path.exists(workdir):
+            os.makedirs(workdir)
+        tok = Tokenizer(lang, mosesdir)
+        basefilename = os.path.basename(infilename)
+        outfilename = workdir + os.sep + basefilename + '.tok'
+        tok.file_tokenise(infilename, outfilename)
+        return outfilename
+        
+    def __init__(self, lang, mosesdir):
+        self.arrows = None
+        self.lang = lang
+        #check the perl tokenizer is here
+        #path = os.path.dirname(os.path.abspath(__file__))
+        path = mosesdir + os.sep + 'scripts' + os.sep + 'tokenizer'
+        self.perltok = path + os.sep + 'tokenizer.perl'
+        if not os.path.exists(path):
+            raise Exception, "Perl tokenizer does not exists"
+
+    def file_tokenise(self, infilename, outfilename):
+        cmd = '%s -q -l %s < %s > %s' % (self.perltok, self.lang, infilename, outfilename)
+        pipe = subprocess.Popen(cmd, stdin = subprocess.PIPE, stdout = subprocess.PIPE, shell=True)
+        pipe.wait()
+
+if __name__ == '__main__':
+    #do some test
+    pass
+
diff --git a/contrib/arrow-pipelines/python/training/components/tokenizer/trg_tokenizer.py b/contrib/arrow-pipelines/python/training/components/tokenizer/trg_tokenizer.py
new file mode 100755
index 000000000..3852e296f
--- /dev/null
+++ b/contrib/arrow-pipelines/python/training/components/tokenizer/trg_tokenizer.py
@@ -0,0 +1,43 @@
+#!/usr/bin/env python
+
+import os
+
+from tokenizer import Tokenizer
+
+from pypeline.helpers.helpers import cons_function_component
+
+def configure(args):
+    result = {}
+    result['trg_lang'] = args['trg_lang']
+    result['trg_tokenisation_dir'] = args['trg_tokenisation_dir']
+    result['moses_installation_dir'] = args['moses_installation_dir']
+    return result
+
+def initialise(config):
+
+    def process(a, s):
+        infilename = a['trg_filename']
+        outfilename = Tokenizer.batch_tokenise(
+            config['trg_lang'], 
+            config['moses_installation_dir'],
+            infilename, 
+            config['trg_tokenisation_dir'])
+        return {'tokenised_trg_filename':outfilename}
+
+    return process
+
+if __name__ == '__main__':
+
+    def __test():
+        configuration = {'trg_lang':'de',
+                         'trg_tokenisation_dir':'tmptoktrg',
+                         'moses_installation_dir':os.path.abspath('../../../../')}
+        values = {'trg_filename':'tmp.de'}
+        from pypeline.helpers.helpers import run_pipeline
+        box_config = configure(configuration)
+        box = initialise(configuration)
+        print run_pipeline(box, values, None)
+
+    #do some test
+    __test()
+
diff --git a/contrib/other-builds/OnDiskPt/.cproject b/contrib/other-builds/OnDiskPt/.cproject
index e135b8886..f551380fd 100644
--- a/contrib/other-builds/OnDiskPt/.cproject
+++ b/contrib/other-builds/OnDiskPt/.cproject
@@ -24,7 +24,7 @@
 					<folderInfo id="cdt.managedbuild.config.gnu.macosx.exe.debug.846397978." name="/" resourcePath="">
 						<toolChain id="cdt.managedbuild.toolchain.gnu.macosx.exe.debug.725420545" name="MacOSX GCC" superClass="cdt.managedbuild.toolchain.gnu.macosx.exe.debug">
 							<targetPlatform binaryParser="org.eclipse.cdt.core.MachO64;org.eclipse.cdt.core.ELF" id="cdt.managedbuild.target.gnu.platform.macosx.exe.debug.1586272140" name="Debug Platform" superClass="cdt.managedbuild.target.gnu.platform.macosx.exe.debug"/>
-							<builder buildPath="${workspace_loc:/OnDiskPt/Debug}" id="cdt.managedbuild.target.gnu.builder.macosx.exe.debug.1909553559" keepEnvironmentInBuildfile="false" managedBuildOn="true" name="Gnu Make Builder" superClass="cdt.managedbuild.target.gnu.builder.macosx.exe.debug"/>
+							<builder buildPath="${workspace_loc:/OnDiskPt/Debug}" id="cdt.managedbuild.target.gnu.builder.macosx.exe.debug.1909553559" keepEnvironmentInBuildfile="false" managedBuildOn="true" name="Gnu Make Builder" parallelBuildOn="true" parallelizationNumber="optimal" superClass="cdt.managedbuild.target.gnu.builder.macosx.exe.debug"/>
 							<tool id="cdt.managedbuild.tool.macosx.c.linker.macosx.exe.debug.30521110" name="MacOS X C Linker" superClass="cdt.managedbuild.tool.macosx.c.linker.macosx.exe.debug"/>
 							<tool id="cdt.managedbuild.tool.macosx.cpp.linker.macosx.exe.debug.478334849" name="MacOS X C++ Linker" superClass="cdt.managedbuild.tool.macosx.cpp.linker.macosx.exe.debug">
 								<inputType id="cdt.managedbuild.tool.macosx.cpp.linker.input.1328561226" superClass="cdt.managedbuild.tool.macosx.cpp.linker.input">
@@ -133,8 +133,13 @@
 			<autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfileCPP"/>
 		</scannerConfigBuildInfo>
 	</storageModule>
-	<storageModule moduleId="refreshScope" versionNumber="1">
-		<resource resourceType="PROJECT" workspacePath="/OnDiskPt"/>
+	<storageModule moduleId="refreshScope" versionNumber="2">
+		<configuration configurationName="Release">
+			<resource resourceType="PROJECT" workspacePath="/OnDiskPt"/>
+		</configuration>
+		<configuration configurationName="Debug">
+			<resource resourceType="PROJECT" workspacePath="/OnDiskPt"/>
+		</configuration>
 	</storageModule>
 	<storageModule moduleId="org.eclipse.cdt.make.core.buildtargets"/>
 	<storageModule moduleId="org.eclipse.cdt.core.LanguageSettingsProviders"/>
diff --git a/contrib/other-builds/extractor/.cproject b/contrib/other-builds/extractor/.cproject
index 7529a7799..fc08b4c3d 100644
--- a/contrib/other-builds/extractor/.cproject
+++ b/contrib/other-builds/extractor/.cproject
@@ -18,11 +18,14 @@
 					<folderInfo id="cdt.managedbuild.config.gnu.exe.debug.1133345948." name="/" resourcePath="">
 						<toolChain id="cdt.managedbuild.toolchain.gnu.exe.debug.1405862229" name="Linux GCC" superClass="cdt.managedbuild.toolchain.gnu.exe.debug">
 							<targetPlatform id="cdt.managedbuild.target.gnu.platform.exe.debug.605722566" name="Debug Platform" superClass="cdt.managedbuild.target.gnu.platform.exe.debug"/>
-							<builder buildPath="${workspace_loc:/extractor/Debug}" id="cdt.managedbuild.target.gnu.builder.exe.debug.238577912" keepEnvironmentInBuildfile="false" managedBuildOn="true" name="Gnu Make Builder" superClass="cdt.managedbuild.target.gnu.builder.exe.debug"/>
+							<builder buildPath="${workspace_loc:/extractor/Debug}" id="cdt.managedbuild.target.gnu.builder.exe.debug.238577912" keepEnvironmentInBuildfile="false" managedBuildOn="true" name="Gnu Make Builder" parallelBuildOn="true" parallelizationNumber="optimal" superClass="cdt.managedbuild.target.gnu.builder.exe.debug"/>
 							<tool id="cdt.managedbuild.tool.gnu.archiver.base.1956867596" name="GCC Archiver" superClass="cdt.managedbuild.tool.gnu.archiver.base"/>
 							<tool id="cdt.managedbuild.tool.gnu.cpp.compiler.exe.debug.1512268277" name="GCC C++ Compiler" superClass="cdt.managedbuild.tool.gnu.cpp.compiler.exe.debug">
 								<option id="gnu.cpp.compiler.exe.debug.option.optimization.level.2143789149" name="Optimization Level" superClass="gnu.cpp.compiler.exe.debug.option.optimization.level" value="gnu.cpp.compiler.optimization.level.none" valueType="enumerated"/>
 								<option id="gnu.cpp.compiler.exe.debug.option.debugging.level.285958391" name="Debug Level" superClass="gnu.cpp.compiler.exe.debug.option.debugging.level" value="gnu.cpp.compiler.debugging.level.max" valueType="enumerated"/>
+								<option id="gnu.cpp.compiler.option.include.paths.966722418" superClass="gnu.cpp.compiler.option.include.paths" valueType="includePath">
+									<listOptionValue builtIn="false" value="&quot;${workspace_loc}/../../boost/include&quot;"/>
+								</option>
 								<inputType id="cdt.managedbuild.tool.gnu.cpp.compiler.input.1839105433" superClass="cdt.managedbuild.tool.gnu.cpp.compiler.input"/>
 							</tool>
 							<tool id="cdt.managedbuild.tool.gnu.c.compiler.exe.debug.554846982" name="GCC C Compiler" superClass="cdt.managedbuild.tool.gnu.c.compiler.exe.debug">
@@ -119,5 +122,13 @@
 			<autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfileC"/>
 		</scannerConfigBuildInfo>
 	</storageModule>
-	<storageModule moduleId="refreshScope"/>
+	<storageModule moduleId="refreshScope" versionNumber="2">
+		<configuration configurationName="Release">
+			<resource resourceType="PROJECT" workspacePath="/extractor"/>
+		</configuration>
+		<configuration configurationName="Debug">
+			<resource resourceType="PROJECT" workspacePath="/extractor"/>
+		</configuration>
+	</storageModule>
+	<storageModule moduleId="org.eclipse.cdt.core.LanguageSettingsProviders"/>
 </cproject>
diff --git a/contrib/other-builds/lm/.cproject b/contrib/other-builds/lm/.cproject
index 2036e6b18..e3e47fd7e 100644
--- a/contrib/other-builds/lm/.cproject
+++ b/contrib/other-builds/lm/.cproject
@@ -24,7 +24,7 @@
 					<folderInfo id="cdt.managedbuild.config.gnu.macosx.exe.debug.351042750." name="/" resourcePath="">
 						<toolChain id="cdt.managedbuild.toolchain.gnu.macosx.exe.debug.640882096" name="MacOSX GCC" superClass="cdt.managedbuild.toolchain.gnu.macosx.exe.debug">
 							<targetPlatform binaryParser="org.eclipse.cdt.core.MachO64;org.eclipse.cdt.core.ELF" id="cdt.managedbuild.target.gnu.platform.macosx.exe.debug.793478365" name="Debug Platform" superClass="cdt.managedbuild.target.gnu.platform.macosx.exe.debug"/>
-							<builder buildPath="${workspace_loc:/lm/Debug}" id="cdt.managedbuild.target.gnu.builder.macosx.exe.debug.36011795" keepEnvironmentInBuildfile="false" managedBuildOn="true" name="Gnu Make Builder" superClass="cdt.managedbuild.target.gnu.builder.macosx.exe.debug"/>
+							<builder buildPath="${workspace_loc:/lm/Debug}" id="cdt.managedbuild.target.gnu.builder.macosx.exe.debug.36011795" keepEnvironmentInBuildfile="false" managedBuildOn="true" name="Gnu Make Builder" parallelBuildOn="true" parallelizationNumber="optimal" superClass="cdt.managedbuild.target.gnu.builder.macosx.exe.debug"/>
 							<tool id="cdt.managedbuild.tool.macosx.c.linker.macosx.exe.debug.1252826468" name="MacOS X C Linker" superClass="cdt.managedbuild.tool.macosx.c.linker.macosx.exe.debug"/>
 							<tool id="cdt.managedbuild.tool.macosx.cpp.linker.macosx.exe.debug.1024598065" name="MacOS X C++ Linker" superClass="cdt.managedbuild.tool.macosx.cpp.linker.macosx.exe.debug">
 								<inputType id="cdt.managedbuild.tool.macosx.cpp.linker.input.139111896" superClass="cdt.managedbuild.tool.macosx.cpp.linker.input">
@@ -131,7 +131,14 @@
 			<autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfileCPP"/>
 		</scannerConfigBuildInfo>
 	</storageModule>
-	<storageModule moduleId="refreshScope"/>
+	<storageModule moduleId="refreshScope" versionNumber="2">
+		<configuration configurationName="Release">
+			<resource resourceType="PROJECT" workspacePath="/lm"/>
+		</configuration>
+		<configuration configurationName="Debug">
+			<resource resourceType="PROJECT" workspacePath="/lm"/>
+		</configuration>
+	</storageModule>
 	<storageModule moduleId="org.eclipse.cdt.make.core.buildtargets"/>
 	<storageModule moduleId="org.eclipse.cdt.core.LanguageSettingsProviders"/>
 </cproject>
diff --git a/contrib/other-builds/lm/.project b/contrib/other-builds/lm/.project
index e75388ac1..a1bde37c2 100644
--- a/contrib/other-builds/lm/.project
+++ b/contrib/other-builds/lm/.project
@@ -142,11 +142,6 @@
 			<locationURI>PARENT-3-PROJECT_LOC/lm/build_binary</locationURI>
 		</link>
 		<link>
-			<name>build_binary.cc</name>
-			<type>1</type>
-			<locationURI>PARENT-3-PROJECT_LOC/lm/build_binary.cc</locationURI>
-		</link>
-		<link>
 			<name>clean.sh</name>
 			<type>1</type>
 			<locationURI>PARENT-3-PROJECT_LOC/lm/clean.sh</locationURI>
@@ -177,11 +172,6 @@
 			<locationURI>PARENT-3-PROJECT_LOC/lm/facade.hh</locationURI>
 		</link>
 		<link>
-			<name>fragment.cc</name>
-			<type>1</type>
-			<locationURI>PARENT-3-PROJECT_LOC/lm/fragment.cc</locationURI>
-		</link>
-		<link>
 			<name>left.hh</name>
 			<type>1</type>
 			<locationURI>PARENT-3-PROJECT_LOC/lm/left.hh</locationURI>
@@ -212,11 +202,6 @@
 			<locationURI>PARENT-3-PROJECT_LOC/lm/lm_exception.hh</locationURI>
 		</link>
 		<link>
-			<name>max_order.cc</name>
-			<type>1</type>
-			<locationURI>PARENT-3-PROJECT_LOC/lm/max_order.cc</locationURI>
-		</link>
-		<link>
 			<name>max_order.hh</name>
 			<type>1</type>
 			<locationURI>PARENT-3-PROJECT_LOC/lm/max_order.hh</locationURI>
@@ -242,11 +227,6 @@
 			<locationURI>PARENT-3-PROJECT_LOC/lm/model_type.hh</locationURI>
 		</link>
 		<link>
-			<name>ngram_query.cc</name>
-			<type>1</type>
-			<locationURI>PARENT-3-PROJECT_LOC/lm/ngram_query.cc</locationURI>
-		</link>
-		<link>
 			<name>ngram_query.hh</name>
 			<type>1</type>
 			<locationURI>PARENT-3-PROJECT_LOC/lm/ngram_query.hh</locationURI>
diff --git a/contrib/other-builds/mert_lib/.cproject b/contrib/other-builds/mert_lib/.cproject
index 41a471cd1..e1c19b822 100644
--- a/contrib/other-builds/mert_lib/.cproject
+++ b/contrib/other-builds/mert_lib/.cproject
@@ -7,7 +7,7 @@
 					<externalSetting>
 						<entry flags="VALUE_WORKSPACE_PATH" kind="includePath" name="/mert_lib"/>
 						<entry flags="VALUE_WORKSPACE_PATH" kind="libraryPath" name="/mert_lib/Debug"/>
-						<entry flags="RESOLVED" kind="libraryFile" name="mert_lib"/>
+						<entry flags="RESOLVED" kind="libraryFile" name="mert_lib" srcPrefixMapping="" srcRootPath=""/>
 					</externalSetting>
 				</externalSettings>
 				<extensions>
@@ -23,13 +23,14 @@
 					<folderInfo id="cdt.managedbuild.config.gnu.lib.debug.1721952013." name="/" resourcePath="">
 						<toolChain id="cdt.managedbuild.toolchain.gnu.lib.debug.1932340583" name="Linux GCC" superClass="cdt.managedbuild.toolchain.gnu.lib.debug">
 							<targetPlatform id="cdt.managedbuild.target.gnu.platform.lib.debug.296711714" name="Debug Platform" superClass="cdt.managedbuild.target.gnu.platform.lib.debug"/>
-							<builder buildPath="${workspace_loc:/mert_lib/Debug}" id="cdt.managedbuild.target.gnu.builder.lib.debug.1369910974" keepEnvironmentInBuildfile="false" managedBuildOn="true" name="Gnu Make Builder" superClass="cdt.managedbuild.target.gnu.builder.lib.debug"/>
+							<builder buildPath="${workspace_loc:/mert_lib/Debug}" id="cdt.managedbuild.target.gnu.builder.lib.debug.1369910974" keepEnvironmentInBuildfile="false" managedBuildOn="true" name="Gnu Make Builder" parallelBuildOn="true" parallelizationNumber="optimal" superClass="cdt.managedbuild.target.gnu.builder.lib.debug"/>
 							<tool id="cdt.managedbuild.tool.gnu.archiver.lib.debug.89397980" name="GCC Archiver" superClass="cdt.managedbuild.tool.gnu.archiver.lib.debug"/>
 							<tool id="cdt.managedbuild.tool.gnu.cpp.compiler.lib.debug.329920537" name="GCC C++ Compiler" superClass="cdt.managedbuild.tool.gnu.cpp.compiler.lib.debug">
 								<option id="gnu.cpp.compiler.lib.debug.option.optimization.level.469164841" name="Optimization Level" superClass="gnu.cpp.compiler.lib.debug.option.optimization.level" value="gnu.cpp.compiler.optimization.level.none" valueType="enumerated"/>
 								<option id="gnu.cpp.compiler.lib.debug.option.debugging.level.1050747398" name="Debug Level" superClass="gnu.cpp.compiler.lib.debug.option.debugging.level" value="gnu.cpp.compiler.debugging.level.max" valueType="enumerated"/>
 								<option id="gnu.cpp.compiler.option.include.paths.1565260476" name="Include paths (-I)" superClass="gnu.cpp.compiler.option.include.paths" valueType="includePath">
 									<listOptionValue builtIn="false" value="&quot;${workspace_loc}/../../&quot;"/>
+									<listOptionValue builtIn="false" value="&quot;${workspace_loc}/../../boost/include&quot;"/>
 								</option>
 								<inputType id="cdt.managedbuild.tool.gnu.cpp.compiler.input.1183866856" superClass="cdt.managedbuild.tool.gnu.cpp.compiler.input"/>
 							</tool>
@@ -45,11 +46,8 @@
 							</tool>
 						</toolChain>
 					</folderInfo>
-					<fileInfo id="cdt.managedbuild.config.gnu.lib.debug.1721952013.626295813" name="extractor.cpp" rcbsApplicability="disable" resourcePath="mert/extractor.cpp" toolsToInvoke="cdt.managedbuild.tool.gnu.cpp.compiler.lib.debug.329920537.1550378460">
-						<tool id="cdt.managedbuild.tool.gnu.cpp.compiler.lib.debug.329920537.1550378460" name="GCC C++ Compiler" superClass="cdt.managedbuild.tool.gnu.cpp.compiler.lib.debug.329920537"/>
-					</fileInfo>
 					<sourceEntries>
-						<entry excluding="mert/extractor.cpp" flags="VALUE_WORKSPACE_PATH|RESOLVED" kind="sourcePath" name=""/>
+						<entry excluding="mert/UtilTest.cpp|mert/TimerTest.cpp|mert/SingletonTest.cpp|mert/PointTest.cpp|mert/OptimizerFactoryTest.cpp|mert/NgramTest.cpp|mert/FeatureDataTest.cpp|mert/DataTest.cpp|mert/ReferenceTest.cpp|mert/VocabularyTest.cpp|mert/extractor.cpp" flags="VALUE_WORKSPACE_PATH|RESOLVED" kind="sourcePath" name=""/>
 					</sourceEntries>
 				</configuration>
 			</storageModule>
@@ -61,7 +59,7 @@
 					<externalSetting>
 						<entry flags="VALUE_WORKSPACE_PATH" kind="includePath" name="/mert_lib"/>
 						<entry flags="VALUE_WORKSPACE_PATH" kind="libraryPath" name="/mert_lib/Release"/>
-						<entry flags="RESOLVED" kind="libraryFile" name="mert_lib"/>
+						<entry flags="RESOLVED" kind="libraryFile" name="mert_lib" srcPrefixMapping="" srcRootPath=""/>
 					</externalSetting>
 				</externalSettings>
 				<extensions>
@@ -119,5 +117,13 @@
 			<autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfileC"/>
 		</scannerConfigBuildInfo>
 	</storageModule>
-	<storageModule moduleId="refreshScope"/>
+	<storageModule moduleId="refreshScope" versionNumber="2">
+		<configuration configurationName="Release">
+			<resource resourceType="PROJECT" workspacePath="/mert_lib"/>
+		</configuration>
+		<configuration configurationName="Debug">
+			<resource resourceType="PROJECT" workspacePath="/mert_lib"/>
+		</configuration>
+	</storageModule>
+	<storageModule moduleId="org.eclipse.cdt.core.LanguageSettingsProviders"/>
 </cproject>
diff --git a/contrib/other-builds/moses-chart-cmd/.cproject b/contrib/other-builds/moses-chart-cmd/.cproject
index fedda926b..71462b5df 100644
--- a/contrib/other-builds/moses-chart-cmd/.cproject
+++ b/contrib/other-builds/moses-chart-cmd/.cproject
@@ -19,7 +19,7 @@
 					<folderInfo id="cdt.managedbuild.config.gnu.exe.debug.162355801." name="/" resourcePath="">
 						<toolChain id="cdt.managedbuild.toolchain.gnu.exe.debug.1633424067" name="Linux GCC" superClass="cdt.managedbuild.toolchain.gnu.exe.debug">
 							<targetPlatform binaryParser="org.eclipse.cdt.core.ELF;org.eclipse.cdt.core.MachO64" id="cdt.managedbuild.target.gnu.platform.exe.debug.1437309068" name="Debug Platform" superClass="cdt.managedbuild.target.gnu.platform.exe.debug"/>
-							<builder buildPath="${workspace_loc:/moses-chart-cmd/Debug}" id="cdt.managedbuild.target.gnu.builder.exe.debug.1495140314" keepEnvironmentInBuildfile="false" managedBuildOn="true" name="Gnu Make Builder" superClass="cdt.managedbuild.target.gnu.builder.exe.debug"/>
+							<builder buildPath="${workspace_loc:/moses-chart-cmd/Debug}" id="cdt.managedbuild.target.gnu.builder.exe.debug.1495140314" keepEnvironmentInBuildfile="false" managedBuildOn="true" name="Gnu Make Builder" parallelBuildOn="true" parallelizationNumber="optimal" superClass="cdt.managedbuild.target.gnu.builder.exe.debug"/>
 							<tool id="cdt.managedbuild.tool.gnu.archiver.base.1247128100" name="GCC Archiver" superClass="cdt.managedbuild.tool.gnu.archiver.base"/>
 							<tool id="cdt.managedbuild.tool.gnu.cpp.compiler.exe.debug.1087697480" name="GCC C++ Compiler" superClass="cdt.managedbuild.tool.gnu.cpp.compiler.exe.debug">
 								<option id="gnu.cpp.compiler.exe.debug.option.optimization.level.1163099464" name="Optimization Level" superClass="gnu.cpp.compiler.exe.debug.option.optimization.level" value="gnu.cpp.compiler.optimization.level.none" valueType="enumerated"/>
@@ -46,6 +46,7 @@
 							<tool id="cdt.managedbuild.tool.gnu.cpp.linker.exe.debug.816413868" name="GCC C++ Linker" superClass="cdt.managedbuild.tool.gnu.cpp.linker.exe.debug">
 								<option id="gnu.cpp.link.option.paths.330225535" name="Library search path (-L)" superClass="gnu.cpp.link.option.paths" valueType="libPaths">
 									<listOptionValue builtIn="false" value="&quot;${workspace_loc:}/../../boost/lib&quot;"/>
+									<listOptionValue builtIn="false" value="&quot;${workspace_loc:}/../../boost/lib64&quot;"/>
 									<listOptionValue builtIn="false" value="&quot;${workspace_loc:}/../../irstlm/lib&quot;"/>
 									<listOptionValue builtIn="false" value="&quot;${workspace_loc:}/../../srilm/lib/macosx&quot;"/>
 									<listOptionValue builtIn="false" value="&quot;${workspace_loc:}/../../srilm/lib/i686-m64&quot;"/>
@@ -70,9 +71,11 @@
 									<listOptionValue builtIn="false" value="lm"/>
 									<listOptionValue builtIn="false" value="util"/>
 									<listOptionValue builtIn="false" value="z"/>
+									<listOptionValue builtIn="false" value="boost_filesystem-mt"/>
+									<listOptionValue builtIn="false" value="boost_iostreams-mt"/>
 									<listOptionValue builtIn="false" value="boost_system-mt"/>
 									<listOptionValue builtIn="false" value="boost_thread-mt"/>
-									<listOptionValue builtIn="false" value="boost_filesystem-mt"/>
+									<listOptionValue builtIn="false" value="bz2"/>
 									<listOptionValue builtIn="false" value="rt"/>
 								</option>
 								<inputType id="cdt.managedbuild.tool.gnu.cpp.linker.input.128214028" superClass="cdt.managedbuild.tool.gnu.cpp.linker.input">
@@ -154,8 +157,13 @@
 			<autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfileCPP"/>
 		</scannerConfigBuildInfo>
 	</storageModule>
-	<storageModule moduleId="refreshScope" versionNumber="1">
-		<resource resourceType="PROJECT" workspacePath="/moses-chart-cmd"/>
+	<storageModule moduleId="refreshScope" versionNumber="2">
+		<configuration configurationName="Release">
+			<resource resourceType="PROJECT" workspacePath="/moses-chart-cmd"/>
+		</configuration>
+		<configuration configurationName="Debug">
+			<resource resourceType="PROJECT" workspacePath="/moses-chart-cmd"/>
+		</configuration>
 	</storageModule>
 	<storageModule moduleId="org.eclipse.cdt.make.core.buildtargets"/>
 	<storageModule moduleId="org.eclipse.cdt.core.LanguageSettingsProviders"/>
diff --git a/contrib/other-builds/moses-cmd/.cproject b/contrib/other-builds/moses-cmd/.cproject
index 10b6784d4..42d2100d8 100644
--- a/contrib/other-builds/moses-cmd/.cproject
+++ b/contrib/other-builds/moses-cmd/.cproject
@@ -19,7 +19,7 @@
 					<folderInfo id="cdt.managedbuild.config.gnu.exe.debug.461114338." name="/" resourcePath="">
 						<toolChain id="cdt.managedbuild.toolchain.gnu.exe.debug.1896491482" name="Linux GCC" superClass="cdt.managedbuild.toolchain.gnu.exe.debug">
 							<targetPlatform binaryParser="org.eclipse.cdt.core.ELF;org.eclipse.cdt.core.MachO64" id="cdt.managedbuild.target.gnu.platform.exe.debug.2144309834" name="Debug Platform" superClass="cdt.managedbuild.target.gnu.platform.exe.debug"/>
-							<builder buildPath="${workspace_loc:/moses-cmd/Debug}" id="cdt.managedbuild.target.gnu.builder.exe.debug.56664170" keepEnvironmentInBuildfile="false" managedBuildOn="true" name="Gnu Make Builder" superClass="cdt.managedbuild.target.gnu.builder.exe.debug"/>
+							<builder buildPath="${workspace_loc:/moses-cmd/Debug}" id="cdt.managedbuild.target.gnu.builder.exe.debug.56664170" keepEnvironmentInBuildfile="false" managedBuildOn="true" name="Gnu Make Builder" parallelBuildOn="true" parallelizationNumber="optimal" superClass="cdt.managedbuild.target.gnu.builder.exe.debug"/>
 							<tool id="cdt.managedbuild.tool.gnu.archiver.base.1278274354" name="GCC Archiver" superClass="cdt.managedbuild.tool.gnu.archiver.base"/>
 							<tool id="cdt.managedbuild.tool.gnu.cpp.compiler.exe.debug.626095182" name="GCC C++ Compiler" superClass="cdt.managedbuild.tool.gnu.cpp.compiler.exe.debug">
 								<option id="gnu.cpp.compiler.exe.debug.option.optimization.level.2084031389" name="Optimization Level" superClass="gnu.cpp.compiler.exe.debug.option.optimization.level" value="gnu.cpp.compiler.optimization.level.none" valueType="enumerated"/>
@@ -46,6 +46,8 @@
 							<tool id="cdt.managedbuild.tool.gnu.cpp.linker.exe.debug.1546774818" name="GCC C++ Linker" superClass="cdt.managedbuild.tool.gnu.cpp.linker.exe.debug">
 								<option id="gnu.cpp.link.option.paths.523170942" name="Library search path (-L)" superClass="gnu.cpp.link.option.paths" valueType="libPaths">
 									<listOptionValue builtIn="false" value="&quot;${workspace_loc:}/../../irstlm/lib&quot;"/>
+									<listOptionValue builtIn="false" value="&quot;${workspace_loc:}/../../boost/lib&quot;"/>
+									<listOptionValue builtIn="false" value="&quot;${workspace_loc:}/../../boost/lib64&quot;"/>
 									<listOptionValue builtIn="false" value="&quot;${workspace_loc:}/../../srilm/lib/macosx&quot;"/>
 									<listOptionValue builtIn="false" value="&quot;${workspace_loc:}/../../srilm/lib/i686-m64&quot;"/>
 									<listOptionValue builtIn="false" value="&quot;${workspace_loc:}/../../srilm/lib/i686&quot;"/>
@@ -69,8 +71,11 @@
 									<listOptionValue builtIn="false" value="z"/>
 									<listOptionValue builtIn="false" value="boost_system-mt"/>
 									<listOptionValue builtIn="false" value="boost_thread-mt"/>
+									<listOptionValue builtIn="false" value="boost_iostreams-mt"/>
+									<listOptionValue builtIn="false" value="boost_filesystem-mt"/>
 									<listOptionValue builtIn="false" value="lm"/>
 									<listOptionValue builtIn="false" value="util"/>
+									<listOptionValue builtIn="false" value="bz2"/>
 									<listOptionValue builtIn="false" value="rt"/>
 								</option>
 								<inputType id="cdt.managedbuild.tool.gnu.cpp.linker.input.983725033" superClass="cdt.managedbuild.tool.gnu.cpp.linker.input">
@@ -155,8 +160,13 @@
 			<autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfileCPP"/>
 		</scannerConfigBuildInfo>
 	</storageModule>
-	<storageModule moduleId="refreshScope" versionNumber="1">
-		<resource resourceType="PROJECT" workspacePath="/moses-cmd"/>
+	<storageModule moduleId="refreshScope" versionNumber="2">
+		<configuration configurationName="Release">
+			<resource resourceType="PROJECT" workspacePath="/moses-cmd"/>
+		</configuration>
+		<configuration configurationName="Debug">
+			<resource resourceType="PROJECT" workspacePath="/moses-cmd"/>
+		</configuration>
 	</storageModule>
 	<storageModule moduleId="org.eclipse.cdt.make.core.buildtargets"/>
 	<storageModule moduleId="org.eclipse.cdt.core.LanguageSettingsProviders"/>
diff --git a/contrib/other-builds/moses/.cproject b/contrib/other-builds/moses/.cproject
index e54a1385b..787024533 100644
--- a/contrib/other-builds/moses/.cproject
+++ b/contrib/other-builds/moses/.cproject
@@ -1,7 +1,5 @@
 <?xml version="1.0" encoding="UTF-8" standalone="no"?>
-<?fileVersion 4.0.0?>
-
-<cproject storage_type_id="org.eclipse.cdt.core.XmlProjectDescriptionStorage">
+<?fileVersion 4.0.0?><cproject storage_type_id="org.eclipse.cdt.core.XmlProjectDescriptionStorage">
 	<storageModule moduleId="org.eclipse.cdt.core.settings">
 		<cconfiguration id="cdt.managedbuild.config.gnu.exe.debug.656913512">
 			<storageModule buildSystemId="org.eclipse.cdt.managedbuilder.core.configurationDataProvider" id="cdt.managedbuild.config.gnu.exe.debug.656913512" moduleId="org.eclipse.cdt.core.settings" name="Debug">
@@ -9,7 +7,7 @@
 					<externalSetting>
 						<entry flags="VALUE_WORKSPACE_PATH" kind="includePath" name="/moses"/>
 						<entry flags="VALUE_WORKSPACE_PATH" kind="libraryPath" name="/moses/Debug"/>
-						<entry flags="RESOLVED" kind="libraryFile" name="moses"/>
+						<entry flags="RESOLVED" kind="libraryFile" name="moses" srcPrefixMapping="" srcRootPath=""/>
 					</externalSetting>
 				</externalSettings>
 				<extensions>
@@ -26,7 +24,7 @@
 					<folderInfo id="cdt.managedbuild.config.gnu.exe.debug.656913512." name="/" resourcePath="">
 						<toolChain id="cdt.managedbuild.toolchain.gnu.exe.debug.1793369992" name="Linux GCC" superClass="cdt.managedbuild.toolchain.gnu.exe.debug">
 							<targetPlatform binaryParser="org.eclipse.cdt.core.ELF;org.eclipse.cdt.core.MachO64" id="cdt.managedbuild.target.gnu.platform.exe.debug.1051650049" name="Debug Platform" superClass="cdt.managedbuild.target.gnu.platform.exe.debug"/>
-							<builder buildPath="${workspace_loc:/moses/Debug}" id="cdt.managedbuild.target.gnu.builder.exe.debug.505583888" keepEnvironmentInBuildfile="false" managedBuildOn="true" name="Gnu Make Builder" superClass="cdt.managedbuild.target.gnu.builder.exe.debug"/>
+							<builder buildPath="${workspace_loc:/moses/Debug}" id="cdt.managedbuild.target.gnu.builder.exe.debug.505583888" keepEnvironmentInBuildfile="false" managedBuildOn="true" name="Gnu Make Builder" parallelBuildOn="true" parallelizationNumber="optimal" superClass="cdt.managedbuild.target.gnu.builder.exe.debug"/>
 							<tool id="cdt.managedbuild.tool.gnu.archiver.base.1976472988" name="GCC Archiver" superClass="cdt.managedbuild.tool.gnu.archiver.base"/>
 							<tool id="cdt.managedbuild.tool.gnu.cpp.compiler.exe.debug.1774992327" name="GCC C++ Compiler" superClass="cdt.managedbuild.tool.gnu.cpp.compiler.exe.debug">
 								<option id="gnu.cpp.compiler.exe.debug.option.optimization.level.1759650532" name="Optimization Level" superClass="gnu.cpp.compiler.exe.debug.option.optimization.level" value="gnu.cpp.compiler.optimization.level.none" valueType="enumerated"/>
@@ -152,8 +150,14 @@
 			<autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfileCPP"/>
 		</scannerConfigBuildInfo>
 	</storageModule>
-	<storageModule moduleId="refreshScope" versionNumber="1">
-		<resource resourceType="PROJECT" workspacePath="/moses"/>
+	<storageModule moduleId="refreshScope" versionNumber="2">
+		<configuration configurationName="Release">
+			<resource resourceType="PROJECT" workspacePath="/moses"/>
+		</configuration>
+		<configuration configurationName="Debug">
+			<resource resourceType="PROJECT" workspacePath="/moses"/>
+		</configuration>
 	</storageModule>
 	<storageModule moduleId="org.eclipse.cdt.make.core.buildtargets"/>
+	<storageModule moduleId="org.eclipse.cdt.core.LanguageSettingsProviders"/>
 </cproject>
diff --git a/contrib/other-builds/search/.cproject b/contrib/other-builds/search/.cproject
index 9ccb8f8e9..2de36fecd 100644
--- a/contrib/other-builds/search/.cproject
+++ b/contrib/other-builds/search/.cproject
@@ -24,7 +24,7 @@
 					<folderInfo id="cdt.managedbuild.config.gnu.exe.debug.722547278." name="/" resourcePath="">
 						<toolChain id="cdt.managedbuild.toolchain.gnu.exe.debug.1512691763" name="Linux GCC" superClass="cdt.managedbuild.toolchain.gnu.exe.debug">
 							<targetPlatform binaryParser="org.eclipse.cdt.core.ELF;org.eclipse.cdt.core.MachO64" id="cdt.managedbuild.target.gnu.platform.exe.debug.633526059" name="Debug Platform" superClass="cdt.managedbuild.target.gnu.platform.exe.debug"/>
-							<builder buildPath="${workspace_loc:/search/Debug}" id="cdt.managedbuild.target.gnu.builder.exe.debug.164367197" keepEnvironmentInBuildfile="false" managedBuildOn="true" name="Gnu Make Builder" superClass="cdt.managedbuild.target.gnu.builder.exe.debug"/>
+							<builder buildPath="${workspace_loc:/search/Debug}" id="cdt.managedbuild.target.gnu.builder.exe.debug.164367197" keepEnvironmentInBuildfile="false" managedBuildOn="true" name="Gnu Make Builder" parallelBuildOn="true" parallelizationNumber="optimal" superClass="cdt.managedbuild.target.gnu.builder.exe.debug"/>
 							<tool id="cdt.managedbuild.tool.gnu.archiver.base.854512708" name="GCC Archiver" superClass="cdt.managedbuild.tool.gnu.archiver.base"/>
 							<tool id="cdt.managedbuild.tool.gnu.cpp.compiler.exe.debug.1096845166" name="GCC C++ Compiler" superClass="cdt.managedbuild.tool.gnu.cpp.compiler.exe.debug">
 								<option id="gnu.cpp.compiler.exe.debug.option.optimization.level.240381177" name="Optimization Level" superClass="gnu.cpp.compiler.exe.debug.option.optimization.level" value="gnu.cpp.compiler.optimization.level.none" valueType="enumerated"/>
@@ -127,6 +127,13 @@
 			<autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfileC"/>
 		</scannerConfigBuildInfo>
 	</storageModule>
-	<storageModule moduleId="refreshScope"/>
+	<storageModule moduleId="refreshScope" versionNumber="2">
+		<configuration configurationName="Release">
+			<resource resourceType="PROJECT" workspacePath="/search"/>
+		</configuration>
+		<configuration configurationName="Debug">
+			<resource resourceType="PROJECT" workspacePath="/search"/>
+		</configuration>
+	</storageModule>
 	<storageModule moduleId="org.eclipse.cdt.core.LanguageSettingsProviders"/>
 </cproject>
diff --git a/contrib/other-builds/search/.project b/contrib/other-builds/search/.project
index efad842ea..95f074aae 100644
--- a/contrib/other-builds/search/.project
+++ b/contrib/other-builds/search/.project
@@ -157,11 +157,6 @@
 			<locationURI>PARENT-3-PROJECT_LOC/search/vertex.hh</locationURI>
 		</link>
 		<link>
-			<name>vertex_generator.cc</name>
-			<type>1</type>
-			<locationURI>PARENT-3-PROJECT_LOC/search/vertex_generator.cc</locationURI>
-		</link>
-		<link>
 			<name>vertex_generator.hh</name>
 			<type>1</type>
 			<locationURI>PARENT-3-PROJECT_LOC/search/vertex_generator.hh</locationURI>
diff --git a/contrib/other-builds/util/.cproject b/contrib/other-builds/util/.cproject
index ab37362a4..2fd4d2dfb 100644
--- a/contrib/other-builds/util/.cproject
+++ b/contrib/other-builds/util/.cproject
@@ -24,7 +24,7 @@
 					<folderInfo id="cdt.managedbuild.config.gnu.macosx.exe.debug.1869657447." name="/" resourcePath="">
 						<toolChain id="cdt.managedbuild.toolchain.gnu.macosx.exe.debug.1388624938" name="MacOSX GCC" superClass="cdt.managedbuild.toolchain.gnu.macosx.exe.debug">
 							<targetPlatform binaryParser="org.eclipse.cdt.core.MachO64;org.eclipse.cdt.core.ELF" id="cdt.managedbuild.target.gnu.platform.macosx.exe.debug.1873607607" name="Debug Platform" superClass="cdt.managedbuild.target.gnu.platform.macosx.exe.debug"/>
-							<builder buildPath="${workspace_loc:/util/Debug}" id="cdt.managedbuild.target.gnu.builder.macosx.exe.debug.2045214944" keepEnvironmentInBuildfile="false" managedBuildOn="true" name="Gnu Make Builder" superClass="cdt.managedbuild.target.gnu.builder.macosx.exe.debug"/>
+							<builder buildPath="${workspace_loc:/util/Debug}" id="cdt.managedbuild.target.gnu.builder.macosx.exe.debug.2045214944" keepEnvironmentInBuildfile="false" managedBuildOn="true" name="Gnu Make Builder" parallelBuildOn="true" parallelizationNumber="optimal" superClass="cdt.managedbuild.target.gnu.builder.macosx.exe.debug"/>
 							<tool id="cdt.managedbuild.tool.macosx.c.linker.macosx.exe.debug.589471640" name="MacOS X C Linker" superClass="cdt.managedbuild.tool.macosx.c.linker.macosx.exe.debug"/>
 							<tool id="cdt.managedbuild.tool.macosx.cpp.linker.macosx.exe.debug.1543780089" name="MacOS X C++ Linker" superClass="cdt.managedbuild.tool.macosx.cpp.linker.macosx.exe.debug">
 								<inputType id="cdt.managedbuild.tool.macosx.cpp.linker.input.635667684" superClass="cdt.managedbuild.tool.macosx.cpp.linker.input">
@@ -136,8 +136,13 @@
 			<autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfileC"/>
 		</scannerConfigBuildInfo>
 	</storageModule>
-	<storageModule moduleId="refreshScope" versionNumber="1">
-		<resource resourceType="PROJECT" workspacePath="/util"/>
+	<storageModule moduleId="refreshScope" versionNumber="2">
+		<configuration configurationName="Release">
+			<resource resourceType="PROJECT" workspacePath="/util"/>
+		</configuration>
+		<configuration configurationName="Debug">
+			<resource resourceType="PROJECT" workspacePath="/util"/>
+		</configuration>
 	</storageModule>
 	<storageModule moduleId="org.eclipse.cdt.make.core.buildtargets"/>
 	<storageModule moduleId="org.eclipse.cdt.core.LanguageSettingsProviders"/>
diff --git a/contrib/rpm/README b/contrib/rpm/README
new file mode 100644
index 000000000..8ba7ef4da
--- /dev/null
+++ b/contrib/rpm/README
@@ -0,0 +1,42 @@
+Building Moses RPM
+==================
+
+*** WARNING ***
+Before completing *any* of the tasks outlined in this README, please commit and push any changes you wish to be included in your installer.
+*** WARNING ***
+
+
+Building the RPM SPEC file
+--------------------------
+
+The first phase is to construct the RPM SPEC file in $HOME/rpmbuild. The build_source.sh script builds all the artefacts needed to build. This script needs the following information:
+
+ - The Git repository from which an installer will be built,
+ - The branch in the Git repository to build, and
+ - The version of the installed Moses distribution.
+
+For example, to build the RELEASE-1.0 branch in the mosesdecode repository (git://github.com/moses-smt/mosesdecoder.git):
+
+$ build_source.sh -r git://github.com/moses-smt/mosesdecoder.git -b RELASE-1.0 -v 1.0
+
+This builds the source tarballs in the $HOME/rpmbuild/SOURCES directory and the moses.spec file in $HOME/rpmbuild/SPECS.
+
+
+Building the RPM
+----------------
+
+Change directory to $HOME/rpmbuild, and build the binary RPM with:
+
+$ rpmbuild -bb SPECS/moses.spec
+
+This will download IRSTLM v5.70.04 and GIZA++ v2, then build them along with Moses and make the RPM in the directory $HOME/rpmbuild/RPMS/<architecture>/moses-<version>-1.<architecture>.rpm.
+
+For example building on a 64 bit Intel architecture, and building v1.0 the RPM would be called moses-1.0-1.x86_64.rpm.
+
+
+Building a Debian package
+-------------------------
+
+The Alien tool converts RPM packages to Debian packages. If a Debian package is required then follow the instructions on the following web-page:
+
+https://help.ubuntu.com/community/RPM/AlienHowto
diff --git a/contrib/rpm/build_source.sh b/contrib/rpm/build_source.sh
new file mode 100755
index 000000000..d0fac6a33
--- /dev/null
+++ b/contrib/rpm/build_source.sh
@@ -0,0 +1,63 @@
+#!/bin/bash
+
+BRANCH="master"
+declare -i NO_RPM_BUILD=0
+declare -r RPM_VERSION_TAG="___RPM_VERSION__"
+
+function usage() {
+  echo "`basename $0` -r [Moses Git repo] -b [Moses Git branch: default ${BRANCH}] -v [RPM version]"
+  exit 1
+}
+
+if [ $# -lt 4 ]; then
+  usage
+fi
+
+while getopts r:b:v:nh OPTION
+do
+  case "$OPTION" in
+      r) REPO="${OPTARG}";;
+      b) BRANCH="${OPTARG}";;
+      v) VERSION="${OPTARG}";;
+      n) NO_RPM_BUILD=1;;
+      [h\?]) usage;;
+  esac
+done
+
+if [ ! -d ./rpmbuild ]; then
+  echo "RPM build directory not in current working direcotry"
+  exit 1
+fi
+
+declare -r MOSES_DIR="moses-${VERSION}"
+git clone ${REPO} ${MOSES_DIR}
+if [ $? -ne 0 ]; then
+  echo "Failed to clone Git repository ${REPO}"
+  exit 3
+fi
+
+cd ${MOSES_DIR}
+
+git checkout ${BRANCH}
+if [ $? -ne 0 ]; then
+  echo "Failed to checkout branch ${BRANCH}"
+  exit 3
+fi
+
+cd ..
+
+tar -cf moses-${VERSION}.tar ${MOSES_DIR}
+gzip -f9 moses-${VERSION}.tar
+
+if [ ${NO_RPM_BUILD} -eq 0 ]; then
+  if [ ! -d ${HOME}/rpmbuild/SPECS ]; then
+    mkdir -p ${HOME}/rpmbuild/SPECS
+  fi
+  eval sed s/${RPM_VERSION_TAG}/${VERSION}/ ./rpmbuild/SPECS/moses.spec > ${HOME}/rpmbuild/SPECS/moses.spec
+  if [ ! -d ${HOME}/rpmbuild/SOURCES ]; then
+    mkdir -p ${HOME}/rpmbuild/SOURCES
+  fi
+  mv moses-${VERSION}.tar.gz ${HOME}/rpmbuild/SOURCES
+fi
+
+rm -Rf ${MOSES_DIR}
diff --git a/contrib/rpm/rpmbuild/SPECS/moses.spec b/contrib/rpm/rpmbuild/SPECS/moses.spec
new file mode 100644
index 000000000..0f4a6c6ec
--- /dev/null
+++ b/contrib/rpm/rpmbuild/SPECS/moses.spec
@@ -0,0 +1,65 @@
+Name: moses
+Summary: Moses is a statistical machine translation system that allows you to automatically train translation models for any language pair.
+Version: ___RPM_VERSION__
+Release: 1
+URL: http://www.statmt.org/moses/
+Source0: %{name}-%{version}.tar.gz
+License: LGPL
+Group: Development/Tools
+Vendor: Capita Translation and Interpreting
+Packager: Ian Johnson <ian.johnson@capita-ti.com>
+Requires: boost >= 1.48, python >= 2.6, perl >= 5
+BuildRoot: /home/ian/rpmbuild/builds/%{name}-%{version}-%{release}
+%description
+Moses is a statistical machine translation system that allows you to automatically train translation models for any language pair. All you need is a collection of translated texts (parallel corpus). An efficient search algorithm finds quickly the highest probability translation among the exponential number of choices.
+%prep
+%setup -q
+
+mkdir -p $RPM_BUILD_ROOT/opt/moses/giza++-v1.0.7
+
+wget -O $RPM_BUILD_DIR/irstlm-5.70.04.tgz http://moses-suite.googlecode.com/files/irstlm-5.70.04.tgz 
+wget -O $RPM_BUILD_DIR/giza-pp-v1.0.7.tgz http://moses-suite.googlecode.com/files/giza-pp-v1.0.7.tar.gz
+
+cd $RPM_BUILD_DIR
+
+tar -zxf irstlm-5.70.04.tgz
+tar -zxf giza-pp-v1.0.7.tgz
+
+cd irstlm-5.70.04
+bash regenerate-makefiles.sh --force
+./configure --prefix $RPM_BUILD_ROOT/opt/moses/irstlm-5.70.04
+make
+make install
+
+cd ../giza-pp
+make
+cp $RPM_BUILD_DIR/giza-pp/GIZA++-v2/GIZA++ $RPM_BUILD_DIR/giza-pp/GIZA++-v2/snt2cooc.out $RPM_BUILD_DIR/giza-pp/mkcls-v2/mkcls $RPM_BUILD_ROOT/opt/moses/giza++-v1.0.7
+%build
+./bjam --with-irstlm=$RPM_BUILD_ROOT/opt/moses/irstlm-5.70.04 --with-giza=$RPM_BUILD_ROOT/opt/moses/giza++-v1.0.7 -j2
+%install
+mkdir -p $RPM_BUILD_ROOT/opt/moses/scripts
+cp -R bin $RPM_BUILD_ROOT/opt/moses
+cp -R scripts/analysis $RPM_BUILD_ROOT/opt/moses/scripts
+cp -R scripts/ems $RPM_BUILD_ROOT/opt/moses/scripts
+cp -R scripts/generic $RPM_BUILD_ROOT/opt/moses/scripts
+cp -R scripts/other $RPM_BUILD_ROOT/opt/moses/scripts
+cp -R scripts/recaser $RPM_BUILD_ROOT/opt/moses/scripts
+cp -R scripts/regression-testing $RPM_BUILD_ROOT/opt/moses/scripts
+cp -R scripts/share $RPM_BUILD_ROOT/opt/moses/scripts
+cp -R scripts/tokenizer $RPM_BUILD_ROOT/opt/moses/scripts
+cp -R scripts/training $RPM_BUILD_ROOT/opt/moses/scripts
+%clean
+%files
+%defattr(-,root,root)
+/opt/moses/bin/*
+/opt/moses/scripts/analysis/*
+/opt/moses/scripts/ems/*
+/opt/moses/scripts/generic/*
+/opt/moses/scripts/other/*
+/opt/moses/scripts/recaser/*
+/opt/moses/scripts/regression-testing/*
+/opt/moses/scripts/share/*
+/opt/moses/scripts/tokenizer/*
+/opt/moses/scripts/training/*
+/opt/moses/irstlm-5.70.04/*
+/opt/moses/giza++-v1.0.7/*
diff --git a/contrib/server/client.py b/contrib/server/client.py
new file mode 100755
index 000000000..43e77555a
--- /dev/null
+++ b/contrib/server/client.py
@@ -0,0 +1,21 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+# python port of client.perl
+
+import xmlrpclib
+import datetime
+
+url = "http://localhost:8080/RPC2"
+proxy = xmlrpclib.ServerProxy(url)
+
+text = u"il a souhaité que la présidence trace à nice le chemin pour l' avenir ."
+params = {"text":text, "align":"true", "report-all-factors":"true"}
+
+result = proxy.translate(params)
+print result['text']
+if 'align' in result:
+    print "Phrase alignments:"
+    aligns = result['align']
+    for align in aligns:
+        print "%s,%s,%s" %(align['tgt-start'], align['src-start'], align['src-end'])
diff --git a/contrib/server/mosesserver.cpp b/contrib/server/mosesserver.cpp
index 98024c891..5d9c40a9b 100644
--- a/contrib/server/mosesserver.cpp
+++ b/contrib/server/mosesserver.cpp
@@ -1,6 +1,8 @@
 #include "util/check.hh"
 #include <stdexcept>
 #include <iostream>
+#include <vector>
+#include <algorithm>
 
 
 #include "moses/ChartManager.h"
@@ -54,7 +56,7 @@ public:
     PhraseDictionaryDynSuffixArray* pdsa = (PhraseDictionaryDynSuffixArray*) pdf->GetDictionary();
     cerr << "Inserting into address " << pdsa << endl;
     pdsa->insertSnt(source_, target_, alignment_);
-    if(add2ORLM_) {       
+    if(add2ORLM_) {
       updateORLM();
     }
     cerr << "Done inserting\n";
@@ -83,8 +85,8 @@ public:
     const std::string sBOS = orlm->GetSentenceStart()->GetString();
     const std::string sEOS = orlm->GetSentenceEnd()->GetString();
     Utils::splitToStr(target_, vl, " ");
-    // insert BOS and EOS 
-    vl.insert(vl.begin(), sBOS); 
+    // insert BOS and EOS
+    vl.insert(vl.begin(), sBOS);
     vl.insert(vl.end(), sEOS);
     for(int j=0; j < vl.size(); ++j) {
       int i = (j<ngOrder) ? 0 : j-ngOrder+1;
@@ -177,7 +179,7 @@ public:
     map<string, xmlrpc_c::value> retData;
 
     if (staticData.IsChart()) {
-       TreeInput tinput; 
+       TreeInput tinput;
         const vector<FactorType> &inputFactorOrder =
           staticData.GetInputFactorOrder();
         stringstream in(source + "\n");
@@ -260,10 +262,16 @@ public:
 
   }
 
+
+  bool compareSearchGraphNode(const SearchGraphNode& a, const SearchGraphNode b) {
+    return a.hypo->GetId() < b.hypo->GetId();
+  }
+
   void insertGraphInfo(Manager& manager, map<string, xmlrpc_c::value>& retData) {
     vector<xmlrpc_c::value> searchGraphXml;
     vector<SearchGraphNode> searchGraph;
     manager.GetSearchGraph(searchGraph);
+    std::sort(searchGraph.begin(), searchGraph.end());
     for (vector<SearchGraphNode>::const_iterator i = searchGraph.begin(); i != searchGraph.end(); ++i) {
       map<string, xmlrpc_c::value> searchGraphXmlNode;
       searchGraphXmlNode["forward"] = xmlrpc_c::value_double(i->forward);
diff --git a/contrib/sigtest-filter/filter-pt.cpp b/contrib/sigtest-filter/filter-pt.cpp
index f06d2b430..6ab1a5657 100644
--- a/contrib/sigtest-filter/filter-pt.cpp
+++ b/contrib/sigtest-filter/filter-pt.cpp
@@ -287,24 +287,24 @@ SentIdSet find_occurrences(const std::string& rule, C_SuffixArraySearchApplicati
     if (hierarchical) {
         //   std::cerr << "splitting up phrase: " << phrase << "\n";
         int pos = 0;
-        int endPos = 0;
+        int NTStartPos, NTEndPos;
         vector<std::string> phrases;
-
-        while (rule.find("[X][X] ", pos) < rule.size()) {
-            endPos = rule.find("[X][X] ",pos) - 1; // -1 to cut space before NT
-            if (endPos < pos) { // no space: NT at start of rule (or two consecutive NTs)
-                pos += 7; 
+        while (rule.find("] ", pos) < rule.size()) {
+            NTStartPos = rule.find("[",pos) - 1; // -1 to cut space before NT
+            NTEndPos = rule.find("] ",pos);
+            if (NTStartPos < pos) { // no space: NT at start of rule (or two consecutive NTs)
+                pos = NTEndPos + 2;
                 continue;
             }
-            phrases.push_back(rule.substr(pos,endPos-pos));
-            pos = endPos + 8;
+            phrases.push_back(rule.substr(pos,NTStartPos-pos));
+            pos = NTEndPos + 2;
         }
 
-        // cut LHS of rule
-        endPos = rule.size()-4;
-        if (endPos > pos) {
-            phrases.push_back(rule.substr(pos,endPos-pos));
+        NTStartPos = rule.find("[",pos) - 1; // LHS of rule
+        if (NTStartPos > pos) {
+            phrases.push_back(rule.substr(pos,NTStartPos-pos));
         }
+
         sa_set = lookup_multiple_phrases(phrases, my_sa, rule, cache);
     }
     else {
diff --git a/contrib/tmcombine/README.md b/contrib/tmcombine/README.md
index 2cbc83299..7b8ebd45e 100644
--- a/contrib/tmcombine/README.md
+++ b/contrib/tmcombine/README.md
@@ -58,7 +58,7 @@ Regression tests (check if the output files (`test/phrase-table_testN`) differ f
 FURTHER NOTES
 -------------
 
- - Different combination algorithms require different statistics. To be on the safe side, use the options `-phrase-word-alignment` and `-write-lexical-counts` when training models.
+ - Different combination algorithms require different statistics. To be on the safe side, use the option and `-write-lexical-counts` when training models.
 
  - The script assumes that phrase tables are sorted (to allow incremental, more memory-friendly processing). Sort the tables with `LC_ALL=C`. Phrase tables produced by Moses are sorted correctly.
 
diff --git a/contrib/tmcombine/tmcombine.py b/contrib/tmcombine/tmcombine.py
index 0bbcf7c78..5b65cc590 100755
--- a/contrib/tmcombine/tmcombine.py
+++ b/contrib/tmcombine/tmcombine.py
@@ -15,7 +15,7 @@
 
 
 # Some general things to note:
-#  - Different combination algorithms require different statistics. To be on the safe side, use the options `-phrase-word-alignment` and `-write-lexical-counts` when training models.
+#  - Different combination algorithms require different statistics. To be on the safe side, use the option `-write-lexical-counts` when training models.
 #  - The script assumes that phrase tables are sorted (to allow incremental, more memory-friendly processing). sort with LC_ALL=C.
 #  - Some configurations require additional statistics that are loaded in memory (lexical tables; complete list of target phrases). If memory consumption is a problem, use the option --lowmem (slightly slower and writes temporary files to disk), or consider pruning your phrase table before combining (e.g. using Johnson et al. 2007).
 #  - The script can read/write gzipped files, but the Python implementation is slow. You're better off unzipping the files on the command line and working with the unzipped files.
@@ -306,7 +306,7 @@ class Moses():
         # assuming that alignment is empty
         elif len(line) == 4:
             if self.require_alignment:
-                sys.stderr.write('Error: unexpected phrase table format. Your current configuration requires alignment information. Make sure you trained your model with -phrase-word-alignment\n')
+                sys.stderr.write('Error: unexpected phrase table format. Your current configuration requires alignment information. Make sure you trained your model with -phrase-word-alignment (default in newer Moses versions)\n')
                 exit()
             
             self.phrase_pairs[src][target][1] = [b'',line[3].lstrip(b'| ')]
diff --git a/mert/InterpolatedScorer.cpp b/mert/InterpolatedScorer.cpp
index e610cbdd0..af3f26bf2 100644
--- a/mert/InterpolatedScorer.cpp
+++ b/mert/InterpolatedScorer.cpp
@@ -164,7 +164,7 @@ void InterpolatedScorer::prepareStats(size_t sid, const string& text, ScoreStats
 {
   stringstream buff;
   string align = text;
-  string sentence = "";
+  string sentence = text;
   size_t alignmentData = text.find("|||");
   //Get sentence and alignment parts
   if(alignmentData != string::npos) {
diff --git a/moses-chart-cmd/IOWrapper.cpp b/moses-chart-cmd/IOWrapper.cpp
index 09e06fcf6..b65873881 100644
--- a/moses-chart-cmd/IOWrapper.cpp
+++ b/moses-chart-cmd/IOWrapper.cpp
@@ -620,10 +620,27 @@ void IOWrapper::FixPrecision(std::ostream &stream, size_t size)
 template <class T>
 void ShiftOffsets(vector<T> &offsets, T shift)
 {
+  T currPos = shift;
   for (size_t i = 0; i < offsets.size(); ++i) {
-    shift += offsets[i];
-    offsets[i] += shift;
+    if (offsets[i] == 0) {
+	  offsets[i] = currPos;
+	  ++currPos;
+	}
+	else {
+	  currPos += offsets[i];
+	}
+  }
+}
+
+size_t CalcSourceSize(const Moses::ChartHypothesis *hypo)
+{
+  size_t ret = hypo->GetCurrSourceRange().GetNumWordsCovered();
+  const std::vector<const ChartHypothesis*> &prevHypos = hypo->GetPrevHypos();
+  for (size_t i = 0; i < prevHypos.size(); ++i) {
+    size_t childSize = prevHypos[i]->GetCurrSourceRange().GetNumWordsCovered();
+    ret -= (childSize - 1);
   }
+  return ret;
 }
 
 size_t IOWrapper::OutputAlignmentNBest(Alignments &retAlign, const Moses::ChartTrellisNode &node, size_t startTarget)
@@ -635,7 +652,11 @@ size_t IOWrapper::OutputAlignmentNBest(Alignments &retAlign, const Moses::ChartT
 
   const TargetPhrase &tp = hypo->GetCurrTargetPhrase();
 
-  vector<size_t> sourceOffsets(hypo->GetCurrSourceRange().GetNumWordsCovered(), 0);
+  size_t thisSourceSize = CalcSourceSize(hypo);
+
+  // position of each terminal word in translation rule, irrespective of alignment
+  // if non-term, number is undefined
+  vector<size_t> sourceOffsets(thisSourceSize, 0);
   vector<size_t> targetOffsets(tp.GetSize(), 0);
 
   const ChartTrellisNode::NodeChildren &prevNodes = node.GetChildren();
@@ -655,11 +676,12 @@ size_t IOWrapper::OutputAlignmentNBest(Alignments &retAlign, const Moses::ChartT
 
       const ChartTrellisNode &prevNode = *prevNodes[sourceInd];
 
-      // 1st. calc source size
+      // calc source size
       size_t sourceSize = prevNode.GetHypothesis().GetCurrSourceRange().GetNumWordsCovered();
       sourceOffsets[sourcePos] = sourceSize;
 
-      // 2nd. calc target size. Recursively look thru child hypos
+      // calc target size.
+      // Recursively look thru child hypos
       size_t currStartTarget = startTarget + totalTargetSize;
       size_t targetSize = OutputAlignmentNBest(retAlign, prevNode, currStartTarget);
       targetOffsets[targetPos] = targetSize;
@@ -672,27 +694,26 @@ size_t IOWrapper::OutputAlignmentNBest(Alignments &retAlign, const Moses::ChartT
     }
   }
 
-  // 3rd. shift offsets
+  // convert position within translation rule to absolute position within
+  // source sentence / output sentence
   ShiftOffsets(sourceOffsets, startSource);
   ShiftOffsets(targetOffsets, startTarget);
 
   // get alignments from this hypo
-  vector< set<size_t> > retAlignmentsS2T(hypo->GetCurrSourceRange().GetNumWordsCovered());
   const AlignmentInfo &aiTerm = hypo->GetCurrTargetPhrase().GetAlignTerm();
-  OutputAlignment(retAlignmentsS2T, aiTerm);
 
   // add to output arg, offsetting by source & target
-  for (size_t source = 0; source < retAlignmentsS2T.size(); ++source) {
-    const set<size_t> &targets = retAlignmentsS2T[source];
-    set<size_t>::const_iterator iter;
-    for (iter = targets.begin(); iter != targets.end(); ++iter) {
-      size_t target = *iter;
-      pair<size_t, size_t> alignPoint(source + sourceOffsets[source]
-                                     ,target + targetOffsets[target]);
-      pair<Alignments::iterator, bool> ret = retAlign.insert(alignPoint);
-      CHECK(ret.second);
-
-    }
+  AlignmentInfo::const_iterator iter;
+  for (iter = aiTerm.begin(); iter != aiTerm.end(); ++iter) {
+    const std::pair<size_t,size_t> &align = *iter;
+    size_t relSource = align.first;
+    size_t relTarget = align.second;
+    size_t absSource = sourceOffsets[relSource];
+    size_t absTarget = targetOffsets[relTarget];
+
+    pair<size_t, size_t> alignPoint(absSource, absTarget);
+    pair<Alignments::iterator, bool> ret = retAlign.insert(alignPoint);
+    CHECK(ret.second);
   }
 
   return totalTargetSize;
@@ -702,14 +723,16 @@ void IOWrapper::OutputAlignment(size_t translationId , const Moses::ChartHypothe
 {
   ostringstream out;
 
-  Alignments retAlign;
-  OutputAlignment(retAlign, hypo, 0);
+  if (hypo) {
+	Alignments retAlign;
+	OutputAlignment(retAlign, hypo, 0);
 
-  // output alignments
-  Alignments::const_iterator iter;
-  for (iter = retAlign.begin(); iter != retAlign.end(); ++iter) {
-    const pair<size_t, size_t> &alignPoint = *iter;
-    out << alignPoint.first << "-" << alignPoint.second << " ";
+	// output alignments
+	Alignments::const_iterator iter;
+	for (iter = retAlign.begin(); iter != retAlign.end(); ++iter) {
+	  const pair<size_t, size_t> &alignPoint = *iter;
+	  out << alignPoint.first << "-" << alignPoint.second << " ";
+	}
   }
   out << endl;
 
@@ -723,7 +746,11 @@ size_t IOWrapper::OutputAlignment(Alignments &retAlign, const Moses::ChartHypoth
 
   const TargetPhrase &tp = hypo->GetCurrTargetPhrase();
 
-  vector<size_t> sourceOffsets(hypo->GetCurrSourceRange().GetNumWordsCovered(), 0);
+  size_t thisSourceSize = CalcSourceSize(hypo);
+
+  // position of each terminal word in translation rule, irrespective of alignment
+  // if non-term, number is undefined
+  vector<size_t> sourceOffsets(thisSourceSize, 0);
   vector<size_t> targetOffsets(tp.GetSize(), 0);
 
   const vector<const ChartHypothesis*> &prevHypos = hypo->GetPrevHypos();
@@ -743,11 +770,12 @@ size_t IOWrapper::OutputAlignment(Alignments &retAlign, const Moses::ChartHypoth
 
       const ChartHypothesis *prevHypo = prevHypos[sourceInd];
 
-      // 1st. calc source size
+      // calc source size
       size_t sourceSize = prevHypo->GetCurrSourceRange().GetNumWordsCovered();
       sourceOffsets[sourcePos] = sourceSize;
 
-      // 2nd. calc target size. Recursively look thru child hypos
+      // calc target size.
+      // Recursively look thru child hypos
       size_t currStartTarget = startTarget + totalTargetSize;
       size_t targetSize = OutputAlignment(retAlign, prevHypo, currStartTarget);
       targetOffsets[targetPos] = targetSize;
@@ -760,27 +788,27 @@ size_t IOWrapper::OutputAlignment(Alignments &retAlign, const Moses::ChartHypoth
     }
   }
 
-  // 3rd. shift offsets
+  // convert position within translation rule to absolute position within
+  // source sentence / output sentence
   ShiftOffsets(sourceOffsets, startSource);
   ShiftOffsets(targetOffsets, startTarget);
 
   // get alignments from this hypo
-  vector< set<size_t> > retAlignmentsS2T(hypo->GetCurrSourceRange().GetNumWordsCovered());
   const AlignmentInfo &aiTerm = hypo->GetCurrTargetPhrase().GetAlignTerm();
-  OutputAlignment(retAlignmentsS2T, aiTerm);
 
   // add to output arg, offsetting by source & target
-  for (size_t source = 0; source < retAlignmentsS2T.size(); ++source) {
-    const set<size_t> &targets = retAlignmentsS2T[source];
-    set<size_t>::const_iterator iter;
-    for (iter = targets.begin(); iter != targets.end(); ++iter) {
-      size_t target = *iter;
-      pair<size_t, size_t> alignPoint(source + sourceOffsets[source]
-                                     ,target + targetOffsets[target]);
-      pair<Alignments::iterator, bool> ret = retAlign.insert(alignPoint);
-      CHECK(ret.second);
+  AlignmentInfo::const_iterator iter;
+  for (iter = aiTerm.begin(); iter != aiTerm.end(); ++iter) {
+    const std::pair<size_t,size_t> &align = *iter;
+    size_t relSource = align.first;
+    size_t relTarget = align.second;
+    size_t absSource = sourceOffsets[relSource];
+    size_t absTarget = targetOffsets[relTarget];
+
+    pair<size_t, size_t> alignPoint(absSource, absTarget);
+    pair<Alignments::iterator, bool> ret = retAlign.insert(alignPoint);
+    CHECK(ret.second);
 
-    }
   }
 
   return totalTargetSize;
diff --git a/moses-cmd/IOWrapper.cpp b/moses-cmd/IOWrapper.cpp
index f11516839..335a570a6 100644
--- a/moses-cmd/IOWrapper.cpp
+++ b/moses-cmd/IOWrapper.cpp
@@ -262,6 +262,19 @@ void OutputAlignment(ostream &out, const vector<const Hypothesis *> &edges)
   out << std::endl;
 }
 
+void OutputAlignment(std::ostream &out, const Moses::Hypothesis *hypo)
+{
+  std::vector<const Hypothesis *> edges;
+  const Hypothesis *currentHypo = hypo;
+  while (currentHypo) {
+    edges.push_back(currentHypo);
+    currentHypo = currentHypo->GetPrevHypo();
+  }
+
+  OutputAlignment(out, edges);
+
+}
+
 void OutputAlignment(OutputCollector* collector, size_t lineNo , const vector<const Hypothesis *> &edges)
 {
   ostringstream out;
diff --git a/moses-cmd/IOWrapper.h b/moses-cmd/IOWrapper.h
index 8f164dfb3..8dbdeda9c 100644
--- a/moses-cmd/IOWrapper.h
+++ b/moses-cmd/IOWrapper.h
@@ -137,7 +137,7 @@ void OutputBestHypo(const Moses::TrellisPath &path, long /*translationId*/,bool
 void OutputInput(std::ostream& os, const Moses::Hypothesis* hypo);
 void OutputAlignment(Moses::OutputCollector* collector, size_t lineNo, const Moses::Hypothesis *hypo);
 void OutputAlignment(Moses::OutputCollector* collector, size_t lineNo,  const Moses::TrellisPath &path);
-
+void OutputAlignment(std::ostream &out, const Moses::Hypothesis *hypo);
 
 }
 
diff --git a/moses-cmd/Jamfile b/moses-cmd/Jamfile
index 04f395a81..bddc10911 100644
--- a/moses-cmd/Jamfile
+++ b/moses-cmd/Jamfile
@@ -1,4 +1,4 @@
-alias deps : IOWrapper.cpp mbr.cpp LatticeMBR.cpp TranslationAnalysis.cpp ../moses//moses ;
+alias deps : IOWrapper.cpp mbr.cpp LatticeMBR.cpp TranslationAnalysis.cpp ..//z ..//boost_iostreams ..//boost_filesystem ../moses//moses ;
 
 exe moses : Main.cpp deps ;
 exe lmbrgrid : LatticeMBRGrid.cpp deps ;
diff --git a/moses-cmd/Main.cpp b/moses-cmd/Main.cpp
index ac4527aae..b08ba532a 100644
--- a/moses-cmd/Main.cpp
+++ b/moses-cmd/Main.cpp
@@ -23,6 +23,13 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
  * Moses main, for single-threaded and multi-threaded.
  **/
 
+#include <boost/algorithm/string/predicate.hpp>
+#include <boost/filesystem.hpp>
+#include <boost/iostreams/device/file.hpp>
+#include <boost/iostreams/filter/bzip2.hpp>
+#include <boost/iostreams/filter/gzip.hpp>
+#include <boost/iostreams/filtering_stream.hpp>
+
 #include <exception>
 #include <fstream>
 #include <sstream>
@@ -83,14 +90,18 @@ public:
                   OutputCollector* wordGraphCollector, OutputCollector* searchGraphCollector,
                   OutputCollector* detailedTranslationCollector,
                   OutputCollector* alignmentInfoCollector,
-                  OutputCollector* unknownsCollector) :
+                  OutputCollector* unknownsCollector,
+                  bool outputSearchGraphSLF,
+		  bool outputSearchGraphHypergraph) :
     m_source(source), m_lineNumber(lineNumber),
     m_outputCollector(outputCollector), m_nbestCollector(nbestCollector),
     m_latticeSamplesCollector(latticeSamplesCollector),
     m_wordGraphCollector(wordGraphCollector), m_searchGraphCollector(searchGraphCollector),
     m_detailedTranslationCollector(detailedTranslationCollector),
     m_alignmentInfoCollector(alignmentInfoCollector),
-    m_unknownsCollector(unknownsCollector) {}
+    m_unknownsCollector(unknownsCollector),
+    m_outputSearchGraphSLF(outputSearchGraphSLF),
+    m_outputSearchGraphHypergraph(outputSearchGraphHypergraph) {}
 
 	/** Translate one sentence
    * gets called by main function implemented at end of this source file */
@@ -143,6 +154,96 @@ public:
 #endif
     }		
 
+    // Output search graph in HTK standard lattice format (SLF)
+    if (m_outputSearchGraphSLF) {
+      stringstream fileName;
+      fileName << staticData.GetParam("output-search-graph-slf")[0] << "/" << m_lineNumber << ".slf";
+      std::ofstream *file = new std::ofstream;
+      file->open(fileName.str().c_str());
+      if (file->is_open() && file->good()) {
+	ostringstream out;
+	fix(out,PRECISION);
+	manager.OutputSearchGraphAsSLF(m_lineNumber, out);
+	*file << out.str();
+	file -> flush();
+      } else {
+	TRACE_ERR("Cannot output HTK standard lattice for line " << m_lineNumber << " because the output file is not open or not ready for writing" << std::endl);
+      }
+    }
+
+    // Output search graph in hypergraph format for Kenneth Heafield's lazy hypergraph decoder
+    if (m_outputSearchGraphHypergraph) {
+
+      vector<string> hypergraphParameters = staticData.GetParam("output-search-graph-hypergraph");
+
+      bool appendSuffix;
+      if (hypergraphParameters.size() > 0 && hypergraphParameters[0] == "true") {
+	appendSuffix = true;
+      } else {
+	appendSuffix = false;
+      }
+
+      string compression;
+      if (hypergraphParameters.size() > 1) {
+	compression = hypergraphParameters[1];
+      } else {
+	compression = "txt";
+      }
+
+      string hypergraphDir;
+      if ( hypergraphParameters.size() > 2 ) {
+        hypergraphDir = hypergraphParameters[2];
+      } else {
+	string nbestFile = staticData.GetNBestFilePath();
+	if ( ! nbestFile.empty() && nbestFile!="-" && !boost::starts_with(nbestFile,"/dev/stdout") ) {
+	  boost::filesystem::path nbestPath(nbestFile);
+	  hypergraphDir = nbestPath.parent_path().filename().native();
+	} else {
+	  stringstream hypergraphDirName;
+	  hypergraphDirName << boost::filesystem::current_path() << "/hypergraph";
+	  hypergraphDir = hypergraphDirName.str();
+	}
+      }
+
+      if ( ! boost::filesystem::exists(hypergraphDir) ) {
+	boost::filesystem::create_directory(hypergraphDir);
+      } 
+
+      if ( ! boost::filesystem::exists(hypergraphDir) ) {
+	TRACE_ERR("Cannot output hypergraphs to " << hypergraphDir << " because the directory does not exist" << std::endl);
+      } else if ( ! boost::filesystem::is_directory(hypergraphDir) ) {
+	TRACE_ERR("Cannot output hypergraphs to " << hypergraphDir << " because that path exists, but is not a directory" << std::endl);
+      } else {
+	stringstream fileName;
+	fileName << hypergraphDir << "/" << m_lineNumber;
+	if ( appendSuffix ) {
+	  fileName << "." << compression;
+	}
+	boost::iostreams::filtering_ostream *file = new boost::iostreams::filtering_ostream;
+
+	if ( compression == "gz" ) {
+	  file->push( boost::iostreams::gzip_compressor() );
+	} else if ( compression == "bz2" ) {
+	  file->push( boost::iostreams::bzip2_compressor() );
+	} else if ( compression != "txt" ) {
+	  TRACE_ERR("Unrecognized hypergraph compression format (" << compression << ") - using uncompressed plain txt" << std::endl);
+	  compression = "txt";
+	}
+
+	file->push( boost::iostreams::file_sink(fileName.str(), ios_base::out) );
+
+	if (file->is_complete() && file->good()) {
+	  fix(*file,PRECISION);
+	  manager.OutputSearchGraphAsHypergraph(m_lineNumber, *file);
+	  file -> flush();
+	} else {
+	  TRACE_ERR("Cannot output hypergraph for line " << m_lineNumber << " because the output file " << fileName.str() << " is not open or not ready for writing" << std::endl);
+	}
+	file -> pop();
+	delete file;
+      }
+    }
+
     // apply decision rule and output best translation(s)
     if (m_outputCollector) {
       ostringstream out;
@@ -157,7 +258,7 @@ public:
       // MAP decoding: best hypothesis
       const Hypothesis* bestHypo = NULL;
       if (!staticData.UseMBR()) 
-			{
+	  {
         bestHypo = manager.GetBestHypothesis();
         if (bestHypo) {
           if (staticData.IsPathRecoveryEnabled()) {
@@ -174,13 +275,18 @@ public:
             staticData.GetOutputFactorOrder(),
             staticData.GetReportSegmentation(),
             staticData.GetReportAllFactors());
+          if (staticData.PrintAlignmentInfo()) {
+        	out << "||| ";
+            OutputAlignment(out, bestHypo);
+          }
+
           OutputAlignment(m_alignmentInfoCollector, m_lineNumber, bestHypo);
           IFVERBOSE(1) {
             debug << "BEST TRANSLATION: " << *bestHypo << endl;
           }
         }
         out << endl;
-			}
+	  }
 
       // MBR decoding (n-best MBR, lattice MBR, consensus)
       else 
@@ -311,6 +417,8 @@ private:
   OutputCollector* m_detailedTranslationCollector;
   OutputCollector* m_alignmentInfoCollector;
   OutputCollector* m_unknownsCollector;
+  bool m_outputSearchGraphSLF;
+  bool m_outputSearchGraphHypergraph;
   std::ofstream *m_alignmentStream;
 
 
@@ -323,7 +431,7 @@ static void PrintFeatureWeight(const FeatureFunction* ff)
     vector<float> values = StaticData::Instance().GetAllWeights().GetScoresForProducer(ff);
     for (size_t i = 0; i < numScoreComps; ++i) 
       cout << ff->GetScoreProducerDescription() <<  " "
-           << ff->GetScoreProducerWeightShortName() << " "
+           << ff->GetScoreProducerWeightShortName(i) << " "
            << values[i] << endl;
   }
   else {
@@ -367,6 +475,63 @@ static void ShowWeights()
 
 }
 
+size_t OutputFeatureWeightsForHypergraph(size_t index, const FeatureFunction* ff, std::ostream &outputSearchGraphStream)
+{
+  size_t numScoreComps = ff->GetNumScoreComponents();
+  if (numScoreComps != ScoreProducer::unlimited) {
+    vector<float> values = StaticData::Instance().GetAllWeights().GetScoresForProducer(ff);
+    if (numScoreComps > 1) {
+      for (size_t i = 0; i < numScoreComps; ++i) {
+	outputSearchGraphStream << ff->GetScoreProducerWeightShortName()
+				<< i
+				<< "=" << values[i] << endl;
+      }
+    } else {
+	outputSearchGraphStream << ff->GetScoreProducerWeightShortName()
+				<< "=" << values[0] << endl;
+    }
+    return index+numScoreComps;
+  } else {
+    cerr << "Sparse features are not yet supported when outputting hypergraph format" << endl;
+    assert(false);
+    return 0;
+  }
+}
+
+void OutputFeatureWeightsForHypergraph(std::ostream &outputSearchGraphStream)
+{
+  outputSearchGraphStream.setf(std::ios::fixed);
+  outputSearchGraphStream.precision(6);
+
+  const StaticData& staticData = StaticData::Instance();
+  const TranslationSystem& system = staticData.GetTranslationSystem(TranslationSystem::DEFAULT);
+  const vector<const StatelessFeatureFunction*>& slf =system.GetStatelessFeatureFunctions();
+  const vector<const StatefulFeatureFunction*>& sff = system.GetStatefulFeatureFunctions();
+  size_t featureIndex = 1;
+  for (size_t i = 0; i < sff.size(); ++i) {
+    featureIndex = OutputFeatureWeightsForHypergraph(featureIndex, sff[i], outputSearchGraphStream);
+  }
+  for (size_t i = 0; i < slf.size(); ++i) {
+    if (slf[i]->GetScoreProducerWeightShortName() != "u" &&
+          slf[i]->GetScoreProducerWeightShortName() != "tm" &&
+          slf[i]->GetScoreProducerWeightShortName() != "I" &&
+          slf[i]->GetScoreProducerWeightShortName() != "g")
+    {
+      featureIndex = OutputFeatureWeightsForHypergraph(featureIndex, slf[i], outputSearchGraphStream);
+    }
+  }
+  const vector<PhraseDictionaryFeature*>& pds = system.GetPhraseDictionaries();
+  for( size_t i=0; i<pds.size(); i++ ) {
+    featureIndex = OutputFeatureWeightsForHypergraph(featureIndex, pds[i], outputSearchGraphStream);
+  }
+  const vector<GenerationDictionary*>& gds = system.GetGenerationDictionaries();
+  for( size_t i=0; i<gds.size(); i++ ) {
+    featureIndex = OutputFeatureWeightsForHypergraph(featureIndex, gds[i], outputSearchGraphStream);
+  }
+
+}
+
+
 } //namespace
 
 /** main function of the command line version of the decoder **/
@@ -391,20 +556,20 @@ int main(int argc, char** argv)
 
     // load all the settings into the Parameter class
     // (stores them as strings, or array of strings)
-    Parameter* params = new Parameter();
-    if (!params->LoadParam(argc,argv)) {
+    Parameter params;
+    if (!params.LoadParam(argc,argv)) {
       exit(1);
     }
 
 
     // initialize all "global" variables, which are stored in StaticData
     // note: this also loads models such as the language model, etc.
-    if (!StaticData::LoadDataStatic(params, argv[0])) {
+    if (!StaticData::LoadDataStatic(&params, argv[0])) {
       exit(1);
     }
 
     // setting "-show-weights" -> just dump out weights and exit
-    if (params->isParamSpecified("show-weights")) {
+    if (params.isParamSpecified("show-weights")) {
       ShowWeights();
       exit(0);
     }
@@ -430,6 +595,32 @@ int main(int argc, char** argv)
       TRACE_ERR(weights);
       TRACE_ERR("\n");
     }
+    if (staticData.GetOutputSearchGraphHypergraph()) {
+      ofstream* weightsOut = new std::ofstream;
+      stringstream weightsFilename;
+      if (staticData.GetParam("output-search-graph-hypergraph").size() > 3) { 
+	weightsFilename << staticData.GetParam("output-search-graph-hypergraph")[3];
+      } else {
+	string nbestFile = staticData.GetNBestFilePath();
+	if ( ! nbestFile.empty() && nbestFile!="-" && !boost::starts_with(nbestFile,"/dev/stdout") ) {
+	  boost::filesystem::path nbestPath(nbestFile);
+	  weightsFilename << nbestPath.parent_path().filename() << "/weights";
+	} else {
+	  weightsFilename << boost::filesystem::current_path() << "/hypergraph/weights";
+	}
+      }
+      boost::filesystem::path weightsFilePath(weightsFilename.str());
+      if ( ! boost::filesystem::exists(weightsFilePath.parent_path()) ) {
+	boost::filesystem::create_directory(weightsFilePath.parent_path());
+      }
+      TRACE_ERR("The weights file is " << weightsFilename.str() << "\n");
+      weightsOut->open(weightsFilename.str().c_str());
+      OutputFeatureWeightsForHypergraph(*weightsOut);
+      weightsOut->flush();
+      weightsOut->close();
+      delete weightsOut;
+    }
+
 
     // initialize output streams
     // note: we can't just write to STDOUT or files
@@ -533,7 +724,9 @@ int main(int argc, char** argv)
                             searchGraphCollector.get(),
                             detailedTranslationCollector.get(),
                             alignmentInfoCollector.get(),
-                            unknownsCollector.get() );
+                            unknownsCollector.get(),
+			    staticData.GetOutputSearchGraphSLF(),
+			    staticData.GetOutputSearchGraphHypergraph());
       // execute task
 #ifdef WITH_THREADS
     pool.Submit(task);
@@ -551,6 +744,8 @@ int main(int argc, char** argv)
     pool.Stop(true); //flush remaining jobs
 #endif
 
+    delete ioWrapper;
+
   } catch (const std::exception &e) {
     std::cerr << "Exception: " << e.what() << std::endl;
     return EXIT_FAILURE;
diff --git a/moses/AlignmentInfoCollection.cpp b/moses/AlignmentInfoCollection.cpp
index 5daba9ba1..53b83d8cd 100644
--- a/moses/AlignmentInfoCollection.cpp
+++ b/moses/AlignmentInfoCollection.cpp
@@ -30,6 +30,9 @@ AlignmentInfoCollection::AlignmentInfoCollection()
   m_emptyAlignmentInfo = Add(pairs);
 }
 
+AlignmentInfoCollection::~AlignmentInfoCollection()
+{}
+
 const AlignmentInfo &AlignmentInfoCollection::GetEmptyAlignmentInfo() const
 {
   return *m_emptyAlignmentInfo;
diff --git a/moses/AlignmentInfoCollection.h b/moses/AlignmentInfoCollection.h
index 9c7f75e13..de0949f8f 100644
--- a/moses/AlignmentInfoCollection.h
+++ b/moses/AlignmentInfoCollection.h
@@ -55,6 +55,7 @@ class AlignmentInfoCollection
 
   //! Only a single static variable should be created.
   AlignmentInfoCollection();
+  ~AlignmentInfoCollection();
 
   static AlignmentInfoCollection s_instance;
 
diff --git a/moses/Hypothesis.cpp b/moses/Hypothesis.cpp
index 506193d5b..5bd3a4e2b 100644
--- a/moses/Hypothesis.cpp
+++ b/moses/Hypothesis.cpp
@@ -462,7 +462,7 @@ void Hypothesis::CleanupArcList()
    */
   const StaticData &staticData = StaticData::Instance();
   size_t nBestSize = staticData.GetNBestSize();
-  bool distinctNBest = staticData.GetDistinctNBest() || staticData.UseMBR() || staticData.GetOutputSearchGraph() || staticData.UseLatticeMBR() ;
+  bool distinctNBest = staticData.GetDistinctNBest() || staticData.UseMBR() || staticData.GetOutputSearchGraph() || staticData.GetOutputSearchGraphSLF() || staticData.GetOutputSearchGraphHypergraph() || staticData.UseLatticeMBR() ;
 
   if (!distinctNBest && m_arcList->size() > nBestSize * 5) {
     // prune arc list only if there too many arcs
diff --git a/moses/LM/SingleFactor.cpp b/moses/LM/SingleFactor.cpp
index 3418aefe2..c061d0fed 100644
--- a/moses/LM/SingleFactor.cpp
+++ b/moses/LM/SingleFactor.cpp
@@ -36,8 +36,9 @@ using namespace std;
 namespace Moses
 {
 
-LanguageModelSingleFactor::~LanguageModelSingleFactor() {}
-
+LanguageModelSingleFactor::~LanguageModelSingleFactor()
+{
+}
 
 struct PointerState : public FFState {
   const void* lmstate;
@@ -58,7 +59,11 @@ LanguageModelPointerState::LanguageModelPointerState()
   m_beginSentenceState = new PointerState(NULL);
 }
 
-LanguageModelPointerState::~LanguageModelPointerState() {}
+LanguageModelPointerState::~LanguageModelPointerState()
+{
+  delete m_nullContextState;
+  delete m_beginSentenceState;
+}
 
 const FFState *LanguageModelPointerState::GetNullContextState() const
 {
diff --git a/moses/Manager.cpp b/moses/Manager.cpp
index 468db0de3..2ca689bb0 100644
--- a/moses/Manager.cpp
+++ b/moses/Manager.cpp
@@ -26,8 +26,10 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
 #endif
 
 #include <algorithm>
-#include <limits>
 #include <cmath>
+#include <limits>
+#include <map>
+#include <set>
 #include "Manager.h"
 #include "TypeDef.h"
 #include "Util.h"
@@ -46,17 +48,19 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
 #include "rule.pb.h"
 #endif
 
+#include "util/exception.hh"
+
 using namespace std;
 
 namespace Moses
 {
 Manager::Manager(size_t lineNumber, InputType const& source, SearchAlgorithm searchAlgorithm, const TranslationSystem* system)
-  :m_lineNumber(lineNumber)
-  ,m_system(system)
+  :m_system(system)
   ,m_transOptColl(source.CreateTranslationOptionCollection(system))
   ,m_search(Search::CreateSearch(*this, source, searchAlgorithm, *m_transOptColl))
   ,interrupted_flag(0)
   ,m_hypoId(0)
+  ,m_lineNumber(lineNumber)
   ,m_source(source)
 {
   m_system->InitializeBeforeSentenceProcessing(source);
@@ -628,6 +632,435 @@ void Manager::GetSearchGraph(vector<SearchGraphNode>& searchGraph) const
 
 }
 
+void Manager::OutputFeatureWeightsForSLF(std::ostream &outputSearchGraphStream) const
+{
+  outputSearchGraphStream.setf(std::ios::fixed);
+  outputSearchGraphStream.precision(6);
+
+  const StaticData& staticData = StaticData::Instance();
+  const TranslationSystem& system = staticData.GetTranslationSystem(TranslationSystem::DEFAULT);
+  const vector<const StatelessFeatureFunction*>& slf =system.GetStatelessFeatureFunctions();
+  const vector<const StatefulFeatureFunction*>& sff = system.GetStatefulFeatureFunctions();
+  size_t featureIndex = 1;
+  for (size_t i = 0; i < sff.size(); ++i) {
+    featureIndex = OutputFeatureWeightsForSLF(featureIndex, sff[i], outputSearchGraphStream);
+  }
+  for (size_t i = 0; i < slf.size(); ++i) {
+    if (slf[i]->GetScoreProducerWeightShortName() != "u" &&
+          slf[i]->GetScoreProducerWeightShortName() != "tm" &&
+          slf[i]->GetScoreProducerWeightShortName() != "I" &&
+          slf[i]->GetScoreProducerWeightShortName() != "g")
+    {
+      featureIndex = OutputFeatureWeightsForSLF(featureIndex, slf[i], outputSearchGraphStream);
+    }
+  }
+  const vector<PhraseDictionaryFeature*>& pds = system.GetPhraseDictionaries();
+  for( size_t i=0; i<pds.size(); i++ ) {
+    featureIndex = OutputFeatureWeightsForSLF(featureIndex, pds[i], outputSearchGraphStream);
+  }
+  const vector<GenerationDictionary*>& gds = system.GetGenerationDictionaries();
+  for( size_t i=0; i<gds.size(); i++ ) {
+    featureIndex = OutputFeatureWeightsForSLF(featureIndex, gds[i], outputSearchGraphStream);
+  }
+
+}
+
+void Manager::OutputFeatureValuesForSLF(const Hypothesis* hypo, bool zeros, std::ostream &outputSearchGraphStream) const
+{
+  outputSearchGraphStream.setf(std::ios::fixed);
+  outputSearchGraphStream.precision(6);
+
+  // outputSearchGraphStream << endl;
+  // outputSearchGraphStream << (*hypo) << endl;
+  // const ScoreComponentCollection& scoreCollection = hypo->GetScoreBreakdown(); 
+  // outputSearchGraphStream << scoreCollection << endl;
+
+  const StaticData& staticData = StaticData::Instance();
+  const TranslationSystem& system = staticData.GetTranslationSystem(TranslationSystem::DEFAULT);
+  const vector<const StatelessFeatureFunction*>& slf =system.GetStatelessFeatureFunctions();
+  const vector<const StatefulFeatureFunction*>& sff = system.GetStatefulFeatureFunctions();
+  size_t featureIndex = 1;
+  for (size_t i = 0; i < sff.size(); ++i) {
+    featureIndex = OutputFeatureValuesForSLF(featureIndex, zeros, hypo, sff[i], outputSearchGraphStream);
+  }
+  for (size_t i = 0; i < slf.size(); ++i) {
+    if (slf[i]->GetScoreProducerWeightShortName() != "u" &&
+          slf[i]->GetScoreProducerWeightShortName() != "tm" &&
+          slf[i]->GetScoreProducerWeightShortName() != "I" &&
+          slf[i]->GetScoreProducerWeightShortName() != "g")
+    {
+      featureIndex = OutputFeatureValuesForSLF(featureIndex, zeros, hypo, slf[i], outputSearchGraphStream);
+    }
+  }
+  const vector<PhraseDictionaryFeature*>& pds = system.GetPhraseDictionaries();
+  for( size_t i=0; i<pds.size(); i++ ) {
+    featureIndex = OutputFeatureValuesForSLF(featureIndex, zeros, hypo, pds[i], outputSearchGraphStream);
+  }
+  const vector<GenerationDictionary*>& gds = system.GetGenerationDictionaries();
+  for( size_t i=0; i<gds.size(); i++ ) {
+    featureIndex = OutputFeatureValuesForSLF(featureIndex, zeros, hypo, gds[i], outputSearchGraphStream);
+  }
+
+}
+
+void Manager::OutputFeatureValuesForHypergraph(const Hypothesis* hypo, std::ostream &outputSearchGraphStream) const
+{
+  outputSearchGraphStream.setf(std::ios::fixed);
+  outputSearchGraphStream.precision(6);
+
+  const StaticData& staticData = StaticData::Instance();
+  const TranslationSystem& system = staticData.GetTranslationSystem(TranslationSystem::DEFAULT);
+  const vector<const StatelessFeatureFunction*>& slf =system.GetStatelessFeatureFunctions();
+  const vector<const StatefulFeatureFunction*>& sff = system.GetStatefulFeatureFunctions();
+  size_t featureIndex = 1;
+  for (size_t i = 0; i < sff.size(); ++i) {
+    featureIndex = OutputFeatureValuesForHypergraph(featureIndex, hypo, sff[i], outputSearchGraphStream);
+  }
+  for (size_t i = 0; i < slf.size(); ++i) {
+    if (slf[i]->GetScoreProducerWeightShortName() != "u" &&
+          slf[i]->GetScoreProducerWeightShortName() != "tm" &&
+          slf[i]->GetScoreProducerWeightShortName() != "I" &&
+          slf[i]->GetScoreProducerWeightShortName() != "g")
+    {
+      featureIndex = OutputFeatureValuesForHypergraph(featureIndex, hypo, slf[i], outputSearchGraphStream);
+    }
+  }
+  const vector<PhraseDictionaryFeature*>& pds = system.GetPhraseDictionaries();
+  for( size_t i=0; i<pds.size(); i++ ) {
+    featureIndex = OutputFeatureValuesForHypergraph(featureIndex, hypo, pds[i], outputSearchGraphStream);
+  }
+  const vector<GenerationDictionary*>& gds = system.GetGenerationDictionaries();
+  for( size_t i=0; i<gds.size(); i++ ) {
+    featureIndex = OutputFeatureValuesForHypergraph(featureIndex, hypo, gds[i], outputSearchGraphStream);
+  }
+
+}
+
+
+size_t Manager::OutputFeatureWeightsForSLF(size_t index, const FeatureFunction* ff, std::ostream &outputSearchGraphStream) const
+{
+  size_t numScoreComps = ff->GetNumScoreComponents();
+  if (numScoreComps != ScoreProducer::unlimited) {
+    vector<float> values = StaticData::Instance().GetAllWeights().GetScoresForProducer(ff);
+    for (size_t i = 0; i < numScoreComps; ++i) {
+      outputSearchGraphStream << "# " << ff->GetScoreProducerDescription() 
+			      << " "  << ff->GetScoreProducerWeightShortName()
+			      << " "  << (i+1) << " of " << numScoreComps << endl
+			      << "x"  << (index+i) << "scale=" << values[i] << endl;
+    }
+    return index+numScoreComps;
+  } else {
+    cerr << "Sparse features are not supported when outputting HTK standard lattice format" << endl;
+    assert(false);
+    return 0;
+  }
+}
+
+size_t Manager::OutputFeatureValuesForSLF(size_t index, bool zeros, const Hypothesis* hypo, const FeatureFunction* ff, std::ostream &outputSearchGraphStream) const
+{
+
+  // { const FeatureFunction* sp = ff;
+  //   const FVector& m_scores = scoreCollection.GetScoresVector();
+  //   FVector& scores = const_cast<FVector&>(m_scores);
+  //   std::string prefix = sp->GetScoreProducerDescription() + FName::SEP;
+  //   // std::cout << "prefix==" << prefix << endl;
+  //   // cout << "m_scores==" << m_scores << endl;
+  //   // cout << "m_scores.size()==" << m_scores.size() << endl;
+  //   // cout << "m_scores.coreSize()==" << m_scores.coreSize() << endl;
+  //   // cout << "m_scores.cbegin() ?= m_scores.cend()\t" <<  (m_scores.cbegin() == m_scores.cend()) << endl;
+
+    
+  //   // for(FVector::FNVmap::const_iterator i = m_scores.cbegin(); i != m_scores.cend(); i++) {
+  //   //   std::cout<<prefix << "\t" << (i->first) << "\t" << (i->second) << std::endl;
+  //   // }
+  //   for(int i=0, n=v.size(); i<n; i+=1) {
+  //     //      outputSearchGraphStream << prefix << i << "==" << v[i] << std::endl;
+      
+  //   }
+  // }
+
+  // FVector featureValues = scoreCollection.GetVectorForProducer(ff);
+  // outputSearchGraphStream << featureValues << endl;
+  const ScoreComponentCollection& scoreCollection = hypo->GetScoreBreakdown(); 
+
+  vector<float> featureValues = scoreCollection.GetScoresForProducer(ff);
+  size_t numScoreComps = featureValues.size();//featureValues.coreSize();
+  //  if (numScoreComps != ScoreProducer::unlimited) {
+    // vector<float> values = StaticData::Instance().GetAllWeights().GetScoresForProducer(ff);
+  for (size_t i = 0; i < numScoreComps; ++i) {
+    outputSearchGraphStream << "x"  << (index+i) << "=" << ((zeros) ? 0.0 : featureValues[i]) << " ";
+    }
+    return index+numScoreComps;
+  // } else {
+  //   cerr << "Sparse features are not supported when outputting HTK standard lattice format" << endl;
+  //   assert(false);
+  //   return 0;
+  // }
+}
+
+size_t Manager::OutputFeatureValuesForHypergraph(size_t index, const Hypothesis* hypo, const FeatureFunction* ff, std::ostream &outputSearchGraphStream) const
+{
+  ScoreComponentCollection scoreCollection = hypo->GetScoreBreakdown(); 
+  const Hypothesis *prevHypo = hypo->GetPrevHypo();
+  if (prevHypo) {
+    scoreCollection.MinusEquals( prevHypo->GetScoreBreakdown() );
+  }
+  vector<float> featureValues = scoreCollection.GetScoresForProducer(ff);
+  size_t numScoreComps = featureValues.size();
+
+  if (numScoreComps > 1) {
+    for (size_t i = 0; i < numScoreComps; ++i) {
+      outputSearchGraphStream << ff->GetScoreProducerWeightShortName()  << i << "=" << featureValues[i] << " ";
+    }
+  } else {
+    outputSearchGraphStream << ff->GetScoreProducerWeightShortName()  << "=" << featureValues[0] << " ";
+  }
+
+  return index+numScoreComps;
+}
+
+/**! Output search graph in hypergraph format of Kenneth Heafield's lazy hypergraph decoder */
+void Manager::OutputSearchGraphAsHypergraph(long translationId, std::ostream &outputSearchGraphStream) const
+{
+
+  VERBOSE(2,"Getting search graph to output as hypergraph for sentence " << translationId << std::endl)
+
+  vector<SearchGraphNode> searchGraph;
+  GetSearchGraph(searchGraph);
+
+
+  map<int,int> mosesIDToHypergraphID;
+  // map<int,int> hypergraphIDToMosesID;
+  set<int> terminalNodes;
+  multimap<int,int> hypergraphIDToArcs;
+
+  VERBOSE(2,"Gathering information about search graph to output as hypergraph for sentence " << translationId << std::endl)
+
+  long numNodes = 0;
+  long endNode = 0;
+  {
+    long hypergraphHypothesisID = 0;
+    for (size_t arcNumber = 0, size=searchGraph.size(); arcNumber < size; ++arcNumber) {
+    
+      // Get an id number for the previous hypothesis
+      const Hypothesis *prevHypo = searchGraph[arcNumber].hypo->GetPrevHypo();
+      if (prevHypo!=NULL) {
+	int mosesPrevHypothesisID = prevHypo->GetId();
+	if (mosesIDToHypergraphID.count(mosesPrevHypothesisID) == 0) {
+	  mosesIDToHypergraphID[mosesPrevHypothesisID] = hypergraphHypothesisID;
+	  //	hypergraphIDToMosesID[hypergraphHypothesisID] = mosesPrevHypothesisID;
+	  hypergraphHypothesisID += 1;
+	}
+      }
+
+      // Get an id number for this hypothesis
+      int mosesHypothesisID;
+      if (searchGraph[arcNumber].recombinationHypo) {
+	mosesHypothesisID = searchGraph[arcNumber].recombinationHypo->GetId();
+      } else {
+	mosesHypothesisID = searchGraph[arcNumber].hypo->GetId();
+      }
+
+      if (mosesIDToHypergraphID.count(mosesHypothesisID) == 0) {
+      
+	mosesIDToHypergraphID[mosesHypothesisID] = hypergraphHypothesisID;
+	//      hypergraphIDToMosesID[hypergraphHypothesisID] = mosesHypothesisID;
+
+	bool terminalNode = (searchGraph[arcNumber].forward == -1);
+	if (terminalNode) {
+	  // Final arc to end node, representing the end of the sentence </s>
+	  terminalNodes.insert(hypergraphHypothesisID);
+	}
+
+	hypergraphHypothesisID += 1;
+      }
+
+      // Record that this arc ends at this node
+      hypergraphIDToArcs.insert(pair<int,int>(mosesIDToHypergraphID[mosesHypothesisID],arcNumber));
+
+    }
+    
+    // Unique end node
+    endNode = hypergraphHypothesisID;
+    //    mosesIDToHypergraphID[hypergraphHypothesisID] = hypergraphHypothesisID;
+    numNodes = endNode + 1;
+
+  }
+  
+
+  long numArcs = searchGraph.size() + terminalNodes.size();
+
+  // Print number of nodes and arcs
+  outputSearchGraphStream << numNodes << " " << numArcs << endl;
+
+  VERBOSE(2,"Search graph to output as hypergraph for sentence " << translationId 
+	  << " contains " << numArcs << " arcs and " << numNodes << " nodes" << std::endl)
+
+  VERBOSE(2,"Outputting search graph to output as hypergraph for sentence " << translationId << std::endl)
+
+
+  for (int hypergraphHypothesisID=0; hypergraphHypothesisID < endNode; hypergraphHypothesisID+=1) {
+    if (hypergraphHypothesisID % 100000 == 0) {
+      VERBOSE(2,"Processed " << hypergraphHypothesisID << " of " << numNodes << " hypergraph nodes for sentence " << translationId << std::endl);
+    }
+    //    int mosesID = hypergraphIDToMosesID[hypergraphHypothesisID];
+    size_t count = hypergraphIDToArcs.count(hypergraphHypothesisID);
+    //    VERBOSE(2,"Hypergraph node " << hypergraphHypothesisID << " has " << count << " incoming arcs" << std::endl)
+    if (count > 0) {
+      outputSearchGraphStream << count << "\n";
+
+      pair<multimap<int,int>::iterator, multimap<int,int>::iterator> range =
+	hypergraphIDToArcs.equal_range(hypergraphHypothesisID);
+      for (multimap<int,int>::iterator it=range.first; it!=range.second; ++it) {
+	int lineNumber = (*it).second;
+	const Hypothesis *thisHypo = searchGraph[lineNumber].hypo;
+	int mosesHypothesisID;// = thisHypo->GetId();
+	if (searchGraph[lineNumber].recombinationHypo) {
+	  mosesHypothesisID = searchGraph[lineNumber].recombinationHypo->GetId();
+	} else {
+	  mosesHypothesisID = searchGraph[lineNumber].hypo->GetId();
+	}
+	//	int actualHypergraphHypothesisID = mosesIDToHypergraphID[mosesHypothesisID];
+	UTIL_THROW_IF(
+		      (hypergraphHypothesisID != mosesIDToHypergraphID[mosesHypothesisID]),
+		      util::Exception,
+		      "Error while writing search lattice as hypergraph for sentence " << translationId << ". " <<
+		      "Moses node " << mosesHypothesisID << " was expected to have hypergraph id " << hypergraphHypothesisID <<
+		      ", but actually had hypergraph id " << mosesIDToHypergraphID[mosesHypothesisID] << 
+		      ". There are " << numNodes << " nodes in the search lattice."
+		      );
+
+	const Hypothesis *prevHypo = thisHypo->GetPrevHypo();
+	if (prevHypo==NULL) {
+	  //	VERBOSE(2,"Hypergraph node " << hypergraphHypothesisID << " start of sentence" << std::endl)
+	  outputSearchGraphStream << "<s> ||| \n";
+	} else {
+	  int startNode = mosesIDToHypergraphID[prevHypo->GetId()];
+	  //	  VERBOSE(2,"Hypergraph node " << hypergraphHypothesisID << " has parent node " << startNode << std::endl)
+	  UTIL_THROW_IF(
+			(startNode >= hypergraphHypothesisID),
+			util::Exception,
+			"Error while writing search lattice as hypergraph for sentence" << translationId << ". " <<
+			"The nodes must be output in topological order. The code attempted to violate this restriction."
+			);
+
+	  const TargetPhrase &targetPhrase = thisHypo->GetCurrTargetPhrase();
+	  int targetWordCount = targetPhrase.GetSize();
+
+	  outputSearchGraphStream << "[" << startNode << "]";
+	  for (int targetWordIndex=0; targetWordIndex<targetWordCount; targetWordIndex+=1) {
+	    outputSearchGraphStream << " " << targetPhrase.GetWord(targetWordIndex);
+	  }
+	  outputSearchGraphStream << " ||| ";
+	  OutputFeatureValuesForHypergraph(thisHypo, outputSearchGraphStream);
+	  outputSearchGraphStream << "\n";
+	}
+      }
+    }
+  }
+
+  // Print node and arc(s) for end of sentence </s>
+  outputSearchGraphStream << terminalNodes.size() << "\n";
+  for (set<int>::iterator it=terminalNodes.begin(); it!=terminalNodes.end(); ++it) {
+    outputSearchGraphStream << "[" << (*it) << "] </s> ||| \n";
+  }
+
+}
+
+
+/**! Output search graph in HTK standard lattice format (SLF) */
+void Manager::OutputSearchGraphAsSLF(long translationId, std::ostream &outputSearchGraphStream) const
+{
+
+  vector<SearchGraphNode> searchGraph;
+  GetSearchGraph(searchGraph);
+
+  long numArcs = 0;
+  long numNodes = 0;
+
+  map<int,int> nodes;
+  set<int> terminalNodes;
+
+  // Unique start node
+  nodes[0] = 0;
+
+  for (size_t arcNumber = 0; arcNumber < searchGraph.size(); ++arcNumber) {
+
+    int targetWordCount = searchGraph[arcNumber].hypo->GetCurrTargetPhrase().GetSize();
+    numArcs += targetWordCount;
+
+    int hypothesisID = searchGraph[arcNumber].hypo->GetId();
+    if (nodes.count(hypothesisID) == 0) {
+      
+      numNodes += targetWordCount;
+      nodes[hypothesisID] = numNodes;
+      //numNodes += 1;
+
+      bool terminalNode = (searchGraph[arcNumber].forward == -1);
+      if (terminalNode) {
+	numArcs += 1;
+      }
+    }
+
+  }
+  numNodes += 1;
+
+  // Unique end node
+  nodes[numNodes] = numNodes;
+
+  outputSearchGraphStream << "UTTERANCE=Sentence_" << translationId << endl;
+  outputSearchGraphStream << "VERSION=1.1" << endl;
+  outputSearchGraphStream << "base=2.71828182845905" << endl;
+  outputSearchGraphStream << "NODES=" << (numNodes+1) << endl;
+  outputSearchGraphStream << "LINKS=" << numArcs  << endl;
+
+  OutputFeatureWeightsForSLF(outputSearchGraphStream);
+
+  for (size_t arcNumber = 0, lineNumber = 0; lineNumber < searchGraph.size(); ++lineNumber) {
+    const Hypothesis *thisHypo = searchGraph[lineNumber].hypo;
+    const Hypothesis *prevHypo = thisHypo->GetPrevHypo();
+    if (prevHypo) {
+
+      int startNode = nodes[prevHypo->GetId()];
+      int endNode   = nodes[thisHypo->GetId()];
+      bool terminalNode = (searchGraph[lineNumber].forward == -1);
+      const TargetPhrase &targetPhrase = thisHypo->GetCurrTargetPhrase();
+      int targetWordCount = targetPhrase.GetSize();
+
+      for (int targetWordIndex=0; targetWordIndex<targetWordCount; targetWordIndex+=1) {
+	int x = (targetWordCount-targetWordIndex);
+
+	outputSearchGraphStream <<  "J=" << arcNumber;
+
+	if (targetWordIndex==0) {
+	  outputSearchGraphStream << " S=" << startNode;
+	} else {
+	  outputSearchGraphStream << " S=" << endNode - x;
+	}
+
+	outputSearchGraphStream << " E=" << endNode - (x-1)
+				<< " W=" << targetPhrase.GetWord(targetWordIndex);
+
+	OutputFeatureValuesForSLF(thisHypo, (targetWordIndex>0), outputSearchGraphStream);
+
+	outputSearchGraphStream  << endl;
+
+	arcNumber += 1;
+      }
+
+      if (terminalNode && terminalNodes.count(endNode) == 0) {
+	terminalNodes.insert(endNode);
+	outputSearchGraphStream <<  "J="   << arcNumber 
+				<< " S="   << endNode
+				<< " E="   << numNodes
+				<< endl;
+	arcNumber += 1;
+      }
+    }	    
+  }
+
+}
+
 void OutputSearchNode(long translationId, std::ostream &outputSearchGraphStream,
                       const SearchGraphNode& searchNode)
 {
diff --git a/moses/Manager.h b/moses/Manager.h
index dd011bc84..11762ec37 100644
--- a/moses/Manager.h
+++ b/moses/Manager.h
@@ -56,6 +56,10 @@ struct SearchGraphNode {
     hypo(theHypo), recombinationHypo(theRecombinationHypo),
     forward(theForward), fscore(theFscore) {}
 
+    bool operator<(const SearchGraphNode& sgn) const {
+        return this->hypo->GetId() < sgn.hypo->GetId();
+    }
+
 };
 
 /** The Manager class implements a stack decoding algorithm for phrase-based decoding
@@ -93,6 +97,19 @@ class Manager
   Manager(Manager const&);
   void operator=(Manager const&);
   const TranslationSystem* m_system;
+private:
+
+  // Helper functions to output search graph in HTK standard lattice format
+  void OutputFeatureWeightsForSLF(std::ostream &outputSearchGraphStream) const;
+  size_t OutputFeatureWeightsForSLF(size_t index, const FeatureFunction* ff, std::ostream &outputSearchGraphStream) const;
+  void OutputFeatureValuesForSLF(const Hypothesis* hypo, bool zeros, std::ostream &outputSearchGraphStream) const;
+  size_t OutputFeatureValuesForSLF(size_t index, bool zeros, const Hypothesis* hypo, const FeatureFunction* ff, std::ostream &outputSearchGraphStream) const;
+
+  // Helper functions to output search graph in the hypergraph format of Kenneth Heafield's lazy hypergraph decoder
+  void OutputFeatureValuesForHypergraph(const Hypothesis* hypo, std::ostream &outputSearchGraphStream) const;
+  size_t OutputFeatureValuesForHypergraph(size_t index, const Hypothesis* hypo, const FeatureFunction* ff, std::ostream &outputSearchGraphStream) const;
+
+
 protected:
   // data
 //	InputType const& m_source; /**< source sentence to be translated */
@@ -103,6 +120,7 @@ protected:
   size_t interrupted_flag;
   std::auto_ptr<SentenceStats> m_sentenceStats;
   int m_hypoId; //used to number the hypos as they are created.
+  size_t m_lineNumber;
 
   void GetConnectedGraph(
     std::map< int, bool >* pConnected,
@@ -113,7 +131,6 @@ protected:
 
 
 public:
-  size_t m_lineNumber;
   InputType const& m_source; /**< source sentence to be translated */
   Manager(size_t lineNumber, InputType const& source, SearchAlgorithm searchAlgorithm, const TranslationSystem* system);
   ~Manager();
@@ -137,6 +154,8 @@ public:
 #endif
 
   void OutputSearchGraph(long translationId, std::ostream &outputSearchGraphStream) const;
+  void OutputSearchGraphAsSLF(long translationId, std::ostream &outputSearchGraphStream) const;
+  void OutputSearchGraphAsHypergraph(long translationId, std::ostream &outputSearchGraphStream) const;
   void GetSearchGraph(std::vector<SearchGraphNode>& searchGraph) const;
   const InputType& GetSource() const {
     return m_source;
diff --git a/moses/PDTAimp.h b/moses/PDTAimp.h
index 25131b98a..5680b8ecb 100644
--- a/moses/PDTAimp.h
+++ b/moses/PDTAimp.h
@@ -11,6 +11,7 @@
 #include "moses/TranslationModel/PhraseDictionaryTreeAdaptor.h"
 #include "SparsePhraseDictionaryFeature.h"
 #include "Util.h"
+#include "util/tokenize_piece.hh"
 
 namespace Moses
 {
@@ -284,11 +285,10 @@ protected:
     FactorCollection &factorCollection = FactorCollection::Instance();
 
     for(size_t k=0; k<factorStrings.size(); ++k) {
-      std::vector<std::string> factors=TokenizeMultiCharSeparator(*factorStrings[k],StaticData::Instance().GetFactorDelimiter());
-      CHECK(factors.size()==m_output.size());
+      util::TokenIter<util::MultiCharacter, false> word(*factorStrings[k], StaticData::Instance().GetFactorDelimiter());
       Word& w=targetPhrase.AddWord();
-      for(size_t l=0; l<m_output.size(); ++l) {
-        w[m_output[l]]= factorCollection.AddFactor(Output, m_output[l], factors[l]);
+      for(size_t l=0; l<m_output.size(); ++l, ++word) {
+        w[m_output[l]]= factorCollection.AddFactor(*word);
       }
     }
 
diff --git a/moses/Parameter.cpp b/moses/Parameter.cpp
index 103277d34..6a9745ade 100644
--- a/moses/Parameter.cpp
+++ b/moses/Parameter.cpp
@@ -107,6 +107,7 @@ Parameter::Parameter()
 	AddParam("monotone-at-punctuation", "mp", "do not reorder over punctuation");
 	AddParam("distortion-file", "source factors (0 if table independent of source), target factors, location of the factorized/lexicalized reordering tables");
  	AddParam("distortion", "configurations for each factorized/lexicalized reordering model.");
+ 	AddParam("early-distortion-cost", "edc", "include estimate of distortion cost yet to be incurred in the score [Moore & Quirk 2007]. Default is no");
 	AddParam("xml-input", "xi", "allows markup of input with desired translations and probabilities. values can be 'pass-through' (default), 'inclusive', 'exclusive', 'ignore'");
   AddParam("xml-brackets", "xb", "specify strings to be used as xml tags opening and closing, e.g. \"{{ }}\" (default \"< >\"). Avoid square brackets because of configuration file format. Valid only with text input mode" );
  	AddParam("minimum-bayes-risk", "mbr", "use miminum Bayes risk to determine best translation");
@@ -130,6 +131,8 @@ Parameter::Parameter()
   AddParam("output-search-graph", "osg", "Output connected hypotheses of search into specified filename");
   AddParam("output-search-graph-extended", "osgx", "Output connected hypotheses of search into specified filename, in extended format");
   AddParam("unpruned-search-graph", "usg", "When outputting chart search graph, do not exclude dead ends. Note: stack pruning may have eliminated some hypotheses");
+  AddParam("output-search-graph-slf", "slf", "Output connected hypotheses of search into specified directory, one file per sentence, in HTK standard lattice format (SLF)");
+  AddParam("output-search-graph-hypergraph", "Output connected hypotheses of search into specified directory, one file per sentence, in a hypergraph format (see Kenneth Heafield's lazy hypergraph decoder)");
   AddParam("include-lhs-in-search-graph", "lhssg", "When outputting chart search graph, include the label of the LHS of the rule (useful when using syntax)");
 #ifdef HAVE_PROTOBUF
   AddParam("output-search-graph-pb", "pb", "Write phrase lattice to protocol buffer objects in the specified path.");
@@ -177,6 +180,7 @@ Parameter::Parameter()
   AddParam("minlexr-memory", "Load lexical reordering table in minlexr format into memory");                                          
   AddParam("minphr-memory", "Load phrase table in minphr format into memory");
 
+  AddParam("print-alignment-info", "Output word-to-word alignment into the log file. Word-to-word alignments are takne from the phrase table if any. Default is false");
   AddParam("include-segmentation-in-n-best", "include phrasal segmentation in the n-best list. default is false");
   AddParam("print-alignment-info-in-n-best", "Include word-to-word alignment in the n-best list. Word-to-word alignments are takne from the phrase table if any. Default is false");
   AddParam("alignment-output-file", "print output word alignments into given file");
diff --git a/moses/SourceWordDeletionFeature.cpp b/moses/SourceWordDeletionFeature.cpp
index c5a61111f..c312a3b03 100644
--- a/moses/SourceWordDeletionFeature.cpp
+++ b/moses/SourceWordDeletionFeature.cpp
@@ -55,12 +55,7 @@ void SourceWordDeletionFeature::ComputeFeatures(const TargetPhrase& targetPhrase
   // handle special case: unknown words (they have no word alignment)
 	size_t targetLength = targetPhrase.GetSize();
 	size_t sourceLength = targetPhrase.GetSourcePhrase().GetSize();
-	if (targetLength == 1 && sourceLength == 1) {
-		const Factor* f1 = targetPhrase.GetWord(0).GetFactor(1);
-		if (f1 && f1->GetString().compare(UNKNOWN_FACTOR) == 0) {
-			return;
-		}
-	}
+	if (targetLength == 1 && sourceLength == 1 && !alignmentInfo.GetSize()) return;
 
   // flag aligned words
   bool aligned[16];
diff --git a/moses/StaticData.cpp b/moses/StaticData.cpp
index df05b64d3..449187da7 100644
--- a/moses/StaticData.cpp
+++ b/moses/StaticData.cpp
@@ -162,10 +162,6 @@ bool StaticData::LoadData(Parameter *parameter)
     }
   }
 
-  if(m_parameter->GetParam("sort-word-alignment").size()) {
-    m_wordAlignmentSort = (WordAlignmentSort) Scan<size_t>(m_parameter->GetParam("sort-word-alignment")[0]);
-  }
-  
   // factor delimiter
   if (m_parameter->GetParam("factor-delimiter").size() > 0) {
     m_factorDelimiter = m_parameter->GetParam("factor-delimiter")[0];
@@ -175,6 +171,16 @@ bool StaticData::LoadData(Parameter *parameter)
   SetBooleanParameter( &m_outputHypoScore, "output-hypo-score", false );
 
   //word-to-word alignment
+  // alignments
+  SetBooleanParameter( &m_PrintAlignmentInfo, "print-alignment-info", false );
+  if (m_PrintAlignmentInfo) {
+    m_needAlignmentInfo = true;
+  }
+
+  if(m_parameter->GetParam("sort-word-alignment").size()) {
+    m_wordAlignmentSort = (WordAlignmentSort) Scan<size_t>(m_parameter->GetParam("sort-word-alignment")[0]);
+  }
+
   SetBooleanParameter( &m_PrintAlignmentInfoNbest, "print-alignment-info-in-n-best", false );
   if (m_PrintAlignmentInfoNbest) {
     m_needAlignmentInfo = true;
@@ -235,8 +241,19 @@ bool StaticData::LoadData(Parameter *parameter)
     }
     m_outputSearchGraph = true;
     m_outputSearchGraphExtended = true;
-  } else
+  } else {
     m_outputSearchGraph = false;
+  }
+  if (m_parameter->GetParam("output-search-graph-slf").size() > 0) {
+    m_outputSearchGraphSLF = true;
+  } else {
+    m_outputSearchGraphSLF = false;
+  }
+  if (m_parameter->GetParam("output-search-graph-hypergraph").size() > 0) {
+    m_outputSearchGraphHypergraph = true;
+  } else {
+    m_outputSearchGraphHypergraph = false;
+  }
 #ifdef HAVE_PROTOBUF
   if (m_parameter->GetParam("output-search-graph-pb").size() > 0) {
     if (m_parameter->GetParam("output-search-graph-pb").size() != 1) {
diff --git a/moses/StaticData.h b/moses/StaticData.h
index 448f1a4e7..20d36e4b8 100644
--- a/moses/StaticData.h
+++ b/moses/StaticData.h
@@ -171,6 +171,7 @@ protected:
   bool m_reportAllFactorsNBest;
   std::string m_detailedTranslationReportingFilePath;
   bool m_onlyDistinctNBest;
+  bool m_PrintAlignmentInfo;
   bool m_needAlignmentInfo;
   bool m_PrintAlignmentInfoNbest;
 
@@ -216,6 +217,8 @@ protected:
   bool m_outputWordGraph; //! whether to output word graph
   bool m_outputSearchGraph; //! whether to output search graph
   bool m_outputSearchGraphExtended; //! ... in extended format
+  bool m_outputSearchGraphSLF; //! whether to output search graph in HTK standard lattice format (SLF)
+  bool m_outputSearchGraphHypergraph; //! whether to output search graph in hypergraph
 #ifdef HAVE_PROTOBUF
   bool m_outputSearchGraphPB; //! whether to output search graph as a protobuf
 #endif
@@ -458,7 +461,7 @@ public:
     return m_nBestFilePath;
   }
   bool IsNBestEnabled() const {
-    return (!m_nBestFilePath.empty()) || m_mbr || m_useLatticeMBR || m_mira || m_outputSearchGraph || m_useConsensusDecoding || !m_latticeSamplesFilePath.empty()
+    return (!m_nBestFilePath.empty()) || m_mbr || m_useLatticeMBR || m_mira || m_outputSearchGraph || m_outputSearchGraphSLF || m_outputSearchGraphHypergraph || m_useConsensusDecoding || !m_latticeSamplesFilePath.empty()
 #ifdef HAVE_PROTOBUF
            || m_outputSearchGraphPB
 #endif
@@ -631,6 +634,12 @@ public:
   bool GetOutputSearchGraphExtended() const {
     return m_outputSearchGraphExtended;
   }
+  bool GetOutputSearchGraphSLF() const {
+    return m_outputSearchGraphSLF;
+  }
+  bool GetOutputSearchGraphHypergraph() const {
+    return m_outputSearchGraphHypergraph;
+  }
 #ifdef HAVE_PROTOBUF
   bool GetOutputSearchGraphPB() const {
     return m_outputSearchGraphPB;
@@ -722,6 +731,9 @@ public:
   const std::string &GetAlignmentOutputFile() const {
     return m_alignmentOutputFile;
   }
+  bool PrintAlignmentInfo() const {
+    return m_PrintAlignmentInfo;
+  }
   bool PrintAlignmentInfoInNbest() const {
     return m_PrintAlignmentInfoNbest;
   }
diff --git a/moses/TargetPhrase.cpp b/moses/TargetPhrase.cpp
index b1d99ab50..6f14657a3 100644
--- a/moses/TargetPhrase.cpp
+++ b/moses/TargetPhrase.cpp
@@ -326,8 +326,10 @@ TO_STRING_BODY(TargetPhrase);
 
 std::ostream& operator<<(std::ostream& os, const TargetPhrase& tp)
 {
-  os << static_cast<const Phrase&>(tp) << ":" << tp.GetAlignNonTerm();
-  os << ": c=" << tp.m_fullScore;
+  os << static_cast<const Phrase&>(tp) << ":" << flush;
+  os << tp.GetAlignNonTerm() << flush;
+  os << ": c=" << tp.m_fullScore << flush;
+  os << " " << tp.m_scoreBreakdown << flush;
 
   return os;
 }
diff --git a/moses/TargetWordInsertionFeature.cpp b/moses/TargetWordInsertionFeature.cpp
index 537c5c9cb..3b9bf36ba 100644
--- a/moses/TargetWordInsertionFeature.cpp
+++ b/moses/TargetWordInsertionFeature.cpp
@@ -56,12 +56,7 @@ void TargetWordInsertionFeature::ComputeFeatures(const TargetPhrase& targetPhras
   // handle special case: unknown words (they have no word alignment)
   size_t targetLength = targetPhrase.GetSize();
   size_t sourceLength = targetPhrase.GetSourcePhrase().GetSize();
-  if (targetLength == 1 && sourceLength == 1) {
-		const Factor* f1 = targetPhrase.GetWord(0).GetFactor(1);
-		if (f1 && f1->GetString().compare(UNKNOWN_FACTOR) == 0) {
-			return;
-		}
-  }
+  if (targetLength == 1 && sourceLength == 1 && !alignmentInfo.GetSize()) return;
 
   // flag aligned words
   bool aligned[16];
diff --git a/moses/TranslationModel/PhraseDictionaryTree.cpp b/moses/TranslationModel/PhraseDictionaryTree.cpp
index 515d2f649..675656112 100644
--- a/moses/TranslationModel/PhraseDictionaryTree.cpp
+++ b/moses/TranslationModel/PhraseDictionaryTree.cpp
@@ -156,22 +156,6 @@ PhraseDictionaryTree::PrefixPtr::operator bool() const
 
 typedef LVoc<std::string> WordVoc;
 
-static WordVoc* ReadVoc(const std::string& filename)
-{
-  static std::map<std::string,WordVoc*> vocs;
-#ifdef WITH_THREADS
-  boost::mutex mutex;
-  boost::mutex::scoped_lock lock(mutex);
-#endif
-  std::map<std::string,WordVoc*>::iterator vi = vocs.find(filename);
-  if (vi == vocs.end()) {
-    WordVoc* voc = new WordVoc();
-    voc->Read(filename);
-    vocs[filename] = voc;
-  }
-  return vocs[filename];
-}
-
 
 class PDTimp {
 public:
@@ -184,8 +168,8 @@ public:
   std::vector<OFF_T> srcOffsets;
 
   FILE *os,*ot;
-  WordVoc* sv;
-  WordVoc* tv;
+  WordVoc sv;
+  WordVoc tv;
 
   ObjectPool<PPimp> pPool;
   // a comparison with the Boost MemPools might be useful
@@ -269,12 +253,12 @@ public:
 
       rv.back().tokens.reserve(iphrase.size());
       for(size_t j=0; j<iphrase.size(); ++j) {
-        rv.back().tokens.push_back(&tv->symbol(iphrase[j]));
+        rv.back().tokens.push_back(&tv.symbol(iphrase[j]));
       }
       rv.back().scores = i->GetScores();
       const IPhrase& fnames = i->GetFeatureNames();
       for (size_t j = 0; j < fnames.size(); ++j) {
-        rv.back().fnames.push_back(&tv->symbol(fnames[j]));
+        rv.back().fnames.push_back(&tv.symbol(fnames[j]));
       }
       rv.back().fvalues = i->GetFeatureValues();
       if (wa) wa->push_back(i->GetAlignment());
@@ -289,7 +273,7 @@ public:
     CHECK(p);
     if(w.empty() || w==EPSILON) return p;
 
-    LabelId wi=sv->index(w);
+    LabelId wi=sv.index(w);
 
     if(wi==InvalidLabelId) return PPtr(); // unknown word
     else if(p.imp->isRoot()) {
@@ -304,6 +288,8 @@ public:
 
     return PPtr();
   }
+
+  WordVoc* ReadVoc(const std::string& filename);
 };
 
 
@@ -350,10 +336,8 @@ int PDTimp::Read(const std::string& fn)
   for(size_t i=0; i<data.size(); ++i)
     data[i]=CPT(os,srcOffsets[i]);
 
-  sv = ReadVoc(ifsv);
-  tv = ReadVoc(iftv);
-  //sv.Read(ifsv);
-  //tv.Read(iftv);
+  sv.Read(ifsv);
+  tv.Read(iftv);
 
   TRACE_ERR("binary phrasefile loaded, default OFF_T: "<<PTF::getDefault()
             <<"\n");
@@ -370,7 +354,7 @@ void PDTimp::PrintTgtCand(const TgtCands& tcand,std::ostream& out) const
     const IPhrase& iphr=tcand[i].GetPhrase();
 
     out << i << " -- " << sc << " -- ";
-    for(size_t j=0; j<iphr.size(); ++j)			out << tv->symbol(iphr[j])<<" ";
+    for(size_t j=0; j<iphr.size(); ++j)			out << tv.symbol(iphr[j])<<" ";
     out<< " -- " << trgAlign;
     out << std::endl;
   }
@@ -423,7 +407,7 @@ GetTargetCandidates(const std::vector<std::string>& src,
 {
   IPhrase f(src.size());
   for(size_t i=0; i<src.size(); ++i) {
-    f[i]=imp->sv->index(src[i]);
+    f[i]=imp->sv.index(src[i]);
     if(f[i]==InvalidLabelId) return;
   }
 
@@ -439,7 +423,7 @@ GetTargetCandidates(const std::vector<std::string>& src,
 {
   IPhrase f(src.size());
   for(size_t i=0; i<src.size(); ++i) {
-    f[i]=imp->sv->index(src[i]);
+    f[i]=imp->sv.index(src[i]);
     if(f[i]==InvalidLabelId) return;
   }
 
@@ -455,7 +439,7 @@ PrintTargetCandidates(const std::vector<std::string>& src,
 {
   IPhrase f(src.size());
   for(size_t i=0; i<src.size(); ++i) {
-    f[i]=imp->sv->index(src[i]);
+    f[i]=imp->sv.index(src[i]);
     if(f[i]==InvalidLabelId) {
       TRACE_ERR("the source phrase '"<<src<<"' contains an unknown word '"
                 <<src[i]<<"'\n");
@@ -497,8 +481,6 @@ int PhraseDictionaryTree::Create(std::istream& inFile,const std::string& out)
   std::vector<OFF_T> vo;
   size_t lnc=0;
   size_t numElement = NOT_FOUND; // 3=old format, 5=async format which include word alignment info
-  imp->sv = new WordVoc();
-  imp->tv = new WordVoc();
   size_t missingAlignmentCount = 0; 
 
   while(getline(inFile, line)) {
@@ -532,11 +514,11 @@ int PhraseDictionaryTree::Create(std::istream& inFile,const std::string& out)
 
     std::vector<std::string> wordVec = Tokenize(sourcePhraseString);
     for (size_t i = 0 ; i < wordVec.size() ; ++i)
-      f.push_back(imp->sv->add(wordVec[i]));
+      f.push_back(imp->sv.add(wordVec[i]));
 
     wordVec = Tokenize(targetPhraseString);
     for (size_t i = 0 ; i < wordVec.size() ; ++i)
-      e.push_back(imp->tv->add(wordVec[i]));
+      e.push_back(imp->tv.add(wordVec[i]));
 
     //			while(is>>w && w!="|||") sc.push_back(atof(w.c_str()));
     // Mauro: to handle 0 probs in phrase tables
@@ -576,7 +558,7 @@ int PhraseDictionaryTree::Create(std::istream& inFile,const std::string& out)
         abort();  
       }
       for (size_t i = 0; i < sparseTokens.size(); i+=2) {
-        fnames.push_back(imp->tv->add(sparseTokens[i]));
+        fnames.push_back(imp->tv.add(sparseTokens[i]));
         fvalues.push_back(Scan<FValue>(sparseTokens[i+1]));
       }
     }
@@ -663,8 +645,8 @@ int PhraseDictionaryTree::Create(std::istream& inFile,const std::string& out)
   fWriteVector(oi,vo);
   fClose(oi);
 
-  imp->sv->Write(ofsv);
-  imp->tv->Write(oftv);
+  imp->sv.Write(ofsv);
+  imp->tv.Write(oftv);
 
   return 1;
 }
diff --git a/moses/TranslationModel/fuzzy-match/FuzzyMatchWrapper.cpp b/moses/TranslationModel/fuzzy-match/FuzzyMatchWrapper.cpp
index c680d7245..065368ca7 100644
--- a/moses/TranslationModel/fuzzy-match/FuzzyMatchWrapper.cpp
+++ b/moses/TranslationModel/fuzzy-match/FuzzyMatchWrapper.cpp
@@ -552,7 +552,9 @@ namespace tmmt
   
   bool FuzzyMatchWrapper::GetLSEDCache(const std::pair< WORD_ID, WORD_ID > &key, unsigned int &value) const
   {
+#ifdef WITH_THREADS
     boost::shared_lock<boost::shared_mutex> read_lock(m_accessLock);
+#endif
     map< pair< WORD_ID, WORD_ID >, unsigned int >::const_iterator lookup = m_lsed.find( key );
     if (lookup != m_lsed.end()) {
       value = lookup->second;
@@ -564,7 +566,9 @@ namespace tmmt
 
   void FuzzyMatchWrapper::SetLSEDCache(const std::pair< WORD_ID, WORD_ID > &key, const unsigned int &value)
   {
+#ifdef WITH_THREADS
     boost::unique_lock<boost::shared_mutex> lock(m_accessLock);
+#endif
     m_lsed[ key ] = value;
   }
 
diff --git a/moses/Util.cpp b/moses/Util.cpp
index 98de1241e..495e05124 100644
--- a/moses/Util.cpp
+++ b/moses/Util.cpp
@@ -35,6 +35,7 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
 #include "TypeDef.h"
 #include "Util.h"
 #include "Timer.h"
+#include "util/exception.hh"
 #include "util/file.hh"
 
 using namespace std;
@@ -65,6 +66,8 @@ const std::string ToLower(const std::string& str)
   return lc;
 }
 
+class BoolValueException : public util::Exception {};
+
 template<>
 bool Scan<bool>(const std::string &input)
 {
@@ -73,8 +76,7 @@ bool Scan<bool>(const std::string &input)
     return true;
   if (lc == "no" || lc == "n" || lc =="false" || lc == "0")
     return false;
-  TRACE_ERR( "Scan<bool>: didn't understand '" << lc << "', returning false" << std::endl);
-  return false;
+  UTIL_THROW(BoolValueException, "Could not interpret " << input << " as a boolean.  After lowercasing, valid values are yes, y, true, 1, no, n, false, and 0.");
 }
 
 bool FileExists(const std::string& filePath)
diff --git a/moses/Word.cpp b/moses/Word.cpp
index c23e8de8c..2c1ac09ea 100644
--- a/moses/Word.cpp
+++ b/moses/Word.cpp
@@ -25,6 +25,7 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
 #include "Word.h"
 #include "TypeDef.h"
 #include "StaticData.h"  // needed to determine the FactorDelimiter
+#include "util/exception.hh"
 #include "util/tokenize_piece.hh"
 
 using namespace std;
@@ -95,6 +96,8 @@ std::string Word::GetString(FactorType factorType) const
   	return NULL;
 }
 
+class StrayFactorException : public util::Exception {};
+
 void Word::CreateFromString(FactorDirection direction
                             , const std::vector<FactorType> &factorOrder
                             , const StringPiece &str
@@ -106,7 +109,7 @@ void Word::CreateFromString(FactorDirection direction
   for (size_t ind = 0; ind < factorOrder.size() && fit; ++ind, ++fit) {
     m_factorArray[factorOrder[ind]] = factorCollection.AddFactor(*fit);
   }
-  CHECK(!fit);
+  UTIL_THROW_IF(fit, StrayFactorException, "You have configured " << factorOrder.size() << " factors but the word " << str << " contains factor delimiter " << StaticData::Instance().GetFactorDelimiter() << " too many times.");
 
   // assume term/non-term same for all factors
   m_isNonTerminal = isNonTerminal;
diff --git a/phrase-extract/consolidate-main.cpp b/phrase-extract/consolidate-main.cpp
index 70de9678b..fd33907de 100644
--- a/phrase-extract/consolidate-main.cpp
+++ b/phrase-extract/consolidate-main.cpp
@@ -256,7 +256,7 @@ void processFiles( char* fileNameDirect, char* fileNameIndirect, char* fileNameC
    if (kneserNeyFlag) {
      float D = kneserNey_D3;
      if (countEF < 2) D = kneserNey_D1;
-     if (countEF < 3) D = kneserNey_D2;
+     else if (countEF < 3) D = kneserNey_D2;
      if (D > countEF) D = countEF - 0.01; // sanity constraint
 
      float p_b_E = n1_E / totalCount; // target phrase prob based on distinct
diff --git a/phrase-extract/extract-main.cpp b/phrase-extract/extract-main.cpp
index 92c8a470e..cab91e92d 100644
--- a/phrase-extract/extract-main.cpp
+++ b/phrase-extract/extract-main.cpp
@@ -712,6 +712,10 @@ for(int fi=startF; fi<=endF; fi++) {
   if (m_options.isOrientationFlag())
     outextractstrOrientation << orientationInfo;
 
+  if (m_options.isIncludeSentenceIdFlag()) {
+    outextractstr << " ||| " << sentence.sentenceID;
+  }
+
   if (m_options.getInstanceWeightsFile().length()) {
     if (m_options.isTranslationFlag()) {
       outextractstr << " ||| " << sentence.weightString;
@@ -722,9 +726,6 @@ for(int fi=startF; fi<=endF; fi++) {
     }
   }
 
-  if (m_options.isIncludeSentenceIdFlag()) {
-    outextractstr << " ||| " << sentence.sentenceID;
-  }
 
   if (m_options.isTranslationFlag()) outextractstr << "\n";
   if (m_options.isTranslationFlag()) outextractstrInv << "\n";
diff --git a/scripts/ems/experiment.meta b/scripts/ems/experiment.meta
index 214569206..769fc0ebf 100644
--- a/scripts/ems/experiment.meta
+++ b/scripts/ems/experiment.meta
@@ -1000,6 +1000,7 @@ lowercase-reference
 	out: reference
 	default-name: evaluation/reference
 	pass-unless: output-lowercaser
+  pass-if: recaser
 	multiref: $moses-script-dir/ems/support/run-command-on-multiple-refsets.perl
 	template: $output-lowercaser < IN > OUT	
 nist-bleu
diff --git a/scripts/ems/support/analysis.perl b/scripts/ems/support/analysis.perl
index 29962ca71..a2f9580a9 100755
--- a/scripts/ems/support/analysis.perl
+++ b/scripts/ems/support/analysis.perl
@@ -745,7 +745,8 @@ sub hierarchical_segmentation {
     open(OUTPUT_TREE,">$dir/output-tree") or die "Cannot open: $!";
     open(NODE,">$dir/node") or die "Cannot open: $!";
     while(<TRACE>) {
-	/^Trans Opt (\d+) \[(\d+)\.\.(\d+)\]: (.+)  : (\S+) \-\>(.+) :([\(\),\d\- ]*): pC=[\d\.\-e]+, c=/ || die("cannot scan line $_");
+	/^Trans Opt (\d+) \[(\d+)\.\.(\d+)\]: (.+)  : (\S+) \-\>(.+) :([\(\),\d\- ]*): pC=[\d\.\-e]+, c=/ ||
+	/^Trans Opt (\d+) \[(\d+)\.\.(\d+)\]: (.+)  : (\S+) \-\>(.+) :([\(\),\d\- ]*): c=/ || die("cannot scan line $_");
 	my ($sentence,$start,$end,$spans,$rule_lhs,$rule_rhs,$alignment) = ($1,$2,$3,$4,$5,$6,$7);
 	if ($last_sentence >= 0 && $sentence != $last_sentence) {
 	    &hs_process($last_sentence,\@DERIVATION,\%STATS);
@@ -1137,9 +1138,17 @@ sub process_search_graph {
   `mkdir -p $dir/search-graph`;
   my $last_sentence = -1;
   while(<OSG>) {
-    /^(\d+) (\d+)\-?\>?(\S*) (\S+) =\> (.+) :(.*): pC=([\de\-\.]+), c=([\de\-\.]+) \[(\d+)\.\.(\d+)\] (.*)\[total=([\d\-\.]+)\] \<\</ || die("ERROR: buggy search graph line: $_"); 
-    my ($sentence,$id,$recomb,$lhs,$output,$alignment,$rule_score,$heuristic_rule_score,$from,$to,$children,$hyp_score) 
-      = ($1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12);
+    my ($sentence,$id,$recomb,$lhs,$output,$alignment,$rule_score,$heuristic_rule_score,$from,$to,$children,$hyp_score);
+    if (/^(\d+) (\d+)\-?\>?(\S*) (\S+) =\> (.+) :(.*): pC=([\de\-\.]+), c=([\de\-\.]+) \[(\d+)\.\.(\d+)\] (.*)\[total=([\d\-\.]+)\] \<\</) {
+      ($sentence,$id,$recomb,$lhs,$output,$alignment,$rule_score,$heuristic_rule_score,$from,$to,$children,$hyp_score) = ($1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12);
+    }
+    elsif (/^(\d+) (\d+)\-?\>?(\S*) (\S+) =\> (.+) :(.*): c=([\de\-\.]+) \[(\d+)\.\.(\d+)\] (.*)\[total=([\d\-\.]+)\] core/) {
+      ($sentence,$id,$recomb,$lhs,$output,$alignment,$rule_score,$from,$to,$children,$hyp_score) = ($1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12);
+      $heuristic_rule_score = $rule_score; # hmmmm....
+    }
+    else {
+      die("ERROR: buggy search graph line: $_"); 
+    }
     chop($alignment) if $alignment;
     chop($children) if $children;
     $recomb = 0 unless $recomb;
diff --git a/scripts/ems/support/wrap-xml.perl b/scripts/ems/support/wrap-xml.perl
index e941aa95b..4ef6a1de6 100755
--- a/scripts/ems/support/wrap-xml.perl
+++ b/scripts/ems/support/wrap-xml.perl
@@ -13,10 +13,10 @@ chomp(@OUT);
 while(<SRC>) {
     chomp;
     if (/^<srcset/) {
-	s/<srcset/<tstset trglang="$language"/;
+	s/<srcset/<tstset trglang="$language"/i;
     }
     elsif (/^<\/srcset/) {
-	s/<\/srcset/<\/tstset/;
+	s/<\/srcset/<\/tstset/i;
     }
     elsif (/^<doc/i) {
   s/ *sysid="[^\"]+"//;
@@ -26,10 +26,10 @@ while(<SRC>) {
 	my $line = shift(@OUT);
         $line = "" if $line =~ /NO BEST TRANSLATION/;
         if (/<\/seg>/) {
-	  s/(<seg[^>]+> *).*(<\/seg>)/$1$line$2/;
+	  s/(<seg[^>]+> *).*(<\/seg>)/$1$line$2/i;
         }
         else {
-	  s/(<seg[^>]+> *)[^<]*/$1$line/;
+	  s/(<seg[^>]+> *)[^<]*/$1$line/i;
         }
     }
     print $_."\n";
diff --git a/scripts/generic/compound-splitter.perl b/scripts/generic/compound-splitter.perl
index 8f82ab8d9..beca70eb0 100755
--- a/scripts/generic/compound-splitter.perl
+++ b/scripts/generic/compound-splitter.perl
@@ -16,15 +16,15 @@ $HELP = 1
     unless &GetOptions('corpus=s' => \$CORPUS,
 		       'model=s' => \$MODEL,
 		       'filler=s' => \$FILLER,
-           'factored' => \$FACTORED,
+		       'factored' => \$FACTORED,
 		       'min-size=i' => \$MIN_SIZE,
 		       'min-count=i' => \$MIN_COUNT,
 		       'max-count=i' => \$MAX_COUNT,
 		       'help' => \$HELP,
 		       'verbose' => \$VERBOSE,
-           'syntax' => \$SYNTAX,
-           'binarize' => \$BINARIZE,
-           'mark-split' => \$MARK_SPLIT,
+		       'syntax' => \$SYNTAX,
+		       'binarize' => \$BINARIZE,
+		       'mark-split' => \$MARK_SPLIT,
 		       'train' => \$TRAIN);
 
 if ($HELP ||
@@ -155,34 +155,37 @@ sub apply {
         next if defined($COUNT{$lc}) && $COUNT{$lc} > $count;
 	$COUNT{$lc} = $count;
 	$TRUECASE{$lc} = $factored_word;
-  $LABEL{$lc} = $label if $SYNTAX;
+	$LABEL{$lc} = $label if $SYNTAX;
     }
     close(MODEL);
 
     while(<STDIN>) {
 	my $first = 1;
 	chop; s/\s+/ /g; s/^ //; s/ $//;
-  my @BUFFER; # for xml tags
+	my @BUFFER; # for xml tags
 	foreach my $factored_word (split) {
 	    print " " unless $first;	    
 	    $first = 0;
 
-      # syntax: don't split xml
-      if ($SYNTAX && ($factored_word =~ /^</ || $factored_word =~ />$/)) {
-        push @BUFFER,$factored_word;
-        $first = 1;
-        next;
-      }
-
-      # get case class
-      my $word = $factored_word;
-      $word =~ s/\|.+//g; # just first factor
-      my $lc = lc($word);
-
+	    # syntax: don't split xml
+	    if ($SYNTAX && ($factored_word =~ /^</ || $factored_word =~ />$/)) {
+		push @BUFFER,$factored_word;
+		$first = 1;
+		next;
+	    }
+	    
+	    # get case class
+	    my $word = $factored_word;
+	    $word =~ s/\|.+//g; # just first factor
+	    my $lc = lc($word);
+	    
+	    print STDERR "considering $word ($lc)...\n" if $VERBOSE;
 	    # don't split frequent words
-	    if (defined($COUNT{$lc}) && $COUNT{$lc}>=$MAX_COUNT) {
-    print join(" ",@BUFFER)." " if scalar(@BUFFER); @BUFFER = (); # clear buffer
+	    if ((defined($COUNT{$lc}) && $COUNT{$lc}>=$MAX_COUNT) ||
+	        $lc !~ /[a-zA-Z]/) {; # has to have at least one letter
+		print join(" ",@BUFFER)." " if scalar(@BUFFER); @BUFFER = (); # clear buffer
 		print $factored_word;
+		print STDERR "\tfrequent word ($COUNT{$lc}>=$MAX_COUNT), skipping\n" if $VERBOSE;
 		next;
 	    }
 
diff --git a/scripts/generic/extract-parallel.perl b/scripts/generic/extract-parallel.perl
index 7533b39e0..192169c86 100755
--- a/scripts/generic/extract-parallel.perl
+++ b/scripts/generic/extract-parallel.perl
@@ -153,9 +153,9 @@ if (defined($baselineExtract)) {
 		$catOCmd .= "$baselineExtract.o$sorted.gz ";
 }
 
-$catCmd .= " | LC_ALL=C $sortCmd -T $TMPDIR | gzip -c > $extract.sorted.gz \n";
-$catInvCmd .= " | LC_ALL=C $sortCmd -T $TMPDIR | gzip -c > $extract.inv.sorted.gz \n";
-$catOCmd .= " | LC_ALL=C $sortCmd -T $TMPDIR | gzip -c > $extract.o.sorted.gz \n";
+$catCmd .= " | LC_ALL=C $sortCmd -T $TMPDIR 2>> /dev/stderr | gzip -c > $extract.sorted.gz 2>> /dev/stderr \n";
+$catInvCmd .= " | LC_ALL=C $sortCmd -T $TMPDIR 2>> /dev/stderr | gzip -c > $extract.inv.sorted.gz 2>> /dev/stderr \n";
+$catOCmd .= " | LC_ALL=C $sortCmd -T $TMPDIR 2>> /dev/stderr | gzip -c > $extract.o.sorted.gz 2>> /dev/stderr \n";
 
 
 @children = ();
diff --git a/scripts/generic/moses-parallel.pl b/scripts/generic/moses-parallel.pl
index d1840fc55..b8d393e71 100755
--- a/scripts/generic/moses-parallel.pl
+++ b/scripts/generic/moses-parallel.pl
@@ -64,6 +64,7 @@ my $wordgraphfile=undef;
 my $wordgraphflag=0;
 my $robust=5; # resubmit crashed jobs robust-times
 my $alifile=undef;
+my $detailsfile=undef;
 my $logfile="";
 my $logflag="";
 my $searchgraphlist="";
@@ -93,6 +94,7 @@ sub init(){
 	     'output-search-graph|osg=s'=> \$searchgraphlist,
              'output-word-graph|owg=s'=> \$wordgraphlist,
              'alignment-output-file=s'=> \$alifile,
+             'translation-details|T=s'=> \$detailsfile,
 	     'qsub-prefix=s'=> \$qsubname,
 	     'queue-parameters=s'=> \$queueparameters,
 	     'inputtype=i'=> \$inputtype,
@@ -539,6 +541,7 @@ while ($robust && scalar @idx_todo) {
 concatenate_1best();
 concatenate_logs() if $logflag;
 concatenate_ali() if defined $alifile;  
+concatenate_details() if defined $detailsfile;  
 concatenate_nbest() if $nbestflag;  
 safesystem("cat nbest$$ >> /dev/stdout") if $nbestlist[0] eq '-';
 
@@ -580,6 +583,11 @@ sub preparing_script(){
       $tmpalioutfile="-alignment-output-file $tmpdir/$alifile.$splitpfx$idx";
     }
 
+    my $tmpdetailsoutfile = "";
+    if (defined $detailsfile){
+      $tmpdetailsoutfile="-translation-details $tmpdir/$detailsfile.$splitpfx$idx";
+    }
+
     my $tmpsearchgraphlist="";
     if ($searchgraphflag){
       $tmpsearchgraphlist="-output-search-graph $tmpdir/$searchgraphfile.$splitpfx$idx";
@@ -592,13 +600,17 @@ sub preparing_script(){
 
 	my $tmpStartTranslationId = ""; # "-start-translation-id $currStartTranslationId";
 
-    print OUT "$mosescmd $mosesparameters $tmpStartTranslationId $tmpalioutfile $tmpwordgraphlist $tmpsearchgraphlist $tmpnbestlist $inputmethod ${inputfile}.$splitpfx$idx > $tmpdir/${inputfile}.$splitpfx$idx.trans\n\n";
+    print OUT "$mosescmd $mosesparameters $tmpStartTranslationId $tmpalioutfile $tmpdetailsoutfile $tmpwordgraphlist $tmpsearchgraphlist $tmpnbestlist $inputmethod ${inputfile}.$splitpfx$idx > $tmpdir/${inputfile}.$splitpfx$idx.trans\n\n";
     print OUT "echo exit status \$\?\n\n";
 
     if (defined $alifile){
       print OUT "\\mv -f $tmpdir/${alifile}.$splitpfx$idx .\n\n";
       print OUT "echo exit status \$\?\n\n";
     }
+    if (defined $detailsfile){
+      print OUT "\\mv -f $tmpdir/${detailsfile}.$splitpfx$idx .\n\n";
+      print OUT "echo exit status \$\?\n\n";
+    }
     if ($nbestflag){
       print OUT "\\mv -f $tmpdir/${nbestfile}.$splitpfx$idx .\n\n";
       print OUT "echo exit status \$\?\n\n";
@@ -827,6 +839,18 @@ sub concatenate_ali(){
   close(OUT);
 }
 
+sub concatenate_details(){
+  open (OUT, "> ${detailsfile}");
+  foreach my $idx (@idxlist){
+    my @in=();
+    open (IN, "$detailsfile.$splitpfx$idx");
+    @in=<IN>;
+    print OUT "@in";
+    close(IN);
+  }
+  close(OUT);
+}
+
 
 sub check_exit_status(){
   print STDERR "check_exit_status\n";
@@ -925,6 +949,7 @@ sub remove_temporary_files(){
     unlink("${inputfile}.${splitpfx}${idx}.trans");
     unlink("${inputfile}.${splitpfx}${idx}");
     if (defined $alifile){ unlink("${alifile}.${splitpfx}${idx}"); }
+    if (defined $detailsfile){ unlink("${detailsfile}.${splitpfx}${idx}"); }
     if ($nbestflag){ unlink("${nbestfile}.${splitpfx}${idx}"); }
     if ($searchgraphflag){ unlink("${searchgraphfile}.${splitpfx}${idx}"); }
     if ($wordgraphflag){ unlink("${wordgraphfile}.${splitpfx}${idx}"); }
diff --git a/scripts/generic/mteval-v13a.pl b/scripts/generic/mteval-v13a.pl
index 879212e6e..f1f8f9ef6 100755
--- a/scripts/generic/mteval-v13a.pl
+++ b/scripts/generic/mteval-v13a.pl
@@ -1009,7 +1009,7 @@ sub extract_sgml_tag_and_span
 sub extract_sgml_tag_attribute
 {
 	my ($name, $data) = @_;
-	($data =~ m|$name\s*=\s*\"([^\"]*)\"|si) ? ($1) : ();
+	($data =~ m|$name\s*=\s*\"?([^\"]*)\"?|si) ? ($1) : ();
 }
 
 #################################
diff --git a/scripts/generic/score-parallel.perl b/scripts/generic/score-parallel.perl
index 520fbddbe..3f763e5d9 100755
--- a/scripts/generic/score-parallel.perl
+++ b/scripts/generic/score-parallel.perl
@@ -163,7 +163,7 @@ else
     $cmd .= "| LC_ALL=C $sortCmd -T $TMPDIR ";
   }
 
-  $cmd .= " | gzip -c > $ptHalf";
+  $cmd .= " | gzip -c > $ptHalf  2>> /dev/stderr ";
 }
 print STDERR $cmd;
 systemCheck($cmd);
diff --git a/scripts/recaser/detruecase.perl b/scripts/recaser/detruecase.perl
index 49c89c299..012c143ac 100755
--- a/scripts/recaser/detruecase.perl
+++ b/scripts/recaser/detruecase.perl
@@ -6,11 +6,12 @@ use Getopt::Long "GetOptions";
 binmode(STDIN, ":utf8");
 binmode(STDOUT, ":utf8");
 
-
-my ($SRC,$INFILE);
+my ($SRC,$INFILE,$UNBUFFERED);
 die("detruecase.perl < in > out")
     unless &GetOptions('headline=s' => \$SRC,
-		       'in=s' => \$INFILE);
+		       'in=s' => \$INFILE,
+                       'b|unbuffered' => \$UNBUFFERED);
+if (defined($UNBUFFERED) && $UNBUFFERED) { $|=1; }
 
 my %SENTENCE_END = ("."=>1,":"=>1,"?"=>1,"!"=>1);
 my %DELAYED_SENTENCE_START = ("("=>1,"["=>1,"\""=>1,"'"=>1,"&quot;"=>1,"&apos;"=>1,"&#91;"=>1,"&#93;"=>1);
diff --git a/scripts/recaser/recase.perl b/scripts/recaser/recase.perl
index c83c30daa..2858cda61 100755
--- a/scripts/recaser/recase.perl
+++ b/scripts/recaser/recase.perl
@@ -4,7 +4,7 @@
 use strict;
 use Getopt::Long "GetOptions";
 
-my ($SRC,$INFILE,$RECASE_MODEL);
+my ($SRC,$INFILE,$RECASE_MODEL,$UNBUFFERED);
 my $MOSES = "moses";
 my $LANGUAGE = "en"; # English by default;
 die("recase.perl --in file --model ini-file > out")
@@ -12,9 +12,11 @@ die("recase.perl --in file --model ini-file > out")
                        'headline=s' => \$SRC,
                        'lang=s' => \$LANGUAGE,
 		       'moses=s' => \$MOSES,
-                       'model=s' => \$RECASE_MODEL)
+                       'model=s' => \$RECASE_MODEL,
+                       'b|unbuffered' => \$UNBUFFERED)
     && defined($INFILE)
     && defined($RECASE_MODEL);
+if (defined($UNBUFFERED) && $UNBUFFERED) { $|=1; }
 
 my %treated_languages = map { ($_,1) } qw/en cs/;
 die "I don't know any rules for $LANGUAGE. Use 'en' as the default."
diff --git a/scripts/recaser/truecase.perl b/scripts/recaser/truecase.perl
index 0e2df27a2..517f5c7a1 100755
--- a/scripts/recaser/truecase.perl
+++ b/scripts/recaser/truecase.perl
@@ -8,9 +8,11 @@ binmode(STDIN, ":utf8");
 binmode(STDOUT, ":utf8");
 
 # apply switches
-my $MODEL;
-die("truecase.perl --model truecaser < in > out")
-    unless &GetOptions('model=s' => \$MODEL);
+my ($MODEL, $UNBUFFERED);
+die("truecase.perl --model MODEL [-b] < in > out")
+    unless &GetOptions('model=s' => \$MODEL,'b|unbuffered' => \$UNBUFFERED)
+    && defined($MODEL);
+if (defined($UNBUFFERED) && $UNBUFFERED) { $|=1; }
 
 my (%BEST,%KNOWN);
 open(MODEL,$MODEL) || die("ERROR: could not open '$MODEL'");
diff --git a/scripts/share/nonbreaking_prefixes/nonbreaking_prefix.hu b/scripts/share/nonbreaking_prefixes/nonbreaking_prefix.hu
new file mode 100644
index 000000000..c6b9af8ca
--- /dev/null
+++ b/scripts/share/nonbreaking_prefixes/nonbreaking_prefix.hu
@@ -0,0 +1,103 @@
+#Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker.
+#Special cases are included for prefixes that ONLY appear before 0-9 numbers.
+
+#any single upper case letter  followed by a period is not a sentence ender (excluding I occasionally, but we leave it in)
+#usually upper case letters are initials in a name
+A
+B
+C
+D
+E
+F
+G
+H
+I
+J
+K
+L
+M
+N
+O
+P
+Q
+R
+S
+T
+U
+V
+W
+X
+Y
+Z
+Á
+É
+Í
+Ó
+Ö
+Ő
+Ú
+Ü
+Ű
+
+#List of titles. These are often followed by upper-case names, but do not indicate sentence breaks
+Dr
+dr
+kb
+Kb
+vö
+Vö
+pl
+Pl
+ca
+Ca
+min
+Min
+max
+Max
+ún
+Ún
+prof
+Prof
+de
+De
+du
+Du
+Szt
+St
+
+#Numbers only. These should only induce breaks when followed by a numeric sequence
+# add NUMERIC_ONLY after the word for this function
+#This case is mostly for the english "No." which can either be a sentence of its own, or
+#if followed by a number, a non-breaking prefix
+
+# Month name abbreviations
+jan #NUMERIC_ONLY#
+Jan #NUMERIC_ONLY#
+Feb #NUMERIC_ONLY#
+feb #NUMERIC_ONLY#
+márc #NUMERIC_ONLY#
+Márc #NUMERIC_ONLY#
+ápr #NUMERIC_ONLY#
+Ápr #NUMERIC_ONLY#
+máj #NUMERIC_ONLY#
+Máj #NUMERIC_ONLY#
+jún #NUMERIC_ONLY#
+Jún #NUMERIC_ONLY#
+Júl #NUMERIC_ONLY#
+júl #NUMERIC_ONLY#
+aug #NUMERIC_ONLY#
+Aug #NUMERIC_ONLY#
+Szept #NUMERIC_ONLY#
+szept #NUMERIC_ONLY#
+okt #NUMERIC_ONLY#
+Okt #NUMERIC_ONLY#
+nov #NUMERIC_ONLY#
+Nov #NUMERIC_ONLY#
+dec #NUMERIC_ONLY#
+Dec #NUMERIC_ONLY#
+
+# Other abbreviations
+tel #NUMERIC_ONLY#
+Tel #NUMERIC_ONLY#
+Fax #NUMERIC_ONLY#
+fax #NUMERIC_ONLY#
diff --git a/scripts/share/nonbreaking_prefixes/nonbreaking_prefix.lv b/scripts/share/nonbreaking_prefixes/nonbreaking_prefix.lv
new file mode 100644
index 000000000..81754a17a
--- /dev/null
+++ b/scripts/share/nonbreaking_prefixes/nonbreaking_prefix.lv
@@ -0,0 +1,100 @@
+#Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker.
+#Special cases are included for prefixes that ONLY appear before 0-9 numbers.
+
+#any single upper case letter  followed by a period is not a sentence ender (excluding I occasionally, but we leave it in)
+#usually upper case letters are initials in a name
+A
+Ā
+B
+C
+Č
+D
+E
+Ē
+F
+G
+Ģ
+H
+I
+Ī
+J
+K
+Ķ
+L
+Ļ
+M
+N
+Ņ
+O
+P
+Q
+R
+S
+Š
+T
+U
+Ū
+V
+W
+X
+Y
+Z
+Ž
+
+#List of titles. These are often followed by upper-case names, but do not indicate sentence breaks
+dr
+Dr
+med
+prof
+Prof
+inž
+Inž
+ist.loc
+Ist.loc
+kor.loc
+Kor.loc
+v.i
+vietn
+Vietn
+
+#misc - odd period-ending items that NEVER indicate breaks (p.m. does NOT fall into this category - it sometimes ends a sentence)
+a.l
+t.p
+pārb
+Pārb
+vec
+Vec
+inv
+Inv
+sk
+Sk
+spec
+Spec
+vienk
+Vienk
+virz
+Virz
+māksl
+Māksl
+mūz
+Mūz
+akad
+Akad
+soc
+Soc
+galv
+Galv
+vad
+Vad
+sertif
+Sertif
+folkl
+Folkl
+hum
+Hum
+
+#Numbers only. These should only induce breaks when followed by a numeric sequence
+# add NUMERIC_ONLY after the word for this function
+#This case is mostly for the english "No." which can either be a sentence of its own, or
+#if followed by a number, a non-breaking prefix
+Nr #NUMERIC_ONLY# 
diff --git a/scripts/tokenizer/tokenizer.perl b/scripts/tokenizer/tokenizer.perl
index f59cd5f86..986a2dfb5 100755
--- a/scripts/tokenizer/tokenizer.perl
+++ b/scripts/tokenizer/tokenizer.perl
@@ -171,7 +171,7 @@ if ($TIMING)
 
 # tokenize a batch of texts saved in an array
 # input: an array containing a batch of texts
-# return: another array cotaining a batch of tokenized texts for the input array
+# return: another array containing a batch of tokenized texts for the input array
 sub tokenize_batch
 {
     my(@text_list) = @_;
diff --git a/scripts/training/clean-corpus-n.perl b/scripts/training/clean-corpus-n.perl
index bea32052a..2865fe391 100755
--- a/scripts/training/clean-corpus-n.perl
+++ b/scripts/training/clean-corpus-n.perl
@@ -47,7 +47,7 @@ my $l1input = "$corpus.$l1";
 if (-e $l1input) {
   $opn = $l1input;
 } elsif (-e $l1input.".gz") {
-  $opn = "zcat $l1input.gz |";
+  $opn = "gunzip -c $l1input.gz |";
 } else {
     die "Error: $l1input does not exist";
 }
@@ -57,7 +57,7 @@ my $l2input = "$corpus.$l2";
 if (-e $l2input) {
   $opn = $l2input;
 } elsif (-e $l2input.".gz") {
-  $opn = "zcat $l2input.gz |";
+  $opn = "gunzip -c $l2input.gz |";
 } else  {
  die "Error: $l2input does not exist";
 }
@@ -160,3 +160,4 @@ sub word_count {
   my @w = split(/ /,$line);
   return scalar @w;
 }
+
diff --git a/scripts/training/filter-rule-table.py b/scripts/training/filter-rule-table.py
index 8bef034de..86c8b300e 100755
--- a/scripts/training/filter-rule-table.py
+++ b/scripts/training/filter-rule-table.py
@@ -40,7 +40,8 @@ def printUsage():
 def main():
     parser = optparse.OptionParser()
     parser.add_option("-c", "--min-non-initial-rule-count",
-                      action="store", dest="minCount", type="int", default="1",
+                      action="store", dest="minCount",
+                      type="float", default="0.0",
                       help="prune non-initial rules where count is below N",
                       metavar="N")
     (options, args) = parser.parse_args()
diff --git a/scripts/training/mert-moses.pl b/scripts/training/mert-moses.pl
index 688e8ce55..9f5f25f15 100755
--- a/scripts/training/mert-moses.pl
+++ b/scripts/training/mert-moses.pl
@@ -1,4 +1,4 @@
-#!/usr/bin/perl -w
+#!/usr/bin/perl -w 
 # $Id$
 # Usage:
 # mert-moses.pl <foreign> <english> <decoder-executable> <decoder-config>
@@ -371,7 +371,7 @@ my $pro_optimizer = File::Spec->catfile($mertdir, "megam_i686.opt");  # or set t
 
 if (($___PAIRWISE_RANKED_OPTIMIZER || $___PRO_STARTING_POINT) && ! -x $pro_optimizer) {
   print "Could not find $pro_optimizer, installing it in $mertdir\n";
-  my $megam_url = "http://www.umiacs.umd.edu/~hal/megam/";
+  my $megam_url = "http://hal3.name/megam";
   if (&is_mac_osx()) {
     die "Error: Sorry for Mac OS X users! Please get the source code of megam and compile by hand. Please see $megam_url for details.";
   }
diff --git a/scripts/training/train-model.perl b/scripts/training/train-model.perl
index 5b0553581..e4292007e 100755
--- a/scripts/training/train-model.perl
+++ b/scripts/training/train-model.perl
@@ -38,8 +38,9 @@ my($_EXTERNAL_BINDIR, $_ROOT_DIR, $_CORPUS_DIR, $_GIZA_E2F, $_GIZA_F2E, $_MODEL_
    $_MEMSCORE, $_FINAL_ALIGNMENT_MODEL,
    $_CONTINUE,$_MAX_LEXICAL_REORDERING,$_DO_STEPS,
    @_ADDITIONAL_INI,$_ADDITIONAL_INI_FILE,
-   $_SPARSE_TRANSLATION_TABLE, @_BASELINE_ALIGNMENT_MODEL, $_BASELINE_EXTRACT, $_BASELINE_CORPUS, $_BASELINE_ALIGNMENT,
+   $_SPARSE_TRANSLATION_TABLE, @_BASELINE_ALIGNMENT_MODEL, $_BASELINE_EXTRACT, $_BASELINE_ALIGNMENT,
    $_DICTIONARY, $_SPARSE_PHRASE_FEATURES, $_EPPEX, $_INSTANCE_WEIGHTS_FILE, $_LMODEL_OOV_FEATURE, $IGNORE);
+my $_BASELINE_CORPUS = "";
 my $_CORES = 1;
 
 my $debug = 0; # debug this script, do not delete any files in debug mode
diff --git a/util/file.cc b/util/file.cc
index 86d9b12de..c7d8e23b2 100644
--- a/util/file.cc
+++ b/util/file.cc
@@ -111,15 +111,26 @@ void ResizeOrThrow(int fd, uint64_t to) {
   UTIL_THROW_IF_ARG(ret, FDException, (fd), "while resizing to " << to << " bytes");
 }
 
+namespace {
+std::size_t GuardLarge(std::size_t size) {
+  // The following operating systems have broken read/write/pread/pwrite that
+  // only supports up to 2^31.
+#if defined(_WIN32) || defined(_WIN64) || defined(__APPLE__) || defined(OS_ANDROID)
+  return std::min(static_cast<std::size_t>(INT_MAX), size);
+#else
+  return size;
+#endif
+}
+}
+
 std::size_t PartialRead(int fd, void *to, std::size_t amount) {
 #if defined(_WIN32) || defined(_WIN64)
-  amount = min(static_cast<std::size_t>(INT_MAX), amount);
-  int ret = _read(fd, to, amount); 
+  int ret = _read(fd, to, GuardLarge(amount));
 #else
   errno = 0;
   ssize_t ret;
   do {
-    ret = read(fd, to, amount);
+    ret = read(fd, to, GuardLarge(amount));
   } while (ret == -1 && errno == EINTR);
 #endif
   UTIL_THROW_IF_ARG(ret < 0, FDException, (fd), "while reading " << amount << " bytes");
@@ -169,11 +180,13 @@ void PReadOrThrow(int fd, void *to_void, std::size_t size, uint64_t off) {
     ssize_t ret;
     errno = 0;
     do {
+      ret =
 #ifdef OS_ANDROID
-      ret = pread64(fd, to, size, off);
+        pread64
 #else
-      ret = pread(fd, to, size, off);
+        pread
 #endif
+        (fd, to, GuardLarge(size), off);
     } while (ret == -1 && errno == EINTR);
     if (ret <= 0) {
       UTIL_THROW_IF(ret == 0, EndOfFileException, " for reading " << size << " bytes at " << off << " from " << NameFromFD(fd));
@@ -190,14 +203,20 @@ void WriteOrThrow(int fd, const void *data_void, std::size_t size) {
   const uint8_t *data = static_cast<const uint8_t*>(data_void);
   while (size) {
 #if defined(_WIN32) || defined(_WIN64)
-    int ret = write(fd, data, min(static_cast<std::size_t>(INT_MAX), size));
+    int ret;
 #else
-    errno = 0;
     ssize_t ret;
+#endif
+    errno = 0;
     do {
-      ret = write(fd, data, size);
-    } while (ret == -1 && errno == EINTR);
+      ret = 
+#if defined(_WIN32) || defined(_WIN64)
+        _write
+#else
+        write
 #endif
+        (fd, data, GuardLarge(size));
+    } while (ret == -1 && errno == EINTR);
     UTIL_THROW_IF_ARG(ret < 1, FDException, (fd), "while writing " << size << " bytes");
     data += ret;
     size -= ret;
diff --git a/util/read_compressed.cc b/util/read_compressed.cc
index b81549e42..b62a6e833 100644
--- a/util/read_compressed.cc
+++ b/util/read_compressed.cc
@@ -180,12 +180,73 @@ class GZip : public ReadBase {
 };
 #endif // HAVE_ZLIB
 
+const uint8_t kBZMagic[3] = {'B', 'Z', 'h'};
+
 #ifdef HAVE_BZLIB
 class BZip : public ReadBase {
   public:
-    explicit BZip(int fd, void *already_data, std::size_t already_size) {
+    BZip(int fd, void *already_data, std::size_t already_size) {
       scoped_fd hold(fd);
       closer_.reset(FDOpenReadOrThrow(hold));
+      file_ = NULL;
+      Open(already_data, already_size);
+    }
+
+    BZip(FILE *file, void *already_data, std::size_t already_size) {
+      closer_.reset(file);
+      file_ = NULL;
+      Open(already_data, already_size);
+    }
+
+    ~BZip() {
+      Close(file_);
+    }
+
+    std::size_t Read(void *to, std::size_t amount, ReadCompressed &thunk) {
+      assert(file_);
+      int bzerror = BZ_OK;
+      int ret = BZ2_bzRead(&bzerror, file_, to, std::min<std::size_t>(static_cast<std::size_t>(INT_MAX), amount));
+      long pos = ftell(closer_.get());
+      if (pos != -1) ReadCount(thunk) = pos;
+      switch (bzerror) {
+        case BZ_STREAM_END:
+          /* bzip2 files can be concatenated by e.g. pbzip2.  Annoyingly, the
+           * library doesn't handle this internally.  This gets the trailing
+           * data, grows it up to magic as needed, validates the magic, and
+           * reopens.
+           */
+          {
+            bzerror = BZ_OK;
+            void *trailing_data;
+            int trailing_size;
+            BZ2_bzReadGetUnused(&bzerror, file_, &trailing_data, &trailing_size);
+            UTIL_THROW_IF(bzerror != BZ_OK, BZException, "bzip2 error in BZ2_bzReadGetUnused " << BZ2_bzerror(file_, &bzerror) << " code " << bzerror);
+            std::string trailing(static_cast<const char*>(trailing_data), trailing_size);
+            Close(file_);
+
+            if (trailing_size < (int)sizeof(kBZMagic)) {
+              trailing.resize(sizeof(kBZMagic));
+              if (1 != fread(&trailing[trailing_size], sizeof(kBZMagic) - trailing_size, 1, closer_.get())) {
+                UTIL_THROW_IF(trailing_size, BZException, "File has trailing cruft");
+                // Legitimate end of file.
+                ReplaceThis(new Complete(), thunk);
+                return ret;
+              }
+            }
+            UTIL_THROW_IF(memcmp(trailing.data(), kBZMagic, sizeof(kBZMagic)), BZException, "Trailing cruft is not another bzip2 stream");
+            Open(&trailing[0], trailing.size());
+          }
+          return ret;
+        case BZ_OK:
+          return ret;
+        default:
+          UTIL_THROW(BZException, "bzip2 error " << BZ2_bzerror(file_, &bzerror) << " code " << bzerror);
+      }
+    }
+
+  private:
+    void Open(void *already_data, std::size_t already_size) {
+      assert(!file_);
       int bzerror = BZ_OK;
       file_ = BZ2_bzReadOpen(&bzerror, closer_.get(), 0, 0, already_data, already_size);
       switch (bzerror) {
@@ -199,38 +260,23 @@ class BZip : public ReadBase {
           UTIL_THROW(BZException, "IO error reading file");
         case BZ_MEM_ERROR:
           throw std::bad_alloc();
+        default:
+          UTIL_THROW(BZException, "Unknown bzip2 error code " << bzerror);
       }
+      assert(file_);
     }
 
-    ~BZip() {
+    static void Close(BZFILE *&file) {
+      if (file == NULL) return;
       int bzerror = BZ_OK;
-      BZ2_bzReadClose(&bzerror, file_);
+      BZ2_bzReadClose(&bzerror, file);
       if (bzerror != BZ_OK) {
-        std::cerr << "bz2 readclose error" << std::endl;
+        std::cerr << "bz2 readclose error number " << bzerror << std::endl;
         abort();
       }
+      file = NULL;
     }
 
-    std::size_t Read(void *to, std::size_t amount, ReadCompressed &thunk) {
-      int bzerror = BZ_OK;
-      int ret = BZ2_bzRead(&bzerror, file_, to, std::min<std::size_t>(static_cast<std::size_t>(INT_MAX), amount));
-      long pos;
-      switch (bzerror) {
-        case BZ_STREAM_END:
-          pos = ftell(closer_.get());
-          if (pos != -1) ReadCount(thunk) = pos;
-          ReplaceThis(new Complete(), thunk);
-          return ret;
-        case BZ_OK:
-          pos = ftell(closer_.get());
-          if (pos != -1) ReadCount(thunk) = pos;
-          return ret;
-        default:
-          UTIL_THROW(BZException, "bzip2 error " << BZ2_bzerror(file_, &bzerror) << " code " << bzerror);
-      }
-    }
-
-  private:
     scoped_FILE closer_;
     BZFILE *file_;
 };
@@ -346,11 +392,11 @@ MagicResult DetectMagic(const void *from_void) {
   if (header[0] == 0x1f && header[1] == 0x8b) {
     return GZIP;
   }
-  if (header[0] == 'B' && header[1] == 'Z' && header[2] == 'h') {
+  if (!memcmp(header, kBZMagic, sizeof(kBZMagic))) {
     return BZIP;
   }
-  const uint8_t xzmagic[6] = { 0xFD, '7', 'z', 'X', 'Z', 0x00 };
-  if (!memcmp(header, xzmagic, 6)) {
+  const uint8_t kXZMagic[6] = { 0xFD, '7', 'z', 'X', 'Z', 0x00 };
+  if (!memcmp(header, kXZMagic, sizeof(kXZMagic))) {
     return XZIP;
   }
   return UNKNOWN;
author	Barry Haddow <barry.haddow@gmail.com>	2013-04-12 19:07:26 +0400
committer	Barry Haddow <barry.haddow@gmail.com>	2013-04-12 19:07:26 +0400
commit	9d42c7f6f74bbb0079768a762fc4546d20d6b634 (patch)
tree	ab1a2a2884a3b3b809a969ea0eb36fb98416347e
parent	c5965b8587b37986ebab786905a8ef9f218403de (diff)
parent	517d6c7bb834e40bcf25e8cbc79985180cb7f29f (diff)