diff options
author | Barry Haddow <barry.haddow@gmail.com> | 2013-04-12 19:07:26 +0400 |
---|---|---|
committer | Barry Haddow <barry.haddow@gmail.com> | 2013-04-12 19:07:26 +0400 |
commit | 9d42c7f6f74bbb0079768a762fc4546d20d6b634 (patch) | |
tree | ab1a2a2884a3b3b809a969ea0eb36fb98416347e | |
parent | c5965b8587b37986ebab786905a8ef9f218403de (diff) | |
parent | 517d6c7bb834e40bcf25e8cbc79985180cb7f29f (diff) |
Merge branch 'master' of github.com:moses-smt/mosesdecoder
98 files changed, 2718 insertions, 348 deletions
diff --git a/.gitmodules b/.gitmodules index e69de29bb..d3a8cb4da 100644 --- a/.gitmodules +++ b/.gitmodules @@ -0,0 +1,3 @@ +[submodule "contrib/arrow-pipelines/python/libs/pypeline"] + path = contrib/arrow-pipelines/python/libs/pypeline + url = git://github.com/ianj-als/pypeline.git diff --git a/BUILD-INSTRUCTIONS.txt b/BUILD-INSTRUCTIONS.txt index 318956ccd..3dac64f60 100644 --- a/BUILD-INSTRUCTIONS.txt +++ b/BUILD-INSTRUCTIONS.txt @@ -45,7 +45,7 @@ ADVICE ON INSTALLING EXTERNAL LIBRARIES Generally, for trouble installing external libraries, you should get support directly from the library maker: -Boost: http://www.boost.org/doc/libs/1_48_0/more/getting_started/unix-variants.html +Boost: http://www.boost.org/doc/libs/release/more/getting_started/unix-variants.html IRSTLM: https://list.fbk.eu/sympa/subscribe/user-irstlm SRILM: http://www.speech.sri.com/projects/srilm/#srilm-user @@ -1,3 +1,5 @@ This code includes data from Daniel Naber's Language Tools (czech abbreviations). This code includes data from czech wiktionary (also czech abbreviations). + + diff --git a/OnDiskPt/Main.cpp b/OnDiskPt/Main.cpp index 5f6da5a33..5d4e0be8d 100644 --- a/OnDiskPt/Main.cpp +++ b/OnDiskPt/Main.cpp @@ -174,6 +174,7 @@ OnDiskPt::PhrasePtr Tokenize(SourcePhrase &sourcePhrase, TargetPhrase &targetPhr break; } default: + cerr << "ERROR in line " << line << endl; assert(false); break; } diff --git a/biconcor/PhrasePair.cpp b/biconcor/PhrasePair.cpp index 9c16be77c..038fa3a31 100644 --- a/biconcor/PhrasePair.cpp +++ b/biconcor/PhrasePair.cpp @@ -8,7 +8,42 @@ using namespace std; -void PhrasePair::Print( ostream* out, int width ) const +void PhrasePair::Print( ostream* out ) const +{ + // source + int sentence_start = m_source_position - m_source_start; + char source_length = m_suffixArray->GetSentenceLength( m_suffixArray->GetSentence( m_source_position ) ); + + for( char i=0; i<source_length; i++ ) { + if (i>0) *out << " "; + *out << m_suffixArray->GetWord( sentence_start + i ); + } + + // target + *out << " |||"; + for( char i=0; i<m_target_length; i++ ) { + *out << " " << m_targetCorpus->GetWord( m_sentence_id, i); + } + + // source span + *out << " ||| " << (int)m_source_start << " " << (int)m_source_end; + + // target span + *out << " ||| " << (int)m_target_start << " " << (int)m_target_end; + + // word alignment + *out << " |||"; + + INDEX ap_points = m_alignment->GetNumberOfAlignmentPoints( m_sentence_id ); + for( INDEX i=0; i<ap_points; i++) { + *out << " " << m_alignment->GetSourceWord( m_sentence_id, i ) + << "-" << m_alignment->GetTargetWord( m_sentence_id, i ); + } + + *out << endl; +} + +void PhrasePair::PrintPretty( ostream* out, int width ) const { vector< WORD_ID >::const_iterator t; diff --git a/biconcor/PhrasePair.h b/biconcor/PhrasePair.h index f8a7881a0..f1dadb637 100644 --- a/biconcor/PhrasePair.h +++ b/biconcor/PhrasePair.h @@ -43,7 +43,8 @@ public: ~PhrasePair () {} void PrintTarget( std::ostream* out ) const; - void Print( std::ostream* out, int width ) const; + void Print( std::ostream* out ) const; + void PrintPretty( std::ostream* out, int width ) const; void PrintHTML( std::ostream* out ) const; void PrintClippedHTML( std::ostream* out, int width ) const; }; diff --git a/biconcor/PhrasePairCollection.cpp b/biconcor/PhrasePairCollection.cpp index 17c95d24a..7497b2af8 100644 --- a/biconcor/PhrasePairCollection.cpp +++ b/biconcor/PhrasePairCollection.cpp @@ -13,31 +13,32 @@ using namespace std; -PhrasePairCollection::PhrasePairCollection( SuffixArray *sa, TargetCorpus *tc, Alignment *a ) +PhrasePairCollection::PhrasePairCollection( SuffixArray *sa, TargetCorpus *tc, Alignment *a, int max_translation, int max_example ) :m_suffixArray(sa) ,m_targetCorpus(tc) ,m_alignment(a) ,m_size(0) - ,m_max_lookup(10000) - ,m_max_pp_target(50) - ,m_max_pp(50) + ,m_max_lookup(10000) // maximum number of source occurrences sampled + ,m_max_translation(max_translation) // max number of different distinct translations returned + ,m_max_example(max_example) // max number of examples returned for each distinct translation {} PhrasePairCollection::~PhrasePairCollection() {} -bool PhrasePairCollection::GetCollection( const vector< string >& sourceString ) +int PhrasePairCollection::GetCollection( const vector< string >& sourceString ) { INDEX first_match, last_match; if (! m_suffixArray->FindMatches( sourceString, first_match, last_match )) { - return false; + return 0; } - cerr << "\tfirst match " << first_match << endl; - cerr << "\tlast match " << last_match << endl; + //cerr << "\tfirst match " << first_match << endl; + //cerr << "\tlast match " << last_match << endl; INDEX found = last_match - first_match +1; map< vector< WORD_ID >, INDEX > index; + int real_count = 0; for( INDEX i=first_match; i<=last_match; i++ ) { int position = m_suffixArray->GetPosition( i ); int source_start = m_suffixArray->GetWordInSentence( position ); @@ -45,23 +46,23 @@ bool PhrasePairCollection::GetCollection( const vector< string >& sourceString ) INDEX sentence_id = m_suffixArray->GetSentence( position ); int sentence_length = m_suffixArray->GetSentenceLength( sentence_id ); int target_length = m_targetCorpus->GetSentenceLength( sentence_id ); - cerr << "match " << (i-first_match) - << " in sentence " << sentence_id - << ", starting at word " << source_start - << " of " << sentence_length - << ". target sentence has " << target_length << " words."; + //cerr << "match " << (i-first_match) + //<< " in sentence " << sentence_id + //<< ", starting at word " << source_start + //<< " of " << sentence_length + //<< ". target sentence has " << target_length << " words."; int target_start, target_end, pre_null, post_null; if (m_alignment->PhraseAlignment( sentence_id, target_length, source_start, source_end, target_start, target_end, pre_null, post_null)) { - cerr << " aligned to [" << (int)target_start << "," << (int)target_end << "]"; - cerr << " +(" << (int)pre_null << "," << (int)post_null << ")"; + //cerr << " aligned to [" << (int)target_start << "," << (int)target_end << "]"; + //cerr << " +(" << (int)pre_null << "," << (int)post_null << ")"; bool null_boundary_words = false; for (int pre = 0; pre <= pre_null && (pre == 0 || null_boundary_words); pre++ ) { for (int post = 0; post <= post_null && (post == 0 || null_boundary_words); post++ ) { vector< WORD_ID > targetString; - cerr << "; "; + //cerr << "; "; for (int target = target_start - pre; target <= target_end + post; target++) { targetString.push_back( m_targetCorpus->GetWordId( sentence_id, target) ); - cerr << m_targetCorpus->GetWord( sentence_id, target) << " "; + //cerr << m_targetCorpus->GetWord( sentence_id, target) << " "; } PhrasePair *phrasePair = new PhrasePair( m_suffixArray, m_targetCorpus, m_alignment, sentence_id, target_length, position, source_start, source_end, target_start-pre, target_end+post, pre, post, pre_null-pre, post_null-post); // matchCollection.Add( sentence_id, ) @@ -76,37 +77,47 @@ bool PhrasePairCollection::GetCollection( const vector< string >& sourceString ) } } else { - cerr << "mismatch " << (i-first_match) - << " in sentence " << sentence_id - << ", starting at word " << source_start - << " of " << sentence_length - << ". target sentence has " << target_length << " words."; + //cerr << "mismatch " << (i-first_match) + // << " in sentence " << sentence_id + // << ", starting at word " << source_start + // << " of " << sentence_length + // << ". target sentence has " << target_length << " words."; Mismatch *mismatch = new Mismatch( m_suffixArray, m_targetCorpus, m_alignment, sentence_id, position, sentence_length, target_length, source_start, source_end ); if (mismatch->Unaligned()) m_unaligned.push_back( mismatch ); else m_mismatch.push_back( mismatch ); } - cerr << endl; + //cerr << endl; if (found > (INDEX)m_max_lookup) { i += found/m_max_lookup-1; } + real_count++; } sort(m_collection.begin(), m_collection.end(), CompareBySize()); - return true; + return real_count; } -void PhrasePairCollection::Print() const +void PhrasePairCollection::Print(bool pretty) const { vector< vector<PhrasePair*> >::const_iterator ppWithSameTarget; - for( ppWithSameTarget = m_collection.begin(); ppWithSameTarget != m_collection.end(); ppWithSameTarget++ ) { + int i=0; + for( ppWithSameTarget = m_collection.begin(); ppWithSameTarget != m_collection.end() && i<m_max_translation; i++, ppWithSameTarget++ ) { (*(ppWithSameTarget->begin()))->PrintTarget( &cout ); int count = ppWithSameTarget->size(); cout << "(" << count << ")" << endl; - vector< PhrasePair* >::const_iterator p; - for(p = ppWithSameTarget->begin(); p != ppWithSameTarget->end(); p++ ) { - (*p)->Print( &cout, 100 ); + vector< PhrasePair* >::const_iterator p = ppWithSameTarget->begin(); + for(int j=0; j<ppWithSameTarget->size() && j<m_max_example; j++, p++ ) { + if (pretty) { + (*p)->PrintPretty( &cout, 100 ); + } + else { + (*p)->Print( &cout ); + } + if (ppWithSameTarget->size() > m_max_example) { + p += ppWithSameTarget->size()/m_max_example-1; + } } } } @@ -117,7 +128,7 @@ void PhrasePairCollection::PrintHTML() const bool singleton = false; // loop over all translations vector< vector<PhrasePair*> >::const_iterator ppWithSameTarget; - for( ppWithSameTarget = m_collection.begin(); ppWithSameTarget != m_collection.end() && pp_target<m_max_pp_target; ppWithSameTarget++, pp_target++ ) { + for( ppWithSameTarget = m_collection.begin(); ppWithSameTarget != m_collection.end() && pp_target<m_max_translation; ppWithSameTarget++, pp_target++ ) { int count = ppWithSameTarget->size(); if (!singleton) { @@ -143,9 +154,9 @@ void PhrasePairCollection::PrintHTML() const int i=0; for(p = ppWithSameTarget->begin(); i<10 && pp<count && p != ppWithSameTarget->end(); p++, pp++, i++ ) { (*p)->PrintClippedHTML( &cout, 160 ); - if (count > m_max_pp) { - p += count/m_max_pp-1; - pp += count/m_max_pp-1; + if (count > m_max_example) { + p += count/m_max_example-1; + pp += count/m_max_example-1; } } if (i == 10 && pp < count) { @@ -153,11 +164,11 @@ void PhrasePairCollection::PrintHTML() const cout << "<tr><td colspan=7 align=center class=\"pp_more\" onclick=\"javascript:document.getElementById('pp_" << pp_target << "').style.display = 'none'; document.getElementById('pp_ext_" << pp_target << "').style.display = 'block';\">(more)</td></tr></table></div>"; cout << "<div id=\"pp_ext_" << pp_target << "\" style=\"display:none;\";\">"; cout << "<table align=\"center\">"; - for(i=0, pp=0, p = ppWithSameTarget->begin(); i<m_max_pp && pp<count && p != ppWithSameTarget->end(); p++, pp++, i++ ) { + for(i=0, pp=0, p = ppWithSameTarget->begin(); i<m_max_example && pp<count && p != ppWithSameTarget->end(); p++, pp++, i++ ) { (*p)->PrintClippedHTML( &cout, 160 ); - if (count > m_max_pp) { - p += count/m_max_pp-1; - pp += count/m_max_pp-1; + if (count > m_max_example) { + p += count/m_max_example-1; + pp += count/m_max_example-1; } } } @@ -172,7 +183,7 @@ void PhrasePairCollection::PrintHTML() const if (singleton) cout << "</table></div>\n"; else if (pp_target > 9) cout << "</div>"; - size_t max_mismatch = m_max_pp/3; + size_t max_mismatch = m_max_example/3; // unaligned phrases if (m_unaligned.size() > 0) { cout << "<p class=\"pp_singleton_header\">unaligned" diff --git a/biconcor/PhrasePairCollection.h b/biconcor/PhrasePairCollection.h index f88bfc10f..e076eba9b 100644 --- a/biconcor/PhrasePairCollection.h +++ b/biconcor/PhrasePairCollection.h @@ -22,19 +22,19 @@ private: std::vector< Mismatch* > m_mismatch, m_unaligned; int m_size; int m_max_lookup; - int m_max_pp_target; - int m_max_pp; + int m_max_translation; + int m_max_example; // No copying allowed. PhrasePairCollection(const PhrasePairCollection&); void operator=(const PhrasePairCollection&); public: - PhrasePairCollection ( SuffixArray *, TargetCorpus *, Alignment * ); + PhrasePairCollection ( SuffixArray *, TargetCorpus *, Alignment *, int, int ); ~PhrasePairCollection (); - bool GetCollection( const std::vector<std::string >& sourceString ); - void Print() const; + int GetCollection( const std::vector<std::string >& sourceString ); + void Print(bool pretty) const; void PrintHTML() const; }; diff --git a/biconcor/biconcor.cpp b/biconcor/biconcor.cpp index a25e63cb7..f4e7c03fb 100644 --- a/biconcor/biconcor.cpp +++ b/biconcor/biconcor.cpp @@ -19,8 +19,12 @@ int main(int argc, char* argv[]) int saveFlag = false; int createFlag = false; int queryFlag = false; - int htmlFlag = false; - string info = "usage: suffix-query\n\t[--load file]\n\t[--save file]\n\t[--create source-corpus]\n\t[--query string]\n\t[--target target-corpus]\n\t[--alignment file]\n"; + int htmlFlag = false; // output as HTML + int prettyFlag = false; // output readable on screen + int stdioFlag = false; // receive requests from STDIN, respond to STDOUT + int max_translation = 20; + int max_example = 50; + string info = "usage: biconcor\n\t[--load model-file]\n\t[--save model-file]\n\t[--create source-corpus]\n\t[--query string]\n\t[--target target-corpus]\n\t[--alignment file]\n\t[--translations count]\n\t[--examples count]\n\t[--html]\n\t[--stdio]\n"; while(1) { static struct option long_options[] = { {"load", required_argument, 0, 'l'}, @@ -29,11 +33,15 @@ int main(int argc, char* argv[]) {"query", required_argument, 0, 'q'}, {"target", required_argument, 0, 't'}, {"alignment", required_argument, 0, 'a'}, - {"html", no_argument, &htmlFlag, 0}, + {"html", no_argument, 0, 'h'}, + {"pretty", no_argument, 0, 'p'}, + {"stdio", no_argument, 0, 'i'}, + {"translations", required_argument, 0, 'o'}, + {"examples", required_argument, 0, 'e'}, {0, 0, 0, 0} }; int option_index = 0; - int c = getopt_long (argc, argv, "l:s:c:q:Q:t:a:h", long_options, &option_index); + int c = getopt_long (argc, argv, "l:s:c:q:Q:t:a:hpio:e:", long_options, &option_index); if (c == -1) break; switch (c) { case 'l': @@ -62,11 +70,29 @@ int main(int argc, char* argv[]) query = string(optarg); queryFlag = true; break; + case 'o': + max_translation = atoi(optarg); + break; + case 'e': + max_example = atoi(optarg); + break; + case 'p': + prettyFlag = true; + break; + case 'h': + htmlFlag = true; + break; + case 'i': + stdioFlag = true; + break; default: cerr << info; exit(1); } } + if (stdioFlag) { + queryFlag = true; + } // check if parameter settings are legal if (saveFlag && !createFlag) { @@ -111,12 +137,37 @@ int main(int argc, char* argv[]) targetCorpus.Load( fileNameSuffix ); alignment.Load( fileNameSuffix ); } - if (queryFlag) { + if (stdioFlag) { + cout << "-|||- BICONCOR START -|||-" << endl << flush; + while(true) { + string query; + if (getline(cin, query, '\n').eof()) { + return 0; + } + vector< string > queryString = alignment.Tokenize( query.c_str() ); + PhrasePairCollection ppCollection( &suffixArray, &targetCorpus, &alignment, max_translation, max_example ); + int total = ppCollection.GetCollection( queryString ); + cout << "TOTAL: " << total << endl; + if (htmlFlag) { + ppCollection.PrintHTML(); + } + else { + ppCollection.Print(prettyFlag); + } + cout << "-|||- BICONCOR END -|||-" << endl << flush; + } + } + else if (queryFlag) { cerr << "query is " << query << endl; vector< string > queryString = alignment.Tokenize( query.c_str() ); - PhrasePairCollection ppCollection( &suffixArray, &targetCorpus, &alignment ); + PhrasePairCollection ppCollection( &suffixArray, &targetCorpus, &alignment, max_translation, max_example ); ppCollection.GetCollection( queryString ); - ppCollection.PrintHTML(); + if (htmlFlag) { + ppCollection.PrintHTML(); + } + else { + ppCollection.Print(prettyFlag); + } } return 0; @@ -1,17 +1,17 @@ #!/bin/bash set -e +top="$(dirname "$0")" if bjam="$(which bjam 2>/dev/null)" && #exists [ ${#bjam} != 0 ] && #paranoia about which printing nothing then returning true ! grep UFIHGUFIHBDJKNCFZXAEVA "${bjam}" </dev/null >/dev/null && #bjam in path isn't this script "${bjam}" --sanity-test 2>/dev/null |grep Sane >/dev/null && #The test in jam-files/sanity.jam passes - (cd jam-files/fail && ! "${bjam}") >/dev/null #Returns non-zero on failure + (cd "${top}/jam-files/fail" && ! "${bjam}") >/dev/null #Returns non-zero on failure then #Delegate to system bjam exec "${bjam}" "$@" fi -top="$(dirname "$0")" if [ ! -x "$top"/jam-files/bjam ] || "$top"/jam-files/bjam -v |grep 2011.4 >/dev/null; then pushd "$top/jam-files/engine" ./build.sh diff --git a/contrib/arrow-pipelines/documentation/training-pipeline/moses-pypeline.dia b/contrib/arrow-pipelines/documentation/training-pipeline/moses-pypeline.dia Binary files differnew file mode 100644 index 000000000..1d35a1dea --- /dev/null +++ b/contrib/arrow-pipelines/documentation/training-pipeline/moses-pypeline.dia diff --git a/contrib/arrow-pipelines/python/README b/contrib/arrow-pipelines/python/README new file mode 100644 index 000000000..e1e12975c --- /dev/null +++ b/contrib/arrow-pipelines/python/README @@ -0,0 +1,32 @@ +Arrow Based Moses Training Pipeline +=================================== + +To use the demonstration you must first initialise the git submodules for this clone. Return to the top level directory and issue the following command: + +$ git submodule init + +This will clone the Pypeline submodule that is available on GitHub (https://github.com/ianj-als/pypeline). To install Pypeline: + +$ cd libs/pypeline +$ python setup.py install + +Alternatively, you can set an appropriate PYTHONPATH enviornment variable to the Pypeline library. + +This demonstration implements a training pipeline that is shown in the Dia diagram in ../documentation/training-pipeline/moses-pypeline.dia. + +Three environment variables need to be set before the manager.py script can be run, they are: + + - MOSES_HOME : The directory where Moses has been cloned, or installed, + - IRSTLM : The installation directory of your IRSTLM, and + - GIZA_HOME : The installation directory of GIZA++. + +The manager.py script takes four positional command-line arguments: + + - The source language code, + - The target language code, + - The source corpus file. This file *must* be cleaned prior to use, and + - The target corpus file. This file *must* be cleaned prior to use. + +For example, run the manager.py script with: + +$ python manager.py en lt cleantrain.en cleantrain.lt diff --git a/contrib/arrow-pipelines/python/libs/pypeline b/contrib/arrow-pipelines/python/libs/pypeline new file mode 160000 +Subproject a7084b686f5196f1bbac5d389b4a6cd7f15c83f diff --git a/contrib/arrow-pipelines/python/manager.py b/contrib/arrow-pipelines/python/manager.py new file mode 100644 index 000000000..1c3ece111 --- /dev/null +++ b/contrib/arrow-pipelines/python/manager.py @@ -0,0 +1,192 @@ +import logging +import os + +from concurrent.futures import Future, ThreadPoolExecutor +from functools import partial +from pypeline.helpers.parallel_helpers import eval_pipeline, \ + cons_function_component, \ + cons_wire, \ + cons_split_wire, \ + cons_unsplit_wire, \ + cons_dictionary_wire + + +# +# Some logging please +# +FORMAT = '%(asctime)-15s : %(threadName)s : %(levelname)s - %(message)s' +logging.basicConfig(format = FORMAT, level = logging.DEBUG) +logger = logging.getLogger("manager") + + +# Build the pipeline components +def build_components(components, configuration, executor): + pipeline_components = dict() + pipeline_configuration = dict() + + for component_id, module_name in components.items(): + logger.info("Loading [%s] component from [%s]..." % (component_id, module_name)) + + module = __import__(module_name, fromlist = ['configure', 'initialise']) + + # Component builds its own configuration object + config_func = getattr(module, 'configure') + component_config = config_func(configuration) + pipeline_configuration.update(component_config) + + # Now build the component + init_func = getattr(module, 'initialise') + component_function = init_func(component_config) + + # A wrapper for the component's function that submits to the executor + def get_component_function_wrapper(inner_function, comp_id, mod_name): + def component_function_wrapper(a, s): + logger.info("Running component [%s], from module [%s], with value [%s] and state [%s]..." % \ + (comp_id, mod_name, a, s)) + return inner_function(a, s) + + return component_function_wrapper + + # Arrowize the component + component = cons_function_component(get_component_function_wrapper(component_function, component_id, module_name)) + + # And store + pipeline_components[component_id] = component + + return pipeline_components, pipeline_configuration + + +# Go! +def main(src_lang, trg_lang, src_filename, trg_filename): + # Global configuration + # One day, this configuration shall be constructed from + # command line options, or a properties file. + configuration = { + 'moses_installation_dir': os.environ['MOSES_HOME'], + 'irstlm_installation_dir': os.environ['IRSTLM'], + 'giza_installation_dir': os.environ['GIZA_HOME'], + 'src_lang': src_lang, + 'src_tokenisation_dir': './tokenisation', + 'trg_lang': trg_lang, + 'trg_tokenisation_dir': './tokenisation', + 'segment_length_limit': 60, + 'irstlm_smoothing_method': 'improved-kneser-ney', + 'language_model_directory': './language-model', + 'translation_model_directory': './translation-model', + 'mert_working_directory': './mert', + 'evaluation_data_size': 100, + 'development_data_size': 100 + } + + # The modules to load + # In the future, the components shall be specified in some kind + # pipeline description file. + component_modules = { + 'src_tokenizer': 'training.components.tokenizer.src_tokenizer', + 'trg_tokenizer': 'training.components.tokenizer.trg_tokenizer', + 'cleanup': 'training.components.cleanup.cleanup', + 'data_split': 'training.components.data_split.data_split', + 'irstlm_build': 'training.components.irstlm_build.irstlm_build', + 'model_training': 'training.components.model_training.model_training', + 'mert': 'training.components.mert.mert' + } + + # The thread pool + executor = ThreadPoolExecutor(max_workers = 3) + + # Phew, build the required components + components, component_config = build_components(component_modules, configuration, executor) + + # + # Wire up components + # Description of wiring should be, in the future, alongside the component + # specification in some kind of confuguration file. Components shall be + # declared then used, i.e., bind a component instance to a unique component + # identifier, then wire component instances together by identifier. + # + + # + # Tokenisation of source and target... + # + # IRSTLM Build components + irstlm_build_component = cons_split_wire() >> \ + (cons_wire(lambda a, s: {'input_filename': a['tokenised_trg_filename']}) >> \ + components['irstlm_build']).second() >> \ + cons_unsplit_wire(lambda t, b: {'tokenised_trg_filename': t['tokenised_trg_filename'], + 'trg_language_model_filename': b['compiled_lm_filename']}) + + # The complete tokenisation component + tokenisation_component = (components['src_tokenizer'] & components['trg_tokenizer']) >> \ + irstlm_build_component.second() >> \ + cons_unsplit_wire(lambda t, b: {'src_filename': t['tokenised_src_filename'], + 'trg_filename': b['tokenised_trg_filename'], + 'trg_language_model_filename': b['trg_language_model_filename']}) + + # + # Cleanup and Data Spliting... + # + + # + # A function that clips off the last '.' delimited string + # + def clip_last_bit(filename): + bn = os.path.basename(filename) + directory = os.path.dirname(filename) + bits = bn.split(".") + bits.pop() + return os.path.join(directory, ".".join(bits)) + + cleanup_datasplit_component = components['cleanup'] >> \ + cons_wire(lambda a, s: {'src_filename': a['cleaned_src_filename'], + 'trg_filename': a['cleaned_trg_filename']}) >> \ + components['data_split'] >> \ + cons_wire(lambda a, s: {'training_data_filename': clip_last_bit(a['train_src_filename']), + 'eval_src_filename': a['eval_src_filename'], + 'eval_trg_filename': a['eval_trg_filename']}) + + # + # Translation model training + # + translation_model_component = cons_split_wire() >> \ + components['model_training'].first() >> \ + cons_unsplit_wire(lambda t, b: {'moses_ini_file': t['moses_ini_file'], + 'development_data_filename': b['eval_src_filename']}) + + # + # The whole pipeline + # + pipeline = tokenisation_component >> \ + cons_split_wire() >> \ + (cleanup_datasplit_component >> translation_model_component).first() >> \ + cons_unsplit_wire(lambda t, b: {'moses_ini_file': t['moses_ini_file'], + 'development_data_filename': clip_last_bit(t['development_data_filename']), + 'trg_language_model_filename': b['trg_language_model_filename'], + 'trg_language_model_order': 3, + 'trg_language_model_type': 9}) >> \ + components['mert'] + + + # + # The input to the pipeline + # + value = {'src_filename': src_filename, + 'trg_filename': trg_filename} + + # + # Evaluate the pipeline + # + logger.info("Evaluating pipeline with input [%s]..." % value) + new_value = eval_pipeline(executor, pipeline, value, component_config) + + # + # Wait for all components to finish + # + executor.shutdown(True) + + logger.info("Pipeline evaluated to %s" % new_value) + + +if __name__ == '__main__': + import sys + + main(sys.argv[1], sys.argv[2], sys.argv[3], sys.argv[4]) diff --git a/contrib/arrow-pipelines/python/test/__init__.py b/contrib/arrow-pipelines/python/test/__init__.py new file mode 100644 index 000000000..e69de29bb --- /dev/null +++ b/contrib/arrow-pipelines/python/test/__init__.py diff --git a/contrib/arrow-pipelines/python/test/test.py b/contrib/arrow-pipelines/python/test/test.py new file mode 100644 index 000000000..628796f7d --- /dev/null +++ b/contrib/arrow-pipelines/python/test/test.py @@ -0,0 +1,11 @@ +import subprocess + +def cat(filename, content): + fh = open(filename, "w") + for line in content: + #print(line, file=fh) + print >> fh, line + fh.close() + +def diff(filename1, filename2): + subprocess.check_output(["diff", filename1, filename2], stderr=subprocess.STDOUT) diff --git a/contrib/arrow-pipelines/python/training/__init__.py b/contrib/arrow-pipelines/python/training/__init__.py new file mode 100644 index 000000000..e69de29bb --- /dev/null +++ b/contrib/arrow-pipelines/python/training/__init__.py diff --git a/contrib/arrow-pipelines/python/training/components/__init__.py b/contrib/arrow-pipelines/python/training/components/__init__.py new file mode 100644 index 000000000..e69de29bb --- /dev/null +++ b/contrib/arrow-pipelines/python/training/components/__init__.py diff --git a/contrib/arrow-pipelines/python/training/components/cleanup/__init__.py b/contrib/arrow-pipelines/python/training/components/cleanup/__init__.py new file mode 100644 index 000000000..e69de29bb --- /dev/null +++ b/contrib/arrow-pipelines/python/training/components/cleanup/__init__.py diff --git a/contrib/arrow-pipelines/python/training/components/cleanup/cleanup.py b/contrib/arrow-pipelines/python/training/components/cleanup/cleanup.py new file mode 100644 index 000000000..cb2e057ce --- /dev/null +++ b/contrib/arrow-pipelines/python/training/components/cleanup/cleanup.py @@ -0,0 +1,125 @@ +from pypeline.helpers.helpers import cons_function_component + +def configure(args): + result = {} + result['segment_length'] = args['segment_length_limit'] + return result + +def initialise(config): + def _filter(limit, ifh1, ofh1, ifh2, ofh2): + def _short(line): + n = 0 + for c in line: + if c == " ": + n += 1 + #print(line, ":", n) + return n < limit + + for (l1, l2) in zip(ifh1, ifh2): + if _short(l1) and _short(l2): + print >>ofh1, l1, + print >>ofh2, l2, + + def _make_cleaned_filename(filename): + bits = filename.split(".") + bits[-1] = "clean" + return ".".join(bits) + + def _filter_main(value, config): + limit = config['segment_length'] + (ifh1, ifh2, ofh1, ofh2) = (None, None, None, None) + try: + input_src_filename = value['src_filename'] + input_trg_filename = value['trg_filename'] + + print "Cleanup: Cleaning [%s] and [%s]..." % (input_src_filename, input_trg_filename) + + ifh1 = open(input_src_filename, "r") + ifh2 = open(input_trg_filename, "r") + + cleaned_src_filename = _make_cleaned_filename(input_src_filename) + cleaned_trg_filename = _make_cleaned_filename(input_trg_filename) + ofh1 = open(cleaned_src_filename, "w") + ofh2 = open(cleaned_trg_filename, "w") + + _filter(limit, ifh1, ofh1, ifh2, ofh2) + + return {'cleaned_src_filename': cleaned_src_filename, + 'cleaned_trg_filename': cleaned_trg_filename} + finally: + def _safe_close(fh): + if fh is not None: + fh.close() + _safe_close(ifh1) + _safe_close(ifh2) + _safe_close(ofh1) + _safe_close(ofh2) + + return _filter_main + + +if __name__ == '__main__': + import os + import tempfile + import test.test as thelp + + from pypeline.helpers.helpers import eval_pipeline + + + def _test_main(): + configuration = {'segment_length_limit': 20} + + src_filename = tempfile.mkstemp(suffix = ".src", dir = "/tmp") + trg_filename = tempfile.mkstemp(suffix = ".trg", dir = "/tmp") + + box_eval = { + 'src_filename': src_filename[1], + 'trg_filename': trg_filename[1], + 'cleaned_src_file_expected': src_filename[1] + ".expected", + 'cleaned_trg_file_expected': trg_filename[1] + ".expected" + } + + try: + _prep_files(box_eval) + _run_test(configuration, box_eval) + finally: + _cleanup_files(box_eval) + + + def _run_test(configuration, box_eval): + box_config = configure(configuration) + box = initialise(box_config) + + output = eval_pipeline(box, box_eval, box_config) + try: + thelp.diff(box_eval['cleaned_src_file_expected'], output['cleaned_src_filename']) + thelp.diff(box_eval['cleaned_trg_file_expected'], output['cleaned_trg_filename']) + finally: + os.unlink(output['cleaned_src_filename']) + os.unlink(output['cleaned_trg_filename']) + + + def _line(line_lengths): + def _gen_line(tokens): + return " ".join(map(lambda n: "tok" + str(n), range(tokens))) + return map(_gen_line, line_lengths) + + + def _prep_files(box_eval): + thelp.cat(box_eval['src_filename'], _line([10, 20, 30, 40, 17, 21])) + thelp.cat(box_eval['trg_filename'], _line([40, 30, 20, 10, 20, 21])) + #expected output: + thelp.cat(box_eval['cleaned_src_file_expected'], _line([17])) + thelp.cat(box_eval['cleaned_trg_file_expected'], _line([20])) + + + def _cleanup_files(box_eval): + try: + for key, filename in box_eval.items(): + os.unlink(filename) + except: + pass + + + _test_main() + diff --git a/contrib/arrow-pipelines/python/training/components/cleanup/cleanup3.py b/contrib/arrow-pipelines/python/training/components/cleanup/cleanup3.py new file mode 100644 index 000000000..27625c612 --- /dev/null +++ b/contrib/arrow-pipelines/python/training/components/cleanup/cleanup3.py @@ -0,0 +1,109 @@ +from pypeline.helpers.helpers import cons_function_component + +def configure(args): + result = {} + result['segment_length'] = args['segment_length_limit'] + return result + +def initialise(config): + def _filter(limit, ifh1, ofh1, ifh2, ofh2): + def _short(line): + n = 0 + for c in line: + if c == " ": + n += 1 + #print(line, ":", n) + return n < limit + + for (l1, l2) in zip(ifh1, ifh2): + if _short(l1) and _short(l2): + print(l1, end='', file=ofh1) + print(l2, end='', file=ofh2) + + def _filter_main(config, value): + limit = config['segment_length'] + (ifh1, ifh2, ofh1, ofh2) = (None, None, None, None) + try: + ifh1 = open(value['src_filename'], "r") + ifh2 = open(value['trg_filename'], "r") + ofh1 = open(value['cleaned_src_filename'], "w") + ofh2 = open(value['cleaned_trg_filename'], "w") + + _filter(limit, ifh1, ofh1, ifh2, ofh2) + + return {'cleaned_src_filename': value['cleaned_src_filename'], + 'cleaned_trg_filename': value['cleaned_trg_filename']} + finally: + def _safe_close(fh): + if fh is not None: + fh.close() + _safe_close(ifh1) + _safe_close(ifh2) + _safe_close(ofh1) + _safe_close(ofh2) + + return cons_function_component(_filter_main) + + +if __name__ == '__main__': + import os + import tempfile + import training.components.shared.test as thelp + + + def _test_main(): + configuration = {'segment_length_limit': 20} + + src_filename = tempfile.mkstemp(suffix = "src", dir = "/tmp") + trg_filename = tempfile.mkstemp(suffix = "trg", dir = "/tmp") + + box_eval = { + 'src_filename': src_filename[1], + 'trg_filename': trg_filename[1], + 'cleaned_src_filename': src_filename[1] + ".clean", + 'cleaned_trg_filename': trg_filename[1] + ".clean", + 'cleaned_src_file_expected': src_filename[1] + ".expected", + 'cleaned_trg_file_expected': trg_filename[1] + ".expected" + } + + try: + _prep_files(box_eval) + _run_test(configuration, box_eval) + finally: + _cleanup_files(box_eval) + + + def _run_test(configuration, box_eval): + from pypeline.helpers.helpers import run_pipeline + box_config = configure(configuration) + box = initialise(box_config) + + run_pipeline(box, box_config, box_eval) + thelp.diff(box_eval['cleaned_src_file_expected'], box_eval['cleaned_src_filename']) + thelp.diff(box_eval['cleaned_trg_file_expected'], box_eval['cleaned_trg_filename']) + + + def _line(line_lengths): + def _gen_line(tokens): + return " ".join(map(lambda n: "tok" + str(n), range(tokens))) + return map(_gen_line, line_lengths) + + + def _prep_files(box_eval): + thelp.cat(box_eval['src_filename'], _line([10, 20, 30, 40, 17, 21])) + thelp.cat(box_eval['trg_filename'], _line([40, 30, 20, 10, 20, 21])) + #expected output: + thelp.cat(box_eval['cleaned_src_file_expected'], _line([17])) + thelp.cat(box_eval['cleaned_trg_file_expected'], _line([20])) + + + def _cleanup_files(box_eval): + try: + for key, filename in box_eval.items(): + os.unlink(filename) + except: + pass + + + _test_main() + diff --git a/contrib/arrow-pipelines/python/training/components/data_split/__init__.py b/contrib/arrow-pipelines/python/training/components/data_split/__init__.py new file mode 100644 index 000000000..e69de29bb --- /dev/null +++ b/contrib/arrow-pipelines/python/training/components/data_split/__init__.py diff --git a/contrib/arrow-pipelines/python/training/components/data_split/data_split.py b/contrib/arrow-pipelines/python/training/components/data_split/data_split.py new file mode 100644 index 000000000..b8469cbf6 --- /dev/null +++ b/contrib/arrow-pipelines/python/training/components/data_split/data_split.py @@ -0,0 +1,146 @@ +from pypeline.helpers.helpers import cons_function_component + +def configure(args): + result = {} + result['evaluate_size'] = args['evaluation_data_size'] + result['development_size'] = args['development_data_size'] + return result + +def initialise(config): + + def _copy(size, inp, ofh1, ofh2): + try: + while size != 0: + (l1, l2) = inp.next() + print >>ofh1, l1, + print >>ofh2, l2, + size -= 1 + except StopIteration: + pass + + def _make_split_filename(filename, data_set): + bits = filename.split(".") + last = bits.pop() + lang_code = bits.pop() + + bits.append(last) + bits.append(data_set) + bits.append(lang_code) + + new_filename = ".".join(bits) + return new_filename + + def _splitter_main(value, config): + (ifh1, ifh2, ofh1, ofh2) = (None, None, None, None) + try: + input_src_filename = value['src_filename'] + input_trg_filename = value['trg_filename'] + + ifh1 = open(input_src_filename, "r") + ifh2 = open(input_trg_filename, "r") + inp = iter(zip(ifh1, ifh2)) + + result = {} + for (data_set, size) in [ + ('devel', config['development_size']), + ('eval', config['evaluate_size']), + ('train', -1) + ]: + output_src_filename = _make_split_filename(input_src_filename, data_set) + output_trg_filename = _make_split_filename(input_trg_filename, data_set) + ofh1 = open(output_src_filename, "w") + ofh2 = open(output_trg_filename, "w") + + _copy(size, inp, ofh1, ofh2) + result[data_set + '_src_filename'] = output_src_filename + result[data_set + '_trg_filename'] = output_trg_filename + + return result + + finally: + def _safe_close(fh): + if fh is not None: + fh.close() + _safe_close(ifh1) + _safe_close(ifh2) + _safe_close(ofh1) + _safe_close(ofh2) + + return _splitter_main + + +if __name__ == '__main__': + import os + import tempfile + import test.test as thelp + + from pypeline.helpers.helpers import eval_pipeline + + + def _test_main(): + configuration = { + 'evaluation_data_size': 7, + 'development_data_size': 13, + } + + src_filename = tempfile.mkstemp(suffix = ".src", dir = "/tmp") + trg_filename = tempfile.mkstemp(suffix = ".trg", dir = "/tmp") + + box_eval = { + 'src_filename': src_filename[1], + 'trg_filename': trg_filename[1], + 'devel_src_expected': src_filename[1] + ".devel.expected", + 'devel_trg_expected': trg_filename[1] + ".devel.expected", + 'eval_src_expected': src_filename[1] + ".eval.expected", + 'eval_trg_expected': trg_filename[1] + ".eval.expected", + 'train_src_expected': src_filename[1] + ".train.expected", + 'train_trg_expected': trg_filename[1] + ".train.expected", + } + + try: + _prep_files(box_eval) + _run_test(configuration, box_eval) + finally: + _cleanup_files(box_eval) + + + def _run_test(configuration, box_eval): + box_config = configure(configuration) + box = initialise(box_config) + + output = eval_pipeline(box, box_eval, box_config) + for data_set in ['devel', 'eval', 'train']: + for lang in ['src', 'trg']: + filename = output[data_set + '_' + lang + '_filename'] + filename_expected = box_eval[data_set + '_' + lang + '_expected'] + thelp.diff(filename_expected, filename) + + + def _line(line_lengths): + def _gen_line(tokens): + return " ".join(map(lambda n: "tok" + str(n), range(tokens))) + return map(_gen_line, line_lengths) + + + def _prep_files(box_eval): + thelp.cat(box_eval['src_filename'], _line(range(50))) + thelp.cat(box_eval['trg_filename'], _line(range(50))) + #expected output: + thelp.cat(box_eval['devel_src_expected'], _line(range(0,13))) + thelp.cat(box_eval['devel_trg_expected'], _line(range(0,13))) + thelp.cat(box_eval['eval_src_expected'], _line(range(13,20))) + thelp.cat(box_eval['eval_trg_expected'], _line(range(13,20))) + thelp.cat(box_eval['train_src_expected'], _line(range(20,50))) + thelp.cat(box_eval['train_trg_expected'], _line(range(20,50))) + + + def _cleanup_files(box_eval): + try: + for key, filename in box_eval.items(): + os.unlink(filename) + except: + pass + + + _test_main() + diff --git a/contrib/arrow-pipelines/python/training/components/irstlm_build/__init__.py b/contrib/arrow-pipelines/python/training/components/irstlm_build/__init__.py new file mode 100644 index 000000000..e69de29bb --- /dev/null +++ b/contrib/arrow-pipelines/python/training/components/irstlm_build/__init__.py diff --git a/contrib/arrow-pipelines/python/training/components/irstlm_build/irstlm_build.py b/contrib/arrow-pipelines/python/training/components/irstlm_build/irstlm_build.py new file mode 100644 index 000000000..f65d61973 --- /dev/null +++ b/contrib/arrow-pipelines/python/training/components/irstlm_build/irstlm_build.py @@ -0,0 +1,106 @@ +import os +import shutil +import subprocess +import tempfile + +from pypeline.helpers.helpers import cons_function_component + +def configure(args): + config = dict() + config['irstlm_install_directory'] = args['irstlm_installation_dir'] + config['smoothing_method'] = args['irstlm_smoothing_method'] + config['lm_directory'] = args['language_model_directory'] + return config + +def initialise(config): + def process(a, s): + # Create the LM directory if we need to + if os.path.exists(s['lm_directory']) is False: + os.makedirs(s['lm_directory']) + + # The filename of the file to chew through + start_end_input_filename = a['input_filename'] + if os.path.exists(start_end_input_filename) is False: + raise Exception("IRSTLM Build: Input file could not be found at [%s]" % start_end_input_filename) + + # Derive the output file name for the add start-end marker processor + filename_bits = os.path.basename(start_end_input_filename).split(".") + filename_bits[2] = "sb"; + start_end_output_filename = os.path.join(s['lm_directory'], ".".join(filename_bits)) + + # Derive the output file name of the LM build + filename_bits[2] = "lm" + lm_filename = os.path.join(s['lm_directory'], ".".join(filename_bits)) + + # Derive the compiled LM file name + filename_bits[2] = "arpa" + compiled_lm_filename = os.path.join(s['lm_directory'], ".".join(filename_bits)) + + # First thing to do is add start and end markers + start_end_cmdline = [os.path.join(s['irstlm_install_directory'], "bin", "add-start-end.sh")] + infile = open(start_end_input_filename, 'r') + outfile = open(start_end_output_filename, 'w') + print "IRSTLM Build: Invoking [%s]..." % " ".join(start_end_cmdline) + return_code = subprocess.check_call(start_end_cmdline, stdin = infile, stdout = outfile) + if return_code: + raise Exception("IRSTLM add start and end markers failed: input file = [%s], output file = [%s], return code = [%d]" % \ + start_end_input_filename, start_end_output_filename, return_code) + + # Next build the language model + tmp_dir = tempfile.mkdtemp(dir = "/tmp") + try: + build_lm_cmdline = [os.path.join(s['irstlm_install_directory'], "bin", "build-lm.sh"), + "-i", start_end_output_filename, + "-t", tmp_dir, + "-p", + "-s", s['smoothing_method'], + "-o", lm_filename] + print "IRSTLM Build: Invoking [%s]..." % " ".join(build_lm_cmdline) + return_code = subprocess.check_call(build_lm_cmdline) + if return_code: + raise Exception("IRST language model failed to build: return code = [%d]" % return_code) + finally: + if os.path.exists(tmp_dir): + shutil.rmtree(tmp_dir) + + # Compile the LM + lm_filename = lm_filename + ".gz" + compile_lm_cmdline = [os.path.join(s['irstlm_install_directory'], "bin", "compile-lm"), + "--text", "yes", + lm_filename, + compiled_lm_filename] + print "IRSTLM Build: Invoking [%s]..." % " ".join(compile_lm_cmdline) + return_code = subprocess.check_call(compile_lm_cmdline) + if return_code: + raise Exception("IRST language model compilation failed: return code = [%d]" % return_code) + + output = {'add_start_end_filename': start_end_output_filename, + 'lm_filename': lm_filename, + 'compiled_lm_filename': compiled_lm_filename} + + print "IRSTLM Build: Output = %s" % output + + return output + + return process + + +if __name__ == '__main__': + from pypeline.helpers.helpers import eval_pipeline + + lm_dir = os.environ["PWD"] + configuration = {'irstlm_root': os.environ["IRSTLM"], + 'irstlm_smoothing_method': 'improved-kneser-ney', + 'language_model_directory': lm_dir} + component_config = configure(configuration) + component = initialise(component_config) + + value = eval_pipeline(component, + {'input_filename': '/Users/ianjohnson/Dropbox/Documents/MTM2012/tokenised_files/news-commentary-v7.fr-en.tok.en'}, + component_config) + target = {'add_start_end_filename': os.path.join(lm_dir, 'news-commentary-v7.fr-en.sb.en'), + 'lm_filename': os.path.join(lm_dir, 'news-commentary-v7.fr-en.lm.en.gz'), + 'compiled_lm_filename': os.path.join(lm_dir, 'news-commentary-v7.fr-en.arpa.en')} + print "Target: %s" % target + if value != target: + raise Exception("Massive fail!") diff --git a/contrib/arrow-pipelines/python/training/components/mert/__init__.py b/contrib/arrow-pipelines/python/training/components/mert/__init__.py new file mode 100644 index 000000000..e69de29bb --- /dev/null +++ b/contrib/arrow-pipelines/python/training/components/mert/__init__.py diff --git a/contrib/arrow-pipelines/python/training/components/mert/mert.py b/contrib/arrow-pipelines/python/training/components/mert/mert.py new file mode 100755 index 000000000..2b60b1720 --- /dev/null +++ b/contrib/arrow-pipelines/python/training/components/mert/mert.py @@ -0,0 +1,83 @@ +#!/usr/bin/env python + +import os, shutil, subprocess + +from pypeline.helpers.helpers import cons_function_component + +def configure(args): + result = {} + result['src_lang'] = args['src_lang'] + result['trg_lang'] = args['trg_lang'] + result['moses_installation_dir'] = args['moses_installation_dir'] + result['mert_working_dir'] = args['mert_working_directory'] + return result + +def initialise(config): + + def process(a, s): + infilename = os.path.abspath(a['development_data_filename']) + lm_file = os.path.abspath(a['trg_language_model_filename']) + lm_order = int(a['trg_language_model_order']) + lm_type = int(a['trg_language_model_type']) + orig_moses_ini = os.path.abspath(a['moses_ini_file']) + + if not os.path.exists(orig_moses_ini): + raise Exception, "Error: Input moses.ini does not exist" + + workdir = os.path.abspath(config['mert_working_dir']) + #simply call the training perl script + #remove the workdir if it is already there + if os.path.exists(workdir): + shutil.rmtree(workdir) + os.makedirs(workdir) + + #local vars + moses_install_dir = os.path.abspath(config['moses_installation_dir']) + mert_perl = os.path.join(moses_install_dir, 'scripts', 'training', 'mert-moses.pl') + bin_dir = os.path.join(moses_install_dir, 'bin') + moses_bin = os.path.join(moses_install_dir, 'bin', 'moses') + src_file = infilename + '.' + config['src_lang'] + ref_file = infilename + '.' + config['trg_lang'] + logfile = os.path.join(workdir, 'log') + #change lm configuration in moses ini + moses_ini = os.path.join(workdir, 'trained-moses.ini') + cmd = r"cat %(orig_moses_ini)s | sed '/\[lmodel-file\]/,/^[[:space:]]*$/c\[lmodel-file\]\n%(lm_type)s 0 %(lm_order)s %(lm_file)s\n' > %(moses_ini)s" + cmd = cmd % locals() + os.system(cmd) + + #the command + cmd = '%(mert_perl)s --mertdir %(bin_dir)s --working-dir %(workdir)s %(src_file)s %(ref_file)s %(moses_bin)s %(moses_ini)s 2> %(logfile)s' + cmd = cmd % locals() + + pipe = subprocess.Popen(cmd, stdin = subprocess.PIPE, stdout = subprocess.PIPE, shell=True) + pipe.wait() + + #check the moses ini + new_mosesini = os.path.join(workdir, 'moses.ini') + if not os.path.exists(new_mosesini): + raise Exception, 'Failed MERT' + + return {'moses_ini_file':new_mosesini} + + return process + +if __name__ == '__main__': + + def __test(): + configuration = {'src_lang':'en', + 'trg_lang':'lt', + 'moses_installation_dir':os.path.abspath('../../../../'), + 'mert_working_dir':'../../../../../tuning'} + values = {'development_data_filename':'../../../../../corpus/tune', + 'moses_ini_file':'../../../../../model/model/moses.ini', + 'trg_language_model_filename':'../../../../../corpus/train.lt.lm', + 'trg_language_model_type':9, + 'trg_language_model_order':4} + from pypeline.helpers.helpers import run_pipeline + box_config = configure(configuration) + box = initialise(configuration) + print run_pipeline(box, values, None) + + #do some test + __test() + diff --git a/contrib/arrow-pipelines/python/training/components/model_training/__init__.py b/contrib/arrow-pipelines/python/training/components/model_training/__init__.py new file mode 100644 index 000000000..e69de29bb --- /dev/null +++ b/contrib/arrow-pipelines/python/training/components/model_training/__init__.py diff --git a/contrib/arrow-pipelines/python/training/components/model_training/model_training.py b/contrib/arrow-pipelines/python/training/components/model_training/model_training.py new file mode 100755 index 000000000..e990307d2 --- /dev/null +++ b/contrib/arrow-pipelines/python/training/components/model_training/model_training.py @@ -0,0 +1,72 @@ +#!/usr/bin/env python + +import os, shutil, subprocess + +from pypeline.helpers.helpers import cons_function_component + +def configure(args): + result = {} + result['src_lang'] = args['src_lang'] + result['trg_lang'] = args['trg_lang'] + result['moses_installation_dir'] = args['moses_installation_dir'] + result['external_bin_dir'] = args['giza_installation_dir'] + result['model_directory'] = args['translation_model_directory'] + return result + +def initialise(config): + + def process(a, s): + infilename = os.path.abspath(a['training_data_filename']) + workdir = os.path.abspath(config['model_directory']) + #simply call the training perl script + #remove the workdir if it is already there + if os.path.exists(workdir): + shutil.rmtree(workdir) + os.makedirs(workdir) + + #local vars + train_model_perl = os.path.abspath(config['moses_installation_dir']) + os.sep + 'scripts' + os.sep + 'training' + os.sep + 'train-model.perl' + src_lang = config['src_lang'].lower() + trg_lang = config['trg_lang'].lower() + external_bin = os.path.abspath(config['external_bin_dir']) + #create a dummy lm file + dummy_lmfile = workdir + os.sep + 'dummy.lm' + f = open(dummy_lmfile, 'w') + print >> f, "dummy lm file" + f.close() + logfile = workdir + os.sep + 'log' + + #the command + cmd = '%(train_model_perl)s -root-dir %(workdir)s -corpus %(infilename)s -f %(src_lang)s -e %(trg_lang)s -alignment grow-diag-final-and -reordering msd-bidirectional-fe -lm 0:5:%(dummy_lmfile)s:0 -external-bin-dir %(external_bin)s 2> %(logfile)s' + + cmd = cmd % locals() + + pipe = subprocess.Popen(cmd, stdin = subprocess.PIPE, stdout = subprocess.PIPE, shell=True) + pipe.wait() + + #check the moses ini + mosesini = workdir + os.sep + 'model' + os.sep + 'moses.ini' + if not os.path.exists(mosesini): + raise Exception, 'Failed training model' + + return {'moses_ini_file':mosesini} + + return process + +if __name__ == '__main__': + + def __test(): + configuration = {'src_lang':'en', + 'trg_lang':'lt', + 'moses_installation_dir':os.environ['MOSES_HOME'], + 'giza_installation_dir':os.environ['GIZA_HOME'], + 'translation_model_directory':'model-dir'} + values = {'training_data_filename':'/Users/ianjohnson/work/MTM-2012/corpus/training/cleantrain'} + from pypeline.helpers.helpers import run_pipeline + box_config = configure(configuration) + box = initialise(box_config) + print run_pipeline(box, values, None) + + #do some test + __test() + diff --git a/contrib/arrow-pipelines/python/training/components/tokenizer/__init__.py b/contrib/arrow-pipelines/python/training/components/tokenizer/__init__.py new file mode 100644 index 000000000..e69de29bb --- /dev/null +++ b/contrib/arrow-pipelines/python/training/components/tokenizer/__init__.py diff --git a/contrib/arrow-pipelines/python/training/components/tokenizer/src_tokenizer.py b/contrib/arrow-pipelines/python/training/components/tokenizer/src_tokenizer.py new file mode 100755 index 000000000..57f8771df --- /dev/null +++ b/contrib/arrow-pipelines/python/training/components/tokenizer/src_tokenizer.py @@ -0,0 +1,43 @@ +#!/usr/bin/env python + +import os + +from tokenizer import Tokenizer + +from pypeline.helpers.helpers import cons_function_component + +def configure(args): + result = {} + result['src_lang'] = args['src_lang'] + result['src_tokenisation_dir'] = args['src_tokenisation_dir'] + result['moses_installation_dir'] = args['moses_installation_dir'] + return result + +def initialise(config): + + def process(a, s): + infilename = a['src_filename'] + outfilename = Tokenizer.batch_tokenise( + config['src_lang'], + config['moses_installation_dir'], + infilename, + config['src_tokenisation_dir']) + return {'tokenised_src_filename':outfilename} + + return process + +if __name__ == '__main__': + + def __test(): + configuration = {'src_lang':'de', + 'src_tokenisation_dir':'tmptok', + 'moses_installation_dir':os.path.abspath('../../../../')} + values = {'src_filename':'tmp.de'} + from pypeline.helpers.helpers import run_pipeline + box_config = configure(configuration) + box = initialise(configuration) + print run_pipeline(box, values, None) + + #do some test + __test() + diff --git a/contrib/arrow-pipelines/python/training/components/tokenizer/tmp.de b/contrib/arrow-pipelines/python/training/components/tokenizer/tmp.de new file mode 100644 index 000000000..c6b41edbe --- /dev/null +++ b/contrib/arrow-pipelines/python/training/components/tokenizer/tmp.de @@ -0,0 +1,3 @@ +asdfweoih +awfwoeijf awefo +what's this diff --git a/contrib/arrow-pipelines/python/training/components/tokenizer/tokenizer.py b/contrib/arrow-pipelines/python/training/components/tokenizer/tokenizer.py new file mode 100644 index 000000000..354ec1abc --- /dev/null +++ b/contrib/arrow-pipelines/python/training/components/tokenizer/tokenizer.py @@ -0,0 +1,36 @@ +#!/usr/bin/env python + +import sys, os, subprocess + +class Tokenizer: + + @staticmethod + def batch_tokenise(lang, mosesdir, infilename, workdir): + print "Tokenizing [%s] in working directory [%s]..." % (infilename, workdir) + if not os.path.exists(workdir): + os.makedirs(workdir) + tok = Tokenizer(lang, mosesdir) + basefilename = os.path.basename(infilename) + outfilename = workdir + os.sep + basefilename + '.tok' + tok.file_tokenise(infilename, outfilename) + return outfilename + + def __init__(self, lang, mosesdir): + self.arrows = None + self.lang = lang + #check the perl tokenizer is here + #path = os.path.dirname(os.path.abspath(__file__)) + path = mosesdir + os.sep + 'scripts' + os.sep + 'tokenizer' + self.perltok = path + os.sep + 'tokenizer.perl' + if not os.path.exists(path): + raise Exception, "Perl tokenizer does not exists" + + def file_tokenise(self, infilename, outfilename): + cmd = '%s -q -l %s < %s > %s' % (self.perltok, self.lang, infilename, outfilename) + pipe = subprocess.Popen(cmd, stdin = subprocess.PIPE, stdout = subprocess.PIPE, shell=True) + pipe.wait() + +if __name__ == '__main__': + #do some test + pass + diff --git a/contrib/arrow-pipelines/python/training/components/tokenizer/trg_tokenizer.py b/contrib/arrow-pipelines/python/training/components/tokenizer/trg_tokenizer.py new file mode 100755 index 000000000..3852e296f --- /dev/null +++ b/contrib/arrow-pipelines/python/training/components/tokenizer/trg_tokenizer.py @@ -0,0 +1,43 @@ +#!/usr/bin/env python + +import os + +from tokenizer import Tokenizer + +from pypeline.helpers.helpers import cons_function_component + +def configure(args): + result = {} + result['trg_lang'] = args['trg_lang'] + result['trg_tokenisation_dir'] = args['trg_tokenisation_dir'] + result['moses_installation_dir'] = args['moses_installation_dir'] + return result + +def initialise(config): + + def process(a, s): + infilename = a['trg_filename'] + outfilename = Tokenizer.batch_tokenise( + config['trg_lang'], + config['moses_installation_dir'], + infilename, + config['trg_tokenisation_dir']) + return {'tokenised_trg_filename':outfilename} + + return process + +if __name__ == '__main__': + + def __test(): + configuration = {'trg_lang':'de', + 'trg_tokenisation_dir':'tmptoktrg', + 'moses_installation_dir':os.path.abspath('../../../../')} + values = {'trg_filename':'tmp.de'} + from pypeline.helpers.helpers import run_pipeline + box_config = configure(configuration) + box = initialise(configuration) + print run_pipeline(box, values, None) + + #do some test + __test() + diff --git a/contrib/other-builds/OnDiskPt/.cproject b/contrib/other-builds/OnDiskPt/.cproject index e135b8886..f551380fd 100644 --- a/contrib/other-builds/OnDiskPt/.cproject +++ b/contrib/other-builds/OnDiskPt/.cproject @@ -24,7 +24,7 @@ <folderInfo id="cdt.managedbuild.config.gnu.macosx.exe.debug.846397978." name="/" resourcePath=""> <toolChain id="cdt.managedbuild.toolchain.gnu.macosx.exe.debug.725420545" name="MacOSX GCC" superClass="cdt.managedbuild.toolchain.gnu.macosx.exe.debug"> <targetPlatform binaryParser="org.eclipse.cdt.core.MachO64;org.eclipse.cdt.core.ELF" id="cdt.managedbuild.target.gnu.platform.macosx.exe.debug.1586272140" name="Debug Platform" superClass="cdt.managedbuild.target.gnu.platform.macosx.exe.debug"/> - <builder buildPath="${workspace_loc:/OnDiskPt/Debug}" id="cdt.managedbuild.target.gnu.builder.macosx.exe.debug.1909553559" keepEnvironmentInBuildfile="false" managedBuildOn="true" name="Gnu Make Builder" superClass="cdt.managedbuild.target.gnu.builder.macosx.exe.debug"/> + <builder buildPath="${workspace_loc:/OnDiskPt/Debug}" id="cdt.managedbuild.target.gnu.builder.macosx.exe.debug.1909553559" keepEnvironmentInBuildfile="false" managedBuildOn="true" name="Gnu Make Builder" parallelBuildOn="true" parallelizationNumber="optimal" superClass="cdt.managedbuild.target.gnu.builder.macosx.exe.debug"/> <tool id="cdt.managedbuild.tool.macosx.c.linker.macosx.exe.debug.30521110" name="MacOS X C Linker" superClass="cdt.managedbuild.tool.macosx.c.linker.macosx.exe.debug"/> <tool id="cdt.managedbuild.tool.macosx.cpp.linker.macosx.exe.debug.478334849" name="MacOS X C++ Linker" superClass="cdt.managedbuild.tool.macosx.cpp.linker.macosx.exe.debug"> <inputType id="cdt.managedbuild.tool.macosx.cpp.linker.input.1328561226" superClass="cdt.managedbuild.tool.macosx.cpp.linker.input"> @@ -133,8 +133,13 @@ <autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfileCPP"/> </scannerConfigBuildInfo> </storageModule> - <storageModule moduleId="refreshScope" versionNumber="1"> - <resource resourceType="PROJECT" workspacePath="/OnDiskPt"/> + <storageModule moduleId="refreshScope" versionNumber="2"> + <configuration configurationName="Release"> + <resource resourceType="PROJECT" workspacePath="/OnDiskPt"/> + </configuration> + <configuration configurationName="Debug"> + <resource resourceType="PROJECT" workspacePath="/OnDiskPt"/> + </configuration> </storageModule> <storageModule moduleId="org.eclipse.cdt.make.core.buildtargets"/> <storageModule moduleId="org.eclipse.cdt.core.LanguageSettingsProviders"/> diff --git a/contrib/other-builds/extractor/.cproject b/contrib/other-builds/extractor/.cproject index 7529a7799..fc08b4c3d 100644 --- a/contrib/other-builds/extractor/.cproject +++ b/contrib/other-builds/extractor/.cproject @@ -18,11 +18,14 @@ <folderInfo id="cdt.managedbuild.config.gnu.exe.debug.1133345948." name="/" resourcePath=""> <toolChain id="cdt.managedbuild.toolchain.gnu.exe.debug.1405862229" name="Linux GCC" superClass="cdt.managedbuild.toolchain.gnu.exe.debug"> <targetPlatform id="cdt.managedbuild.target.gnu.platform.exe.debug.605722566" name="Debug Platform" superClass="cdt.managedbuild.target.gnu.platform.exe.debug"/> - <builder buildPath="${workspace_loc:/extractor/Debug}" id="cdt.managedbuild.target.gnu.builder.exe.debug.238577912" keepEnvironmentInBuildfile="false" managedBuildOn="true" name="Gnu Make Builder" superClass="cdt.managedbuild.target.gnu.builder.exe.debug"/> + <builder buildPath="${workspace_loc:/extractor/Debug}" id="cdt.managedbuild.target.gnu.builder.exe.debug.238577912" keepEnvironmentInBuildfile="false" managedBuildOn="true" name="Gnu Make Builder" parallelBuildOn="true" parallelizationNumber="optimal" superClass="cdt.managedbuild.target.gnu.builder.exe.debug"/> <tool id="cdt.managedbuild.tool.gnu.archiver.base.1956867596" name="GCC Archiver" superClass="cdt.managedbuild.tool.gnu.archiver.base"/> <tool id="cdt.managedbuild.tool.gnu.cpp.compiler.exe.debug.1512268277" name="GCC C++ Compiler" superClass="cdt.managedbuild.tool.gnu.cpp.compiler.exe.debug"> <option id="gnu.cpp.compiler.exe.debug.option.optimization.level.2143789149" name="Optimization Level" superClass="gnu.cpp.compiler.exe.debug.option.optimization.level" value="gnu.cpp.compiler.optimization.level.none" valueType="enumerated"/> <option id="gnu.cpp.compiler.exe.debug.option.debugging.level.285958391" name="Debug Level" superClass="gnu.cpp.compiler.exe.debug.option.debugging.level" value="gnu.cpp.compiler.debugging.level.max" valueType="enumerated"/> + <option id="gnu.cpp.compiler.option.include.paths.966722418" superClass="gnu.cpp.compiler.option.include.paths" valueType="includePath"> + <listOptionValue builtIn="false" value=""${workspace_loc}/../../boost/include""/> + </option> <inputType id="cdt.managedbuild.tool.gnu.cpp.compiler.input.1839105433" superClass="cdt.managedbuild.tool.gnu.cpp.compiler.input"/> </tool> <tool id="cdt.managedbuild.tool.gnu.c.compiler.exe.debug.554846982" name="GCC C Compiler" superClass="cdt.managedbuild.tool.gnu.c.compiler.exe.debug"> @@ -119,5 +122,13 @@ <autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfileC"/> </scannerConfigBuildInfo> </storageModule> - <storageModule moduleId="refreshScope"/> + <storageModule moduleId="refreshScope" versionNumber="2"> + <configuration configurationName="Release"> + <resource resourceType="PROJECT" workspacePath="/extractor"/> + </configuration> + <configuration configurationName="Debug"> + <resource resourceType="PROJECT" workspacePath="/extractor"/> + </configuration> + </storageModule> + <storageModule moduleId="org.eclipse.cdt.core.LanguageSettingsProviders"/> </cproject> diff --git a/contrib/other-builds/lm/.cproject b/contrib/other-builds/lm/.cproject index 2036e6b18..e3e47fd7e 100644 --- a/contrib/other-builds/lm/.cproject +++ b/contrib/other-builds/lm/.cproject @@ -24,7 +24,7 @@ <folderInfo id="cdt.managedbuild.config.gnu.macosx.exe.debug.351042750." name="/" resourcePath=""> <toolChain id="cdt.managedbuild.toolchain.gnu.macosx.exe.debug.640882096" name="MacOSX GCC" superClass="cdt.managedbuild.toolchain.gnu.macosx.exe.debug"> <targetPlatform binaryParser="org.eclipse.cdt.core.MachO64;org.eclipse.cdt.core.ELF" id="cdt.managedbuild.target.gnu.platform.macosx.exe.debug.793478365" name="Debug Platform" superClass="cdt.managedbuild.target.gnu.platform.macosx.exe.debug"/> - <builder buildPath="${workspace_loc:/lm/Debug}" id="cdt.managedbuild.target.gnu.builder.macosx.exe.debug.36011795" keepEnvironmentInBuildfile="false" managedBuildOn="true" name="Gnu Make Builder" superClass="cdt.managedbuild.target.gnu.builder.macosx.exe.debug"/> + <builder buildPath="${workspace_loc:/lm/Debug}" id="cdt.managedbuild.target.gnu.builder.macosx.exe.debug.36011795" keepEnvironmentInBuildfile="false" managedBuildOn="true" name="Gnu Make Builder" parallelBuildOn="true" parallelizationNumber="optimal" superClass="cdt.managedbuild.target.gnu.builder.macosx.exe.debug"/> <tool id="cdt.managedbuild.tool.macosx.c.linker.macosx.exe.debug.1252826468" name="MacOS X C Linker" superClass="cdt.managedbuild.tool.macosx.c.linker.macosx.exe.debug"/> <tool id="cdt.managedbuild.tool.macosx.cpp.linker.macosx.exe.debug.1024598065" name="MacOS X C++ Linker" superClass="cdt.managedbuild.tool.macosx.cpp.linker.macosx.exe.debug"> <inputType id="cdt.managedbuild.tool.macosx.cpp.linker.input.139111896" superClass="cdt.managedbuild.tool.macosx.cpp.linker.input"> @@ -131,7 +131,14 @@ <autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfileCPP"/> </scannerConfigBuildInfo> </storageModule> - <storageModule moduleId="refreshScope"/> + <storageModule moduleId="refreshScope" versionNumber="2"> + <configuration configurationName="Release"> + <resource resourceType="PROJECT" workspacePath="/lm"/> + </configuration> + <configuration configurationName="Debug"> + <resource resourceType="PROJECT" workspacePath="/lm"/> + </configuration> + </storageModule> <storageModule moduleId="org.eclipse.cdt.make.core.buildtargets"/> <storageModule moduleId="org.eclipse.cdt.core.LanguageSettingsProviders"/> </cproject> diff --git a/contrib/other-builds/lm/.project b/contrib/other-builds/lm/.project index e75388ac1..a1bde37c2 100644 --- a/contrib/other-builds/lm/.project +++ b/contrib/other-builds/lm/.project @@ -142,11 +142,6 @@ <locationURI>PARENT-3-PROJECT_LOC/lm/build_binary</locationURI> </link> <link> - <name>build_binary.cc</name> - <type>1</type> - <locationURI>PARENT-3-PROJECT_LOC/lm/build_binary.cc</locationURI> - </link> - <link> <name>clean.sh</name> <type>1</type> <locationURI>PARENT-3-PROJECT_LOC/lm/clean.sh</locationURI> @@ -177,11 +172,6 @@ <locationURI>PARENT-3-PROJECT_LOC/lm/facade.hh</locationURI> </link> <link> - <name>fragment.cc</name> - <type>1</type> - <locationURI>PARENT-3-PROJECT_LOC/lm/fragment.cc</locationURI> - </link> - <link> <name>left.hh</name> <type>1</type> <locationURI>PARENT-3-PROJECT_LOC/lm/left.hh</locationURI> @@ -212,11 +202,6 @@ <locationURI>PARENT-3-PROJECT_LOC/lm/lm_exception.hh</locationURI> </link> <link> - <name>max_order.cc</name> - <type>1</type> - <locationURI>PARENT-3-PROJECT_LOC/lm/max_order.cc</locationURI> - </link> - <link> <name>max_order.hh</name> <type>1</type> <locationURI>PARENT-3-PROJECT_LOC/lm/max_order.hh</locationURI> @@ -242,11 +227,6 @@ <locationURI>PARENT-3-PROJECT_LOC/lm/model_type.hh</locationURI> </link> <link> - <name>ngram_query.cc</name> - <type>1</type> - <locationURI>PARENT-3-PROJECT_LOC/lm/ngram_query.cc</locationURI> - </link> - <link> <name>ngram_query.hh</name> <type>1</type> <locationURI>PARENT-3-PROJECT_LOC/lm/ngram_query.hh</locationURI> diff --git a/contrib/other-builds/mert_lib/.cproject b/contrib/other-builds/mert_lib/.cproject index 41a471cd1..e1c19b822 100644 --- a/contrib/other-builds/mert_lib/.cproject +++ b/contrib/other-builds/mert_lib/.cproject @@ -7,7 +7,7 @@ <externalSetting> <entry flags="VALUE_WORKSPACE_PATH" kind="includePath" name="/mert_lib"/> <entry flags="VALUE_WORKSPACE_PATH" kind="libraryPath" name="/mert_lib/Debug"/> - <entry flags="RESOLVED" kind="libraryFile" name="mert_lib"/> + <entry flags="RESOLVED" kind="libraryFile" name="mert_lib" srcPrefixMapping="" srcRootPath=""/> </externalSetting> </externalSettings> <extensions> @@ -23,13 +23,14 @@ <folderInfo id="cdt.managedbuild.config.gnu.lib.debug.1721952013." name="/" resourcePath=""> <toolChain id="cdt.managedbuild.toolchain.gnu.lib.debug.1932340583" name="Linux GCC" superClass="cdt.managedbuild.toolchain.gnu.lib.debug"> <targetPlatform id="cdt.managedbuild.target.gnu.platform.lib.debug.296711714" name="Debug Platform" superClass="cdt.managedbuild.target.gnu.platform.lib.debug"/> - <builder buildPath="${workspace_loc:/mert_lib/Debug}" id="cdt.managedbuild.target.gnu.builder.lib.debug.1369910974" keepEnvironmentInBuildfile="false" managedBuildOn="true" name="Gnu Make Builder" superClass="cdt.managedbuild.target.gnu.builder.lib.debug"/> + <builder buildPath="${workspace_loc:/mert_lib/Debug}" id="cdt.managedbuild.target.gnu.builder.lib.debug.1369910974" keepEnvironmentInBuildfile="false" managedBuildOn="true" name="Gnu Make Builder" parallelBuildOn="true" parallelizationNumber="optimal" superClass="cdt.managedbuild.target.gnu.builder.lib.debug"/> <tool id="cdt.managedbuild.tool.gnu.archiver.lib.debug.89397980" name="GCC Archiver" superClass="cdt.managedbuild.tool.gnu.archiver.lib.debug"/> <tool id="cdt.managedbuild.tool.gnu.cpp.compiler.lib.debug.329920537" name="GCC C++ Compiler" superClass="cdt.managedbuild.tool.gnu.cpp.compiler.lib.debug"> <option id="gnu.cpp.compiler.lib.debug.option.optimization.level.469164841" name="Optimization Level" superClass="gnu.cpp.compiler.lib.debug.option.optimization.level" value="gnu.cpp.compiler.optimization.level.none" valueType="enumerated"/> <option id="gnu.cpp.compiler.lib.debug.option.debugging.level.1050747398" name="Debug Level" superClass="gnu.cpp.compiler.lib.debug.option.debugging.level" value="gnu.cpp.compiler.debugging.level.max" valueType="enumerated"/> <option id="gnu.cpp.compiler.option.include.paths.1565260476" name="Include paths (-I)" superClass="gnu.cpp.compiler.option.include.paths" valueType="includePath"> <listOptionValue builtIn="false" value=""${workspace_loc}/../../""/> + <listOptionValue builtIn="false" value=""${workspace_loc}/../../boost/include""/> </option> <inputType id="cdt.managedbuild.tool.gnu.cpp.compiler.input.1183866856" superClass="cdt.managedbuild.tool.gnu.cpp.compiler.input"/> </tool> @@ -45,11 +46,8 @@ </tool> </toolChain> </folderInfo> - <fileInfo id="cdt.managedbuild.config.gnu.lib.debug.1721952013.626295813" name="extractor.cpp" rcbsApplicability="disable" resourcePath="mert/extractor.cpp" toolsToInvoke="cdt.managedbuild.tool.gnu.cpp.compiler.lib.debug.329920537.1550378460"> - <tool id="cdt.managedbuild.tool.gnu.cpp.compiler.lib.debug.329920537.1550378460" name="GCC C++ Compiler" superClass="cdt.managedbuild.tool.gnu.cpp.compiler.lib.debug.329920537"/> - </fileInfo> <sourceEntries> - <entry excluding="mert/extractor.cpp" flags="VALUE_WORKSPACE_PATH|RESOLVED" kind="sourcePath" name=""/> + <entry excluding="mert/UtilTest.cpp|mert/TimerTest.cpp|mert/SingletonTest.cpp|mert/PointTest.cpp|mert/OptimizerFactoryTest.cpp|mert/NgramTest.cpp|mert/FeatureDataTest.cpp|mert/DataTest.cpp|mert/ReferenceTest.cpp|mert/VocabularyTest.cpp|mert/extractor.cpp" flags="VALUE_WORKSPACE_PATH|RESOLVED" kind="sourcePath" name=""/> </sourceEntries> </configuration> </storageModule> @@ -61,7 +59,7 @@ <externalSetting> <entry flags="VALUE_WORKSPACE_PATH" kind="includePath" name="/mert_lib"/> <entry flags="VALUE_WORKSPACE_PATH" kind="libraryPath" name="/mert_lib/Release"/> - <entry flags="RESOLVED" kind="libraryFile" name="mert_lib"/> + <entry flags="RESOLVED" kind="libraryFile" name="mert_lib" srcPrefixMapping="" srcRootPath=""/> </externalSetting> </externalSettings> <extensions> @@ -119,5 +117,13 @@ <autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfileC"/> </scannerConfigBuildInfo> </storageModule> - <storageModule moduleId="refreshScope"/> + <storageModule moduleId="refreshScope" versionNumber="2"> + <configuration configurationName="Release"> + <resource resourceType="PROJECT" workspacePath="/mert_lib"/> + </configuration> + <configuration configurationName="Debug"> + <resource resourceType="PROJECT" workspacePath="/mert_lib"/> + </configuration> + </storageModule> + <storageModule moduleId="org.eclipse.cdt.core.LanguageSettingsProviders"/> </cproject> diff --git a/contrib/other-builds/moses-chart-cmd/.cproject b/contrib/other-builds/moses-chart-cmd/.cproject index fedda926b..71462b5df 100644 --- a/contrib/other-builds/moses-chart-cmd/.cproject +++ b/contrib/other-builds/moses-chart-cmd/.cproject @@ -19,7 +19,7 @@ <folderInfo id="cdt.managedbuild.config.gnu.exe.debug.162355801." name="/" resourcePath=""> <toolChain id="cdt.managedbuild.toolchain.gnu.exe.debug.1633424067" name="Linux GCC" superClass="cdt.managedbuild.toolchain.gnu.exe.debug"> <targetPlatform binaryParser="org.eclipse.cdt.core.ELF;org.eclipse.cdt.core.MachO64" id="cdt.managedbuild.target.gnu.platform.exe.debug.1437309068" name="Debug Platform" superClass="cdt.managedbuild.target.gnu.platform.exe.debug"/> - <builder buildPath="${workspace_loc:/moses-chart-cmd/Debug}" id="cdt.managedbuild.target.gnu.builder.exe.debug.1495140314" keepEnvironmentInBuildfile="false" managedBuildOn="true" name="Gnu Make Builder" superClass="cdt.managedbuild.target.gnu.builder.exe.debug"/> + <builder buildPath="${workspace_loc:/moses-chart-cmd/Debug}" id="cdt.managedbuild.target.gnu.builder.exe.debug.1495140314" keepEnvironmentInBuildfile="false" managedBuildOn="true" name="Gnu Make Builder" parallelBuildOn="true" parallelizationNumber="optimal" superClass="cdt.managedbuild.target.gnu.builder.exe.debug"/> <tool id="cdt.managedbuild.tool.gnu.archiver.base.1247128100" name="GCC Archiver" superClass="cdt.managedbuild.tool.gnu.archiver.base"/> <tool id="cdt.managedbuild.tool.gnu.cpp.compiler.exe.debug.1087697480" name="GCC C++ Compiler" superClass="cdt.managedbuild.tool.gnu.cpp.compiler.exe.debug"> <option id="gnu.cpp.compiler.exe.debug.option.optimization.level.1163099464" name="Optimization Level" superClass="gnu.cpp.compiler.exe.debug.option.optimization.level" value="gnu.cpp.compiler.optimization.level.none" valueType="enumerated"/> @@ -46,6 +46,7 @@ <tool id="cdt.managedbuild.tool.gnu.cpp.linker.exe.debug.816413868" name="GCC C++ Linker" superClass="cdt.managedbuild.tool.gnu.cpp.linker.exe.debug"> <option id="gnu.cpp.link.option.paths.330225535" name="Library search path (-L)" superClass="gnu.cpp.link.option.paths" valueType="libPaths"> <listOptionValue builtIn="false" value=""${workspace_loc:}/../../boost/lib""/> + <listOptionValue builtIn="false" value=""${workspace_loc:}/../../boost/lib64""/> <listOptionValue builtIn="false" value=""${workspace_loc:}/../../irstlm/lib""/> <listOptionValue builtIn="false" value=""${workspace_loc:}/../../srilm/lib/macosx""/> <listOptionValue builtIn="false" value=""${workspace_loc:}/../../srilm/lib/i686-m64""/> @@ -70,9 +71,11 @@ <listOptionValue builtIn="false" value="lm"/> <listOptionValue builtIn="false" value="util"/> <listOptionValue builtIn="false" value="z"/> + <listOptionValue builtIn="false" value="boost_filesystem-mt"/> + <listOptionValue builtIn="false" value="boost_iostreams-mt"/> <listOptionValue builtIn="false" value="boost_system-mt"/> <listOptionValue builtIn="false" value="boost_thread-mt"/> - <listOptionValue builtIn="false" value="boost_filesystem-mt"/> + <listOptionValue builtIn="false" value="bz2"/> <listOptionValue builtIn="false" value="rt"/> </option> <inputType id="cdt.managedbuild.tool.gnu.cpp.linker.input.128214028" superClass="cdt.managedbuild.tool.gnu.cpp.linker.input"> @@ -154,8 +157,13 @@ <autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfileCPP"/> </scannerConfigBuildInfo> </storageModule> - <storageModule moduleId="refreshScope" versionNumber="1"> - <resource resourceType="PROJECT" workspacePath="/moses-chart-cmd"/> + <storageModule moduleId="refreshScope" versionNumber="2"> + <configuration configurationName="Release"> + <resource resourceType="PROJECT" workspacePath="/moses-chart-cmd"/> + </configuration> + <configuration configurationName="Debug"> + <resource resourceType="PROJECT" workspacePath="/moses-chart-cmd"/> + </configuration> </storageModule> <storageModule moduleId="org.eclipse.cdt.make.core.buildtargets"/> <storageModule moduleId="org.eclipse.cdt.core.LanguageSettingsProviders"/> diff --git a/contrib/other-builds/moses-cmd/.cproject b/contrib/other-builds/moses-cmd/.cproject index 10b6784d4..42d2100d8 100644 --- a/contrib/other-builds/moses-cmd/.cproject +++ b/contrib/other-builds/moses-cmd/.cproject @@ -19,7 +19,7 @@ <folderInfo id="cdt.managedbuild.config.gnu.exe.debug.461114338." name="/" resourcePath=""> <toolChain id="cdt.managedbuild.toolchain.gnu.exe.debug.1896491482" name="Linux GCC" superClass="cdt.managedbuild.toolchain.gnu.exe.debug"> <targetPlatform binaryParser="org.eclipse.cdt.core.ELF;org.eclipse.cdt.core.MachO64" id="cdt.managedbuild.target.gnu.platform.exe.debug.2144309834" name="Debug Platform" superClass="cdt.managedbuild.target.gnu.platform.exe.debug"/> - <builder buildPath="${workspace_loc:/moses-cmd/Debug}" id="cdt.managedbuild.target.gnu.builder.exe.debug.56664170" keepEnvironmentInBuildfile="false" managedBuildOn="true" name="Gnu Make Builder" superClass="cdt.managedbuild.target.gnu.builder.exe.debug"/> + <builder buildPath="${workspace_loc:/moses-cmd/Debug}" id="cdt.managedbuild.target.gnu.builder.exe.debug.56664170" keepEnvironmentInBuildfile="false" managedBuildOn="true" name="Gnu Make Builder" parallelBuildOn="true" parallelizationNumber="optimal" superClass="cdt.managedbuild.target.gnu.builder.exe.debug"/> <tool id="cdt.managedbuild.tool.gnu.archiver.base.1278274354" name="GCC Archiver" superClass="cdt.managedbuild.tool.gnu.archiver.base"/> <tool id="cdt.managedbuild.tool.gnu.cpp.compiler.exe.debug.626095182" name="GCC C++ Compiler" superClass="cdt.managedbuild.tool.gnu.cpp.compiler.exe.debug"> <option id="gnu.cpp.compiler.exe.debug.option.optimization.level.2084031389" name="Optimization Level" superClass="gnu.cpp.compiler.exe.debug.option.optimization.level" value="gnu.cpp.compiler.optimization.level.none" valueType="enumerated"/> @@ -46,6 +46,8 @@ <tool id="cdt.managedbuild.tool.gnu.cpp.linker.exe.debug.1546774818" name="GCC C++ Linker" superClass="cdt.managedbuild.tool.gnu.cpp.linker.exe.debug"> <option id="gnu.cpp.link.option.paths.523170942" name="Library search path (-L)" superClass="gnu.cpp.link.option.paths" valueType="libPaths"> <listOptionValue builtIn="false" value=""${workspace_loc:}/../../irstlm/lib""/> + <listOptionValue builtIn="false" value=""${workspace_loc:}/../../boost/lib""/> + <listOptionValue builtIn="false" value=""${workspace_loc:}/../../boost/lib64""/> <listOptionValue builtIn="false" value=""${workspace_loc:}/../../srilm/lib/macosx""/> <listOptionValue builtIn="false" value=""${workspace_loc:}/../../srilm/lib/i686-m64""/> <listOptionValue builtIn="false" value=""${workspace_loc:}/../../srilm/lib/i686""/> @@ -69,8 +71,11 @@ <listOptionValue builtIn="false" value="z"/> <listOptionValue builtIn="false" value="boost_system-mt"/> <listOptionValue builtIn="false" value="boost_thread-mt"/> + <listOptionValue builtIn="false" value="boost_iostreams-mt"/> + <listOptionValue builtIn="false" value="boost_filesystem-mt"/> <listOptionValue builtIn="false" value="lm"/> <listOptionValue builtIn="false" value="util"/> + <listOptionValue builtIn="false" value="bz2"/> <listOptionValue builtIn="false" value="rt"/> </option> <inputType id="cdt.managedbuild.tool.gnu.cpp.linker.input.983725033" superClass="cdt.managedbuild.tool.gnu.cpp.linker.input"> @@ -155,8 +160,13 @@ <autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfileCPP"/> </scannerConfigBuildInfo> </storageModule> - <storageModule moduleId="refreshScope" versionNumber="1"> - <resource resourceType="PROJECT" workspacePath="/moses-cmd"/> + <storageModule moduleId="refreshScope" versionNumber="2"> + <configuration configurationName="Release"> + <resource resourceType="PROJECT" workspacePath="/moses-cmd"/> + </configuration> + <configuration configurationName="Debug"> + <resource resourceType="PROJECT" workspacePath="/moses-cmd"/> + </configuration> </storageModule> <storageModule moduleId="org.eclipse.cdt.make.core.buildtargets"/> <storageModule moduleId="org.eclipse.cdt.core.LanguageSettingsProviders"/> diff --git a/contrib/other-builds/moses/.cproject b/contrib/other-builds/moses/.cproject index e54a1385b..787024533 100644 --- a/contrib/other-builds/moses/.cproject +++ b/contrib/other-builds/moses/.cproject @@ -1,7 +1,5 @@ <?xml version="1.0" encoding="UTF-8" standalone="no"?> -<?fileVersion 4.0.0?> - -<cproject storage_type_id="org.eclipse.cdt.core.XmlProjectDescriptionStorage"> +<?fileVersion 4.0.0?><cproject storage_type_id="org.eclipse.cdt.core.XmlProjectDescriptionStorage"> <storageModule moduleId="org.eclipse.cdt.core.settings"> <cconfiguration id="cdt.managedbuild.config.gnu.exe.debug.656913512"> <storageModule buildSystemId="org.eclipse.cdt.managedbuilder.core.configurationDataProvider" id="cdt.managedbuild.config.gnu.exe.debug.656913512" moduleId="org.eclipse.cdt.core.settings" name="Debug"> @@ -9,7 +7,7 @@ <externalSetting> <entry flags="VALUE_WORKSPACE_PATH" kind="includePath" name="/moses"/> <entry flags="VALUE_WORKSPACE_PATH" kind="libraryPath" name="/moses/Debug"/> - <entry flags="RESOLVED" kind="libraryFile" name="moses"/> + <entry flags="RESOLVED" kind="libraryFile" name="moses" srcPrefixMapping="" srcRootPath=""/> </externalSetting> </externalSettings> <extensions> @@ -26,7 +24,7 @@ <folderInfo id="cdt.managedbuild.config.gnu.exe.debug.656913512." name="/" resourcePath=""> <toolChain id="cdt.managedbuild.toolchain.gnu.exe.debug.1793369992" name="Linux GCC" superClass="cdt.managedbuild.toolchain.gnu.exe.debug"> <targetPlatform binaryParser="org.eclipse.cdt.core.ELF;org.eclipse.cdt.core.MachO64" id="cdt.managedbuild.target.gnu.platform.exe.debug.1051650049" name="Debug Platform" superClass="cdt.managedbuild.target.gnu.platform.exe.debug"/> - <builder buildPath="${workspace_loc:/moses/Debug}" id="cdt.managedbuild.target.gnu.builder.exe.debug.505583888" keepEnvironmentInBuildfile="false" managedBuildOn="true" name="Gnu Make Builder" superClass="cdt.managedbuild.target.gnu.builder.exe.debug"/> + <builder buildPath="${workspace_loc:/moses/Debug}" id="cdt.managedbuild.target.gnu.builder.exe.debug.505583888" keepEnvironmentInBuildfile="false" managedBuildOn="true" name="Gnu Make Builder" parallelBuildOn="true" parallelizationNumber="optimal" superClass="cdt.managedbuild.target.gnu.builder.exe.debug"/> <tool id="cdt.managedbuild.tool.gnu.archiver.base.1976472988" name="GCC Archiver" superClass="cdt.managedbuild.tool.gnu.archiver.base"/> <tool id="cdt.managedbuild.tool.gnu.cpp.compiler.exe.debug.1774992327" name="GCC C++ Compiler" superClass="cdt.managedbuild.tool.gnu.cpp.compiler.exe.debug"> <option id="gnu.cpp.compiler.exe.debug.option.optimization.level.1759650532" name="Optimization Level" superClass="gnu.cpp.compiler.exe.debug.option.optimization.level" value="gnu.cpp.compiler.optimization.level.none" valueType="enumerated"/> @@ -152,8 +150,14 @@ <autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfileCPP"/> </scannerConfigBuildInfo> </storageModule> - <storageModule moduleId="refreshScope" versionNumber="1"> - <resource resourceType="PROJECT" workspacePath="/moses"/> + <storageModule moduleId="refreshScope" versionNumber="2"> + <configuration configurationName="Release"> + <resource resourceType="PROJECT" workspacePath="/moses"/> + </configuration> + <configuration configurationName="Debug"> + <resource resourceType="PROJECT" workspacePath="/moses"/> + </configuration> </storageModule> <storageModule moduleId="org.eclipse.cdt.make.core.buildtargets"/> + <storageModule moduleId="org.eclipse.cdt.core.LanguageSettingsProviders"/> </cproject> diff --git a/contrib/other-builds/search/.cproject b/contrib/other-builds/search/.cproject index 9ccb8f8e9..2de36fecd 100644 --- a/contrib/other-builds/search/.cproject +++ b/contrib/other-builds/search/.cproject @@ -24,7 +24,7 @@ <folderInfo id="cdt.managedbuild.config.gnu.exe.debug.722547278." name="/" resourcePath=""> <toolChain id="cdt.managedbuild.toolchain.gnu.exe.debug.1512691763" name="Linux GCC" superClass="cdt.managedbuild.toolchain.gnu.exe.debug"> <targetPlatform binaryParser="org.eclipse.cdt.core.ELF;org.eclipse.cdt.core.MachO64" id="cdt.managedbuild.target.gnu.platform.exe.debug.633526059" name="Debug Platform" superClass="cdt.managedbuild.target.gnu.platform.exe.debug"/> - <builder buildPath="${workspace_loc:/search/Debug}" id="cdt.managedbuild.target.gnu.builder.exe.debug.164367197" keepEnvironmentInBuildfile="false" managedBuildOn="true" name="Gnu Make Builder" superClass="cdt.managedbuild.target.gnu.builder.exe.debug"/> + <builder buildPath="${workspace_loc:/search/Debug}" id="cdt.managedbuild.target.gnu.builder.exe.debug.164367197" keepEnvironmentInBuildfile="false" managedBuildOn="true" name="Gnu Make Builder" parallelBuildOn="true" parallelizationNumber="optimal" superClass="cdt.managedbuild.target.gnu.builder.exe.debug"/> <tool id="cdt.managedbuild.tool.gnu.archiver.base.854512708" name="GCC Archiver" superClass="cdt.managedbuild.tool.gnu.archiver.base"/> <tool id="cdt.managedbuild.tool.gnu.cpp.compiler.exe.debug.1096845166" name="GCC C++ Compiler" superClass="cdt.managedbuild.tool.gnu.cpp.compiler.exe.debug"> <option id="gnu.cpp.compiler.exe.debug.option.optimization.level.240381177" name="Optimization Level" superClass="gnu.cpp.compiler.exe.debug.option.optimization.level" value="gnu.cpp.compiler.optimization.level.none" valueType="enumerated"/> @@ -127,6 +127,13 @@ <autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfileC"/> </scannerConfigBuildInfo> </storageModule> - <storageModule moduleId="refreshScope"/> + <storageModule moduleId="refreshScope" versionNumber="2"> + <configuration configurationName="Release"> + <resource resourceType="PROJECT" workspacePath="/search"/> + </configuration> + <configuration configurationName="Debug"> + <resource resourceType="PROJECT" workspacePath="/search"/> + </configuration> + </storageModule> <storageModule moduleId="org.eclipse.cdt.core.LanguageSettingsProviders"/> </cproject> diff --git a/contrib/other-builds/search/.project b/contrib/other-builds/search/.project index efad842ea..95f074aae 100644 --- a/contrib/other-builds/search/.project +++ b/contrib/other-builds/search/.project @@ -157,11 +157,6 @@ <locationURI>PARENT-3-PROJECT_LOC/search/vertex.hh</locationURI> </link> <link> - <name>vertex_generator.cc</name> - <type>1</type> - <locationURI>PARENT-3-PROJECT_LOC/search/vertex_generator.cc</locationURI> - </link> - <link> <name>vertex_generator.hh</name> <type>1</type> <locationURI>PARENT-3-PROJECT_LOC/search/vertex_generator.hh</locationURI> diff --git a/contrib/other-builds/util/.cproject b/contrib/other-builds/util/.cproject index ab37362a4..2fd4d2dfb 100644 --- a/contrib/other-builds/util/.cproject +++ b/contrib/other-builds/util/.cproject @@ -24,7 +24,7 @@ <folderInfo id="cdt.managedbuild.config.gnu.macosx.exe.debug.1869657447." name="/" resourcePath=""> <toolChain id="cdt.managedbuild.toolchain.gnu.macosx.exe.debug.1388624938" name="MacOSX GCC" superClass="cdt.managedbuild.toolchain.gnu.macosx.exe.debug"> <targetPlatform binaryParser="org.eclipse.cdt.core.MachO64;org.eclipse.cdt.core.ELF" id="cdt.managedbuild.target.gnu.platform.macosx.exe.debug.1873607607" name="Debug Platform" superClass="cdt.managedbuild.target.gnu.platform.macosx.exe.debug"/> - <builder buildPath="${workspace_loc:/util/Debug}" id="cdt.managedbuild.target.gnu.builder.macosx.exe.debug.2045214944" keepEnvironmentInBuildfile="false" managedBuildOn="true" name="Gnu Make Builder" superClass="cdt.managedbuild.target.gnu.builder.macosx.exe.debug"/> + <builder buildPath="${workspace_loc:/util/Debug}" id="cdt.managedbuild.target.gnu.builder.macosx.exe.debug.2045214944" keepEnvironmentInBuildfile="false" managedBuildOn="true" name="Gnu Make Builder" parallelBuildOn="true" parallelizationNumber="optimal" superClass="cdt.managedbuild.target.gnu.builder.macosx.exe.debug"/> <tool id="cdt.managedbuild.tool.macosx.c.linker.macosx.exe.debug.589471640" name="MacOS X C Linker" superClass="cdt.managedbuild.tool.macosx.c.linker.macosx.exe.debug"/> <tool id="cdt.managedbuild.tool.macosx.cpp.linker.macosx.exe.debug.1543780089" name="MacOS X C++ Linker" superClass="cdt.managedbuild.tool.macosx.cpp.linker.macosx.exe.debug"> <inputType id="cdt.managedbuild.tool.macosx.cpp.linker.input.635667684" superClass="cdt.managedbuild.tool.macosx.cpp.linker.input"> @@ -136,8 +136,13 @@ <autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfileC"/> </scannerConfigBuildInfo> </storageModule> - <storageModule moduleId="refreshScope" versionNumber="1"> - <resource resourceType="PROJECT" workspacePath="/util"/> + <storageModule moduleId="refreshScope" versionNumber="2"> + <configuration configurationName="Release"> + <resource resourceType="PROJECT" workspacePath="/util"/> + </configuration> + <configuration configurationName="Debug"> + <resource resourceType="PROJECT" workspacePath="/util"/> + </configuration> </storageModule> <storageModule moduleId="org.eclipse.cdt.make.core.buildtargets"/> <storageModule moduleId="org.eclipse.cdt.core.LanguageSettingsProviders"/> diff --git a/contrib/rpm/README b/contrib/rpm/README new file mode 100644 index 000000000..8ba7ef4da --- /dev/null +++ b/contrib/rpm/README @@ -0,0 +1,42 @@ +Building Moses RPM +================== + +*** WARNING *** +Before completing *any* of the tasks outlined in this README, please commit and push any changes you wish to be included in your installer. +*** WARNING *** + + +Building the RPM SPEC file +-------------------------- + +The first phase is to construct the RPM SPEC file in $HOME/rpmbuild. The build_source.sh script builds all the artefacts needed to build. This script needs the following information: + + - The Git repository from which an installer will be built, + - The branch in the Git repository to build, and + - The version of the installed Moses distribution. + +For example, to build the RELEASE-1.0 branch in the mosesdecode repository (git://github.com/moses-smt/mosesdecoder.git): + +$ build_source.sh -r git://github.com/moses-smt/mosesdecoder.git -b RELASE-1.0 -v 1.0 + +This builds the source tarballs in the $HOME/rpmbuild/SOURCES directory and the moses.spec file in $HOME/rpmbuild/SPECS. + + +Building the RPM +---------------- + +Change directory to $HOME/rpmbuild, and build the binary RPM with: + +$ rpmbuild -bb SPECS/moses.spec + +This will download IRSTLM v5.70.04 and GIZA++ v2, then build them along with Moses and make the RPM in the directory $HOME/rpmbuild/RPMS/<architecture>/moses-<version>-1.<architecture>.rpm. + +For example building on a 64 bit Intel architecture, and building v1.0 the RPM would be called moses-1.0-1.x86_64.rpm. + + +Building a Debian package +------------------------- + +The Alien tool converts RPM packages to Debian packages. If a Debian package is required then follow the instructions on the following web-page: + +https://help.ubuntu.com/community/RPM/AlienHowto diff --git a/contrib/rpm/build_source.sh b/contrib/rpm/build_source.sh new file mode 100755 index 000000000..d0fac6a33 --- /dev/null +++ b/contrib/rpm/build_source.sh @@ -0,0 +1,63 @@ +#!/bin/bash + +BRANCH="master" +declare -i NO_RPM_BUILD=0 +declare -r RPM_VERSION_TAG="___RPM_VERSION__" + +function usage() { + echo "`basename $0` -r [Moses Git repo] -b [Moses Git branch: default ${BRANCH}] -v [RPM version]" + exit 1 +} + +if [ $# -lt 4 ]; then + usage +fi + +while getopts r:b:v:nh OPTION +do + case "$OPTION" in + r) REPO="${OPTARG}";; + b) BRANCH="${OPTARG}";; + v) VERSION="${OPTARG}";; + n) NO_RPM_BUILD=1;; + [h\?]) usage;; + esac +done + +if [ ! -d ./rpmbuild ]; then + echo "RPM build directory not in current working direcotry" + exit 1 +fi + +declare -r MOSES_DIR="moses-${VERSION}" +git clone ${REPO} ${MOSES_DIR} +if [ $? -ne 0 ]; then + echo "Failed to clone Git repository ${REPO}" + exit 3 +fi + +cd ${MOSES_DIR} + +git checkout ${BRANCH} +if [ $? -ne 0 ]; then + echo "Failed to checkout branch ${BRANCH}" + exit 3 +fi + +cd .. + +tar -cf moses-${VERSION}.tar ${MOSES_DIR} +gzip -f9 moses-${VERSION}.tar + +if [ ${NO_RPM_BUILD} -eq 0 ]; then + if [ ! -d ${HOME}/rpmbuild/SPECS ]; then + mkdir -p ${HOME}/rpmbuild/SPECS + fi + eval sed s/${RPM_VERSION_TAG}/${VERSION}/ ./rpmbuild/SPECS/moses.spec > ${HOME}/rpmbuild/SPECS/moses.spec + if [ ! -d ${HOME}/rpmbuild/SOURCES ]; then + mkdir -p ${HOME}/rpmbuild/SOURCES + fi + mv moses-${VERSION}.tar.gz ${HOME}/rpmbuild/SOURCES +fi + +rm -Rf ${MOSES_DIR} diff --git a/contrib/rpm/rpmbuild/SPECS/moses.spec b/contrib/rpm/rpmbuild/SPECS/moses.spec new file mode 100644 index 000000000..0f4a6c6ec --- /dev/null +++ b/contrib/rpm/rpmbuild/SPECS/moses.spec @@ -0,0 +1,65 @@ +Name: moses +Summary: Moses is a statistical machine translation system that allows you to automatically train translation models for any language pair. +Version: ___RPM_VERSION__ +Release: 1 +URL: http://www.statmt.org/moses/ +Source0: %{name}-%{version}.tar.gz +License: LGPL +Group: Development/Tools +Vendor: Capita Translation and Interpreting +Packager: Ian Johnson <ian.johnson@capita-ti.com> +Requires: boost >= 1.48, python >= 2.6, perl >= 5 +BuildRoot: /home/ian/rpmbuild/builds/%{name}-%{version}-%{release} +%description +Moses is a statistical machine translation system that allows you to automatically train translation models for any language pair. All you need is a collection of translated texts (parallel corpus). An efficient search algorithm finds quickly the highest probability translation among the exponential number of choices. +%prep +%setup -q + +mkdir -p $RPM_BUILD_ROOT/opt/moses/giza++-v1.0.7 + +wget -O $RPM_BUILD_DIR/irstlm-5.70.04.tgz http://moses-suite.googlecode.com/files/irstlm-5.70.04.tgz +wget -O $RPM_BUILD_DIR/giza-pp-v1.0.7.tgz http://moses-suite.googlecode.com/files/giza-pp-v1.0.7.tar.gz + +cd $RPM_BUILD_DIR + +tar -zxf irstlm-5.70.04.tgz +tar -zxf giza-pp-v1.0.7.tgz + +cd irstlm-5.70.04 +bash regenerate-makefiles.sh --force +./configure --prefix $RPM_BUILD_ROOT/opt/moses/irstlm-5.70.04 +make +make install + +cd ../giza-pp +make +cp $RPM_BUILD_DIR/giza-pp/GIZA++-v2/GIZA++ $RPM_BUILD_DIR/giza-pp/GIZA++-v2/snt2cooc.out $RPM_BUILD_DIR/giza-pp/mkcls-v2/mkcls $RPM_BUILD_ROOT/opt/moses/giza++-v1.0.7 +%build +./bjam --with-irstlm=$RPM_BUILD_ROOT/opt/moses/irstlm-5.70.04 --with-giza=$RPM_BUILD_ROOT/opt/moses/giza++-v1.0.7 -j2 +%install +mkdir -p $RPM_BUILD_ROOT/opt/moses/scripts +cp -R bin $RPM_BUILD_ROOT/opt/moses +cp -R scripts/analysis $RPM_BUILD_ROOT/opt/moses/scripts +cp -R scripts/ems $RPM_BUILD_ROOT/opt/moses/scripts +cp -R scripts/generic $RPM_BUILD_ROOT/opt/moses/scripts +cp -R scripts/other $RPM_BUILD_ROOT/opt/moses/scripts +cp -R scripts/recaser $RPM_BUILD_ROOT/opt/moses/scripts +cp -R scripts/regression-testing $RPM_BUILD_ROOT/opt/moses/scripts +cp -R scripts/share $RPM_BUILD_ROOT/opt/moses/scripts +cp -R scripts/tokenizer $RPM_BUILD_ROOT/opt/moses/scripts +cp -R scripts/training $RPM_BUILD_ROOT/opt/moses/scripts +%clean +%files +%defattr(-,root,root) +/opt/moses/bin/* +/opt/moses/scripts/analysis/* +/opt/moses/scripts/ems/* +/opt/moses/scripts/generic/* +/opt/moses/scripts/other/* +/opt/moses/scripts/recaser/* +/opt/moses/scripts/regression-testing/* +/opt/moses/scripts/share/* +/opt/moses/scripts/tokenizer/* +/opt/moses/scripts/training/* +/opt/moses/irstlm-5.70.04/* +/opt/moses/giza++-v1.0.7/* diff --git a/contrib/server/client.py b/contrib/server/client.py new file mode 100755 index 000000000..43e77555a --- /dev/null +++ b/contrib/server/client.py @@ -0,0 +1,21 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +# python port of client.perl + +import xmlrpclib +import datetime + +url = "http://localhost:8080/RPC2" +proxy = xmlrpclib.ServerProxy(url) + +text = u"il a souhaité que la présidence trace à nice le chemin pour l' avenir ." +params = {"text":text, "align":"true", "report-all-factors":"true"} + +result = proxy.translate(params) +print result['text'] +if 'align' in result: + print "Phrase alignments:" + aligns = result['align'] + for align in aligns: + print "%s,%s,%s" %(align['tgt-start'], align['src-start'], align['src-end']) diff --git a/contrib/server/mosesserver.cpp b/contrib/server/mosesserver.cpp index 98024c891..5d9c40a9b 100644 --- a/contrib/server/mosesserver.cpp +++ b/contrib/server/mosesserver.cpp @@ -1,6 +1,8 @@ #include "util/check.hh" #include <stdexcept> #include <iostream> +#include <vector> +#include <algorithm> #include "moses/ChartManager.h" @@ -54,7 +56,7 @@ public: PhraseDictionaryDynSuffixArray* pdsa = (PhraseDictionaryDynSuffixArray*) pdf->GetDictionary(); cerr << "Inserting into address " << pdsa << endl; pdsa->insertSnt(source_, target_, alignment_); - if(add2ORLM_) { + if(add2ORLM_) { updateORLM(); } cerr << "Done inserting\n"; @@ -83,8 +85,8 @@ public: const std::string sBOS = orlm->GetSentenceStart()->GetString(); const std::string sEOS = orlm->GetSentenceEnd()->GetString(); Utils::splitToStr(target_, vl, " "); - // insert BOS and EOS - vl.insert(vl.begin(), sBOS); + // insert BOS and EOS + vl.insert(vl.begin(), sBOS); vl.insert(vl.end(), sEOS); for(int j=0; j < vl.size(); ++j) { int i = (j<ngOrder) ? 0 : j-ngOrder+1; @@ -177,7 +179,7 @@ public: map<string, xmlrpc_c::value> retData; if (staticData.IsChart()) { - TreeInput tinput; + TreeInput tinput; const vector<FactorType> &inputFactorOrder = staticData.GetInputFactorOrder(); stringstream in(source + "\n"); @@ -260,10 +262,16 @@ public: } + + bool compareSearchGraphNode(const SearchGraphNode& a, const SearchGraphNode b) { + return a.hypo->GetId() < b.hypo->GetId(); + } + void insertGraphInfo(Manager& manager, map<string, xmlrpc_c::value>& retData) { vector<xmlrpc_c::value> searchGraphXml; vector<SearchGraphNode> searchGraph; manager.GetSearchGraph(searchGraph); + std::sort(searchGraph.begin(), searchGraph.end()); for (vector<SearchGraphNode>::const_iterator i = searchGraph.begin(); i != searchGraph.end(); ++i) { map<string, xmlrpc_c::value> searchGraphXmlNode; searchGraphXmlNode["forward"] = xmlrpc_c::value_double(i->forward); diff --git a/contrib/sigtest-filter/filter-pt.cpp b/contrib/sigtest-filter/filter-pt.cpp index f06d2b430..6ab1a5657 100644 --- a/contrib/sigtest-filter/filter-pt.cpp +++ b/contrib/sigtest-filter/filter-pt.cpp @@ -287,24 +287,24 @@ SentIdSet find_occurrences(const std::string& rule, C_SuffixArraySearchApplicati if (hierarchical) { // std::cerr << "splitting up phrase: " << phrase << "\n"; int pos = 0; - int endPos = 0; + int NTStartPos, NTEndPos; vector<std::string> phrases; - - while (rule.find("[X][X] ", pos) < rule.size()) { - endPos = rule.find("[X][X] ",pos) - 1; // -1 to cut space before NT - if (endPos < pos) { // no space: NT at start of rule (or two consecutive NTs) - pos += 7; + while (rule.find("] ", pos) < rule.size()) { + NTStartPos = rule.find("[",pos) - 1; // -1 to cut space before NT + NTEndPos = rule.find("] ",pos); + if (NTStartPos < pos) { // no space: NT at start of rule (or two consecutive NTs) + pos = NTEndPos + 2; continue; } - phrases.push_back(rule.substr(pos,endPos-pos)); - pos = endPos + 8; + phrases.push_back(rule.substr(pos,NTStartPos-pos)); + pos = NTEndPos + 2; } - // cut LHS of rule - endPos = rule.size()-4; - if (endPos > pos) { - phrases.push_back(rule.substr(pos,endPos-pos)); + NTStartPos = rule.find("[",pos) - 1; // LHS of rule + if (NTStartPos > pos) { + phrases.push_back(rule.substr(pos,NTStartPos-pos)); } + sa_set = lookup_multiple_phrases(phrases, my_sa, rule, cache); } else { diff --git a/contrib/tmcombine/README.md b/contrib/tmcombine/README.md index 2cbc83299..7b8ebd45e 100644 --- a/contrib/tmcombine/README.md +++ b/contrib/tmcombine/README.md @@ -58,7 +58,7 @@ Regression tests (check if the output files (`test/phrase-table_testN`) differ f FURTHER NOTES ------------- - - Different combination algorithms require different statistics. To be on the safe side, use the options `-phrase-word-alignment` and `-write-lexical-counts` when training models. + - Different combination algorithms require different statistics. To be on the safe side, use the option and `-write-lexical-counts` when training models. - The script assumes that phrase tables are sorted (to allow incremental, more memory-friendly processing). Sort the tables with `LC_ALL=C`. Phrase tables produced by Moses are sorted correctly. diff --git a/contrib/tmcombine/tmcombine.py b/contrib/tmcombine/tmcombine.py index 0bbcf7c78..5b65cc590 100755 --- a/contrib/tmcombine/tmcombine.py +++ b/contrib/tmcombine/tmcombine.py @@ -15,7 +15,7 @@ # Some general things to note: -# - Different combination algorithms require different statistics. To be on the safe side, use the options `-phrase-word-alignment` and `-write-lexical-counts` when training models. +# - Different combination algorithms require different statistics. To be on the safe side, use the option `-write-lexical-counts` when training models. # - The script assumes that phrase tables are sorted (to allow incremental, more memory-friendly processing). sort with LC_ALL=C. # - Some configurations require additional statistics that are loaded in memory (lexical tables; complete list of target phrases). If memory consumption is a problem, use the option --lowmem (slightly slower and writes temporary files to disk), or consider pruning your phrase table before combining (e.g. using Johnson et al. 2007). # - The script can read/write gzipped files, but the Python implementation is slow. You're better off unzipping the files on the command line and working with the unzipped files. @@ -306,7 +306,7 @@ class Moses(): # assuming that alignment is empty elif len(line) == 4: if self.require_alignment: - sys.stderr.write('Error: unexpected phrase table format. Your current configuration requires alignment information. Make sure you trained your model with -phrase-word-alignment\n') + sys.stderr.write('Error: unexpected phrase table format. Your current configuration requires alignment information. Make sure you trained your model with -phrase-word-alignment (default in newer Moses versions)\n') exit() self.phrase_pairs[src][target][1] = [b'',line[3].lstrip(b'| ')] diff --git a/mert/InterpolatedScorer.cpp b/mert/InterpolatedScorer.cpp index e610cbdd0..af3f26bf2 100644 --- a/mert/InterpolatedScorer.cpp +++ b/mert/InterpolatedScorer.cpp @@ -164,7 +164,7 @@ void InterpolatedScorer::prepareStats(size_t sid, const string& text, ScoreStats { stringstream buff; string align = text; - string sentence = ""; + string sentence = text; size_t alignmentData = text.find("|||"); //Get sentence and alignment parts if(alignmentData != string::npos) { diff --git a/moses-chart-cmd/IOWrapper.cpp b/moses-chart-cmd/IOWrapper.cpp index 09e06fcf6..b65873881 100644 --- a/moses-chart-cmd/IOWrapper.cpp +++ b/moses-chart-cmd/IOWrapper.cpp @@ -620,10 +620,27 @@ void IOWrapper::FixPrecision(std::ostream &stream, size_t size) template <class T> void ShiftOffsets(vector<T> &offsets, T shift) { + T currPos = shift; for (size_t i = 0; i < offsets.size(); ++i) { - shift += offsets[i]; - offsets[i] += shift; + if (offsets[i] == 0) { + offsets[i] = currPos; + ++currPos; + } + else { + currPos += offsets[i]; + } + } +} + +size_t CalcSourceSize(const Moses::ChartHypothesis *hypo) +{ + size_t ret = hypo->GetCurrSourceRange().GetNumWordsCovered(); + const std::vector<const ChartHypothesis*> &prevHypos = hypo->GetPrevHypos(); + for (size_t i = 0; i < prevHypos.size(); ++i) { + size_t childSize = prevHypos[i]->GetCurrSourceRange().GetNumWordsCovered(); + ret -= (childSize - 1); } + return ret; } size_t IOWrapper::OutputAlignmentNBest(Alignments &retAlign, const Moses::ChartTrellisNode &node, size_t startTarget) @@ -635,7 +652,11 @@ size_t IOWrapper::OutputAlignmentNBest(Alignments &retAlign, const Moses::ChartT const TargetPhrase &tp = hypo->GetCurrTargetPhrase(); - vector<size_t> sourceOffsets(hypo->GetCurrSourceRange().GetNumWordsCovered(), 0); + size_t thisSourceSize = CalcSourceSize(hypo); + + // position of each terminal word in translation rule, irrespective of alignment + // if non-term, number is undefined + vector<size_t> sourceOffsets(thisSourceSize, 0); vector<size_t> targetOffsets(tp.GetSize(), 0); const ChartTrellisNode::NodeChildren &prevNodes = node.GetChildren(); @@ -655,11 +676,12 @@ size_t IOWrapper::OutputAlignmentNBest(Alignments &retAlign, const Moses::ChartT const ChartTrellisNode &prevNode = *prevNodes[sourceInd]; - // 1st. calc source size + // calc source size size_t sourceSize = prevNode.GetHypothesis().GetCurrSourceRange().GetNumWordsCovered(); sourceOffsets[sourcePos] = sourceSize; - // 2nd. calc target size. Recursively look thru child hypos + // calc target size. + // Recursively look thru child hypos size_t currStartTarget = startTarget + totalTargetSize; size_t targetSize = OutputAlignmentNBest(retAlign, prevNode, currStartTarget); targetOffsets[targetPos] = targetSize; @@ -672,27 +694,26 @@ size_t IOWrapper::OutputAlignmentNBest(Alignments &retAlign, const Moses::ChartT } } - // 3rd. shift offsets + // convert position within translation rule to absolute position within + // source sentence / output sentence ShiftOffsets(sourceOffsets, startSource); ShiftOffsets(targetOffsets, startTarget); // get alignments from this hypo - vector< set<size_t> > retAlignmentsS2T(hypo->GetCurrSourceRange().GetNumWordsCovered()); const AlignmentInfo &aiTerm = hypo->GetCurrTargetPhrase().GetAlignTerm(); - OutputAlignment(retAlignmentsS2T, aiTerm); // add to output arg, offsetting by source & target - for (size_t source = 0; source < retAlignmentsS2T.size(); ++source) { - const set<size_t> &targets = retAlignmentsS2T[source]; - set<size_t>::const_iterator iter; - for (iter = targets.begin(); iter != targets.end(); ++iter) { - size_t target = *iter; - pair<size_t, size_t> alignPoint(source + sourceOffsets[source] - ,target + targetOffsets[target]); - pair<Alignments::iterator, bool> ret = retAlign.insert(alignPoint); - CHECK(ret.second); - - } + AlignmentInfo::const_iterator iter; + for (iter = aiTerm.begin(); iter != aiTerm.end(); ++iter) { + const std::pair<size_t,size_t> &align = *iter; + size_t relSource = align.first; + size_t relTarget = align.second; + size_t absSource = sourceOffsets[relSource]; + size_t absTarget = targetOffsets[relTarget]; + + pair<size_t, size_t> alignPoint(absSource, absTarget); + pair<Alignments::iterator, bool> ret = retAlign.insert(alignPoint); + CHECK(ret.second); } return totalTargetSize; @@ -702,14 +723,16 @@ void IOWrapper::OutputAlignment(size_t translationId , const Moses::ChartHypothe { ostringstream out; - Alignments retAlign; - OutputAlignment(retAlign, hypo, 0); + if (hypo) { + Alignments retAlign; + OutputAlignment(retAlign, hypo, 0); - // output alignments - Alignments::const_iterator iter; - for (iter = retAlign.begin(); iter != retAlign.end(); ++iter) { - const pair<size_t, size_t> &alignPoint = *iter; - out << alignPoint.first << "-" << alignPoint.second << " "; + // output alignments + Alignments::const_iterator iter; + for (iter = retAlign.begin(); iter != retAlign.end(); ++iter) { + const pair<size_t, size_t> &alignPoint = *iter; + out << alignPoint.first << "-" << alignPoint.second << " "; + } } out << endl; @@ -723,7 +746,11 @@ size_t IOWrapper::OutputAlignment(Alignments &retAlign, const Moses::ChartHypoth const TargetPhrase &tp = hypo->GetCurrTargetPhrase(); - vector<size_t> sourceOffsets(hypo->GetCurrSourceRange().GetNumWordsCovered(), 0); + size_t thisSourceSize = CalcSourceSize(hypo); + + // position of each terminal word in translation rule, irrespective of alignment + // if non-term, number is undefined + vector<size_t> sourceOffsets(thisSourceSize, 0); vector<size_t> targetOffsets(tp.GetSize(), 0); const vector<const ChartHypothesis*> &prevHypos = hypo->GetPrevHypos(); @@ -743,11 +770,12 @@ size_t IOWrapper::OutputAlignment(Alignments &retAlign, const Moses::ChartHypoth const ChartHypothesis *prevHypo = prevHypos[sourceInd]; - // 1st. calc source size + // calc source size size_t sourceSize = prevHypo->GetCurrSourceRange().GetNumWordsCovered(); sourceOffsets[sourcePos] = sourceSize; - // 2nd. calc target size. Recursively look thru child hypos + // calc target size. + // Recursively look thru child hypos size_t currStartTarget = startTarget + totalTargetSize; size_t targetSize = OutputAlignment(retAlign, prevHypo, currStartTarget); targetOffsets[targetPos] = targetSize; @@ -760,27 +788,27 @@ size_t IOWrapper::OutputAlignment(Alignments &retAlign, const Moses::ChartHypoth } } - // 3rd. shift offsets + // convert position within translation rule to absolute position within + // source sentence / output sentence ShiftOffsets(sourceOffsets, startSource); ShiftOffsets(targetOffsets, startTarget); // get alignments from this hypo - vector< set<size_t> > retAlignmentsS2T(hypo->GetCurrSourceRange().GetNumWordsCovered()); const AlignmentInfo &aiTerm = hypo->GetCurrTargetPhrase().GetAlignTerm(); - OutputAlignment(retAlignmentsS2T, aiTerm); // add to output arg, offsetting by source & target - for (size_t source = 0; source < retAlignmentsS2T.size(); ++source) { - const set<size_t> &targets = retAlignmentsS2T[source]; - set<size_t>::const_iterator iter; - for (iter = targets.begin(); iter != targets.end(); ++iter) { - size_t target = *iter; - pair<size_t, size_t> alignPoint(source + sourceOffsets[source] - ,target + targetOffsets[target]); - pair<Alignments::iterator, bool> ret = retAlign.insert(alignPoint); - CHECK(ret.second); + AlignmentInfo::const_iterator iter; + for (iter = aiTerm.begin(); iter != aiTerm.end(); ++iter) { + const std::pair<size_t,size_t> &align = *iter; + size_t relSource = align.first; + size_t relTarget = align.second; + size_t absSource = sourceOffsets[relSource]; + size_t absTarget = targetOffsets[relTarget]; + + pair<size_t, size_t> alignPoint(absSource, absTarget); + pair<Alignments::iterator, bool> ret = retAlign.insert(alignPoint); + CHECK(ret.second); - } } return totalTargetSize; diff --git a/moses-cmd/IOWrapper.cpp b/moses-cmd/IOWrapper.cpp index f11516839..335a570a6 100644 --- a/moses-cmd/IOWrapper.cpp +++ b/moses-cmd/IOWrapper.cpp @@ -262,6 +262,19 @@ void OutputAlignment(ostream &out, const vector<const Hypothesis *> &edges) out << std::endl; } +void OutputAlignment(std::ostream &out, const Moses::Hypothesis *hypo) +{ + std::vector<const Hypothesis *> edges; + const Hypothesis *currentHypo = hypo; + while (currentHypo) { + edges.push_back(currentHypo); + currentHypo = currentHypo->GetPrevHypo(); + } + + OutputAlignment(out, edges); + +} + void OutputAlignment(OutputCollector* collector, size_t lineNo , const vector<const Hypothesis *> &edges) { ostringstream out; diff --git a/moses-cmd/IOWrapper.h b/moses-cmd/IOWrapper.h index 8f164dfb3..8dbdeda9c 100644 --- a/moses-cmd/IOWrapper.h +++ b/moses-cmd/IOWrapper.h @@ -137,7 +137,7 @@ void OutputBestHypo(const Moses::TrellisPath &path, long /*translationId*/,bool void OutputInput(std::ostream& os, const Moses::Hypothesis* hypo); void OutputAlignment(Moses::OutputCollector* collector, size_t lineNo, const Moses::Hypothesis *hypo); void OutputAlignment(Moses::OutputCollector* collector, size_t lineNo, const Moses::TrellisPath &path); - +void OutputAlignment(std::ostream &out, const Moses::Hypothesis *hypo); } diff --git a/moses-cmd/Jamfile b/moses-cmd/Jamfile index 04f395a81..bddc10911 100644 --- a/moses-cmd/Jamfile +++ b/moses-cmd/Jamfile @@ -1,4 +1,4 @@ -alias deps : IOWrapper.cpp mbr.cpp LatticeMBR.cpp TranslationAnalysis.cpp ../moses//moses ; +alias deps : IOWrapper.cpp mbr.cpp LatticeMBR.cpp TranslationAnalysis.cpp ..//z ..//boost_iostreams ..//boost_filesystem ../moses//moses ; exe moses : Main.cpp deps ; exe lmbrgrid : LatticeMBRGrid.cpp deps ; diff --git a/moses-cmd/Main.cpp b/moses-cmd/Main.cpp index ac4527aae..b08ba532a 100644 --- a/moses-cmd/Main.cpp +++ b/moses-cmd/Main.cpp @@ -23,6 +23,13 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA * Moses main, for single-threaded and multi-threaded. **/ +#include <boost/algorithm/string/predicate.hpp> +#include <boost/filesystem.hpp> +#include <boost/iostreams/device/file.hpp> +#include <boost/iostreams/filter/bzip2.hpp> +#include <boost/iostreams/filter/gzip.hpp> +#include <boost/iostreams/filtering_stream.hpp> + #include <exception> #include <fstream> #include <sstream> @@ -83,14 +90,18 @@ public: OutputCollector* wordGraphCollector, OutputCollector* searchGraphCollector, OutputCollector* detailedTranslationCollector, OutputCollector* alignmentInfoCollector, - OutputCollector* unknownsCollector) : + OutputCollector* unknownsCollector, + bool outputSearchGraphSLF, + bool outputSearchGraphHypergraph) : m_source(source), m_lineNumber(lineNumber), m_outputCollector(outputCollector), m_nbestCollector(nbestCollector), m_latticeSamplesCollector(latticeSamplesCollector), m_wordGraphCollector(wordGraphCollector), m_searchGraphCollector(searchGraphCollector), m_detailedTranslationCollector(detailedTranslationCollector), m_alignmentInfoCollector(alignmentInfoCollector), - m_unknownsCollector(unknownsCollector) {} + m_unknownsCollector(unknownsCollector), + m_outputSearchGraphSLF(outputSearchGraphSLF), + m_outputSearchGraphHypergraph(outputSearchGraphHypergraph) {} /** Translate one sentence * gets called by main function implemented at end of this source file */ @@ -143,6 +154,96 @@ public: #endif } + // Output search graph in HTK standard lattice format (SLF) + if (m_outputSearchGraphSLF) { + stringstream fileName; + fileName << staticData.GetParam("output-search-graph-slf")[0] << "/" << m_lineNumber << ".slf"; + std::ofstream *file = new std::ofstream; + file->open(fileName.str().c_str()); + if (file->is_open() && file->good()) { + ostringstream out; + fix(out,PRECISION); + manager.OutputSearchGraphAsSLF(m_lineNumber, out); + *file << out.str(); + file -> flush(); + } else { + TRACE_ERR("Cannot output HTK standard lattice for line " << m_lineNumber << " because the output file is not open or not ready for writing" << std::endl); + } + } + + // Output search graph in hypergraph format for Kenneth Heafield's lazy hypergraph decoder + if (m_outputSearchGraphHypergraph) { + + vector<string> hypergraphParameters = staticData.GetParam("output-search-graph-hypergraph"); + + bool appendSuffix; + if (hypergraphParameters.size() > 0 && hypergraphParameters[0] == "true") { + appendSuffix = true; + } else { + appendSuffix = false; + } + + string compression; + if (hypergraphParameters.size() > 1) { + compression = hypergraphParameters[1]; + } else { + compression = "txt"; + } + + string hypergraphDir; + if ( hypergraphParameters.size() > 2 ) { + hypergraphDir = hypergraphParameters[2]; + } else { + string nbestFile = staticData.GetNBestFilePath(); + if ( ! nbestFile.empty() && nbestFile!="-" && !boost::starts_with(nbestFile,"/dev/stdout") ) { + boost::filesystem::path nbestPath(nbestFile); + hypergraphDir = nbestPath.parent_path().filename().native(); + } else { + stringstream hypergraphDirName; + hypergraphDirName << boost::filesystem::current_path() << "/hypergraph"; + hypergraphDir = hypergraphDirName.str(); + } + } + + if ( ! boost::filesystem::exists(hypergraphDir) ) { + boost::filesystem::create_directory(hypergraphDir); + } + + if ( ! boost::filesystem::exists(hypergraphDir) ) { + TRACE_ERR("Cannot output hypergraphs to " << hypergraphDir << " because the directory does not exist" << std::endl); + } else if ( ! boost::filesystem::is_directory(hypergraphDir) ) { + TRACE_ERR("Cannot output hypergraphs to " << hypergraphDir << " because that path exists, but is not a directory" << std::endl); + } else { + stringstream fileName; + fileName << hypergraphDir << "/" << m_lineNumber; + if ( appendSuffix ) { + fileName << "." << compression; + } + boost::iostreams::filtering_ostream *file = new boost::iostreams::filtering_ostream; + + if ( compression == "gz" ) { + file->push( boost::iostreams::gzip_compressor() ); + } else if ( compression == "bz2" ) { + file->push( boost::iostreams::bzip2_compressor() ); + } else if ( compression != "txt" ) { + TRACE_ERR("Unrecognized hypergraph compression format (" << compression << ") - using uncompressed plain txt" << std::endl); + compression = "txt"; + } + + file->push( boost::iostreams::file_sink(fileName.str(), ios_base::out) ); + + if (file->is_complete() && file->good()) { + fix(*file,PRECISION); + manager.OutputSearchGraphAsHypergraph(m_lineNumber, *file); + file -> flush(); + } else { + TRACE_ERR("Cannot output hypergraph for line " << m_lineNumber << " because the output file " << fileName.str() << " is not open or not ready for writing" << std::endl); + } + file -> pop(); + delete file; + } + } + // apply decision rule and output best translation(s) if (m_outputCollector) { ostringstream out; @@ -157,7 +258,7 @@ public: // MAP decoding: best hypothesis const Hypothesis* bestHypo = NULL; if (!staticData.UseMBR()) - { + { bestHypo = manager.GetBestHypothesis(); if (bestHypo) { if (staticData.IsPathRecoveryEnabled()) { @@ -174,13 +275,18 @@ public: staticData.GetOutputFactorOrder(), staticData.GetReportSegmentation(), staticData.GetReportAllFactors()); + if (staticData.PrintAlignmentInfo()) { + out << "||| "; + OutputAlignment(out, bestHypo); + } + OutputAlignment(m_alignmentInfoCollector, m_lineNumber, bestHypo); IFVERBOSE(1) { debug << "BEST TRANSLATION: " << *bestHypo << endl; } } out << endl; - } + } // MBR decoding (n-best MBR, lattice MBR, consensus) else @@ -311,6 +417,8 @@ private: OutputCollector* m_detailedTranslationCollector; OutputCollector* m_alignmentInfoCollector; OutputCollector* m_unknownsCollector; + bool m_outputSearchGraphSLF; + bool m_outputSearchGraphHypergraph; std::ofstream *m_alignmentStream; @@ -323,7 +431,7 @@ static void PrintFeatureWeight(const FeatureFunction* ff) vector<float> values = StaticData::Instance().GetAllWeights().GetScoresForProducer(ff); for (size_t i = 0; i < numScoreComps; ++i) cout << ff->GetScoreProducerDescription() << " " - << ff->GetScoreProducerWeightShortName() << " " + << ff->GetScoreProducerWeightShortName(i) << " " << values[i] << endl; } else { @@ -367,6 +475,63 @@ static void ShowWeights() } +size_t OutputFeatureWeightsForHypergraph(size_t index, const FeatureFunction* ff, std::ostream &outputSearchGraphStream) +{ + size_t numScoreComps = ff->GetNumScoreComponents(); + if (numScoreComps != ScoreProducer::unlimited) { + vector<float> values = StaticData::Instance().GetAllWeights().GetScoresForProducer(ff); + if (numScoreComps > 1) { + for (size_t i = 0; i < numScoreComps; ++i) { + outputSearchGraphStream << ff->GetScoreProducerWeightShortName() + << i + << "=" << values[i] << endl; + } + } else { + outputSearchGraphStream << ff->GetScoreProducerWeightShortName() + << "=" << values[0] << endl; + } + return index+numScoreComps; + } else { + cerr << "Sparse features are not yet supported when outputting hypergraph format" << endl; + assert(false); + return 0; + } +} + +void OutputFeatureWeightsForHypergraph(std::ostream &outputSearchGraphStream) +{ + outputSearchGraphStream.setf(std::ios::fixed); + outputSearchGraphStream.precision(6); + + const StaticData& staticData = StaticData::Instance(); + const TranslationSystem& system = staticData.GetTranslationSystem(TranslationSystem::DEFAULT); + const vector<const StatelessFeatureFunction*>& slf =system.GetStatelessFeatureFunctions(); + const vector<const StatefulFeatureFunction*>& sff = system.GetStatefulFeatureFunctions(); + size_t featureIndex = 1; + for (size_t i = 0; i < sff.size(); ++i) { + featureIndex = OutputFeatureWeightsForHypergraph(featureIndex, sff[i], outputSearchGraphStream); + } + for (size_t i = 0; i < slf.size(); ++i) { + if (slf[i]->GetScoreProducerWeightShortName() != "u" && + slf[i]->GetScoreProducerWeightShortName() != "tm" && + slf[i]->GetScoreProducerWeightShortName() != "I" && + slf[i]->GetScoreProducerWeightShortName() != "g") + { + featureIndex = OutputFeatureWeightsForHypergraph(featureIndex, slf[i], outputSearchGraphStream); + } + } + const vector<PhraseDictionaryFeature*>& pds = system.GetPhraseDictionaries(); + for( size_t i=0; i<pds.size(); i++ ) { + featureIndex = OutputFeatureWeightsForHypergraph(featureIndex, pds[i], outputSearchGraphStream); + } + const vector<GenerationDictionary*>& gds = system.GetGenerationDictionaries(); + for( size_t i=0; i<gds.size(); i++ ) { + featureIndex = OutputFeatureWeightsForHypergraph(featureIndex, gds[i], outputSearchGraphStream); + } + +} + + } //namespace /** main function of the command line version of the decoder **/ @@ -391,20 +556,20 @@ int main(int argc, char** argv) // load all the settings into the Parameter class // (stores them as strings, or array of strings) - Parameter* params = new Parameter(); - if (!params->LoadParam(argc,argv)) { + Parameter params; + if (!params.LoadParam(argc,argv)) { exit(1); } // initialize all "global" variables, which are stored in StaticData // note: this also loads models such as the language model, etc. - if (!StaticData::LoadDataStatic(params, argv[0])) { + if (!StaticData::LoadDataStatic(¶ms, argv[0])) { exit(1); } // setting "-show-weights" -> just dump out weights and exit - if (params->isParamSpecified("show-weights")) { + if (params.isParamSpecified("show-weights")) { ShowWeights(); exit(0); } @@ -430,6 +595,32 @@ int main(int argc, char** argv) TRACE_ERR(weights); TRACE_ERR("\n"); } + if (staticData.GetOutputSearchGraphHypergraph()) { + ofstream* weightsOut = new std::ofstream; + stringstream weightsFilename; + if (staticData.GetParam("output-search-graph-hypergraph").size() > 3) { + weightsFilename << staticData.GetParam("output-search-graph-hypergraph")[3]; + } else { + string nbestFile = staticData.GetNBestFilePath(); + if ( ! nbestFile.empty() && nbestFile!="-" && !boost::starts_with(nbestFile,"/dev/stdout") ) { + boost::filesystem::path nbestPath(nbestFile); + weightsFilename << nbestPath.parent_path().filename() << "/weights"; + } else { + weightsFilename << boost::filesystem::current_path() << "/hypergraph/weights"; + } + } + boost::filesystem::path weightsFilePath(weightsFilename.str()); + if ( ! boost::filesystem::exists(weightsFilePath.parent_path()) ) { + boost::filesystem::create_directory(weightsFilePath.parent_path()); + } + TRACE_ERR("The weights file is " << weightsFilename.str() << "\n"); + weightsOut->open(weightsFilename.str().c_str()); + OutputFeatureWeightsForHypergraph(*weightsOut); + weightsOut->flush(); + weightsOut->close(); + delete weightsOut; + } + // initialize output streams // note: we can't just write to STDOUT or files @@ -533,7 +724,9 @@ int main(int argc, char** argv) searchGraphCollector.get(), detailedTranslationCollector.get(), alignmentInfoCollector.get(), - unknownsCollector.get() ); + unknownsCollector.get(), + staticData.GetOutputSearchGraphSLF(), + staticData.GetOutputSearchGraphHypergraph()); // execute task #ifdef WITH_THREADS pool.Submit(task); @@ -551,6 +744,8 @@ int main(int argc, char** argv) pool.Stop(true); //flush remaining jobs #endif + delete ioWrapper; + } catch (const std::exception &e) { std::cerr << "Exception: " << e.what() << std::endl; return EXIT_FAILURE; diff --git a/moses/AlignmentInfoCollection.cpp b/moses/AlignmentInfoCollection.cpp index 5daba9ba1..53b83d8cd 100644 --- a/moses/AlignmentInfoCollection.cpp +++ b/moses/AlignmentInfoCollection.cpp @@ -30,6 +30,9 @@ AlignmentInfoCollection::AlignmentInfoCollection() m_emptyAlignmentInfo = Add(pairs); } +AlignmentInfoCollection::~AlignmentInfoCollection() +{} + const AlignmentInfo &AlignmentInfoCollection::GetEmptyAlignmentInfo() const { return *m_emptyAlignmentInfo; diff --git a/moses/AlignmentInfoCollection.h b/moses/AlignmentInfoCollection.h index 9c7f75e13..de0949f8f 100644 --- a/moses/AlignmentInfoCollection.h +++ b/moses/AlignmentInfoCollection.h @@ -55,6 +55,7 @@ class AlignmentInfoCollection //! Only a single static variable should be created. AlignmentInfoCollection(); + ~AlignmentInfoCollection(); static AlignmentInfoCollection s_instance; diff --git a/moses/Hypothesis.cpp b/moses/Hypothesis.cpp index 506193d5b..5bd3a4e2b 100644 --- a/moses/Hypothesis.cpp +++ b/moses/Hypothesis.cpp @@ -462,7 +462,7 @@ void Hypothesis::CleanupArcList() */ const StaticData &staticData = StaticData::Instance(); size_t nBestSize = staticData.GetNBestSize(); - bool distinctNBest = staticData.GetDistinctNBest() || staticData.UseMBR() || staticData.GetOutputSearchGraph() || staticData.UseLatticeMBR() ; + bool distinctNBest = staticData.GetDistinctNBest() || staticData.UseMBR() || staticData.GetOutputSearchGraph() || staticData.GetOutputSearchGraphSLF() || staticData.GetOutputSearchGraphHypergraph() || staticData.UseLatticeMBR() ; if (!distinctNBest && m_arcList->size() > nBestSize * 5) { // prune arc list only if there too many arcs diff --git a/moses/LM/SingleFactor.cpp b/moses/LM/SingleFactor.cpp index 3418aefe2..c061d0fed 100644 --- a/moses/LM/SingleFactor.cpp +++ b/moses/LM/SingleFactor.cpp @@ -36,8 +36,9 @@ using namespace std; namespace Moses { -LanguageModelSingleFactor::~LanguageModelSingleFactor() {} - +LanguageModelSingleFactor::~LanguageModelSingleFactor() +{ +} struct PointerState : public FFState { const void* lmstate; @@ -58,7 +59,11 @@ LanguageModelPointerState::LanguageModelPointerState() m_beginSentenceState = new PointerState(NULL); } -LanguageModelPointerState::~LanguageModelPointerState() {} +LanguageModelPointerState::~LanguageModelPointerState() +{ + delete m_nullContextState; + delete m_beginSentenceState; +} const FFState *LanguageModelPointerState::GetNullContextState() const { diff --git a/moses/Manager.cpp b/moses/Manager.cpp index 468db0de3..2ca689bb0 100644 --- a/moses/Manager.cpp +++ b/moses/Manager.cpp @@ -26,8 +26,10 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA #endif #include <algorithm> -#include <limits> #include <cmath> +#include <limits> +#include <map> +#include <set> #include "Manager.h" #include "TypeDef.h" #include "Util.h" @@ -46,17 +48,19 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA #include "rule.pb.h" #endif +#include "util/exception.hh" + using namespace std; namespace Moses { Manager::Manager(size_t lineNumber, InputType const& source, SearchAlgorithm searchAlgorithm, const TranslationSystem* system) - :m_lineNumber(lineNumber) - ,m_system(system) + :m_system(system) ,m_transOptColl(source.CreateTranslationOptionCollection(system)) ,m_search(Search::CreateSearch(*this, source, searchAlgorithm, *m_transOptColl)) ,interrupted_flag(0) ,m_hypoId(0) + ,m_lineNumber(lineNumber) ,m_source(source) { m_system->InitializeBeforeSentenceProcessing(source); @@ -628,6 +632,435 @@ void Manager::GetSearchGraph(vector<SearchGraphNode>& searchGraph) const } +void Manager::OutputFeatureWeightsForSLF(std::ostream &outputSearchGraphStream) const +{ + outputSearchGraphStream.setf(std::ios::fixed); + outputSearchGraphStream.precision(6); + + const StaticData& staticData = StaticData::Instance(); + const TranslationSystem& system = staticData.GetTranslationSystem(TranslationSystem::DEFAULT); + const vector<const StatelessFeatureFunction*>& slf =system.GetStatelessFeatureFunctions(); + const vector<const StatefulFeatureFunction*>& sff = system.GetStatefulFeatureFunctions(); + size_t featureIndex = 1; + for (size_t i = 0; i < sff.size(); ++i) { + featureIndex = OutputFeatureWeightsForSLF(featureIndex, sff[i], outputSearchGraphStream); + } + for (size_t i = 0; i < slf.size(); ++i) { + if (slf[i]->GetScoreProducerWeightShortName() != "u" && + slf[i]->GetScoreProducerWeightShortName() != "tm" && + slf[i]->GetScoreProducerWeightShortName() != "I" && + slf[i]->GetScoreProducerWeightShortName() != "g") + { + featureIndex = OutputFeatureWeightsForSLF(featureIndex, slf[i], outputSearchGraphStream); + } + } + const vector<PhraseDictionaryFeature*>& pds = system.GetPhraseDictionaries(); + for( size_t i=0; i<pds.size(); i++ ) { + featureIndex = OutputFeatureWeightsForSLF(featureIndex, pds[i], outputSearchGraphStream); + } + const vector<GenerationDictionary*>& gds = system.GetGenerationDictionaries(); + for( size_t i=0; i<gds.size(); i++ ) { + featureIndex = OutputFeatureWeightsForSLF(featureIndex, gds[i], outputSearchGraphStream); + } + +} + +void Manager::OutputFeatureValuesForSLF(const Hypothesis* hypo, bool zeros, std::ostream &outputSearchGraphStream) const +{ + outputSearchGraphStream.setf(std::ios::fixed); + outputSearchGraphStream.precision(6); + + // outputSearchGraphStream << endl; + // outputSearchGraphStream << (*hypo) << endl; + // const ScoreComponentCollection& scoreCollection = hypo->GetScoreBreakdown(); + // outputSearchGraphStream << scoreCollection << endl; + + const StaticData& staticData = StaticData::Instance(); + const TranslationSystem& system = staticData.GetTranslationSystem(TranslationSystem::DEFAULT); + const vector<const StatelessFeatureFunction*>& slf =system.GetStatelessFeatureFunctions(); + const vector<const StatefulFeatureFunction*>& sff = system.GetStatefulFeatureFunctions(); + size_t featureIndex = 1; + for (size_t i = 0; i < sff.size(); ++i) { + featureIndex = OutputFeatureValuesForSLF(featureIndex, zeros, hypo, sff[i], outputSearchGraphStream); + } + for (size_t i = 0; i < slf.size(); ++i) { + if (slf[i]->GetScoreProducerWeightShortName() != "u" && + slf[i]->GetScoreProducerWeightShortName() != "tm" && + slf[i]->GetScoreProducerWeightShortName() != "I" && + slf[i]->GetScoreProducerWeightShortName() != "g") + { + featureIndex = OutputFeatureValuesForSLF(featureIndex, zeros, hypo, slf[i], outputSearchGraphStream); + } + } + const vector<PhraseDictionaryFeature*>& pds = system.GetPhraseDictionaries(); + for( size_t i=0; i<pds.size(); i++ ) { + featureIndex = OutputFeatureValuesForSLF(featureIndex, zeros, hypo, pds[i], outputSearchGraphStream); + } + const vector<GenerationDictionary*>& gds = system.GetGenerationDictionaries(); + for( size_t i=0; i<gds.size(); i++ ) { + featureIndex = OutputFeatureValuesForSLF(featureIndex, zeros, hypo, gds[i], outputSearchGraphStream); + } + +} + +void Manager::OutputFeatureValuesForHypergraph(const Hypothesis* hypo, std::ostream &outputSearchGraphStream) const +{ + outputSearchGraphStream.setf(std::ios::fixed); + outputSearchGraphStream.precision(6); + + const StaticData& staticData = StaticData::Instance(); + const TranslationSystem& system = staticData.GetTranslationSystem(TranslationSystem::DEFAULT); + const vector<const StatelessFeatureFunction*>& slf =system.GetStatelessFeatureFunctions(); + const vector<const StatefulFeatureFunction*>& sff = system.GetStatefulFeatureFunctions(); + size_t featureIndex = 1; + for (size_t i = 0; i < sff.size(); ++i) { + featureIndex = OutputFeatureValuesForHypergraph(featureIndex, hypo, sff[i], outputSearchGraphStream); + } + for (size_t i = 0; i < slf.size(); ++i) { + if (slf[i]->GetScoreProducerWeightShortName() != "u" && + slf[i]->GetScoreProducerWeightShortName() != "tm" && + slf[i]->GetScoreProducerWeightShortName() != "I" && + slf[i]->GetScoreProducerWeightShortName() != "g") + { + featureIndex = OutputFeatureValuesForHypergraph(featureIndex, hypo, slf[i], outputSearchGraphStream); + } + } + const vector<PhraseDictionaryFeature*>& pds = system.GetPhraseDictionaries(); + for( size_t i=0; i<pds.size(); i++ ) { + featureIndex = OutputFeatureValuesForHypergraph(featureIndex, hypo, pds[i], outputSearchGraphStream); + } + const vector<GenerationDictionary*>& gds = system.GetGenerationDictionaries(); + for( size_t i=0; i<gds.size(); i++ ) { + featureIndex = OutputFeatureValuesForHypergraph(featureIndex, hypo, gds[i], outputSearchGraphStream); + } + +} + + +size_t Manager::OutputFeatureWeightsForSLF(size_t index, const FeatureFunction* ff, std::ostream &outputSearchGraphStream) const +{ + size_t numScoreComps = ff->GetNumScoreComponents(); + if (numScoreComps != ScoreProducer::unlimited) { + vector<float> values = StaticData::Instance().GetAllWeights().GetScoresForProducer(ff); + for (size_t i = 0; i < numScoreComps; ++i) { + outputSearchGraphStream << "# " << ff->GetScoreProducerDescription() + << " " << ff->GetScoreProducerWeightShortName() + << " " << (i+1) << " of " << numScoreComps << endl + << "x" << (index+i) << "scale=" << values[i] << endl; + } + return index+numScoreComps; + } else { + cerr << "Sparse features are not supported when outputting HTK standard lattice format" << endl; + assert(false); + return 0; + } +} + +size_t Manager::OutputFeatureValuesForSLF(size_t index, bool zeros, const Hypothesis* hypo, const FeatureFunction* ff, std::ostream &outputSearchGraphStream) const +{ + + // { const FeatureFunction* sp = ff; + // const FVector& m_scores = scoreCollection.GetScoresVector(); + // FVector& scores = const_cast<FVector&>(m_scores); + // std::string prefix = sp->GetScoreProducerDescription() + FName::SEP; + // // std::cout << "prefix==" << prefix << endl; + // // cout << "m_scores==" << m_scores << endl; + // // cout << "m_scores.size()==" << m_scores.size() << endl; + // // cout << "m_scores.coreSize()==" << m_scores.coreSize() << endl; + // // cout << "m_scores.cbegin() ?= m_scores.cend()\t" << (m_scores.cbegin() == m_scores.cend()) << endl; + + + // // for(FVector::FNVmap::const_iterator i = m_scores.cbegin(); i != m_scores.cend(); i++) { + // // std::cout<<prefix << "\t" << (i->first) << "\t" << (i->second) << std::endl; + // // } + // for(int i=0, n=v.size(); i<n; i+=1) { + // // outputSearchGraphStream << prefix << i << "==" << v[i] << std::endl; + + // } + // } + + // FVector featureValues = scoreCollection.GetVectorForProducer(ff); + // outputSearchGraphStream << featureValues << endl; + const ScoreComponentCollection& scoreCollection = hypo->GetScoreBreakdown(); + + vector<float> featureValues = scoreCollection.GetScoresForProducer(ff); + size_t numScoreComps = featureValues.size();//featureValues.coreSize(); + // if (numScoreComps != ScoreProducer::unlimited) { + // vector<float> values = StaticData::Instance().GetAllWeights().GetScoresForProducer(ff); + for (size_t i = 0; i < numScoreComps; ++i) { + outputSearchGraphStream << "x" << (index+i) << "=" << ((zeros) ? 0.0 : featureValues[i]) << " "; + } + return index+numScoreComps; + // } else { + // cerr << "Sparse features are not supported when outputting HTK standard lattice format" << endl; + // assert(false); + // return 0; + // } +} + +size_t Manager::OutputFeatureValuesForHypergraph(size_t index, const Hypothesis* hypo, const FeatureFunction* ff, std::ostream &outputSearchGraphStream) const +{ + ScoreComponentCollection scoreCollection = hypo->GetScoreBreakdown(); + const Hypothesis *prevHypo = hypo->GetPrevHypo(); + if (prevHypo) { + scoreCollection.MinusEquals( prevHypo->GetScoreBreakdown() ); + } + vector<float> featureValues = scoreCollection.GetScoresForProducer(ff); + size_t numScoreComps = featureValues.size(); + + if (numScoreComps > 1) { + for (size_t i = 0; i < numScoreComps; ++i) { + outputSearchGraphStream << ff->GetScoreProducerWeightShortName() << i << "=" << featureValues[i] << " "; + } + } else { + outputSearchGraphStream << ff->GetScoreProducerWeightShortName() << "=" << featureValues[0] << " "; + } + + return index+numScoreComps; +} + +/**! Output search graph in hypergraph format of Kenneth Heafield's lazy hypergraph decoder */ +void Manager::OutputSearchGraphAsHypergraph(long translationId, std::ostream &outputSearchGraphStream) const +{ + + VERBOSE(2,"Getting search graph to output as hypergraph for sentence " << translationId << std::endl) + + vector<SearchGraphNode> searchGraph; + GetSearchGraph(searchGraph); + + + map<int,int> mosesIDToHypergraphID; + // map<int,int> hypergraphIDToMosesID; + set<int> terminalNodes; + multimap<int,int> hypergraphIDToArcs; + + VERBOSE(2,"Gathering information about search graph to output as hypergraph for sentence " << translationId << std::endl) + + long numNodes = 0; + long endNode = 0; + { + long hypergraphHypothesisID = 0; + for (size_t arcNumber = 0, size=searchGraph.size(); arcNumber < size; ++arcNumber) { + + // Get an id number for the previous hypothesis + const Hypothesis *prevHypo = searchGraph[arcNumber].hypo->GetPrevHypo(); + if (prevHypo!=NULL) { + int mosesPrevHypothesisID = prevHypo->GetId(); + if (mosesIDToHypergraphID.count(mosesPrevHypothesisID) == 0) { + mosesIDToHypergraphID[mosesPrevHypothesisID] = hypergraphHypothesisID; + // hypergraphIDToMosesID[hypergraphHypothesisID] = mosesPrevHypothesisID; + hypergraphHypothesisID += 1; + } + } + + // Get an id number for this hypothesis + int mosesHypothesisID; + if (searchGraph[arcNumber].recombinationHypo) { + mosesHypothesisID = searchGraph[arcNumber].recombinationHypo->GetId(); + } else { + mosesHypothesisID = searchGraph[arcNumber].hypo->GetId(); + } + + if (mosesIDToHypergraphID.count(mosesHypothesisID) == 0) { + + mosesIDToHypergraphID[mosesHypothesisID] = hypergraphHypothesisID; + // hypergraphIDToMosesID[hypergraphHypothesisID] = mosesHypothesisID; + + bool terminalNode = (searchGraph[arcNumber].forward == -1); + if (terminalNode) { + // Final arc to end node, representing the end of the sentence </s> + terminalNodes.insert(hypergraphHypothesisID); + } + + hypergraphHypothesisID += 1; + } + + // Record that this arc ends at this node + hypergraphIDToArcs.insert(pair<int,int>(mosesIDToHypergraphID[mosesHypothesisID],arcNumber)); + + } + + // Unique end node + endNode = hypergraphHypothesisID; + // mosesIDToHypergraphID[hypergraphHypothesisID] = hypergraphHypothesisID; + numNodes = endNode + 1; + + } + + + long numArcs = searchGraph.size() + terminalNodes.size(); + + // Print number of nodes and arcs + outputSearchGraphStream << numNodes << " " << numArcs << endl; + + VERBOSE(2,"Search graph to output as hypergraph for sentence " << translationId + << " contains " << numArcs << " arcs and " << numNodes << " nodes" << std::endl) + + VERBOSE(2,"Outputting search graph to output as hypergraph for sentence " << translationId << std::endl) + + + for (int hypergraphHypothesisID=0; hypergraphHypothesisID < endNode; hypergraphHypothesisID+=1) { + if (hypergraphHypothesisID % 100000 == 0) { + VERBOSE(2,"Processed " << hypergraphHypothesisID << " of " << numNodes << " hypergraph nodes for sentence " << translationId << std::endl); + } + // int mosesID = hypergraphIDToMosesID[hypergraphHypothesisID]; + size_t count = hypergraphIDToArcs.count(hypergraphHypothesisID); + // VERBOSE(2,"Hypergraph node " << hypergraphHypothesisID << " has " << count << " incoming arcs" << std::endl) + if (count > 0) { + outputSearchGraphStream << count << "\n"; + + pair<multimap<int,int>::iterator, multimap<int,int>::iterator> range = + hypergraphIDToArcs.equal_range(hypergraphHypothesisID); + for (multimap<int,int>::iterator it=range.first; it!=range.second; ++it) { + int lineNumber = (*it).second; + const Hypothesis *thisHypo = searchGraph[lineNumber].hypo; + int mosesHypothesisID;// = thisHypo->GetId(); + if (searchGraph[lineNumber].recombinationHypo) { + mosesHypothesisID = searchGraph[lineNumber].recombinationHypo->GetId(); + } else { + mosesHypothesisID = searchGraph[lineNumber].hypo->GetId(); + } + // int actualHypergraphHypothesisID = mosesIDToHypergraphID[mosesHypothesisID]; + UTIL_THROW_IF( + (hypergraphHypothesisID != mosesIDToHypergraphID[mosesHypothesisID]), + util::Exception, + "Error while writing search lattice as hypergraph for sentence " << translationId << ". " << + "Moses node " << mosesHypothesisID << " was expected to have hypergraph id " << hypergraphHypothesisID << + ", but actually had hypergraph id " << mosesIDToHypergraphID[mosesHypothesisID] << + ". There are " << numNodes << " nodes in the search lattice." + ); + + const Hypothesis *prevHypo = thisHypo->GetPrevHypo(); + if (prevHypo==NULL) { + // VERBOSE(2,"Hypergraph node " << hypergraphHypothesisID << " start of sentence" << std::endl) + outputSearchGraphStream << "<s> ||| \n"; + } else { + int startNode = mosesIDToHypergraphID[prevHypo->GetId()]; + // VERBOSE(2,"Hypergraph node " << hypergraphHypothesisID << " has parent node " << startNode << std::endl) + UTIL_THROW_IF( + (startNode >= hypergraphHypothesisID), + util::Exception, + "Error while writing search lattice as hypergraph for sentence" << translationId << ". " << + "The nodes must be output in topological order. The code attempted to violate this restriction." + ); + + const TargetPhrase &targetPhrase = thisHypo->GetCurrTargetPhrase(); + int targetWordCount = targetPhrase.GetSize(); + + outputSearchGraphStream << "[" << startNode << "]"; + for (int targetWordIndex=0; targetWordIndex<targetWordCount; targetWordIndex+=1) { + outputSearchGraphStream << " " << targetPhrase.GetWord(targetWordIndex); + } + outputSearchGraphStream << " ||| "; + OutputFeatureValuesForHypergraph(thisHypo, outputSearchGraphStream); + outputSearchGraphStream << "\n"; + } + } + } + } + + // Print node and arc(s) for end of sentence </s> + outputSearchGraphStream << terminalNodes.size() << "\n"; + for (set<int>::iterator it=terminalNodes.begin(); it!=terminalNodes.end(); ++it) { + outputSearchGraphStream << "[" << (*it) << "] </s> ||| \n"; + } + +} + + +/**! Output search graph in HTK standard lattice format (SLF) */ +void Manager::OutputSearchGraphAsSLF(long translationId, std::ostream &outputSearchGraphStream) const +{ + + vector<SearchGraphNode> searchGraph; + GetSearchGraph(searchGraph); + + long numArcs = 0; + long numNodes = 0; + + map<int,int> nodes; + set<int> terminalNodes; + + // Unique start node + nodes[0] = 0; + + for (size_t arcNumber = 0; arcNumber < searchGraph.size(); ++arcNumber) { + + int targetWordCount = searchGraph[arcNumber].hypo->GetCurrTargetPhrase().GetSize(); + numArcs += targetWordCount; + + int hypothesisID = searchGraph[arcNumber].hypo->GetId(); + if (nodes.count(hypothesisID) == 0) { + + numNodes += targetWordCount; + nodes[hypothesisID] = numNodes; + //numNodes += 1; + + bool terminalNode = (searchGraph[arcNumber].forward == -1); + if (terminalNode) { + numArcs += 1; + } + } + + } + numNodes += 1; + + // Unique end node + nodes[numNodes] = numNodes; + + outputSearchGraphStream << "UTTERANCE=Sentence_" << translationId << endl; + outputSearchGraphStream << "VERSION=1.1" << endl; + outputSearchGraphStream << "base=2.71828182845905" << endl; + outputSearchGraphStream << "NODES=" << (numNodes+1) << endl; + outputSearchGraphStream << "LINKS=" << numArcs << endl; + + OutputFeatureWeightsForSLF(outputSearchGraphStream); + + for (size_t arcNumber = 0, lineNumber = 0; lineNumber < searchGraph.size(); ++lineNumber) { + const Hypothesis *thisHypo = searchGraph[lineNumber].hypo; + const Hypothesis *prevHypo = thisHypo->GetPrevHypo(); + if (prevHypo) { + + int startNode = nodes[prevHypo->GetId()]; + int endNode = nodes[thisHypo->GetId()]; + bool terminalNode = (searchGraph[lineNumber].forward == -1); + const TargetPhrase &targetPhrase = thisHypo->GetCurrTargetPhrase(); + int targetWordCount = targetPhrase.GetSize(); + + for (int targetWordIndex=0; targetWordIndex<targetWordCount; targetWordIndex+=1) { + int x = (targetWordCount-targetWordIndex); + + outputSearchGraphStream << "J=" << arcNumber; + + if (targetWordIndex==0) { + outputSearchGraphStream << " S=" << startNode; + } else { + outputSearchGraphStream << " S=" << endNode - x; + } + + outputSearchGraphStream << " E=" << endNode - (x-1) + << " W=" << targetPhrase.GetWord(targetWordIndex); + + OutputFeatureValuesForSLF(thisHypo, (targetWordIndex>0), outputSearchGraphStream); + + outputSearchGraphStream << endl; + + arcNumber += 1; + } + + if (terminalNode && terminalNodes.count(endNode) == 0) { + terminalNodes.insert(endNode); + outputSearchGraphStream << "J=" << arcNumber + << " S=" << endNode + << " E=" << numNodes + << endl; + arcNumber += 1; + } + } + } + +} + void OutputSearchNode(long translationId, std::ostream &outputSearchGraphStream, const SearchGraphNode& searchNode) { diff --git a/moses/Manager.h b/moses/Manager.h index dd011bc84..11762ec37 100644 --- a/moses/Manager.h +++ b/moses/Manager.h @@ -56,6 +56,10 @@ struct SearchGraphNode { hypo(theHypo), recombinationHypo(theRecombinationHypo), forward(theForward), fscore(theFscore) {} + bool operator<(const SearchGraphNode& sgn) const { + return this->hypo->GetId() < sgn.hypo->GetId(); + } + }; /** The Manager class implements a stack decoding algorithm for phrase-based decoding @@ -93,6 +97,19 @@ class Manager Manager(Manager const&); void operator=(Manager const&); const TranslationSystem* m_system; +private: + + // Helper functions to output search graph in HTK standard lattice format + void OutputFeatureWeightsForSLF(std::ostream &outputSearchGraphStream) const; + size_t OutputFeatureWeightsForSLF(size_t index, const FeatureFunction* ff, std::ostream &outputSearchGraphStream) const; + void OutputFeatureValuesForSLF(const Hypothesis* hypo, bool zeros, std::ostream &outputSearchGraphStream) const; + size_t OutputFeatureValuesForSLF(size_t index, bool zeros, const Hypothesis* hypo, const FeatureFunction* ff, std::ostream &outputSearchGraphStream) const; + + // Helper functions to output search graph in the hypergraph format of Kenneth Heafield's lazy hypergraph decoder + void OutputFeatureValuesForHypergraph(const Hypothesis* hypo, std::ostream &outputSearchGraphStream) const; + size_t OutputFeatureValuesForHypergraph(size_t index, const Hypothesis* hypo, const FeatureFunction* ff, std::ostream &outputSearchGraphStream) const; + + protected: // data // InputType const& m_source; /**< source sentence to be translated */ @@ -103,6 +120,7 @@ protected: size_t interrupted_flag; std::auto_ptr<SentenceStats> m_sentenceStats; int m_hypoId; //used to number the hypos as they are created. + size_t m_lineNumber; void GetConnectedGraph( std::map< int, bool >* pConnected, @@ -113,7 +131,6 @@ protected: public: - size_t m_lineNumber; InputType const& m_source; /**< source sentence to be translated */ Manager(size_t lineNumber, InputType const& source, SearchAlgorithm searchAlgorithm, const TranslationSystem* system); ~Manager(); @@ -137,6 +154,8 @@ public: #endif void OutputSearchGraph(long translationId, std::ostream &outputSearchGraphStream) const; + void OutputSearchGraphAsSLF(long translationId, std::ostream &outputSearchGraphStream) const; + void OutputSearchGraphAsHypergraph(long translationId, std::ostream &outputSearchGraphStream) const; void GetSearchGraph(std::vector<SearchGraphNode>& searchGraph) const; const InputType& GetSource() const { return m_source; diff --git a/moses/PDTAimp.h b/moses/PDTAimp.h index 25131b98a..5680b8ecb 100644 --- a/moses/PDTAimp.h +++ b/moses/PDTAimp.h @@ -11,6 +11,7 @@ #include "moses/TranslationModel/PhraseDictionaryTreeAdaptor.h" #include "SparsePhraseDictionaryFeature.h" #include "Util.h" +#include "util/tokenize_piece.hh" namespace Moses { @@ -284,11 +285,10 @@ protected: FactorCollection &factorCollection = FactorCollection::Instance(); for(size_t k=0; k<factorStrings.size(); ++k) { - std::vector<std::string> factors=TokenizeMultiCharSeparator(*factorStrings[k],StaticData::Instance().GetFactorDelimiter()); - CHECK(factors.size()==m_output.size()); + util::TokenIter<util::MultiCharacter, false> word(*factorStrings[k], StaticData::Instance().GetFactorDelimiter()); Word& w=targetPhrase.AddWord(); - for(size_t l=0; l<m_output.size(); ++l) { - w[m_output[l]]= factorCollection.AddFactor(Output, m_output[l], factors[l]); + for(size_t l=0; l<m_output.size(); ++l, ++word) { + w[m_output[l]]= factorCollection.AddFactor(*word); } } diff --git a/moses/Parameter.cpp b/moses/Parameter.cpp index 103277d34..6a9745ade 100644 --- a/moses/Parameter.cpp +++ b/moses/Parameter.cpp @@ -107,6 +107,7 @@ Parameter::Parameter() AddParam("monotone-at-punctuation", "mp", "do not reorder over punctuation"); AddParam("distortion-file", "source factors (0 if table independent of source), target factors, location of the factorized/lexicalized reordering tables"); AddParam("distortion", "configurations for each factorized/lexicalized reordering model."); + AddParam("early-distortion-cost", "edc", "include estimate of distortion cost yet to be incurred in the score [Moore & Quirk 2007]. Default is no"); AddParam("xml-input", "xi", "allows markup of input with desired translations and probabilities. values can be 'pass-through' (default), 'inclusive', 'exclusive', 'ignore'"); AddParam("xml-brackets", "xb", "specify strings to be used as xml tags opening and closing, e.g. \"{{ }}\" (default \"< >\"). Avoid square brackets because of configuration file format. Valid only with text input mode" ); AddParam("minimum-bayes-risk", "mbr", "use miminum Bayes risk to determine best translation"); @@ -130,6 +131,8 @@ Parameter::Parameter() AddParam("output-search-graph", "osg", "Output connected hypotheses of search into specified filename"); AddParam("output-search-graph-extended", "osgx", "Output connected hypotheses of search into specified filename, in extended format"); AddParam("unpruned-search-graph", "usg", "When outputting chart search graph, do not exclude dead ends. Note: stack pruning may have eliminated some hypotheses"); + AddParam("output-search-graph-slf", "slf", "Output connected hypotheses of search into specified directory, one file per sentence, in HTK standard lattice format (SLF)"); + AddParam("output-search-graph-hypergraph", "Output connected hypotheses of search into specified directory, one file per sentence, in a hypergraph format (see Kenneth Heafield's lazy hypergraph decoder)"); AddParam("include-lhs-in-search-graph", "lhssg", "When outputting chart search graph, include the label of the LHS of the rule (useful when using syntax)"); #ifdef HAVE_PROTOBUF AddParam("output-search-graph-pb", "pb", "Write phrase lattice to protocol buffer objects in the specified path."); @@ -177,6 +180,7 @@ Parameter::Parameter() AddParam("minlexr-memory", "Load lexical reordering table in minlexr format into memory"); AddParam("minphr-memory", "Load phrase table in minphr format into memory"); + AddParam("print-alignment-info", "Output word-to-word alignment into the log file. Word-to-word alignments are takne from the phrase table if any. Default is false"); AddParam("include-segmentation-in-n-best", "include phrasal segmentation in the n-best list. default is false"); AddParam("print-alignment-info-in-n-best", "Include word-to-word alignment in the n-best list. Word-to-word alignments are takne from the phrase table if any. Default is false"); AddParam("alignment-output-file", "print output word alignments into given file"); diff --git a/moses/SourceWordDeletionFeature.cpp b/moses/SourceWordDeletionFeature.cpp index c5a61111f..c312a3b03 100644 --- a/moses/SourceWordDeletionFeature.cpp +++ b/moses/SourceWordDeletionFeature.cpp @@ -55,12 +55,7 @@ void SourceWordDeletionFeature::ComputeFeatures(const TargetPhrase& targetPhrase // handle special case: unknown words (they have no word alignment) size_t targetLength = targetPhrase.GetSize(); size_t sourceLength = targetPhrase.GetSourcePhrase().GetSize(); - if (targetLength == 1 && sourceLength == 1) { - const Factor* f1 = targetPhrase.GetWord(0).GetFactor(1); - if (f1 && f1->GetString().compare(UNKNOWN_FACTOR) == 0) { - return; - } - } + if (targetLength == 1 && sourceLength == 1 && !alignmentInfo.GetSize()) return; // flag aligned words bool aligned[16]; diff --git a/moses/StaticData.cpp b/moses/StaticData.cpp index df05b64d3..449187da7 100644 --- a/moses/StaticData.cpp +++ b/moses/StaticData.cpp @@ -162,10 +162,6 @@ bool StaticData::LoadData(Parameter *parameter) } } - if(m_parameter->GetParam("sort-word-alignment").size()) { - m_wordAlignmentSort = (WordAlignmentSort) Scan<size_t>(m_parameter->GetParam("sort-word-alignment")[0]); - } - // factor delimiter if (m_parameter->GetParam("factor-delimiter").size() > 0) { m_factorDelimiter = m_parameter->GetParam("factor-delimiter")[0]; @@ -175,6 +171,16 @@ bool StaticData::LoadData(Parameter *parameter) SetBooleanParameter( &m_outputHypoScore, "output-hypo-score", false ); //word-to-word alignment + // alignments + SetBooleanParameter( &m_PrintAlignmentInfo, "print-alignment-info", false ); + if (m_PrintAlignmentInfo) { + m_needAlignmentInfo = true; + } + + if(m_parameter->GetParam("sort-word-alignment").size()) { + m_wordAlignmentSort = (WordAlignmentSort) Scan<size_t>(m_parameter->GetParam("sort-word-alignment")[0]); + } + SetBooleanParameter( &m_PrintAlignmentInfoNbest, "print-alignment-info-in-n-best", false ); if (m_PrintAlignmentInfoNbest) { m_needAlignmentInfo = true; @@ -235,8 +241,19 @@ bool StaticData::LoadData(Parameter *parameter) } m_outputSearchGraph = true; m_outputSearchGraphExtended = true; - } else + } else { m_outputSearchGraph = false; + } + if (m_parameter->GetParam("output-search-graph-slf").size() > 0) { + m_outputSearchGraphSLF = true; + } else { + m_outputSearchGraphSLF = false; + } + if (m_parameter->GetParam("output-search-graph-hypergraph").size() > 0) { + m_outputSearchGraphHypergraph = true; + } else { + m_outputSearchGraphHypergraph = false; + } #ifdef HAVE_PROTOBUF if (m_parameter->GetParam("output-search-graph-pb").size() > 0) { if (m_parameter->GetParam("output-search-graph-pb").size() != 1) { diff --git a/moses/StaticData.h b/moses/StaticData.h index 448f1a4e7..20d36e4b8 100644 --- a/moses/StaticData.h +++ b/moses/StaticData.h @@ -171,6 +171,7 @@ protected: bool m_reportAllFactorsNBest; std::string m_detailedTranslationReportingFilePath; bool m_onlyDistinctNBest; + bool m_PrintAlignmentInfo; bool m_needAlignmentInfo; bool m_PrintAlignmentInfoNbest; @@ -216,6 +217,8 @@ protected: bool m_outputWordGraph; //! whether to output word graph bool m_outputSearchGraph; //! whether to output search graph bool m_outputSearchGraphExtended; //! ... in extended format + bool m_outputSearchGraphSLF; //! whether to output search graph in HTK standard lattice format (SLF) + bool m_outputSearchGraphHypergraph; //! whether to output search graph in hypergraph #ifdef HAVE_PROTOBUF bool m_outputSearchGraphPB; //! whether to output search graph as a protobuf #endif @@ -458,7 +461,7 @@ public: return m_nBestFilePath; } bool IsNBestEnabled() const { - return (!m_nBestFilePath.empty()) || m_mbr || m_useLatticeMBR || m_mira || m_outputSearchGraph || m_useConsensusDecoding || !m_latticeSamplesFilePath.empty() + return (!m_nBestFilePath.empty()) || m_mbr || m_useLatticeMBR || m_mira || m_outputSearchGraph || m_outputSearchGraphSLF || m_outputSearchGraphHypergraph || m_useConsensusDecoding || !m_latticeSamplesFilePath.empty() #ifdef HAVE_PROTOBUF || m_outputSearchGraphPB #endif @@ -631,6 +634,12 @@ public: bool GetOutputSearchGraphExtended() const { return m_outputSearchGraphExtended; } + bool GetOutputSearchGraphSLF() const { + return m_outputSearchGraphSLF; + } + bool GetOutputSearchGraphHypergraph() const { + return m_outputSearchGraphHypergraph; + } #ifdef HAVE_PROTOBUF bool GetOutputSearchGraphPB() const { return m_outputSearchGraphPB; @@ -722,6 +731,9 @@ public: const std::string &GetAlignmentOutputFile() const { return m_alignmentOutputFile; } + bool PrintAlignmentInfo() const { + return m_PrintAlignmentInfo; + } bool PrintAlignmentInfoInNbest() const { return m_PrintAlignmentInfoNbest; } diff --git a/moses/TargetPhrase.cpp b/moses/TargetPhrase.cpp index b1d99ab50..6f14657a3 100644 --- a/moses/TargetPhrase.cpp +++ b/moses/TargetPhrase.cpp @@ -326,8 +326,10 @@ TO_STRING_BODY(TargetPhrase); std::ostream& operator<<(std::ostream& os, const TargetPhrase& tp) { - os << static_cast<const Phrase&>(tp) << ":" << tp.GetAlignNonTerm(); - os << ": c=" << tp.m_fullScore; + os << static_cast<const Phrase&>(tp) << ":" << flush; + os << tp.GetAlignNonTerm() << flush; + os << ": c=" << tp.m_fullScore << flush; + os << " " << tp.m_scoreBreakdown << flush; return os; } diff --git a/moses/TargetWordInsertionFeature.cpp b/moses/TargetWordInsertionFeature.cpp index 537c5c9cb..3b9bf36ba 100644 --- a/moses/TargetWordInsertionFeature.cpp +++ b/moses/TargetWordInsertionFeature.cpp @@ -56,12 +56,7 @@ void TargetWordInsertionFeature::ComputeFeatures(const TargetPhrase& targetPhras // handle special case: unknown words (they have no word alignment) size_t targetLength = targetPhrase.GetSize(); size_t sourceLength = targetPhrase.GetSourcePhrase().GetSize(); - if (targetLength == 1 && sourceLength == 1) { - const Factor* f1 = targetPhrase.GetWord(0).GetFactor(1); - if (f1 && f1->GetString().compare(UNKNOWN_FACTOR) == 0) { - return; - } - } + if (targetLength == 1 && sourceLength == 1 && !alignmentInfo.GetSize()) return; // flag aligned words bool aligned[16]; diff --git a/moses/TranslationModel/PhraseDictionaryTree.cpp b/moses/TranslationModel/PhraseDictionaryTree.cpp index 515d2f649..675656112 100644 --- a/moses/TranslationModel/PhraseDictionaryTree.cpp +++ b/moses/TranslationModel/PhraseDictionaryTree.cpp @@ -156,22 +156,6 @@ PhraseDictionaryTree::PrefixPtr::operator bool() const typedef LVoc<std::string> WordVoc; -static WordVoc* ReadVoc(const std::string& filename) -{ - static std::map<std::string,WordVoc*> vocs; -#ifdef WITH_THREADS - boost::mutex mutex; - boost::mutex::scoped_lock lock(mutex); -#endif - std::map<std::string,WordVoc*>::iterator vi = vocs.find(filename); - if (vi == vocs.end()) { - WordVoc* voc = new WordVoc(); - voc->Read(filename); - vocs[filename] = voc; - } - return vocs[filename]; -} - class PDTimp { public: @@ -184,8 +168,8 @@ public: std::vector<OFF_T> srcOffsets; FILE *os,*ot; - WordVoc* sv; - WordVoc* tv; + WordVoc sv; + WordVoc tv; ObjectPool<PPimp> pPool; // a comparison with the Boost MemPools might be useful @@ -269,12 +253,12 @@ public: rv.back().tokens.reserve(iphrase.size()); for(size_t j=0; j<iphrase.size(); ++j) { - rv.back().tokens.push_back(&tv->symbol(iphrase[j])); + rv.back().tokens.push_back(&tv.symbol(iphrase[j])); } rv.back().scores = i->GetScores(); const IPhrase& fnames = i->GetFeatureNames(); for (size_t j = 0; j < fnames.size(); ++j) { - rv.back().fnames.push_back(&tv->symbol(fnames[j])); + rv.back().fnames.push_back(&tv.symbol(fnames[j])); } rv.back().fvalues = i->GetFeatureValues(); if (wa) wa->push_back(i->GetAlignment()); @@ -289,7 +273,7 @@ public: CHECK(p); if(w.empty() || w==EPSILON) return p; - LabelId wi=sv->index(w); + LabelId wi=sv.index(w); if(wi==InvalidLabelId) return PPtr(); // unknown word else if(p.imp->isRoot()) { @@ -304,6 +288,8 @@ public: return PPtr(); } + + WordVoc* ReadVoc(const std::string& filename); }; @@ -350,10 +336,8 @@ int PDTimp::Read(const std::string& fn) for(size_t i=0; i<data.size(); ++i) data[i]=CPT(os,srcOffsets[i]); - sv = ReadVoc(ifsv); - tv = ReadVoc(iftv); - //sv.Read(ifsv); - //tv.Read(iftv); + sv.Read(ifsv); + tv.Read(iftv); TRACE_ERR("binary phrasefile loaded, default OFF_T: "<<PTF::getDefault() <<"\n"); @@ -370,7 +354,7 @@ void PDTimp::PrintTgtCand(const TgtCands& tcand,std::ostream& out) const const IPhrase& iphr=tcand[i].GetPhrase(); out << i << " -- " << sc << " -- "; - for(size_t j=0; j<iphr.size(); ++j) out << tv->symbol(iphr[j])<<" "; + for(size_t j=0; j<iphr.size(); ++j) out << tv.symbol(iphr[j])<<" "; out<< " -- " << trgAlign; out << std::endl; } @@ -423,7 +407,7 @@ GetTargetCandidates(const std::vector<std::string>& src, { IPhrase f(src.size()); for(size_t i=0; i<src.size(); ++i) { - f[i]=imp->sv->index(src[i]); + f[i]=imp->sv.index(src[i]); if(f[i]==InvalidLabelId) return; } @@ -439,7 +423,7 @@ GetTargetCandidates(const std::vector<std::string>& src, { IPhrase f(src.size()); for(size_t i=0; i<src.size(); ++i) { - f[i]=imp->sv->index(src[i]); + f[i]=imp->sv.index(src[i]); if(f[i]==InvalidLabelId) return; } @@ -455,7 +439,7 @@ PrintTargetCandidates(const std::vector<std::string>& src, { IPhrase f(src.size()); for(size_t i=0; i<src.size(); ++i) { - f[i]=imp->sv->index(src[i]); + f[i]=imp->sv.index(src[i]); if(f[i]==InvalidLabelId) { TRACE_ERR("the source phrase '"<<src<<"' contains an unknown word '" <<src[i]<<"'\n"); @@ -497,8 +481,6 @@ int PhraseDictionaryTree::Create(std::istream& inFile,const std::string& out) std::vector<OFF_T> vo; size_t lnc=0; size_t numElement = NOT_FOUND; // 3=old format, 5=async format which include word alignment info - imp->sv = new WordVoc(); - imp->tv = new WordVoc(); size_t missingAlignmentCount = 0; while(getline(inFile, line)) { @@ -532,11 +514,11 @@ int PhraseDictionaryTree::Create(std::istream& inFile,const std::string& out) std::vector<std::string> wordVec = Tokenize(sourcePhraseString); for (size_t i = 0 ; i < wordVec.size() ; ++i) - f.push_back(imp->sv->add(wordVec[i])); + f.push_back(imp->sv.add(wordVec[i])); wordVec = Tokenize(targetPhraseString); for (size_t i = 0 ; i < wordVec.size() ; ++i) - e.push_back(imp->tv->add(wordVec[i])); + e.push_back(imp->tv.add(wordVec[i])); // while(is>>w && w!="|||") sc.push_back(atof(w.c_str())); // Mauro: to handle 0 probs in phrase tables @@ -576,7 +558,7 @@ int PhraseDictionaryTree::Create(std::istream& inFile,const std::string& out) abort(); } for (size_t i = 0; i < sparseTokens.size(); i+=2) { - fnames.push_back(imp->tv->add(sparseTokens[i])); + fnames.push_back(imp->tv.add(sparseTokens[i])); fvalues.push_back(Scan<FValue>(sparseTokens[i+1])); } } @@ -663,8 +645,8 @@ int PhraseDictionaryTree::Create(std::istream& inFile,const std::string& out) fWriteVector(oi,vo); fClose(oi); - imp->sv->Write(ofsv); - imp->tv->Write(oftv); + imp->sv.Write(ofsv); + imp->tv.Write(oftv); return 1; } diff --git a/moses/TranslationModel/fuzzy-match/FuzzyMatchWrapper.cpp b/moses/TranslationModel/fuzzy-match/FuzzyMatchWrapper.cpp index c680d7245..065368ca7 100644 --- a/moses/TranslationModel/fuzzy-match/FuzzyMatchWrapper.cpp +++ b/moses/TranslationModel/fuzzy-match/FuzzyMatchWrapper.cpp @@ -552,7 +552,9 @@ namespace tmmt bool FuzzyMatchWrapper::GetLSEDCache(const std::pair< WORD_ID, WORD_ID > &key, unsigned int &value) const { +#ifdef WITH_THREADS boost::shared_lock<boost::shared_mutex> read_lock(m_accessLock); +#endif map< pair< WORD_ID, WORD_ID >, unsigned int >::const_iterator lookup = m_lsed.find( key ); if (lookup != m_lsed.end()) { value = lookup->second; @@ -564,7 +566,9 @@ namespace tmmt void FuzzyMatchWrapper::SetLSEDCache(const std::pair< WORD_ID, WORD_ID > &key, const unsigned int &value) { +#ifdef WITH_THREADS boost::unique_lock<boost::shared_mutex> lock(m_accessLock); +#endif m_lsed[ key ] = value; } diff --git a/moses/Util.cpp b/moses/Util.cpp index 98de1241e..495e05124 100644 --- a/moses/Util.cpp +++ b/moses/Util.cpp @@ -35,6 +35,7 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA #include "TypeDef.h" #include "Util.h" #include "Timer.h" +#include "util/exception.hh" #include "util/file.hh" using namespace std; @@ -65,6 +66,8 @@ const std::string ToLower(const std::string& str) return lc; } +class BoolValueException : public util::Exception {}; + template<> bool Scan<bool>(const std::string &input) { @@ -73,8 +76,7 @@ bool Scan<bool>(const std::string &input) return true; if (lc == "no" || lc == "n" || lc =="false" || lc == "0") return false; - TRACE_ERR( "Scan<bool>: didn't understand '" << lc << "', returning false" << std::endl); - return false; + UTIL_THROW(BoolValueException, "Could not interpret " << input << " as a boolean. After lowercasing, valid values are yes, y, true, 1, no, n, false, and 0."); } bool FileExists(const std::string& filePath) diff --git a/moses/Word.cpp b/moses/Word.cpp index c23e8de8c..2c1ac09ea 100644 --- a/moses/Word.cpp +++ b/moses/Word.cpp @@ -25,6 +25,7 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA #include "Word.h" #include "TypeDef.h" #include "StaticData.h" // needed to determine the FactorDelimiter +#include "util/exception.hh" #include "util/tokenize_piece.hh" using namespace std; @@ -95,6 +96,8 @@ std::string Word::GetString(FactorType factorType) const return NULL; } +class StrayFactorException : public util::Exception {}; + void Word::CreateFromString(FactorDirection direction , const std::vector<FactorType> &factorOrder , const StringPiece &str @@ -106,7 +109,7 @@ void Word::CreateFromString(FactorDirection direction for (size_t ind = 0; ind < factorOrder.size() && fit; ++ind, ++fit) { m_factorArray[factorOrder[ind]] = factorCollection.AddFactor(*fit); } - CHECK(!fit); + UTIL_THROW_IF(fit, StrayFactorException, "You have configured " << factorOrder.size() << " factors but the word " << str << " contains factor delimiter " << StaticData::Instance().GetFactorDelimiter() << " too many times."); // assume term/non-term same for all factors m_isNonTerminal = isNonTerminal; diff --git a/phrase-extract/consolidate-main.cpp b/phrase-extract/consolidate-main.cpp index 70de9678b..fd33907de 100644 --- a/phrase-extract/consolidate-main.cpp +++ b/phrase-extract/consolidate-main.cpp @@ -256,7 +256,7 @@ void processFiles( char* fileNameDirect, char* fileNameIndirect, char* fileNameC if (kneserNeyFlag) { float D = kneserNey_D3; if (countEF < 2) D = kneserNey_D1; - if (countEF < 3) D = kneserNey_D2; + else if (countEF < 3) D = kneserNey_D2; if (D > countEF) D = countEF - 0.01; // sanity constraint float p_b_E = n1_E / totalCount; // target phrase prob based on distinct diff --git a/phrase-extract/extract-main.cpp b/phrase-extract/extract-main.cpp index 92c8a470e..cab91e92d 100644 --- a/phrase-extract/extract-main.cpp +++ b/phrase-extract/extract-main.cpp @@ -712,6 +712,10 @@ for(int fi=startF; fi<=endF; fi++) { if (m_options.isOrientationFlag()) outextractstrOrientation << orientationInfo; + if (m_options.isIncludeSentenceIdFlag()) { + outextractstr << " ||| " << sentence.sentenceID; + } + if (m_options.getInstanceWeightsFile().length()) { if (m_options.isTranslationFlag()) { outextractstr << " ||| " << sentence.weightString; @@ -722,9 +726,6 @@ for(int fi=startF; fi<=endF; fi++) { } } - if (m_options.isIncludeSentenceIdFlag()) { - outextractstr << " ||| " << sentence.sentenceID; - } if (m_options.isTranslationFlag()) outextractstr << "\n"; if (m_options.isTranslationFlag()) outextractstrInv << "\n"; diff --git a/scripts/ems/experiment.meta b/scripts/ems/experiment.meta index 214569206..769fc0ebf 100644 --- a/scripts/ems/experiment.meta +++ b/scripts/ems/experiment.meta @@ -1000,6 +1000,7 @@ lowercase-reference out: reference default-name: evaluation/reference pass-unless: output-lowercaser + pass-if: recaser multiref: $moses-script-dir/ems/support/run-command-on-multiple-refsets.perl template: $output-lowercaser < IN > OUT nist-bleu diff --git a/scripts/ems/support/analysis.perl b/scripts/ems/support/analysis.perl index 29962ca71..a2f9580a9 100755 --- a/scripts/ems/support/analysis.perl +++ b/scripts/ems/support/analysis.perl @@ -745,7 +745,8 @@ sub hierarchical_segmentation { open(OUTPUT_TREE,">$dir/output-tree") or die "Cannot open: $!"; open(NODE,">$dir/node") or die "Cannot open: $!"; while(<TRACE>) { - /^Trans Opt (\d+) \[(\d+)\.\.(\d+)\]: (.+) : (\S+) \-\>(.+) :([\(\),\d\- ]*): pC=[\d\.\-e]+, c=/ || die("cannot scan line $_"); + /^Trans Opt (\d+) \[(\d+)\.\.(\d+)\]: (.+) : (\S+) \-\>(.+) :([\(\),\d\- ]*): pC=[\d\.\-e]+, c=/ || + /^Trans Opt (\d+) \[(\d+)\.\.(\d+)\]: (.+) : (\S+) \-\>(.+) :([\(\),\d\- ]*): c=/ || die("cannot scan line $_"); my ($sentence,$start,$end,$spans,$rule_lhs,$rule_rhs,$alignment) = ($1,$2,$3,$4,$5,$6,$7); if ($last_sentence >= 0 && $sentence != $last_sentence) { &hs_process($last_sentence,\@DERIVATION,\%STATS); @@ -1137,9 +1138,17 @@ sub process_search_graph { `mkdir -p $dir/search-graph`; my $last_sentence = -1; while(<OSG>) { - /^(\d+) (\d+)\-?\>?(\S*) (\S+) =\> (.+) :(.*): pC=([\de\-\.]+), c=([\de\-\.]+) \[(\d+)\.\.(\d+)\] (.*)\[total=([\d\-\.]+)\] \<\</ || die("ERROR: buggy search graph line: $_"); - my ($sentence,$id,$recomb,$lhs,$output,$alignment,$rule_score,$heuristic_rule_score,$from,$to,$children,$hyp_score) - = ($1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12); + my ($sentence,$id,$recomb,$lhs,$output,$alignment,$rule_score,$heuristic_rule_score,$from,$to,$children,$hyp_score); + if (/^(\d+) (\d+)\-?\>?(\S*) (\S+) =\> (.+) :(.*): pC=([\de\-\.]+), c=([\de\-\.]+) \[(\d+)\.\.(\d+)\] (.*)\[total=([\d\-\.]+)\] \<\</) { + ($sentence,$id,$recomb,$lhs,$output,$alignment,$rule_score,$heuristic_rule_score,$from,$to,$children,$hyp_score) = ($1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12); + } + elsif (/^(\d+) (\d+)\-?\>?(\S*) (\S+) =\> (.+) :(.*): c=([\de\-\.]+) \[(\d+)\.\.(\d+)\] (.*)\[total=([\d\-\.]+)\] core/) { + ($sentence,$id,$recomb,$lhs,$output,$alignment,$rule_score,$from,$to,$children,$hyp_score) = ($1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12); + $heuristic_rule_score = $rule_score; # hmmmm.... + } + else { + die("ERROR: buggy search graph line: $_"); + } chop($alignment) if $alignment; chop($children) if $children; $recomb = 0 unless $recomb; diff --git a/scripts/ems/support/wrap-xml.perl b/scripts/ems/support/wrap-xml.perl index e941aa95b..4ef6a1de6 100755 --- a/scripts/ems/support/wrap-xml.perl +++ b/scripts/ems/support/wrap-xml.perl @@ -13,10 +13,10 @@ chomp(@OUT); while(<SRC>) { chomp; if (/^<srcset/) { - s/<srcset/<tstset trglang="$language"/; + s/<srcset/<tstset trglang="$language"/i; } elsif (/^<\/srcset/) { - s/<\/srcset/<\/tstset/; + s/<\/srcset/<\/tstset/i; } elsif (/^<doc/i) { s/ *sysid="[^\"]+"//; @@ -26,10 +26,10 @@ while(<SRC>) { my $line = shift(@OUT); $line = "" if $line =~ /NO BEST TRANSLATION/; if (/<\/seg>/) { - s/(<seg[^>]+> *).*(<\/seg>)/$1$line$2/; + s/(<seg[^>]+> *).*(<\/seg>)/$1$line$2/i; } else { - s/(<seg[^>]+> *)[^<]*/$1$line/; + s/(<seg[^>]+> *)[^<]*/$1$line/i; } } print $_."\n"; diff --git a/scripts/generic/compound-splitter.perl b/scripts/generic/compound-splitter.perl index 8f82ab8d9..beca70eb0 100755 --- a/scripts/generic/compound-splitter.perl +++ b/scripts/generic/compound-splitter.perl @@ -16,15 +16,15 @@ $HELP = 1 unless &GetOptions('corpus=s' => \$CORPUS, 'model=s' => \$MODEL, 'filler=s' => \$FILLER, - 'factored' => \$FACTORED, + 'factored' => \$FACTORED, 'min-size=i' => \$MIN_SIZE, 'min-count=i' => \$MIN_COUNT, 'max-count=i' => \$MAX_COUNT, 'help' => \$HELP, 'verbose' => \$VERBOSE, - 'syntax' => \$SYNTAX, - 'binarize' => \$BINARIZE, - 'mark-split' => \$MARK_SPLIT, + 'syntax' => \$SYNTAX, + 'binarize' => \$BINARIZE, + 'mark-split' => \$MARK_SPLIT, 'train' => \$TRAIN); if ($HELP || @@ -155,34 +155,37 @@ sub apply { next if defined($COUNT{$lc}) && $COUNT{$lc} > $count; $COUNT{$lc} = $count; $TRUECASE{$lc} = $factored_word; - $LABEL{$lc} = $label if $SYNTAX; + $LABEL{$lc} = $label if $SYNTAX; } close(MODEL); while(<STDIN>) { my $first = 1; chop; s/\s+/ /g; s/^ //; s/ $//; - my @BUFFER; # for xml tags + my @BUFFER; # for xml tags foreach my $factored_word (split) { print " " unless $first; $first = 0; - # syntax: don't split xml - if ($SYNTAX && ($factored_word =~ /^</ || $factored_word =~ />$/)) { - push @BUFFER,$factored_word; - $first = 1; - next; - } - - # get case class - my $word = $factored_word; - $word =~ s/\|.+//g; # just first factor - my $lc = lc($word); - + # syntax: don't split xml + if ($SYNTAX && ($factored_word =~ /^</ || $factored_word =~ />$/)) { + push @BUFFER,$factored_word; + $first = 1; + next; + } + + # get case class + my $word = $factored_word; + $word =~ s/\|.+//g; # just first factor + my $lc = lc($word); + + print STDERR "considering $word ($lc)...\n" if $VERBOSE; # don't split frequent words - if (defined($COUNT{$lc}) && $COUNT{$lc}>=$MAX_COUNT) { - print join(" ",@BUFFER)." " if scalar(@BUFFER); @BUFFER = (); # clear buffer + if ((defined($COUNT{$lc}) && $COUNT{$lc}>=$MAX_COUNT) || + $lc !~ /[a-zA-Z]/) {; # has to have at least one letter + print join(" ",@BUFFER)." " if scalar(@BUFFER); @BUFFER = (); # clear buffer print $factored_word; + print STDERR "\tfrequent word ($COUNT{$lc}>=$MAX_COUNT), skipping\n" if $VERBOSE; next; } diff --git a/scripts/generic/extract-parallel.perl b/scripts/generic/extract-parallel.perl index 7533b39e0..192169c86 100755 --- a/scripts/generic/extract-parallel.perl +++ b/scripts/generic/extract-parallel.perl @@ -153,9 +153,9 @@ if (defined($baselineExtract)) { $catOCmd .= "$baselineExtract.o$sorted.gz "; } -$catCmd .= " | LC_ALL=C $sortCmd -T $TMPDIR | gzip -c > $extract.sorted.gz \n"; -$catInvCmd .= " | LC_ALL=C $sortCmd -T $TMPDIR | gzip -c > $extract.inv.sorted.gz \n"; -$catOCmd .= " | LC_ALL=C $sortCmd -T $TMPDIR | gzip -c > $extract.o.sorted.gz \n"; +$catCmd .= " | LC_ALL=C $sortCmd -T $TMPDIR 2>> /dev/stderr | gzip -c > $extract.sorted.gz 2>> /dev/stderr \n"; +$catInvCmd .= " | LC_ALL=C $sortCmd -T $TMPDIR 2>> /dev/stderr | gzip -c > $extract.inv.sorted.gz 2>> /dev/stderr \n"; +$catOCmd .= " | LC_ALL=C $sortCmd -T $TMPDIR 2>> /dev/stderr | gzip -c > $extract.o.sorted.gz 2>> /dev/stderr \n"; @children = (); diff --git a/scripts/generic/moses-parallel.pl b/scripts/generic/moses-parallel.pl index d1840fc55..b8d393e71 100755 --- a/scripts/generic/moses-parallel.pl +++ b/scripts/generic/moses-parallel.pl @@ -64,6 +64,7 @@ my $wordgraphfile=undef; my $wordgraphflag=0; my $robust=5; # resubmit crashed jobs robust-times my $alifile=undef; +my $detailsfile=undef; my $logfile=""; my $logflag=""; my $searchgraphlist=""; @@ -93,6 +94,7 @@ sub init(){ 'output-search-graph|osg=s'=> \$searchgraphlist, 'output-word-graph|owg=s'=> \$wordgraphlist, 'alignment-output-file=s'=> \$alifile, + 'translation-details|T=s'=> \$detailsfile, 'qsub-prefix=s'=> \$qsubname, 'queue-parameters=s'=> \$queueparameters, 'inputtype=i'=> \$inputtype, @@ -539,6 +541,7 @@ while ($robust && scalar @idx_todo) { concatenate_1best(); concatenate_logs() if $logflag; concatenate_ali() if defined $alifile; +concatenate_details() if defined $detailsfile; concatenate_nbest() if $nbestflag; safesystem("cat nbest$$ >> /dev/stdout") if $nbestlist[0] eq '-'; @@ -580,6 +583,11 @@ sub preparing_script(){ $tmpalioutfile="-alignment-output-file $tmpdir/$alifile.$splitpfx$idx"; } + my $tmpdetailsoutfile = ""; + if (defined $detailsfile){ + $tmpdetailsoutfile="-translation-details $tmpdir/$detailsfile.$splitpfx$idx"; + } + my $tmpsearchgraphlist=""; if ($searchgraphflag){ $tmpsearchgraphlist="-output-search-graph $tmpdir/$searchgraphfile.$splitpfx$idx"; @@ -592,13 +600,17 @@ sub preparing_script(){ my $tmpStartTranslationId = ""; # "-start-translation-id $currStartTranslationId"; - print OUT "$mosescmd $mosesparameters $tmpStartTranslationId $tmpalioutfile $tmpwordgraphlist $tmpsearchgraphlist $tmpnbestlist $inputmethod ${inputfile}.$splitpfx$idx > $tmpdir/${inputfile}.$splitpfx$idx.trans\n\n"; + print OUT "$mosescmd $mosesparameters $tmpStartTranslationId $tmpalioutfile $tmpdetailsoutfile $tmpwordgraphlist $tmpsearchgraphlist $tmpnbestlist $inputmethod ${inputfile}.$splitpfx$idx > $tmpdir/${inputfile}.$splitpfx$idx.trans\n\n"; print OUT "echo exit status \$\?\n\n"; if (defined $alifile){ print OUT "\\mv -f $tmpdir/${alifile}.$splitpfx$idx .\n\n"; print OUT "echo exit status \$\?\n\n"; } + if (defined $detailsfile){ + print OUT "\\mv -f $tmpdir/${detailsfile}.$splitpfx$idx .\n\n"; + print OUT "echo exit status \$\?\n\n"; + } if ($nbestflag){ print OUT "\\mv -f $tmpdir/${nbestfile}.$splitpfx$idx .\n\n"; print OUT "echo exit status \$\?\n\n"; @@ -827,6 +839,18 @@ sub concatenate_ali(){ close(OUT); } +sub concatenate_details(){ + open (OUT, "> ${detailsfile}"); + foreach my $idx (@idxlist){ + my @in=(); + open (IN, "$detailsfile.$splitpfx$idx"); + @in=<IN>; + print OUT "@in"; + close(IN); + } + close(OUT); +} + sub check_exit_status(){ print STDERR "check_exit_status\n"; @@ -925,6 +949,7 @@ sub remove_temporary_files(){ unlink("${inputfile}.${splitpfx}${idx}.trans"); unlink("${inputfile}.${splitpfx}${idx}"); if (defined $alifile){ unlink("${alifile}.${splitpfx}${idx}"); } + if (defined $detailsfile){ unlink("${detailsfile}.${splitpfx}${idx}"); } if ($nbestflag){ unlink("${nbestfile}.${splitpfx}${idx}"); } if ($searchgraphflag){ unlink("${searchgraphfile}.${splitpfx}${idx}"); } if ($wordgraphflag){ unlink("${wordgraphfile}.${splitpfx}${idx}"); } diff --git a/scripts/generic/mteval-v13a.pl b/scripts/generic/mteval-v13a.pl index 879212e6e..f1f8f9ef6 100755 --- a/scripts/generic/mteval-v13a.pl +++ b/scripts/generic/mteval-v13a.pl @@ -1009,7 +1009,7 @@ sub extract_sgml_tag_and_span sub extract_sgml_tag_attribute { my ($name, $data) = @_; - ($data =~ m|$name\s*=\s*\"([^\"]*)\"|si) ? ($1) : (); + ($data =~ m|$name\s*=\s*\"?([^\"]*)\"?|si) ? ($1) : (); } ################################# diff --git a/scripts/generic/score-parallel.perl b/scripts/generic/score-parallel.perl index 520fbddbe..3f763e5d9 100755 --- a/scripts/generic/score-parallel.perl +++ b/scripts/generic/score-parallel.perl @@ -163,7 +163,7 @@ else $cmd .= "| LC_ALL=C $sortCmd -T $TMPDIR "; } - $cmd .= " | gzip -c > $ptHalf"; + $cmd .= " | gzip -c > $ptHalf 2>> /dev/stderr "; } print STDERR $cmd; systemCheck($cmd); diff --git a/scripts/recaser/detruecase.perl b/scripts/recaser/detruecase.perl index 49c89c299..012c143ac 100755 --- a/scripts/recaser/detruecase.perl +++ b/scripts/recaser/detruecase.perl @@ -6,11 +6,12 @@ use Getopt::Long "GetOptions"; binmode(STDIN, ":utf8"); binmode(STDOUT, ":utf8"); - -my ($SRC,$INFILE); +my ($SRC,$INFILE,$UNBUFFERED); die("detruecase.perl < in > out") unless &GetOptions('headline=s' => \$SRC, - 'in=s' => \$INFILE); + 'in=s' => \$INFILE, + 'b|unbuffered' => \$UNBUFFERED); +if (defined($UNBUFFERED) && $UNBUFFERED) { $|=1; } my %SENTENCE_END = ("."=>1,":"=>1,"?"=>1,"!"=>1); my %DELAYED_SENTENCE_START = ("("=>1,"["=>1,"\""=>1,"'"=>1,"""=>1,"'"=>1,"["=>1,"]"=>1); diff --git a/scripts/recaser/recase.perl b/scripts/recaser/recase.perl index c83c30daa..2858cda61 100755 --- a/scripts/recaser/recase.perl +++ b/scripts/recaser/recase.perl @@ -4,7 +4,7 @@ use strict; use Getopt::Long "GetOptions"; -my ($SRC,$INFILE,$RECASE_MODEL); +my ($SRC,$INFILE,$RECASE_MODEL,$UNBUFFERED); my $MOSES = "moses"; my $LANGUAGE = "en"; # English by default; die("recase.perl --in file --model ini-file > out") @@ -12,9 +12,11 @@ die("recase.perl --in file --model ini-file > out") 'headline=s' => \$SRC, 'lang=s' => \$LANGUAGE, 'moses=s' => \$MOSES, - 'model=s' => \$RECASE_MODEL) + 'model=s' => \$RECASE_MODEL, + 'b|unbuffered' => \$UNBUFFERED) && defined($INFILE) && defined($RECASE_MODEL); +if (defined($UNBUFFERED) && $UNBUFFERED) { $|=1; } my %treated_languages = map { ($_,1) } qw/en cs/; die "I don't know any rules for $LANGUAGE. Use 'en' as the default." diff --git a/scripts/recaser/truecase.perl b/scripts/recaser/truecase.perl index 0e2df27a2..517f5c7a1 100755 --- a/scripts/recaser/truecase.perl +++ b/scripts/recaser/truecase.perl @@ -8,9 +8,11 @@ binmode(STDIN, ":utf8"); binmode(STDOUT, ":utf8"); # apply switches -my $MODEL; -die("truecase.perl --model truecaser < in > out") - unless &GetOptions('model=s' => \$MODEL); +my ($MODEL, $UNBUFFERED); +die("truecase.perl --model MODEL [-b] < in > out") + unless &GetOptions('model=s' => \$MODEL,'b|unbuffered' => \$UNBUFFERED) + && defined($MODEL); +if (defined($UNBUFFERED) && $UNBUFFERED) { $|=1; } my (%BEST,%KNOWN); open(MODEL,$MODEL) || die("ERROR: could not open '$MODEL'"); diff --git a/scripts/share/nonbreaking_prefixes/nonbreaking_prefix.hu b/scripts/share/nonbreaking_prefixes/nonbreaking_prefix.hu new file mode 100644 index 000000000..c6b9af8ca --- /dev/null +++ b/scripts/share/nonbreaking_prefixes/nonbreaking_prefix.hu @@ -0,0 +1,103 @@ +#Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker. +#Special cases are included for prefixes that ONLY appear before 0-9 numbers. + +#any single upper case letter followed by a period is not a sentence ender (excluding I occasionally, but we leave it in) +#usually upper case letters are initials in a name +A +B +C +D +E +F +G +H +I +J +K +L +M +N +O +P +Q +R +S +T +U +V +W +X +Y +Z +Á +É +Í +Ó +Ö +Ő +Ú +Ü +Ű + +#List of titles. These are often followed by upper-case names, but do not indicate sentence breaks +Dr +dr +kb +Kb +vö +Vö +pl +Pl +ca +Ca +min +Min +max +Max +ún +Ún +prof +Prof +de +De +du +Du +Szt +St + +#Numbers only. These should only induce breaks when followed by a numeric sequence +# add NUMERIC_ONLY after the word for this function +#This case is mostly for the english "No." which can either be a sentence of its own, or +#if followed by a number, a non-breaking prefix + +# Month name abbreviations +jan #NUMERIC_ONLY# +Jan #NUMERIC_ONLY# +Feb #NUMERIC_ONLY# +feb #NUMERIC_ONLY# +márc #NUMERIC_ONLY# +Márc #NUMERIC_ONLY# +ápr #NUMERIC_ONLY# +Ápr #NUMERIC_ONLY# +máj #NUMERIC_ONLY# +Máj #NUMERIC_ONLY# +jún #NUMERIC_ONLY# +Jún #NUMERIC_ONLY# +Júl #NUMERIC_ONLY# +júl #NUMERIC_ONLY# +aug #NUMERIC_ONLY# +Aug #NUMERIC_ONLY# +Szept #NUMERIC_ONLY# +szept #NUMERIC_ONLY# +okt #NUMERIC_ONLY# +Okt #NUMERIC_ONLY# +nov #NUMERIC_ONLY# +Nov #NUMERIC_ONLY# +dec #NUMERIC_ONLY# +Dec #NUMERIC_ONLY# + +# Other abbreviations +tel #NUMERIC_ONLY# +Tel #NUMERIC_ONLY# +Fax #NUMERIC_ONLY# +fax #NUMERIC_ONLY# diff --git a/scripts/share/nonbreaking_prefixes/nonbreaking_prefix.lv b/scripts/share/nonbreaking_prefixes/nonbreaking_prefix.lv new file mode 100644 index 000000000..81754a17a --- /dev/null +++ b/scripts/share/nonbreaking_prefixes/nonbreaking_prefix.lv @@ -0,0 +1,100 @@ +#Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker. +#Special cases are included for prefixes that ONLY appear before 0-9 numbers. + +#any single upper case letter followed by a period is not a sentence ender (excluding I occasionally, but we leave it in) +#usually upper case letters are initials in a name +A +Ā +B +C +Č +D +E +Ē +F +G +Ģ +H +I +Ī +J +K +Ķ +L +Ļ +M +N +Ņ +O +P +Q +R +S +Š +T +U +Ū +V +W +X +Y +Z +Ž + +#List of titles. These are often followed by upper-case names, but do not indicate sentence breaks +dr +Dr +med +prof +Prof +inž +Inž +ist.loc +Ist.loc +kor.loc +Kor.loc +v.i +vietn +Vietn + +#misc - odd period-ending items that NEVER indicate breaks (p.m. does NOT fall into this category - it sometimes ends a sentence) +a.l +t.p +pārb +Pārb +vec +Vec +inv +Inv +sk +Sk +spec +Spec +vienk +Vienk +virz +Virz +māksl +Māksl +mūz +Mūz +akad +Akad +soc +Soc +galv +Galv +vad +Vad +sertif +Sertif +folkl +Folkl +hum +Hum + +#Numbers only. These should only induce breaks when followed by a numeric sequence +# add NUMERIC_ONLY after the word for this function +#This case is mostly for the english "No." which can either be a sentence of its own, or +#if followed by a number, a non-breaking prefix +Nr #NUMERIC_ONLY# diff --git a/scripts/tokenizer/tokenizer.perl b/scripts/tokenizer/tokenizer.perl index f59cd5f86..986a2dfb5 100755 --- a/scripts/tokenizer/tokenizer.perl +++ b/scripts/tokenizer/tokenizer.perl @@ -171,7 +171,7 @@ if ($TIMING) # tokenize a batch of texts saved in an array # input: an array containing a batch of texts -# return: another array cotaining a batch of tokenized texts for the input array +# return: another array containing a batch of tokenized texts for the input array sub tokenize_batch { my(@text_list) = @_; diff --git a/scripts/training/clean-corpus-n.perl b/scripts/training/clean-corpus-n.perl index bea32052a..2865fe391 100755 --- a/scripts/training/clean-corpus-n.perl +++ b/scripts/training/clean-corpus-n.perl @@ -47,7 +47,7 @@ my $l1input = "$corpus.$l1"; if (-e $l1input) { $opn = $l1input; } elsif (-e $l1input.".gz") { - $opn = "zcat $l1input.gz |"; + $opn = "gunzip -c $l1input.gz |"; } else { die "Error: $l1input does not exist"; } @@ -57,7 +57,7 @@ my $l2input = "$corpus.$l2"; if (-e $l2input) { $opn = $l2input; } elsif (-e $l2input.".gz") { - $opn = "zcat $l2input.gz |"; + $opn = "gunzip -c $l2input.gz |"; } else { die "Error: $l2input does not exist"; } @@ -160,3 +160,4 @@ sub word_count { my @w = split(/ /,$line); return scalar @w; } + diff --git a/scripts/training/filter-rule-table.py b/scripts/training/filter-rule-table.py index 8bef034de..86c8b300e 100755 --- a/scripts/training/filter-rule-table.py +++ b/scripts/training/filter-rule-table.py @@ -40,7 +40,8 @@ def printUsage(): def main(): parser = optparse.OptionParser() parser.add_option("-c", "--min-non-initial-rule-count", - action="store", dest="minCount", type="int", default="1", + action="store", dest="minCount", + type="float", default="0.0", help="prune non-initial rules where count is below N", metavar="N") (options, args) = parser.parse_args() diff --git a/scripts/training/mert-moses.pl b/scripts/training/mert-moses.pl index 688e8ce55..9f5f25f15 100755 --- a/scripts/training/mert-moses.pl +++ b/scripts/training/mert-moses.pl @@ -1,4 +1,4 @@ -#!/usr/bin/perl -w +#!/usr/bin/perl -w # $Id$ # Usage: # mert-moses.pl <foreign> <english> <decoder-executable> <decoder-config> @@ -371,7 +371,7 @@ my $pro_optimizer = File::Spec->catfile($mertdir, "megam_i686.opt"); # or set t if (($___PAIRWISE_RANKED_OPTIMIZER || $___PRO_STARTING_POINT) && ! -x $pro_optimizer) { print "Could not find $pro_optimizer, installing it in $mertdir\n"; - my $megam_url = "http://www.umiacs.umd.edu/~hal/megam/"; + my $megam_url = "http://hal3.name/megam"; if (&is_mac_osx()) { die "Error: Sorry for Mac OS X users! Please get the source code of megam and compile by hand. Please see $megam_url for details."; } diff --git a/scripts/training/train-model.perl b/scripts/training/train-model.perl index 5b0553581..e4292007e 100755 --- a/scripts/training/train-model.perl +++ b/scripts/training/train-model.perl @@ -38,8 +38,9 @@ my($_EXTERNAL_BINDIR, $_ROOT_DIR, $_CORPUS_DIR, $_GIZA_E2F, $_GIZA_F2E, $_MODEL_ $_MEMSCORE, $_FINAL_ALIGNMENT_MODEL, $_CONTINUE,$_MAX_LEXICAL_REORDERING,$_DO_STEPS, @_ADDITIONAL_INI,$_ADDITIONAL_INI_FILE, - $_SPARSE_TRANSLATION_TABLE, @_BASELINE_ALIGNMENT_MODEL, $_BASELINE_EXTRACT, $_BASELINE_CORPUS, $_BASELINE_ALIGNMENT, + $_SPARSE_TRANSLATION_TABLE, @_BASELINE_ALIGNMENT_MODEL, $_BASELINE_EXTRACT, $_BASELINE_ALIGNMENT, $_DICTIONARY, $_SPARSE_PHRASE_FEATURES, $_EPPEX, $_INSTANCE_WEIGHTS_FILE, $_LMODEL_OOV_FEATURE, $IGNORE); +my $_BASELINE_CORPUS = ""; my $_CORES = 1; my $debug = 0; # debug this script, do not delete any files in debug mode diff --git a/util/file.cc b/util/file.cc index 86d9b12de..c7d8e23b2 100644 --- a/util/file.cc +++ b/util/file.cc @@ -111,15 +111,26 @@ void ResizeOrThrow(int fd, uint64_t to) { UTIL_THROW_IF_ARG(ret, FDException, (fd), "while resizing to " << to << " bytes"); } +namespace { +std::size_t GuardLarge(std::size_t size) { + // The following operating systems have broken read/write/pread/pwrite that + // only supports up to 2^31. +#if defined(_WIN32) || defined(_WIN64) || defined(__APPLE__) || defined(OS_ANDROID) + return std::min(static_cast<std::size_t>(INT_MAX), size); +#else + return size; +#endif +} +} + std::size_t PartialRead(int fd, void *to, std::size_t amount) { #if defined(_WIN32) || defined(_WIN64) - amount = min(static_cast<std::size_t>(INT_MAX), amount); - int ret = _read(fd, to, amount); + int ret = _read(fd, to, GuardLarge(amount)); #else errno = 0; ssize_t ret; do { - ret = read(fd, to, amount); + ret = read(fd, to, GuardLarge(amount)); } while (ret == -1 && errno == EINTR); #endif UTIL_THROW_IF_ARG(ret < 0, FDException, (fd), "while reading " << amount << " bytes"); @@ -169,11 +180,13 @@ void PReadOrThrow(int fd, void *to_void, std::size_t size, uint64_t off) { ssize_t ret; errno = 0; do { + ret = #ifdef OS_ANDROID - ret = pread64(fd, to, size, off); + pread64 #else - ret = pread(fd, to, size, off); + pread #endif + (fd, to, GuardLarge(size), off); } while (ret == -1 && errno == EINTR); if (ret <= 0) { UTIL_THROW_IF(ret == 0, EndOfFileException, " for reading " << size << " bytes at " << off << " from " << NameFromFD(fd)); @@ -190,14 +203,20 @@ void WriteOrThrow(int fd, const void *data_void, std::size_t size) { const uint8_t *data = static_cast<const uint8_t*>(data_void); while (size) { #if defined(_WIN32) || defined(_WIN64) - int ret = write(fd, data, min(static_cast<std::size_t>(INT_MAX), size)); + int ret; #else - errno = 0; ssize_t ret; +#endif + errno = 0; do { - ret = write(fd, data, size); - } while (ret == -1 && errno == EINTR); + ret = +#if defined(_WIN32) || defined(_WIN64) + _write +#else + write #endif + (fd, data, GuardLarge(size)); + } while (ret == -1 && errno == EINTR); UTIL_THROW_IF_ARG(ret < 1, FDException, (fd), "while writing " << size << " bytes"); data += ret; size -= ret; diff --git a/util/read_compressed.cc b/util/read_compressed.cc index b81549e42..b62a6e833 100644 --- a/util/read_compressed.cc +++ b/util/read_compressed.cc @@ -180,12 +180,73 @@ class GZip : public ReadBase { }; #endif // HAVE_ZLIB +const uint8_t kBZMagic[3] = {'B', 'Z', 'h'}; + #ifdef HAVE_BZLIB class BZip : public ReadBase { public: - explicit BZip(int fd, void *already_data, std::size_t already_size) { + BZip(int fd, void *already_data, std::size_t already_size) { scoped_fd hold(fd); closer_.reset(FDOpenReadOrThrow(hold)); + file_ = NULL; + Open(already_data, already_size); + } + + BZip(FILE *file, void *already_data, std::size_t already_size) { + closer_.reset(file); + file_ = NULL; + Open(already_data, already_size); + } + + ~BZip() { + Close(file_); + } + + std::size_t Read(void *to, std::size_t amount, ReadCompressed &thunk) { + assert(file_); + int bzerror = BZ_OK; + int ret = BZ2_bzRead(&bzerror, file_, to, std::min<std::size_t>(static_cast<std::size_t>(INT_MAX), amount)); + long pos = ftell(closer_.get()); + if (pos != -1) ReadCount(thunk) = pos; + switch (bzerror) { + case BZ_STREAM_END: + /* bzip2 files can be concatenated by e.g. pbzip2. Annoyingly, the + * library doesn't handle this internally. This gets the trailing + * data, grows it up to magic as needed, validates the magic, and + * reopens. + */ + { + bzerror = BZ_OK; + void *trailing_data; + int trailing_size; + BZ2_bzReadGetUnused(&bzerror, file_, &trailing_data, &trailing_size); + UTIL_THROW_IF(bzerror != BZ_OK, BZException, "bzip2 error in BZ2_bzReadGetUnused " << BZ2_bzerror(file_, &bzerror) << " code " << bzerror); + std::string trailing(static_cast<const char*>(trailing_data), trailing_size); + Close(file_); + + if (trailing_size < (int)sizeof(kBZMagic)) { + trailing.resize(sizeof(kBZMagic)); + if (1 != fread(&trailing[trailing_size], sizeof(kBZMagic) - trailing_size, 1, closer_.get())) { + UTIL_THROW_IF(trailing_size, BZException, "File has trailing cruft"); + // Legitimate end of file. + ReplaceThis(new Complete(), thunk); + return ret; + } + } + UTIL_THROW_IF(memcmp(trailing.data(), kBZMagic, sizeof(kBZMagic)), BZException, "Trailing cruft is not another bzip2 stream"); + Open(&trailing[0], trailing.size()); + } + return ret; + case BZ_OK: + return ret; + default: + UTIL_THROW(BZException, "bzip2 error " << BZ2_bzerror(file_, &bzerror) << " code " << bzerror); + } + } + + private: + void Open(void *already_data, std::size_t already_size) { + assert(!file_); int bzerror = BZ_OK; file_ = BZ2_bzReadOpen(&bzerror, closer_.get(), 0, 0, already_data, already_size); switch (bzerror) { @@ -199,38 +260,23 @@ class BZip : public ReadBase { UTIL_THROW(BZException, "IO error reading file"); case BZ_MEM_ERROR: throw std::bad_alloc(); + default: + UTIL_THROW(BZException, "Unknown bzip2 error code " << bzerror); } + assert(file_); } - ~BZip() { + static void Close(BZFILE *&file) { + if (file == NULL) return; int bzerror = BZ_OK; - BZ2_bzReadClose(&bzerror, file_); + BZ2_bzReadClose(&bzerror, file); if (bzerror != BZ_OK) { - std::cerr << "bz2 readclose error" << std::endl; + std::cerr << "bz2 readclose error number " << bzerror << std::endl; abort(); } + file = NULL; } - std::size_t Read(void *to, std::size_t amount, ReadCompressed &thunk) { - int bzerror = BZ_OK; - int ret = BZ2_bzRead(&bzerror, file_, to, std::min<std::size_t>(static_cast<std::size_t>(INT_MAX), amount)); - long pos; - switch (bzerror) { - case BZ_STREAM_END: - pos = ftell(closer_.get()); - if (pos != -1) ReadCount(thunk) = pos; - ReplaceThis(new Complete(), thunk); - return ret; - case BZ_OK: - pos = ftell(closer_.get()); - if (pos != -1) ReadCount(thunk) = pos; - return ret; - default: - UTIL_THROW(BZException, "bzip2 error " << BZ2_bzerror(file_, &bzerror) << " code " << bzerror); - } - } - - private: scoped_FILE closer_; BZFILE *file_; }; @@ -346,11 +392,11 @@ MagicResult DetectMagic(const void *from_void) { if (header[0] == 0x1f && header[1] == 0x8b) { return GZIP; } - if (header[0] == 'B' && header[1] == 'Z' && header[2] == 'h') { + if (!memcmp(header, kBZMagic, sizeof(kBZMagic))) { return BZIP; } - const uint8_t xzmagic[6] = { 0xFD, '7', 'z', 'X', 'Z', 0x00 }; - if (!memcmp(header, xzmagic, 6)) { + const uint8_t kXZMagic[6] = { 0xFD, '7', 'z', 'X', 'Z', 0x00 }; + if (!memcmp(header, kXZMagic, sizeof(kXZMagic))) { return XZIP; } return UNKNOWN; |