Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/moses-smt/mosesdecoder.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorBarry Haddow <barry.haddow@gmail.com>2013-04-12 19:07:26 +0400
committerBarry Haddow <barry.haddow@gmail.com>2013-04-12 19:07:26 +0400
commit9d42c7f6f74bbb0079768a762fc4546d20d6b634 (patch)
treeab1a2a2884a3b3b809a969ea0eb36fb98416347e
parentc5965b8587b37986ebab786905a8ef9f218403de (diff)
parent517d6c7bb834e40bcf25e8cbc79985180cb7f29f (diff)
Merge branch 'master' of github.com:moses-smt/mosesdecoder
-rw-r--r--.gitmodules3
-rw-r--r--BUILD-INSTRUCTIONS.txt2
-rw-r--r--NOTICE2
-rw-r--r--OnDiskPt/Main.cpp1
-rw-r--r--biconcor/PhrasePair.cpp37
-rw-r--r--biconcor/PhrasePair.h3
-rw-r--r--biconcor/PhrasePairCollection.cpp87
-rw-r--r--biconcor/PhrasePairCollection.h10
-rw-r--r--biconcor/biconcor.cpp65
-rwxr-xr-xbjam4
-rw-r--r--contrib/arrow-pipelines/documentation/training-pipeline/moses-pypeline.diabin0 -> 3532 bytes
-rw-r--r--contrib/arrow-pipelines/python/README32
m---------contrib/arrow-pipelines/python/libs/pypeline0
-rw-r--r--contrib/arrow-pipelines/python/manager.py192
-rw-r--r--contrib/arrow-pipelines/python/test/__init__.py0
-rw-r--r--contrib/arrow-pipelines/python/test/test.py11
-rw-r--r--contrib/arrow-pipelines/python/training/__init__.py0
-rw-r--r--contrib/arrow-pipelines/python/training/components/__init__.py0
-rw-r--r--contrib/arrow-pipelines/python/training/components/cleanup/__init__.py0
-rw-r--r--contrib/arrow-pipelines/python/training/components/cleanup/cleanup.py125
-rw-r--r--contrib/arrow-pipelines/python/training/components/cleanup/cleanup3.py109
-rw-r--r--contrib/arrow-pipelines/python/training/components/data_split/__init__.py0
-rw-r--r--contrib/arrow-pipelines/python/training/components/data_split/data_split.py146
-rw-r--r--contrib/arrow-pipelines/python/training/components/irstlm_build/__init__.py0
-rw-r--r--contrib/arrow-pipelines/python/training/components/irstlm_build/irstlm_build.py106
-rw-r--r--contrib/arrow-pipelines/python/training/components/mert/__init__.py0
-rwxr-xr-xcontrib/arrow-pipelines/python/training/components/mert/mert.py83
-rw-r--r--contrib/arrow-pipelines/python/training/components/model_training/__init__.py0
-rwxr-xr-xcontrib/arrow-pipelines/python/training/components/model_training/model_training.py72
-rw-r--r--contrib/arrow-pipelines/python/training/components/tokenizer/__init__.py0
-rwxr-xr-xcontrib/arrow-pipelines/python/training/components/tokenizer/src_tokenizer.py43
-rw-r--r--contrib/arrow-pipelines/python/training/components/tokenizer/tmp.de3
-rw-r--r--contrib/arrow-pipelines/python/training/components/tokenizer/tokenizer.py36
-rwxr-xr-xcontrib/arrow-pipelines/python/training/components/tokenizer/trg_tokenizer.py43
-rw-r--r--contrib/other-builds/OnDiskPt/.cproject11
-rw-r--r--contrib/other-builds/extractor/.cproject15
-rw-r--r--contrib/other-builds/lm/.cproject11
-rw-r--r--contrib/other-builds/lm/.project20
-rw-r--r--contrib/other-builds/mert_lib/.cproject22
-rw-r--r--contrib/other-builds/moses-chart-cmd/.cproject16
-rw-r--r--contrib/other-builds/moses-cmd/.cproject16
-rw-r--r--contrib/other-builds/moses/.cproject18
-rw-r--r--contrib/other-builds/search/.cproject11
-rw-r--r--contrib/other-builds/search/.project5
-rw-r--r--contrib/other-builds/util/.cproject11
-rw-r--r--contrib/rpm/README42
-rwxr-xr-xcontrib/rpm/build_source.sh63
-rw-r--r--contrib/rpm/rpmbuild/SPECS/moses.spec65
-rwxr-xr-xcontrib/server/client.py21
-rw-r--r--contrib/server/mosesserver.cpp16
-rw-r--r--contrib/sigtest-filter/filter-pt.cpp24
-rw-r--r--contrib/tmcombine/README.md2
-rwxr-xr-xcontrib/tmcombine/tmcombine.py4
-rw-r--r--mert/InterpolatedScorer.cpp2
-rw-r--r--moses-chart-cmd/IOWrapper.cpp112
-rw-r--r--moses-cmd/IOWrapper.cpp13
-rw-r--r--moses-cmd/IOWrapper.h2
-rw-r--r--moses-cmd/Jamfile2
-rw-r--r--moses-cmd/Main.cpp215
-rw-r--r--moses/AlignmentInfoCollection.cpp3
-rw-r--r--moses/AlignmentInfoCollection.h1
-rw-r--r--moses/Hypothesis.cpp2
-rw-r--r--moses/LM/SingleFactor.cpp11
-rw-r--r--moses/Manager.cpp439
-rw-r--r--moses/Manager.h21
-rw-r--r--moses/PDTAimp.h8
-rw-r--r--moses/Parameter.cpp4
-rw-r--r--moses/SourceWordDeletionFeature.cpp7
-rw-r--r--moses/StaticData.cpp27
-rw-r--r--moses/StaticData.h14
-rw-r--r--moses/TargetPhrase.cpp6
-rw-r--r--moses/TargetWordInsertionFeature.cpp7
-rw-r--r--moses/TranslationModel/PhraseDictionaryTree.cpp54
-rw-r--r--moses/TranslationModel/fuzzy-match/FuzzyMatchWrapper.cpp4
-rw-r--r--moses/Util.cpp6
-rw-r--r--moses/Word.cpp5
-rw-r--r--phrase-extract/consolidate-main.cpp2
-rw-r--r--phrase-extract/extract-main.cpp7
-rw-r--r--scripts/ems/experiment.meta1
-rwxr-xr-xscripts/ems/support/analysis.perl17
-rwxr-xr-xscripts/ems/support/wrap-xml.perl8
-rwxr-xr-xscripts/generic/compound-splitter.perl43
-rwxr-xr-xscripts/generic/extract-parallel.perl6
-rwxr-xr-xscripts/generic/moses-parallel.pl27
-rwxr-xr-xscripts/generic/mteval-v13a.pl2
-rwxr-xr-xscripts/generic/score-parallel.perl2
-rwxr-xr-xscripts/recaser/detruecase.perl7
-rwxr-xr-xscripts/recaser/recase.perl6
-rwxr-xr-xscripts/recaser/truecase.perl8
-rw-r--r--scripts/share/nonbreaking_prefixes/nonbreaking_prefix.hu103
-rw-r--r--scripts/share/nonbreaking_prefixes/nonbreaking_prefix.lv100
-rwxr-xr-xscripts/tokenizer/tokenizer.perl2
-rwxr-xr-xscripts/training/clean-corpus-n.perl5
-rwxr-xr-xscripts/training/filter-rule-table.py3
-rwxr-xr-xscripts/training/mert-moses.pl4
-rwxr-xr-xscripts/training/train-model.perl3
-rw-r--r--util/file.cc37
-rw-r--r--util/read_compressed.cc100
98 files changed, 2718 insertions, 348 deletions
diff --git a/.gitmodules b/.gitmodules
index e69de29bb..d3a8cb4da 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -0,0 +1,3 @@
+[submodule "contrib/arrow-pipelines/python/libs/pypeline"]
+ path = contrib/arrow-pipelines/python/libs/pypeline
+ url = git://github.com/ianj-als/pypeline.git
diff --git a/BUILD-INSTRUCTIONS.txt b/BUILD-INSTRUCTIONS.txt
index 318956ccd..3dac64f60 100644
--- a/BUILD-INSTRUCTIONS.txt
+++ b/BUILD-INSTRUCTIONS.txt
@@ -45,7 +45,7 @@ ADVICE ON INSTALLING EXTERNAL LIBRARIES
Generally, for trouble installing external libraries, you should get support
directly from the library maker:
-Boost: http://www.boost.org/doc/libs/1_48_0/more/getting_started/unix-variants.html
+Boost: http://www.boost.org/doc/libs/release/more/getting_started/unix-variants.html
IRSTLM: https://list.fbk.eu/sympa/subscribe/user-irstlm
SRILM: http://www.speech.sri.com/projects/srilm/#srilm-user
diff --git a/NOTICE b/NOTICE
index 7d631cd88..23d8b2ad1 100644
--- a/NOTICE
+++ b/NOTICE
@@ -1,3 +1,5 @@
This code includes data from Daniel Naber's Language Tools (czech abbreviations).
This code includes data from czech wiktionary (also czech abbreviations).
+
+
diff --git a/OnDiskPt/Main.cpp b/OnDiskPt/Main.cpp
index 5f6da5a33..5d4e0be8d 100644
--- a/OnDiskPt/Main.cpp
+++ b/OnDiskPt/Main.cpp
@@ -174,6 +174,7 @@ OnDiskPt::PhrasePtr Tokenize(SourcePhrase &sourcePhrase, TargetPhrase &targetPhr
break;
}
default:
+ cerr << "ERROR in line " << line << endl;
assert(false);
break;
}
diff --git a/biconcor/PhrasePair.cpp b/biconcor/PhrasePair.cpp
index 9c16be77c..038fa3a31 100644
--- a/biconcor/PhrasePair.cpp
+++ b/biconcor/PhrasePair.cpp
@@ -8,7 +8,42 @@
using namespace std;
-void PhrasePair::Print( ostream* out, int width ) const
+void PhrasePair::Print( ostream* out ) const
+{
+ // source
+ int sentence_start = m_source_position - m_source_start;
+ char source_length = m_suffixArray->GetSentenceLength( m_suffixArray->GetSentence( m_source_position ) );
+
+ for( char i=0; i<source_length; i++ ) {
+ if (i>0) *out << " ";
+ *out << m_suffixArray->GetWord( sentence_start + i );
+ }
+
+ // target
+ *out << " |||";
+ for( char i=0; i<m_target_length; i++ ) {
+ *out << " " << m_targetCorpus->GetWord( m_sentence_id, i);
+ }
+
+ // source span
+ *out << " ||| " << (int)m_source_start << " " << (int)m_source_end;
+
+ // target span
+ *out << " ||| " << (int)m_target_start << " " << (int)m_target_end;
+
+ // word alignment
+ *out << " |||";
+
+ INDEX ap_points = m_alignment->GetNumberOfAlignmentPoints( m_sentence_id );
+ for( INDEX i=0; i<ap_points; i++) {
+ *out << " " << m_alignment->GetSourceWord( m_sentence_id, i )
+ << "-" << m_alignment->GetTargetWord( m_sentence_id, i );
+ }
+
+ *out << endl;
+}
+
+void PhrasePair::PrintPretty( ostream* out, int width ) const
{
vector< WORD_ID >::const_iterator t;
diff --git a/biconcor/PhrasePair.h b/biconcor/PhrasePair.h
index f8a7881a0..f1dadb637 100644
--- a/biconcor/PhrasePair.h
+++ b/biconcor/PhrasePair.h
@@ -43,7 +43,8 @@ public:
~PhrasePair () {}
void PrintTarget( std::ostream* out ) const;
- void Print( std::ostream* out, int width ) const;
+ void Print( std::ostream* out ) const;
+ void PrintPretty( std::ostream* out, int width ) const;
void PrintHTML( std::ostream* out ) const;
void PrintClippedHTML( std::ostream* out, int width ) const;
};
diff --git a/biconcor/PhrasePairCollection.cpp b/biconcor/PhrasePairCollection.cpp
index 17c95d24a..7497b2af8 100644
--- a/biconcor/PhrasePairCollection.cpp
+++ b/biconcor/PhrasePairCollection.cpp
@@ -13,31 +13,32 @@
using namespace std;
-PhrasePairCollection::PhrasePairCollection( SuffixArray *sa, TargetCorpus *tc, Alignment *a )
+PhrasePairCollection::PhrasePairCollection( SuffixArray *sa, TargetCorpus *tc, Alignment *a, int max_translation, int max_example )
:m_suffixArray(sa)
,m_targetCorpus(tc)
,m_alignment(a)
,m_size(0)
- ,m_max_lookup(10000)
- ,m_max_pp_target(50)
- ,m_max_pp(50)
+ ,m_max_lookup(10000) // maximum number of source occurrences sampled
+ ,m_max_translation(max_translation) // max number of different distinct translations returned
+ ,m_max_example(max_example) // max number of examples returned for each distinct translation
{}
PhrasePairCollection::~PhrasePairCollection()
{}
-bool PhrasePairCollection::GetCollection( const vector< string >& sourceString )
+int PhrasePairCollection::GetCollection( const vector< string >& sourceString )
{
INDEX first_match, last_match;
if (! m_suffixArray->FindMatches( sourceString, first_match, last_match )) {
- return false;
+ return 0;
}
- cerr << "\tfirst match " << first_match << endl;
- cerr << "\tlast match " << last_match << endl;
+ //cerr << "\tfirst match " << first_match << endl;
+ //cerr << "\tlast match " << last_match << endl;
INDEX found = last_match - first_match +1;
map< vector< WORD_ID >, INDEX > index;
+ int real_count = 0;
for( INDEX i=first_match; i<=last_match; i++ ) {
int position = m_suffixArray->GetPosition( i );
int source_start = m_suffixArray->GetWordInSentence( position );
@@ -45,23 +46,23 @@ bool PhrasePairCollection::GetCollection( const vector< string >& sourceString )
INDEX sentence_id = m_suffixArray->GetSentence( position );
int sentence_length = m_suffixArray->GetSentenceLength( sentence_id );
int target_length = m_targetCorpus->GetSentenceLength( sentence_id );
- cerr << "match " << (i-first_match)
- << " in sentence " << sentence_id
- << ", starting at word " << source_start
- << " of " << sentence_length
- << ". target sentence has " << target_length << " words.";
+ //cerr << "match " << (i-first_match)
+ //<< " in sentence " << sentence_id
+ //<< ", starting at word " << source_start
+ //<< " of " << sentence_length
+ //<< ". target sentence has " << target_length << " words.";
int target_start, target_end, pre_null, post_null;
if (m_alignment->PhraseAlignment( sentence_id, target_length, source_start, source_end, target_start, target_end, pre_null, post_null)) {
- cerr << " aligned to [" << (int)target_start << "," << (int)target_end << "]";
- cerr << " +(" << (int)pre_null << "," << (int)post_null << ")";
+ //cerr << " aligned to [" << (int)target_start << "," << (int)target_end << "]";
+ //cerr << " +(" << (int)pre_null << "," << (int)post_null << ")";
bool null_boundary_words = false;
for (int pre = 0; pre <= pre_null && (pre == 0 || null_boundary_words); pre++ ) {
for (int post = 0; post <= post_null && (post == 0 || null_boundary_words); post++ ) {
vector< WORD_ID > targetString;
- cerr << "; ";
+ //cerr << "; ";
for (int target = target_start - pre; target <= target_end + post; target++) {
targetString.push_back( m_targetCorpus->GetWordId( sentence_id, target) );
- cerr << m_targetCorpus->GetWord( sentence_id, target) << " ";
+ //cerr << m_targetCorpus->GetWord( sentence_id, target) << " ";
}
PhrasePair *phrasePair = new PhrasePair( m_suffixArray, m_targetCorpus, m_alignment, sentence_id, target_length, position, source_start, source_end, target_start-pre, target_end+post, pre, post, pre_null-pre, post_null-post);
// matchCollection.Add( sentence_id, )
@@ -76,37 +77,47 @@ bool PhrasePairCollection::GetCollection( const vector< string >& sourceString )
}
}
else {
- cerr << "mismatch " << (i-first_match)
- << " in sentence " << sentence_id
- << ", starting at word " << source_start
- << " of " << sentence_length
- << ". target sentence has " << target_length << " words.";
+ //cerr << "mismatch " << (i-first_match)
+ // << " in sentence " << sentence_id
+ // << ", starting at word " << source_start
+ // << " of " << sentence_length
+ // << ". target sentence has " << target_length << " words.";
Mismatch *mismatch = new Mismatch( m_suffixArray, m_targetCorpus, m_alignment, sentence_id, position, sentence_length, target_length, source_start, source_end );
if (mismatch->Unaligned())
m_unaligned.push_back( mismatch );
else
m_mismatch.push_back( mismatch );
}
- cerr << endl;
+ //cerr << endl;
if (found > (INDEX)m_max_lookup) {
i += found/m_max_lookup-1;
}
+ real_count++;
}
sort(m_collection.begin(), m_collection.end(), CompareBySize());
- return true;
+ return real_count;
}
-void PhrasePairCollection::Print() const
+void PhrasePairCollection::Print(bool pretty) const
{
vector< vector<PhrasePair*> >::const_iterator ppWithSameTarget;
- for( ppWithSameTarget = m_collection.begin(); ppWithSameTarget != m_collection.end(); ppWithSameTarget++ ) {
+ int i=0;
+ for( ppWithSameTarget = m_collection.begin(); ppWithSameTarget != m_collection.end() && i<m_max_translation; i++, ppWithSameTarget++ ) {
(*(ppWithSameTarget->begin()))->PrintTarget( &cout );
int count = ppWithSameTarget->size();
cout << "(" << count << ")" << endl;
- vector< PhrasePair* >::const_iterator p;
- for(p = ppWithSameTarget->begin(); p != ppWithSameTarget->end(); p++ ) {
- (*p)->Print( &cout, 100 );
+ vector< PhrasePair* >::const_iterator p = ppWithSameTarget->begin();
+ for(int j=0; j<ppWithSameTarget->size() && j<m_max_example; j++, p++ ) {
+ if (pretty) {
+ (*p)->PrintPretty( &cout, 100 );
+ }
+ else {
+ (*p)->Print( &cout );
+ }
+ if (ppWithSameTarget->size() > m_max_example) {
+ p += ppWithSameTarget->size()/m_max_example-1;
+ }
}
}
}
@@ -117,7 +128,7 @@ void PhrasePairCollection::PrintHTML() const
bool singleton = false;
// loop over all translations
vector< vector<PhrasePair*> >::const_iterator ppWithSameTarget;
- for( ppWithSameTarget = m_collection.begin(); ppWithSameTarget != m_collection.end() && pp_target<m_max_pp_target; ppWithSameTarget++, pp_target++ ) {
+ for( ppWithSameTarget = m_collection.begin(); ppWithSameTarget != m_collection.end() && pp_target<m_max_translation; ppWithSameTarget++, pp_target++ ) {
int count = ppWithSameTarget->size();
if (!singleton) {
@@ -143,9 +154,9 @@ void PhrasePairCollection::PrintHTML() const
int i=0;
for(p = ppWithSameTarget->begin(); i<10 && pp<count && p != ppWithSameTarget->end(); p++, pp++, i++ ) {
(*p)->PrintClippedHTML( &cout, 160 );
- if (count > m_max_pp) {
- p += count/m_max_pp-1;
- pp += count/m_max_pp-1;
+ if (count > m_max_example) {
+ p += count/m_max_example-1;
+ pp += count/m_max_example-1;
}
}
if (i == 10 && pp < count) {
@@ -153,11 +164,11 @@ void PhrasePairCollection::PrintHTML() const
cout << "<tr><td colspan=7 align=center class=\"pp_more\" onclick=\"javascript:document.getElementById('pp_" << pp_target << "').style.display = 'none'; document.getElementById('pp_ext_" << pp_target << "').style.display = 'block';\">(more)</td></tr></table></div>";
cout << "<div id=\"pp_ext_" << pp_target << "\" style=\"display:none;\";\">";
cout << "<table align=\"center\">";
- for(i=0, pp=0, p = ppWithSameTarget->begin(); i<m_max_pp && pp<count && p != ppWithSameTarget->end(); p++, pp++, i++ ) {
+ for(i=0, pp=0, p = ppWithSameTarget->begin(); i<m_max_example && pp<count && p != ppWithSameTarget->end(); p++, pp++, i++ ) {
(*p)->PrintClippedHTML( &cout, 160 );
- if (count > m_max_pp) {
- p += count/m_max_pp-1;
- pp += count/m_max_pp-1;
+ if (count > m_max_example) {
+ p += count/m_max_example-1;
+ pp += count/m_max_example-1;
}
}
}
@@ -172,7 +183,7 @@ void PhrasePairCollection::PrintHTML() const
if (singleton) cout << "</table></div>\n";
else if (pp_target > 9) cout << "</div>";
- size_t max_mismatch = m_max_pp/3;
+ size_t max_mismatch = m_max_example/3;
// unaligned phrases
if (m_unaligned.size() > 0) {
cout << "<p class=\"pp_singleton_header\">unaligned"
diff --git a/biconcor/PhrasePairCollection.h b/biconcor/PhrasePairCollection.h
index f88bfc10f..e076eba9b 100644
--- a/biconcor/PhrasePairCollection.h
+++ b/biconcor/PhrasePairCollection.h
@@ -22,19 +22,19 @@ private:
std::vector< Mismatch* > m_mismatch, m_unaligned;
int m_size;
int m_max_lookup;
- int m_max_pp_target;
- int m_max_pp;
+ int m_max_translation;
+ int m_max_example;
// No copying allowed.
PhrasePairCollection(const PhrasePairCollection&);
void operator=(const PhrasePairCollection&);
public:
- PhrasePairCollection ( SuffixArray *, TargetCorpus *, Alignment * );
+ PhrasePairCollection ( SuffixArray *, TargetCorpus *, Alignment *, int, int );
~PhrasePairCollection ();
- bool GetCollection( const std::vector<std::string >& sourceString );
- void Print() const;
+ int GetCollection( const std::vector<std::string >& sourceString );
+ void Print(bool pretty) const;
void PrintHTML() const;
};
diff --git a/biconcor/biconcor.cpp b/biconcor/biconcor.cpp
index a25e63cb7..f4e7c03fb 100644
--- a/biconcor/biconcor.cpp
+++ b/biconcor/biconcor.cpp
@@ -19,8 +19,12 @@ int main(int argc, char* argv[])
int saveFlag = false;
int createFlag = false;
int queryFlag = false;
- int htmlFlag = false;
- string info = "usage: suffix-query\n\t[--load file]\n\t[--save file]\n\t[--create source-corpus]\n\t[--query string]\n\t[--target target-corpus]\n\t[--alignment file]\n";
+ int htmlFlag = false; // output as HTML
+ int prettyFlag = false; // output readable on screen
+ int stdioFlag = false; // receive requests from STDIN, respond to STDOUT
+ int max_translation = 20;
+ int max_example = 50;
+ string info = "usage: biconcor\n\t[--load model-file]\n\t[--save model-file]\n\t[--create source-corpus]\n\t[--query string]\n\t[--target target-corpus]\n\t[--alignment file]\n\t[--translations count]\n\t[--examples count]\n\t[--html]\n\t[--stdio]\n";
while(1) {
static struct option long_options[] = {
{"load", required_argument, 0, 'l'},
@@ -29,11 +33,15 @@ int main(int argc, char* argv[])
{"query", required_argument, 0, 'q'},
{"target", required_argument, 0, 't'},
{"alignment", required_argument, 0, 'a'},
- {"html", no_argument, &htmlFlag, 0},
+ {"html", no_argument, 0, 'h'},
+ {"pretty", no_argument, 0, 'p'},
+ {"stdio", no_argument, 0, 'i'},
+ {"translations", required_argument, 0, 'o'},
+ {"examples", required_argument, 0, 'e'},
{0, 0, 0, 0}
};
int option_index = 0;
- int c = getopt_long (argc, argv, "l:s:c:q:Q:t:a:h", long_options, &option_index);
+ int c = getopt_long (argc, argv, "l:s:c:q:Q:t:a:hpio:e:", long_options, &option_index);
if (c == -1) break;
switch (c) {
case 'l':
@@ -62,11 +70,29 @@ int main(int argc, char* argv[])
query = string(optarg);
queryFlag = true;
break;
+ case 'o':
+ max_translation = atoi(optarg);
+ break;
+ case 'e':
+ max_example = atoi(optarg);
+ break;
+ case 'p':
+ prettyFlag = true;
+ break;
+ case 'h':
+ htmlFlag = true;
+ break;
+ case 'i':
+ stdioFlag = true;
+ break;
default:
cerr << info;
exit(1);
}
}
+ if (stdioFlag) {
+ queryFlag = true;
+ }
// check if parameter settings are legal
if (saveFlag && !createFlag) {
@@ -111,12 +137,37 @@ int main(int argc, char* argv[])
targetCorpus.Load( fileNameSuffix );
alignment.Load( fileNameSuffix );
}
- if (queryFlag) {
+ if (stdioFlag) {
+ cout << "-|||- BICONCOR START -|||-" << endl << flush;
+ while(true) {
+ string query;
+ if (getline(cin, query, '\n').eof()) {
+ return 0;
+ }
+ vector< string > queryString = alignment.Tokenize( query.c_str() );
+ PhrasePairCollection ppCollection( &suffixArray, &targetCorpus, &alignment, max_translation, max_example );
+ int total = ppCollection.GetCollection( queryString );
+ cout << "TOTAL: " << total << endl;
+ if (htmlFlag) {
+ ppCollection.PrintHTML();
+ }
+ else {
+ ppCollection.Print(prettyFlag);
+ }
+ cout << "-|||- BICONCOR END -|||-" << endl << flush;
+ }
+ }
+ else if (queryFlag) {
cerr << "query is " << query << endl;
vector< string > queryString = alignment.Tokenize( query.c_str() );
- PhrasePairCollection ppCollection( &suffixArray, &targetCorpus, &alignment );
+ PhrasePairCollection ppCollection( &suffixArray, &targetCorpus, &alignment, max_translation, max_example );
ppCollection.GetCollection( queryString );
- ppCollection.PrintHTML();
+ if (htmlFlag) {
+ ppCollection.PrintHTML();
+ }
+ else {
+ ppCollection.Print(prettyFlag);
+ }
}
return 0;
diff --git a/bjam b/bjam
index d0d94dedb..0ebf105c3 100755
--- a/bjam
+++ b/bjam
@@ -1,17 +1,17 @@
#!/bin/bash
set -e
+top="$(dirname "$0")"
if
bjam="$(which bjam 2>/dev/null)" && #exists
[ ${#bjam} != 0 ] && #paranoia about which printing nothing then returning true
! grep UFIHGUFIHBDJKNCFZXAEVA "${bjam}" </dev/null >/dev/null && #bjam in path isn't this script
"${bjam}" --sanity-test 2>/dev/null |grep Sane >/dev/null && #The test in jam-files/sanity.jam passes
- (cd jam-files/fail && ! "${bjam}") >/dev/null #Returns non-zero on failure
+ (cd "${top}/jam-files/fail" && ! "${bjam}") >/dev/null #Returns non-zero on failure
then
#Delegate to system bjam
exec "${bjam}" "$@"
fi
-top="$(dirname "$0")"
if [ ! -x "$top"/jam-files/bjam ] || "$top"/jam-files/bjam -v |grep 2011.4 >/dev/null; then
pushd "$top/jam-files/engine"
./build.sh
diff --git a/contrib/arrow-pipelines/documentation/training-pipeline/moses-pypeline.dia b/contrib/arrow-pipelines/documentation/training-pipeline/moses-pypeline.dia
new file mode 100644
index 000000000..1d35a1dea
--- /dev/null
+++ b/contrib/arrow-pipelines/documentation/training-pipeline/moses-pypeline.dia
Binary files differ
diff --git a/contrib/arrow-pipelines/python/README b/contrib/arrow-pipelines/python/README
new file mode 100644
index 000000000..e1e12975c
--- /dev/null
+++ b/contrib/arrow-pipelines/python/README
@@ -0,0 +1,32 @@
+Arrow Based Moses Training Pipeline
+===================================
+
+To use the demonstration you must first initialise the git submodules for this clone. Return to the top level directory and issue the following command:
+
+$ git submodule init
+
+This will clone the Pypeline submodule that is available on GitHub (https://github.com/ianj-als/pypeline). To install Pypeline:
+
+$ cd libs/pypeline
+$ python setup.py install
+
+Alternatively, you can set an appropriate PYTHONPATH enviornment variable to the Pypeline library.
+
+This demonstration implements a training pipeline that is shown in the Dia diagram in ../documentation/training-pipeline/moses-pypeline.dia.
+
+Three environment variables need to be set before the manager.py script can be run, they are:
+
+ - MOSES_HOME : The directory where Moses has been cloned, or installed,
+ - IRSTLM : The installation directory of your IRSTLM, and
+ - GIZA_HOME : The installation directory of GIZA++.
+
+The manager.py script takes four positional command-line arguments:
+
+ - The source language code,
+ - The target language code,
+ - The source corpus file. This file *must* be cleaned prior to use, and
+ - The target corpus file. This file *must* be cleaned prior to use.
+
+For example, run the manager.py script with:
+
+$ python manager.py en lt cleantrain.en cleantrain.lt
diff --git a/contrib/arrow-pipelines/python/libs/pypeline b/contrib/arrow-pipelines/python/libs/pypeline
new file mode 160000
+Subproject a7084b686f5196f1bbac5d389b4a6cd7f15c83f
diff --git a/contrib/arrow-pipelines/python/manager.py b/contrib/arrow-pipelines/python/manager.py
new file mode 100644
index 000000000..1c3ece111
--- /dev/null
+++ b/contrib/arrow-pipelines/python/manager.py
@@ -0,0 +1,192 @@
+import logging
+import os
+
+from concurrent.futures import Future, ThreadPoolExecutor
+from functools import partial
+from pypeline.helpers.parallel_helpers import eval_pipeline, \
+ cons_function_component, \
+ cons_wire, \
+ cons_split_wire, \
+ cons_unsplit_wire, \
+ cons_dictionary_wire
+
+
+#
+# Some logging please
+#
+FORMAT = '%(asctime)-15s : %(threadName)s : %(levelname)s - %(message)s'
+logging.basicConfig(format = FORMAT, level = logging.DEBUG)
+logger = logging.getLogger("manager")
+
+
+# Build the pipeline components
+def build_components(components, configuration, executor):
+ pipeline_components = dict()
+ pipeline_configuration = dict()
+
+ for component_id, module_name in components.items():
+ logger.info("Loading [%s] component from [%s]..." % (component_id, module_name))
+
+ module = __import__(module_name, fromlist = ['configure', 'initialise'])
+
+ # Component builds its own configuration object
+ config_func = getattr(module, 'configure')
+ component_config = config_func(configuration)
+ pipeline_configuration.update(component_config)
+
+ # Now build the component
+ init_func = getattr(module, 'initialise')
+ component_function = init_func(component_config)
+
+ # A wrapper for the component's function that submits to the executor
+ def get_component_function_wrapper(inner_function, comp_id, mod_name):
+ def component_function_wrapper(a, s):
+ logger.info("Running component [%s], from module [%s], with value [%s] and state [%s]..." % \
+ (comp_id, mod_name, a, s))
+ return inner_function(a, s)
+
+ return component_function_wrapper
+
+ # Arrowize the component
+ component = cons_function_component(get_component_function_wrapper(component_function, component_id, module_name))
+
+ # And store
+ pipeline_components[component_id] = component
+
+ return pipeline_components, pipeline_configuration
+
+
+# Go!
+def main(src_lang, trg_lang, src_filename, trg_filename):
+ # Global configuration
+ # One day, this configuration shall be constructed from
+ # command line options, or a properties file.
+ configuration = {
+ 'moses_installation_dir': os.environ['MOSES_HOME'],
+ 'irstlm_installation_dir': os.environ['IRSTLM'],
+ 'giza_installation_dir': os.environ['GIZA_HOME'],
+ 'src_lang': src_lang,
+ 'src_tokenisation_dir': './tokenisation',
+ 'trg_lang': trg_lang,
+ 'trg_tokenisation_dir': './tokenisation',
+ 'segment_length_limit': 60,
+ 'irstlm_smoothing_method': 'improved-kneser-ney',
+ 'language_model_directory': './language-model',
+ 'translation_model_directory': './translation-model',
+ 'mert_working_directory': './mert',
+ 'evaluation_data_size': 100,
+ 'development_data_size': 100
+ }
+
+ # The modules to load
+ # In the future, the components shall be specified in some kind
+ # pipeline description file.
+ component_modules = {
+ 'src_tokenizer': 'training.components.tokenizer.src_tokenizer',
+ 'trg_tokenizer': 'training.components.tokenizer.trg_tokenizer',
+ 'cleanup': 'training.components.cleanup.cleanup',
+ 'data_split': 'training.components.data_split.data_split',
+ 'irstlm_build': 'training.components.irstlm_build.irstlm_build',
+ 'model_training': 'training.components.model_training.model_training',
+ 'mert': 'training.components.mert.mert'
+ }
+
+ # The thread pool
+ executor = ThreadPoolExecutor(max_workers = 3)
+
+ # Phew, build the required components
+ components, component_config = build_components(component_modules, configuration, executor)
+
+ #
+ # Wire up components
+ # Description of wiring should be, in the future, alongside the component
+ # specification in some kind of confuguration file. Components shall be
+ # declared then used, i.e., bind a component instance to a unique component
+ # identifier, then wire component instances together by identifier.
+ #
+
+ #
+ # Tokenisation of source and target...
+ #
+ # IRSTLM Build components
+ irstlm_build_component = cons_split_wire() >> \
+ (cons_wire(lambda a, s: {'input_filename': a['tokenised_trg_filename']}) >> \
+ components['irstlm_build']).second() >> \
+ cons_unsplit_wire(lambda t, b: {'tokenised_trg_filename': t['tokenised_trg_filename'],
+ 'trg_language_model_filename': b['compiled_lm_filename']})
+
+ # The complete tokenisation component
+ tokenisation_component = (components['src_tokenizer'] & components['trg_tokenizer']) >> \
+ irstlm_build_component.second() >> \
+ cons_unsplit_wire(lambda t, b: {'src_filename': t['tokenised_src_filename'],
+ 'trg_filename': b['tokenised_trg_filename'],
+ 'trg_language_model_filename': b['trg_language_model_filename']})
+
+ #
+ # Cleanup and Data Spliting...
+ #
+
+ #
+ # A function that clips off the last '.' delimited string
+ #
+ def clip_last_bit(filename):
+ bn = os.path.basename(filename)
+ directory = os.path.dirname(filename)
+ bits = bn.split(".")
+ bits.pop()
+ return os.path.join(directory, ".".join(bits))
+
+ cleanup_datasplit_component = components['cleanup'] >> \
+ cons_wire(lambda a, s: {'src_filename': a['cleaned_src_filename'],
+ 'trg_filename': a['cleaned_trg_filename']}) >> \
+ components['data_split'] >> \
+ cons_wire(lambda a, s: {'training_data_filename': clip_last_bit(a['train_src_filename']),
+ 'eval_src_filename': a['eval_src_filename'],
+ 'eval_trg_filename': a['eval_trg_filename']})
+
+ #
+ # Translation model training
+ #
+ translation_model_component = cons_split_wire() >> \
+ components['model_training'].first() >> \
+ cons_unsplit_wire(lambda t, b: {'moses_ini_file': t['moses_ini_file'],
+ 'development_data_filename': b['eval_src_filename']})
+
+ #
+ # The whole pipeline
+ #
+ pipeline = tokenisation_component >> \
+ cons_split_wire() >> \
+ (cleanup_datasplit_component >> translation_model_component).first() >> \
+ cons_unsplit_wire(lambda t, b: {'moses_ini_file': t['moses_ini_file'],
+ 'development_data_filename': clip_last_bit(t['development_data_filename']),
+ 'trg_language_model_filename': b['trg_language_model_filename'],
+ 'trg_language_model_order': 3,
+ 'trg_language_model_type': 9}) >> \
+ components['mert']
+
+
+ #
+ # The input to the pipeline
+ #
+ value = {'src_filename': src_filename,
+ 'trg_filename': trg_filename}
+
+ #
+ # Evaluate the pipeline
+ #
+ logger.info("Evaluating pipeline with input [%s]..." % value)
+ new_value = eval_pipeline(executor, pipeline, value, component_config)
+
+ #
+ # Wait for all components to finish
+ #
+ executor.shutdown(True)
+
+ logger.info("Pipeline evaluated to %s" % new_value)
+
+
+if __name__ == '__main__':
+ import sys
+
+ main(sys.argv[1], sys.argv[2], sys.argv[3], sys.argv[4])
diff --git a/contrib/arrow-pipelines/python/test/__init__.py b/contrib/arrow-pipelines/python/test/__init__.py
new file mode 100644
index 000000000..e69de29bb
--- /dev/null
+++ b/contrib/arrow-pipelines/python/test/__init__.py
diff --git a/contrib/arrow-pipelines/python/test/test.py b/contrib/arrow-pipelines/python/test/test.py
new file mode 100644
index 000000000..628796f7d
--- /dev/null
+++ b/contrib/arrow-pipelines/python/test/test.py
@@ -0,0 +1,11 @@
+import subprocess
+
+def cat(filename, content):
+ fh = open(filename, "w")
+ for line in content:
+ #print(line, file=fh)
+ print >> fh, line
+ fh.close()
+
+def diff(filename1, filename2):
+ subprocess.check_output(["diff", filename1, filename2], stderr=subprocess.STDOUT)
diff --git a/contrib/arrow-pipelines/python/training/__init__.py b/contrib/arrow-pipelines/python/training/__init__.py
new file mode 100644
index 000000000..e69de29bb
--- /dev/null
+++ b/contrib/arrow-pipelines/python/training/__init__.py
diff --git a/contrib/arrow-pipelines/python/training/components/__init__.py b/contrib/arrow-pipelines/python/training/components/__init__.py
new file mode 100644
index 000000000..e69de29bb
--- /dev/null
+++ b/contrib/arrow-pipelines/python/training/components/__init__.py
diff --git a/contrib/arrow-pipelines/python/training/components/cleanup/__init__.py b/contrib/arrow-pipelines/python/training/components/cleanup/__init__.py
new file mode 100644
index 000000000..e69de29bb
--- /dev/null
+++ b/contrib/arrow-pipelines/python/training/components/cleanup/__init__.py
diff --git a/contrib/arrow-pipelines/python/training/components/cleanup/cleanup.py b/contrib/arrow-pipelines/python/training/components/cleanup/cleanup.py
new file mode 100644
index 000000000..cb2e057ce
--- /dev/null
+++ b/contrib/arrow-pipelines/python/training/components/cleanup/cleanup.py
@@ -0,0 +1,125 @@
+from pypeline.helpers.helpers import cons_function_component
+
+def configure(args):
+ result = {}
+ result['segment_length'] = args['segment_length_limit']
+ return result
+
+def initialise(config):
+ def _filter(limit, ifh1, ofh1, ifh2, ofh2):
+ def _short(line):
+ n = 0
+ for c in line:
+ if c == " ":
+ n += 1
+ #print(line, ":", n)
+ return n < limit
+
+ for (l1, l2) in zip(ifh1, ifh2):
+ if _short(l1) and _short(l2):
+ print >>ofh1, l1,
+ print >>ofh2, l2,
+
+ def _make_cleaned_filename(filename):
+ bits = filename.split(".")
+ bits[-1] = "clean"
+ return ".".join(bits)
+
+ def _filter_main(value, config):
+ limit = config['segment_length']
+ (ifh1, ifh2, ofh1, ofh2) = (None, None, None, None)
+ try:
+ input_src_filename = value['src_filename']
+ input_trg_filename = value['trg_filename']
+
+ print "Cleanup: Cleaning [%s] and [%s]..." % (input_src_filename, input_trg_filename)
+
+ ifh1 = open(input_src_filename, "r")
+ ifh2 = open(input_trg_filename, "r")
+
+ cleaned_src_filename = _make_cleaned_filename(input_src_filename)
+ cleaned_trg_filename = _make_cleaned_filename(input_trg_filename)
+ ofh1 = open(cleaned_src_filename, "w")
+ ofh2 = open(cleaned_trg_filename, "w")
+
+ _filter(limit, ifh1, ofh1, ifh2, ofh2)
+
+ return {'cleaned_src_filename': cleaned_src_filename,
+ 'cleaned_trg_filename': cleaned_trg_filename}
+ finally:
+ def _safe_close(fh):
+ if fh is not None:
+ fh.close()
+ _safe_close(ifh1)
+ _safe_close(ifh2)
+ _safe_close(ofh1)
+ _safe_close(ofh2)
+
+ return _filter_main
+
+
+if __name__ == '__main__':
+ import os
+ import tempfile
+ import test.test as thelp
+
+ from pypeline.helpers.helpers import eval_pipeline
+
+
+ def _test_main():
+ configuration = {'segment_length_limit': 20}
+
+ src_filename = tempfile.mkstemp(suffix = ".src", dir = "/tmp")
+ trg_filename = tempfile.mkstemp(suffix = ".trg", dir = "/tmp")
+
+ box_eval = {
+ 'src_filename': src_filename[1],
+ 'trg_filename': trg_filename[1],
+ 'cleaned_src_file_expected': src_filename[1] + ".expected",
+ 'cleaned_trg_file_expected': trg_filename[1] + ".expected"
+ }
+
+ try:
+ _prep_files(box_eval)
+ _run_test(configuration, box_eval)
+ finally:
+ _cleanup_files(box_eval)
+
+
+ def _run_test(configuration, box_eval):
+ box_config = configure(configuration)
+ box = initialise(box_config)
+
+ output = eval_pipeline(box, box_eval, box_config)
+ try:
+ thelp.diff(box_eval['cleaned_src_file_expected'], output['cleaned_src_filename'])
+ thelp.diff(box_eval['cleaned_trg_file_expected'], output['cleaned_trg_filename'])
+ finally:
+ os.unlink(output['cleaned_src_filename'])
+ os.unlink(output['cleaned_trg_filename'])
+
+
+ def _line(line_lengths):
+ def _gen_line(tokens):
+ return " ".join(map(lambda n: "tok" + str(n), range(tokens)))
+ return map(_gen_line, line_lengths)
+
+
+ def _prep_files(box_eval):
+ thelp.cat(box_eval['src_filename'], _line([10, 20, 30, 40, 17, 21]))
+ thelp.cat(box_eval['trg_filename'], _line([40, 30, 20, 10, 20, 21]))
+ #expected output:
+ thelp.cat(box_eval['cleaned_src_file_expected'], _line([17]))
+ thelp.cat(box_eval['cleaned_trg_file_expected'], _line([20]))
+
+
+ def _cleanup_files(box_eval):
+ try:
+ for key, filename in box_eval.items():
+ os.unlink(filename)
+ except:
+ pass
+
+
+ _test_main()
+
diff --git a/contrib/arrow-pipelines/python/training/components/cleanup/cleanup3.py b/contrib/arrow-pipelines/python/training/components/cleanup/cleanup3.py
new file mode 100644
index 000000000..27625c612
--- /dev/null
+++ b/contrib/arrow-pipelines/python/training/components/cleanup/cleanup3.py
@@ -0,0 +1,109 @@
+from pypeline.helpers.helpers import cons_function_component
+
+def configure(args):
+ result = {}
+ result['segment_length'] = args['segment_length_limit']
+ return result
+
+def initialise(config):
+ def _filter(limit, ifh1, ofh1, ifh2, ofh2):
+ def _short(line):
+ n = 0
+ for c in line:
+ if c == " ":
+ n += 1
+ #print(line, ":", n)
+ return n < limit
+
+ for (l1, l2) in zip(ifh1, ifh2):
+ if _short(l1) and _short(l2):
+ print(l1, end='', file=ofh1)
+ print(l2, end='', file=ofh2)
+
+ def _filter_main(config, value):
+ limit = config['segment_length']
+ (ifh1, ifh2, ofh1, ofh2) = (None, None, None, None)
+ try:
+ ifh1 = open(value['src_filename'], "r")
+ ifh2 = open(value['trg_filename'], "r")
+ ofh1 = open(value['cleaned_src_filename'], "w")
+ ofh2 = open(value['cleaned_trg_filename'], "w")
+
+ _filter(limit, ifh1, ofh1, ifh2, ofh2)
+
+ return {'cleaned_src_filename': value['cleaned_src_filename'],
+ 'cleaned_trg_filename': value['cleaned_trg_filename']}
+ finally:
+ def _safe_close(fh):
+ if fh is not None:
+ fh.close()
+ _safe_close(ifh1)
+ _safe_close(ifh2)
+ _safe_close(ofh1)
+ _safe_close(ofh2)
+
+ return cons_function_component(_filter_main)
+
+
+if __name__ == '__main__':
+ import os
+ import tempfile
+ import training.components.shared.test as thelp
+
+
+ def _test_main():
+ configuration = {'segment_length_limit': 20}
+
+ src_filename = tempfile.mkstemp(suffix = "src", dir = "/tmp")
+ trg_filename = tempfile.mkstemp(suffix = "trg", dir = "/tmp")
+
+ box_eval = {
+ 'src_filename': src_filename[1],
+ 'trg_filename': trg_filename[1],
+ 'cleaned_src_filename': src_filename[1] + ".clean",
+ 'cleaned_trg_filename': trg_filename[1] + ".clean",
+ 'cleaned_src_file_expected': src_filename[1] + ".expected",
+ 'cleaned_trg_file_expected': trg_filename[1] + ".expected"
+ }
+
+ try:
+ _prep_files(box_eval)
+ _run_test(configuration, box_eval)
+ finally:
+ _cleanup_files(box_eval)
+
+
+ def _run_test(configuration, box_eval):
+ from pypeline.helpers.helpers import run_pipeline
+ box_config = configure(configuration)
+ box = initialise(box_config)
+
+ run_pipeline(box, box_config, box_eval)
+ thelp.diff(box_eval['cleaned_src_file_expected'], box_eval['cleaned_src_filename'])
+ thelp.diff(box_eval['cleaned_trg_file_expected'], box_eval['cleaned_trg_filename'])
+
+
+ def _line(line_lengths):
+ def _gen_line(tokens):
+ return " ".join(map(lambda n: "tok" + str(n), range(tokens)))
+ return map(_gen_line, line_lengths)
+
+
+ def _prep_files(box_eval):
+ thelp.cat(box_eval['src_filename'], _line([10, 20, 30, 40, 17, 21]))
+ thelp.cat(box_eval['trg_filename'], _line([40, 30, 20, 10, 20, 21]))
+ #expected output:
+ thelp.cat(box_eval['cleaned_src_file_expected'], _line([17]))
+ thelp.cat(box_eval['cleaned_trg_file_expected'], _line([20]))
+
+
+ def _cleanup_files(box_eval):
+ try:
+ for key, filename in box_eval.items():
+ os.unlink(filename)
+ except:
+ pass
+
+
+ _test_main()
+
diff --git a/contrib/arrow-pipelines/python/training/components/data_split/__init__.py b/contrib/arrow-pipelines/python/training/components/data_split/__init__.py
new file mode 100644
index 000000000..e69de29bb
--- /dev/null
+++ b/contrib/arrow-pipelines/python/training/components/data_split/__init__.py
diff --git a/contrib/arrow-pipelines/python/training/components/data_split/data_split.py b/contrib/arrow-pipelines/python/training/components/data_split/data_split.py
new file mode 100644
index 000000000..b8469cbf6
--- /dev/null
+++ b/contrib/arrow-pipelines/python/training/components/data_split/data_split.py
@@ -0,0 +1,146 @@
+from pypeline.helpers.helpers import cons_function_component
+
+def configure(args):
+ result = {}
+ result['evaluate_size'] = args['evaluation_data_size']
+ result['development_size'] = args['development_data_size']
+ return result
+
+def initialise(config):
+
+ def _copy(size, inp, ofh1, ofh2):
+ try:
+ while size != 0:
+ (l1, l2) = inp.next()
+ print >>ofh1, l1,
+ print >>ofh2, l2,
+ size -= 1
+ except StopIteration:
+ pass
+
+ def _make_split_filename(filename, data_set):
+ bits = filename.split(".")
+ last = bits.pop()
+ lang_code = bits.pop()
+
+ bits.append(last)
+ bits.append(data_set)
+ bits.append(lang_code)
+
+ new_filename = ".".join(bits)
+ return new_filename
+
+ def _splitter_main(value, config):
+ (ifh1, ifh2, ofh1, ofh2) = (None, None, None, None)
+ try:
+ input_src_filename = value['src_filename']
+ input_trg_filename = value['trg_filename']
+
+ ifh1 = open(input_src_filename, "r")
+ ifh2 = open(input_trg_filename, "r")
+ inp = iter(zip(ifh1, ifh2))
+
+ result = {}
+ for (data_set, size) in [
+ ('devel', config['development_size']),
+ ('eval', config['evaluate_size']),
+ ('train', -1)
+ ]:
+ output_src_filename = _make_split_filename(input_src_filename, data_set)
+ output_trg_filename = _make_split_filename(input_trg_filename, data_set)
+ ofh1 = open(output_src_filename, "w")
+ ofh2 = open(output_trg_filename, "w")
+
+ _copy(size, inp, ofh1, ofh2)
+ result[data_set + '_src_filename'] = output_src_filename
+ result[data_set + '_trg_filename'] = output_trg_filename
+
+ return result
+
+ finally:
+ def _safe_close(fh):
+ if fh is not None:
+ fh.close()
+ _safe_close(ifh1)
+ _safe_close(ifh2)
+ _safe_close(ofh1)
+ _safe_close(ofh2)
+
+ return _splitter_main
+
+
+if __name__ == '__main__':
+ import os
+ import tempfile
+ import test.test as thelp
+
+ from pypeline.helpers.helpers import eval_pipeline
+
+
+ def _test_main():
+ configuration = {
+ 'evaluation_data_size': 7,
+ 'development_data_size': 13,
+ }
+
+ src_filename = tempfile.mkstemp(suffix = ".src", dir = "/tmp")
+ trg_filename = tempfile.mkstemp(suffix = ".trg", dir = "/tmp")
+
+ box_eval = {
+ 'src_filename': src_filename[1],
+ 'trg_filename': trg_filename[1],
+ 'devel_src_expected': src_filename[1] + ".devel.expected",
+ 'devel_trg_expected': trg_filename[1] + ".devel.expected",
+ 'eval_src_expected': src_filename[1] + ".eval.expected",
+ 'eval_trg_expected': trg_filename[1] + ".eval.expected",
+ 'train_src_expected': src_filename[1] + ".train.expected",
+ 'train_trg_expected': trg_filename[1] + ".train.expected",
+ }
+
+ try:
+ _prep_files(box_eval)
+ _run_test(configuration, box_eval)
+ finally:
+ _cleanup_files(box_eval)
+
+
+ def _run_test(configuration, box_eval):
+ box_config = configure(configuration)
+ box = initialise(box_config)
+
+ output = eval_pipeline(box, box_eval, box_config)
+ for data_set in ['devel', 'eval', 'train']:
+ for lang in ['src', 'trg']:
+ filename = output[data_set + '_' + lang + '_filename']
+ filename_expected = box_eval[data_set + '_' + lang + '_expected']
+ thelp.diff(filename_expected, filename)
+
+
+ def _line(line_lengths):
+ def _gen_line(tokens):
+ return " ".join(map(lambda n: "tok" + str(n), range(tokens)))
+ return map(_gen_line, line_lengths)
+
+
+ def _prep_files(box_eval):
+ thelp.cat(box_eval['src_filename'], _line(range(50)))
+ thelp.cat(box_eval['trg_filename'], _line(range(50)))
+ #expected output:
+ thelp.cat(box_eval['devel_src_expected'], _line(range(0,13)))
+ thelp.cat(box_eval['devel_trg_expected'], _line(range(0,13)))
+ thelp.cat(box_eval['eval_src_expected'], _line(range(13,20)))
+ thelp.cat(box_eval['eval_trg_expected'], _line(range(13,20)))
+ thelp.cat(box_eval['train_src_expected'], _line(range(20,50)))
+ thelp.cat(box_eval['train_trg_expected'], _line(range(20,50)))
+
+
+ def _cleanup_files(box_eval):
+ try:
+ for key, filename in box_eval.items():
+ os.unlink(filename)
+ except:
+ pass
+
+
+ _test_main()
+
diff --git a/contrib/arrow-pipelines/python/training/components/irstlm_build/__init__.py b/contrib/arrow-pipelines/python/training/components/irstlm_build/__init__.py
new file mode 100644
index 000000000..e69de29bb
--- /dev/null
+++ b/contrib/arrow-pipelines/python/training/components/irstlm_build/__init__.py
diff --git a/contrib/arrow-pipelines/python/training/components/irstlm_build/irstlm_build.py b/contrib/arrow-pipelines/python/training/components/irstlm_build/irstlm_build.py
new file mode 100644
index 000000000..f65d61973
--- /dev/null
+++ b/contrib/arrow-pipelines/python/training/components/irstlm_build/irstlm_build.py
@@ -0,0 +1,106 @@
+import os
+import shutil
+import subprocess
+import tempfile
+
+from pypeline.helpers.helpers import cons_function_component
+
+def configure(args):
+ config = dict()
+ config['irstlm_install_directory'] = args['irstlm_installation_dir']
+ config['smoothing_method'] = args['irstlm_smoothing_method']
+ config['lm_directory'] = args['language_model_directory']
+ return config
+
+def initialise(config):
+ def process(a, s):
+ # Create the LM directory if we need to
+ if os.path.exists(s['lm_directory']) is False:
+ os.makedirs(s['lm_directory'])
+
+ # The filename of the file to chew through
+ start_end_input_filename = a['input_filename']
+ if os.path.exists(start_end_input_filename) is False:
+ raise Exception("IRSTLM Build: Input file could not be found at [%s]" % start_end_input_filename)
+
+ # Derive the output file name for the add start-end marker processor
+ filename_bits = os.path.basename(start_end_input_filename).split(".")
+ filename_bits[2] = "sb";
+ start_end_output_filename = os.path.join(s['lm_directory'], ".".join(filename_bits))
+
+ # Derive the output file name of the LM build
+ filename_bits[2] = "lm"
+ lm_filename = os.path.join(s['lm_directory'], ".".join(filename_bits))
+
+ # Derive the compiled LM file name
+ filename_bits[2] = "arpa"
+ compiled_lm_filename = os.path.join(s['lm_directory'], ".".join(filename_bits))
+
+ # First thing to do is add start and end markers
+ start_end_cmdline = [os.path.join(s['irstlm_install_directory'], "bin", "add-start-end.sh")]
+ infile = open(start_end_input_filename, 'r')
+ outfile = open(start_end_output_filename, 'w')
+ print "IRSTLM Build: Invoking [%s]..." % " ".join(start_end_cmdline)
+ return_code = subprocess.check_call(start_end_cmdline, stdin = infile, stdout = outfile)
+ if return_code:
+ raise Exception("IRSTLM add start and end markers failed: input file = [%s], output file = [%s], return code = [%d]" % \
+ start_end_input_filename, start_end_output_filename, return_code)
+
+ # Next build the language model
+ tmp_dir = tempfile.mkdtemp(dir = "/tmp")
+ try:
+ build_lm_cmdline = [os.path.join(s['irstlm_install_directory'], "bin", "build-lm.sh"),
+ "-i", start_end_output_filename,
+ "-t", tmp_dir,
+ "-p",
+ "-s", s['smoothing_method'],
+ "-o", lm_filename]
+ print "IRSTLM Build: Invoking [%s]..." % " ".join(build_lm_cmdline)
+ return_code = subprocess.check_call(build_lm_cmdline)
+ if return_code:
+ raise Exception("IRST language model failed to build: return code = [%d]" % return_code)
+ finally:
+ if os.path.exists(tmp_dir):
+ shutil.rmtree(tmp_dir)
+
+ # Compile the LM
+ lm_filename = lm_filename + ".gz"
+ compile_lm_cmdline = [os.path.join(s['irstlm_install_directory'], "bin", "compile-lm"),
+ "--text", "yes",
+ lm_filename,
+ compiled_lm_filename]
+ print "IRSTLM Build: Invoking [%s]..." % " ".join(compile_lm_cmdline)
+ return_code = subprocess.check_call(compile_lm_cmdline)
+ if return_code:
+ raise Exception("IRST language model compilation failed: return code = [%d]" % return_code)
+
+ output = {'add_start_end_filename': start_end_output_filename,
+ 'lm_filename': lm_filename,
+ 'compiled_lm_filename': compiled_lm_filename}
+
+ print "IRSTLM Build: Output = %s" % output
+
+ return output
+
+ return process
+
+
+if __name__ == '__main__':
+ from pypeline.helpers.helpers import eval_pipeline
+
+ lm_dir = os.environ["PWD"]
+ configuration = {'irstlm_root': os.environ["IRSTLM"],
+ 'irstlm_smoothing_method': 'improved-kneser-ney',
+ 'language_model_directory': lm_dir}
+ component_config = configure(configuration)
+ component = initialise(component_config)
+
+ value = eval_pipeline(component,
+ {'input_filename': '/Users/ianjohnson/Dropbox/Documents/MTM2012/tokenised_files/news-commentary-v7.fr-en.tok.en'},
+ component_config)
+ target = {'add_start_end_filename': os.path.join(lm_dir, 'news-commentary-v7.fr-en.sb.en'),
+ 'lm_filename': os.path.join(lm_dir, 'news-commentary-v7.fr-en.lm.en.gz'),
+ 'compiled_lm_filename': os.path.join(lm_dir, 'news-commentary-v7.fr-en.arpa.en')}
+ print "Target: %s" % target
+ if value != target:
+ raise Exception("Massive fail!")
diff --git a/contrib/arrow-pipelines/python/training/components/mert/__init__.py b/contrib/arrow-pipelines/python/training/components/mert/__init__.py
new file mode 100644
index 000000000..e69de29bb
--- /dev/null
+++ b/contrib/arrow-pipelines/python/training/components/mert/__init__.py
diff --git a/contrib/arrow-pipelines/python/training/components/mert/mert.py b/contrib/arrow-pipelines/python/training/components/mert/mert.py
new file mode 100755
index 000000000..2b60b1720
--- /dev/null
+++ b/contrib/arrow-pipelines/python/training/components/mert/mert.py
@@ -0,0 +1,83 @@
+#!/usr/bin/env python
+
+import os, shutil, subprocess
+
+from pypeline.helpers.helpers import cons_function_component
+
+def configure(args):
+ result = {}
+ result['src_lang'] = args['src_lang']
+ result['trg_lang'] = args['trg_lang']
+ result['moses_installation_dir'] = args['moses_installation_dir']
+ result['mert_working_dir'] = args['mert_working_directory']
+ return result
+
+def initialise(config):
+
+ def process(a, s):
+ infilename = os.path.abspath(a['development_data_filename'])
+ lm_file = os.path.abspath(a['trg_language_model_filename'])
+ lm_order = int(a['trg_language_model_order'])
+ lm_type = int(a['trg_language_model_type'])
+ orig_moses_ini = os.path.abspath(a['moses_ini_file'])
+
+ if not os.path.exists(orig_moses_ini):
+ raise Exception, "Error: Input moses.ini does not exist"
+
+ workdir = os.path.abspath(config['mert_working_dir'])
+ #simply call the training perl script
+ #remove the workdir if it is already there
+ if os.path.exists(workdir):
+ shutil.rmtree(workdir)
+ os.makedirs(workdir)
+
+ #local vars
+ moses_install_dir = os.path.abspath(config['moses_installation_dir'])
+ mert_perl = os.path.join(moses_install_dir, 'scripts', 'training', 'mert-moses.pl')
+ bin_dir = os.path.join(moses_install_dir, 'bin')
+ moses_bin = os.path.join(moses_install_dir, 'bin', 'moses')
+ src_file = infilename + '.' + config['src_lang']
+ ref_file = infilename + '.' + config['trg_lang']
+ logfile = os.path.join(workdir, 'log')
+ #change lm configuration in moses ini
+ moses_ini = os.path.join(workdir, 'trained-moses.ini')
+ cmd = r"cat %(orig_moses_ini)s | sed '/\[lmodel-file\]/,/^[[:space:]]*$/c\[lmodel-file\]\n%(lm_type)s 0 %(lm_order)s %(lm_file)s\n' > %(moses_ini)s"
+ cmd = cmd % locals()
+ os.system(cmd)
+
+ #the command
+ cmd = '%(mert_perl)s --mertdir %(bin_dir)s --working-dir %(workdir)s %(src_file)s %(ref_file)s %(moses_bin)s %(moses_ini)s 2> %(logfile)s'
+ cmd = cmd % locals()
+
+ pipe = subprocess.Popen(cmd, stdin = subprocess.PIPE, stdout = subprocess.PIPE, shell=True)
+ pipe.wait()
+
+ #check the moses ini
+ new_mosesini = os.path.join(workdir, 'moses.ini')
+ if not os.path.exists(new_mosesini):
+ raise Exception, 'Failed MERT'
+
+ return {'moses_ini_file':new_mosesini}
+
+ return process
+
+if __name__ == '__main__':
+
+ def __test():
+ configuration = {'src_lang':'en',
+ 'trg_lang':'lt',
+ 'moses_installation_dir':os.path.abspath('../../../../'),
+ 'mert_working_dir':'../../../../../tuning'}
+ values = {'development_data_filename':'../../../../../corpus/tune',
+ 'moses_ini_file':'../../../../../model/model/moses.ini',
+ 'trg_language_model_filename':'../../../../../corpus/train.lt.lm',
+ 'trg_language_model_type':9,
+ 'trg_language_model_order':4}
+ from pypeline.helpers.helpers import run_pipeline
+ box_config = configure(configuration)
+ box = initialise(configuration)
+ print run_pipeline(box, values, None)
+
+ #do some test
+ __test()
+
diff --git a/contrib/arrow-pipelines/python/training/components/model_training/__init__.py b/contrib/arrow-pipelines/python/training/components/model_training/__init__.py
new file mode 100644
index 000000000..e69de29bb
--- /dev/null
+++ b/contrib/arrow-pipelines/python/training/components/model_training/__init__.py
diff --git a/contrib/arrow-pipelines/python/training/components/model_training/model_training.py b/contrib/arrow-pipelines/python/training/components/model_training/model_training.py
new file mode 100755
index 000000000..e990307d2
--- /dev/null
+++ b/contrib/arrow-pipelines/python/training/components/model_training/model_training.py
@@ -0,0 +1,72 @@
+#!/usr/bin/env python
+
+import os, shutil, subprocess
+
+from pypeline.helpers.helpers import cons_function_component
+
+def configure(args):
+ result = {}
+ result['src_lang'] = args['src_lang']
+ result['trg_lang'] = args['trg_lang']
+ result['moses_installation_dir'] = args['moses_installation_dir']
+ result['external_bin_dir'] = args['giza_installation_dir']
+ result['model_directory'] = args['translation_model_directory']
+ return result
+
+def initialise(config):
+
+ def process(a, s):
+ infilename = os.path.abspath(a['training_data_filename'])
+ workdir = os.path.abspath(config['model_directory'])
+ #simply call the training perl script
+ #remove the workdir if it is already there
+ if os.path.exists(workdir):
+ shutil.rmtree(workdir)
+ os.makedirs(workdir)
+
+ #local vars
+ train_model_perl = os.path.abspath(config['moses_installation_dir']) + os.sep + 'scripts' + os.sep + 'training' + os.sep + 'train-model.perl'
+ src_lang = config['src_lang'].lower()
+ trg_lang = config['trg_lang'].lower()
+ external_bin = os.path.abspath(config['external_bin_dir'])
+ #create a dummy lm file
+ dummy_lmfile = workdir + os.sep + 'dummy.lm'
+ f = open(dummy_lmfile, 'w')
+ print >> f, "dummy lm file"
+ f.close()
+ logfile = workdir + os.sep + 'log'
+
+ #the command
+ cmd = '%(train_model_perl)s -root-dir %(workdir)s -corpus %(infilename)s -f %(src_lang)s -e %(trg_lang)s -alignment grow-diag-final-and -reordering msd-bidirectional-fe -lm 0:5:%(dummy_lmfile)s:0 -external-bin-dir %(external_bin)s 2> %(logfile)s'
+
+ cmd = cmd % locals()
+
+ pipe = subprocess.Popen(cmd, stdin = subprocess.PIPE, stdout = subprocess.PIPE, shell=True)
+ pipe.wait()
+
+ #check the moses ini
+ mosesini = workdir + os.sep + 'model' + os.sep + 'moses.ini'
+ if not os.path.exists(mosesini):
+ raise Exception, 'Failed training model'
+
+ return {'moses_ini_file':mosesini}
+
+ return process
+
+if __name__ == '__main__':
+
+ def __test():
+ configuration = {'src_lang':'en',
+ 'trg_lang':'lt',
+ 'moses_installation_dir':os.environ['MOSES_HOME'],
+ 'giza_installation_dir':os.environ['GIZA_HOME'],
+ 'translation_model_directory':'model-dir'}
+ values = {'training_data_filename':'/Users/ianjohnson/work/MTM-2012/corpus/training/cleantrain'}
+ from pypeline.helpers.helpers import run_pipeline
+ box_config = configure(configuration)
+ box = initialise(box_config)
+ print run_pipeline(box, values, None)
+
+ #do some test
+ __test()
+
diff --git a/contrib/arrow-pipelines/python/training/components/tokenizer/__init__.py b/contrib/arrow-pipelines/python/training/components/tokenizer/__init__.py
new file mode 100644
index 000000000..e69de29bb
--- /dev/null
+++ b/contrib/arrow-pipelines/python/training/components/tokenizer/__init__.py
diff --git a/contrib/arrow-pipelines/python/training/components/tokenizer/src_tokenizer.py b/contrib/arrow-pipelines/python/training/components/tokenizer/src_tokenizer.py
new file mode 100755
index 000000000..57f8771df
--- /dev/null
+++ b/contrib/arrow-pipelines/python/training/components/tokenizer/src_tokenizer.py
@@ -0,0 +1,43 @@
+#!/usr/bin/env python
+
+import os
+
+from tokenizer import Tokenizer
+
+from pypeline.helpers.helpers import cons_function_component
+
+def configure(args):
+ result = {}
+ result['src_lang'] = args['src_lang']
+ result['src_tokenisation_dir'] = args['src_tokenisation_dir']
+ result['moses_installation_dir'] = args['moses_installation_dir']
+ return result
+
+def initialise(config):
+
+ def process(a, s):
+ infilename = a['src_filename']
+ outfilename = Tokenizer.batch_tokenise(
+ config['src_lang'],
+ config['moses_installation_dir'],
+ infilename,
+ config['src_tokenisation_dir'])
+ return {'tokenised_src_filename':outfilename}
+
+ return process
+
+if __name__ == '__main__':
+
+ def __test():
+ configuration = {'src_lang':'de',
+ 'src_tokenisation_dir':'tmptok',
+ 'moses_installation_dir':os.path.abspath('../../../../')}
+ values = {'src_filename':'tmp.de'}
+ from pypeline.helpers.helpers import run_pipeline
+ box_config = configure(configuration)
+ box = initialise(configuration)
+ print run_pipeline(box, values, None)
+
+ #do some test
+ __test()
+
diff --git a/contrib/arrow-pipelines/python/training/components/tokenizer/tmp.de b/contrib/arrow-pipelines/python/training/components/tokenizer/tmp.de
new file mode 100644
index 000000000..c6b41edbe
--- /dev/null
+++ b/contrib/arrow-pipelines/python/training/components/tokenizer/tmp.de
@@ -0,0 +1,3 @@
+asdfweoih
+awfwoeijf awefo
+what's this
diff --git a/contrib/arrow-pipelines/python/training/components/tokenizer/tokenizer.py b/contrib/arrow-pipelines/python/training/components/tokenizer/tokenizer.py
new file mode 100644
index 000000000..354ec1abc
--- /dev/null
+++ b/contrib/arrow-pipelines/python/training/components/tokenizer/tokenizer.py
@@ -0,0 +1,36 @@
+#!/usr/bin/env python
+
+import sys, os, subprocess
+
+class Tokenizer:
+
+ @staticmethod
+ def batch_tokenise(lang, mosesdir, infilename, workdir):
+ print "Tokenizing [%s] in working directory [%s]..." % (infilename, workdir)
+ if not os.path.exists(workdir):
+ os.makedirs(workdir)
+ tok = Tokenizer(lang, mosesdir)
+ basefilename = os.path.basename(infilename)
+ outfilename = workdir + os.sep + basefilename + '.tok'
+ tok.file_tokenise(infilename, outfilename)
+ return outfilename
+
+ def __init__(self, lang, mosesdir):
+ self.arrows = None
+ self.lang = lang
+ #check the perl tokenizer is here
+ #path = os.path.dirname(os.path.abspath(__file__))
+ path = mosesdir + os.sep + 'scripts' + os.sep + 'tokenizer'
+ self.perltok = path + os.sep + 'tokenizer.perl'
+ if not os.path.exists(path):
+ raise Exception, "Perl tokenizer does not exists"
+
+ def file_tokenise(self, infilename, outfilename):
+ cmd = '%s -q -l %s < %s > %s' % (self.perltok, self.lang, infilename, outfilename)
+ pipe = subprocess.Popen(cmd, stdin = subprocess.PIPE, stdout = subprocess.PIPE, shell=True)
+ pipe.wait()
+
+if __name__ == '__main__':
+ #do some test
+ pass
+
diff --git a/contrib/arrow-pipelines/python/training/components/tokenizer/trg_tokenizer.py b/contrib/arrow-pipelines/python/training/components/tokenizer/trg_tokenizer.py
new file mode 100755
index 000000000..3852e296f
--- /dev/null
+++ b/contrib/arrow-pipelines/python/training/components/tokenizer/trg_tokenizer.py
@@ -0,0 +1,43 @@
+#!/usr/bin/env python
+
+import os
+
+from tokenizer import Tokenizer
+
+from pypeline.helpers.helpers import cons_function_component
+
+def configure(args):
+ result = {}
+ result['trg_lang'] = args['trg_lang']
+ result['trg_tokenisation_dir'] = args['trg_tokenisation_dir']
+ result['moses_installation_dir'] = args['moses_installation_dir']
+ return result
+
+def initialise(config):
+
+ def process(a, s):
+ infilename = a['trg_filename']
+ outfilename = Tokenizer.batch_tokenise(
+ config['trg_lang'],
+ config['moses_installation_dir'],
+ infilename,
+ config['trg_tokenisation_dir'])
+ return {'tokenised_trg_filename':outfilename}
+
+ return process
+
+if __name__ == '__main__':
+
+ def __test():
+ configuration = {'trg_lang':'de',
+ 'trg_tokenisation_dir':'tmptoktrg',
+ 'moses_installation_dir':os.path.abspath('../../../../')}
+ values = {'trg_filename':'tmp.de'}
+ from pypeline.helpers.helpers import run_pipeline
+ box_config = configure(configuration)
+ box = initialise(configuration)
+ print run_pipeline(box, values, None)
+
+ #do some test
+ __test()
+
diff --git a/contrib/other-builds/OnDiskPt/.cproject b/contrib/other-builds/OnDiskPt/.cproject
index e135b8886..f551380fd 100644
--- a/contrib/other-builds/OnDiskPt/.cproject
+++ b/contrib/other-builds/OnDiskPt/.cproject
@@ -24,7 +24,7 @@
<folderInfo id="cdt.managedbuild.config.gnu.macosx.exe.debug.846397978." name="/" resourcePath="">
<toolChain id="cdt.managedbuild.toolchain.gnu.macosx.exe.debug.725420545" name="MacOSX GCC" superClass="cdt.managedbuild.toolchain.gnu.macosx.exe.debug">
<targetPlatform binaryParser="org.eclipse.cdt.core.MachO64;org.eclipse.cdt.core.ELF" id="cdt.managedbuild.target.gnu.platform.macosx.exe.debug.1586272140" name="Debug Platform" superClass="cdt.managedbuild.target.gnu.platform.macosx.exe.debug"/>
- <builder buildPath="${workspace_loc:/OnDiskPt/Debug}" id="cdt.managedbuild.target.gnu.builder.macosx.exe.debug.1909553559" keepEnvironmentInBuildfile="false" managedBuildOn="true" name="Gnu Make Builder" superClass="cdt.managedbuild.target.gnu.builder.macosx.exe.debug"/>
+ <builder buildPath="${workspace_loc:/OnDiskPt/Debug}" id="cdt.managedbuild.target.gnu.builder.macosx.exe.debug.1909553559" keepEnvironmentInBuildfile="false" managedBuildOn="true" name="Gnu Make Builder" parallelBuildOn="true" parallelizationNumber="optimal" superClass="cdt.managedbuild.target.gnu.builder.macosx.exe.debug"/>
<tool id="cdt.managedbuild.tool.macosx.c.linker.macosx.exe.debug.30521110" name="MacOS X C Linker" superClass="cdt.managedbuild.tool.macosx.c.linker.macosx.exe.debug"/>
<tool id="cdt.managedbuild.tool.macosx.cpp.linker.macosx.exe.debug.478334849" name="MacOS X C++ Linker" superClass="cdt.managedbuild.tool.macosx.cpp.linker.macosx.exe.debug">
<inputType id="cdt.managedbuild.tool.macosx.cpp.linker.input.1328561226" superClass="cdt.managedbuild.tool.macosx.cpp.linker.input">
@@ -133,8 +133,13 @@
<autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfileCPP"/>
</scannerConfigBuildInfo>
</storageModule>
- <storageModule moduleId="refreshScope" versionNumber="1">
- <resource resourceType="PROJECT" workspacePath="/OnDiskPt"/>
+ <storageModule moduleId="refreshScope" versionNumber="2">
+ <configuration configurationName="Release">
+ <resource resourceType="PROJECT" workspacePath="/OnDiskPt"/>
+ </configuration>
+ <configuration configurationName="Debug">
+ <resource resourceType="PROJECT" workspacePath="/OnDiskPt"/>
+ </configuration>
</storageModule>
<storageModule moduleId="org.eclipse.cdt.make.core.buildtargets"/>
<storageModule moduleId="org.eclipse.cdt.core.LanguageSettingsProviders"/>
diff --git a/contrib/other-builds/extractor/.cproject b/contrib/other-builds/extractor/.cproject
index 7529a7799..fc08b4c3d 100644
--- a/contrib/other-builds/extractor/.cproject
+++ b/contrib/other-builds/extractor/.cproject
@@ -18,11 +18,14 @@
<folderInfo id="cdt.managedbuild.config.gnu.exe.debug.1133345948." name="/" resourcePath="">
<toolChain id="cdt.managedbuild.toolchain.gnu.exe.debug.1405862229" name="Linux GCC" superClass="cdt.managedbuild.toolchain.gnu.exe.debug">
<targetPlatform id="cdt.managedbuild.target.gnu.platform.exe.debug.605722566" name="Debug Platform" superClass="cdt.managedbuild.target.gnu.platform.exe.debug"/>
- <builder buildPath="${workspace_loc:/extractor/Debug}" id="cdt.managedbuild.target.gnu.builder.exe.debug.238577912" keepEnvironmentInBuildfile="false" managedBuildOn="true" name="Gnu Make Builder" superClass="cdt.managedbuild.target.gnu.builder.exe.debug"/>
+ <builder buildPath="${workspace_loc:/extractor/Debug}" id="cdt.managedbuild.target.gnu.builder.exe.debug.238577912" keepEnvironmentInBuildfile="false" managedBuildOn="true" name="Gnu Make Builder" parallelBuildOn="true" parallelizationNumber="optimal" superClass="cdt.managedbuild.target.gnu.builder.exe.debug"/>
<tool id="cdt.managedbuild.tool.gnu.archiver.base.1956867596" name="GCC Archiver" superClass="cdt.managedbuild.tool.gnu.archiver.base"/>
<tool id="cdt.managedbuild.tool.gnu.cpp.compiler.exe.debug.1512268277" name="GCC C++ Compiler" superClass="cdt.managedbuild.tool.gnu.cpp.compiler.exe.debug">
<option id="gnu.cpp.compiler.exe.debug.option.optimization.level.2143789149" name="Optimization Level" superClass="gnu.cpp.compiler.exe.debug.option.optimization.level" value="gnu.cpp.compiler.optimization.level.none" valueType="enumerated"/>
<option id="gnu.cpp.compiler.exe.debug.option.debugging.level.285958391" name="Debug Level" superClass="gnu.cpp.compiler.exe.debug.option.debugging.level" value="gnu.cpp.compiler.debugging.level.max" valueType="enumerated"/>
+ <option id="gnu.cpp.compiler.option.include.paths.966722418" superClass="gnu.cpp.compiler.option.include.paths" valueType="includePath">
+ <listOptionValue builtIn="false" value="&quot;${workspace_loc}/../../boost/include&quot;"/>
+ </option>
<inputType id="cdt.managedbuild.tool.gnu.cpp.compiler.input.1839105433" superClass="cdt.managedbuild.tool.gnu.cpp.compiler.input"/>
</tool>
<tool id="cdt.managedbuild.tool.gnu.c.compiler.exe.debug.554846982" name="GCC C Compiler" superClass="cdt.managedbuild.tool.gnu.c.compiler.exe.debug">
@@ -119,5 +122,13 @@
<autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfileC"/>
</scannerConfigBuildInfo>
</storageModule>
- <storageModule moduleId="refreshScope"/>
+ <storageModule moduleId="refreshScope" versionNumber="2">
+ <configuration configurationName="Release">
+ <resource resourceType="PROJECT" workspacePath="/extractor"/>
+ </configuration>
+ <configuration configurationName="Debug">
+ <resource resourceType="PROJECT" workspacePath="/extractor"/>
+ </configuration>
+ </storageModule>
+ <storageModule moduleId="org.eclipse.cdt.core.LanguageSettingsProviders"/>
</cproject>
diff --git a/contrib/other-builds/lm/.cproject b/contrib/other-builds/lm/.cproject
index 2036e6b18..e3e47fd7e 100644
--- a/contrib/other-builds/lm/.cproject
+++ b/contrib/other-builds/lm/.cproject
@@ -24,7 +24,7 @@
<folderInfo id="cdt.managedbuild.config.gnu.macosx.exe.debug.351042750." name="/" resourcePath="">
<toolChain id="cdt.managedbuild.toolchain.gnu.macosx.exe.debug.640882096" name="MacOSX GCC" superClass="cdt.managedbuild.toolchain.gnu.macosx.exe.debug">
<targetPlatform binaryParser="org.eclipse.cdt.core.MachO64;org.eclipse.cdt.core.ELF" id="cdt.managedbuild.target.gnu.platform.macosx.exe.debug.793478365" name="Debug Platform" superClass="cdt.managedbuild.target.gnu.platform.macosx.exe.debug"/>
- <builder buildPath="${workspace_loc:/lm/Debug}" id="cdt.managedbuild.target.gnu.builder.macosx.exe.debug.36011795" keepEnvironmentInBuildfile="false" managedBuildOn="true" name="Gnu Make Builder" superClass="cdt.managedbuild.target.gnu.builder.macosx.exe.debug"/>
+ <builder buildPath="${workspace_loc:/lm/Debug}" id="cdt.managedbuild.target.gnu.builder.macosx.exe.debug.36011795" keepEnvironmentInBuildfile="false" managedBuildOn="true" name="Gnu Make Builder" parallelBuildOn="true" parallelizationNumber="optimal" superClass="cdt.managedbuild.target.gnu.builder.macosx.exe.debug"/>
<tool id="cdt.managedbuild.tool.macosx.c.linker.macosx.exe.debug.1252826468" name="MacOS X C Linker" superClass="cdt.managedbuild.tool.macosx.c.linker.macosx.exe.debug"/>
<tool id="cdt.managedbuild.tool.macosx.cpp.linker.macosx.exe.debug.1024598065" name="MacOS X C++ Linker" superClass="cdt.managedbuild.tool.macosx.cpp.linker.macosx.exe.debug">
<inputType id="cdt.managedbuild.tool.macosx.cpp.linker.input.139111896" superClass="cdt.managedbuild.tool.macosx.cpp.linker.input">
@@ -131,7 +131,14 @@
<autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfileCPP"/>
</scannerConfigBuildInfo>
</storageModule>
- <storageModule moduleId="refreshScope"/>
+ <storageModule moduleId="refreshScope" versionNumber="2">
+ <configuration configurationName="Release">
+ <resource resourceType="PROJECT" workspacePath="/lm"/>
+ </configuration>
+ <configuration configurationName="Debug">
+ <resource resourceType="PROJECT" workspacePath="/lm"/>
+ </configuration>
+ </storageModule>
<storageModule moduleId="org.eclipse.cdt.make.core.buildtargets"/>
<storageModule moduleId="org.eclipse.cdt.core.LanguageSettingsProviders"/>
</cproject>
diff --git a/contrib/other-builds/lm/.project b/contrib/other-builds/lm/.project
index e75388ac1..a1bde37c2 100644
--- a/contrib/other-builds/lm/.project
+++ b/contrib/other-builds/lm/.project
@@ -142,11 +142,6 @@
<locationURI>PARENT-3-PROJECT_LOC/lm/build_binary</locationURI>
</link>
<link>
- <name>build_binary.cc</name>
- <type>1</type>
- <locationURI>PARENT-3-PROJECT_LOC/lm/build_binary.cc</locationURI>
- </link>
- <link>
<name>clean.sh</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/lm/clean.sh</locationURI>
@@ -177,11 +172,6 @@
<locationURI>PARENT-3-PROJECT_LOC/lm/facade.hh</locationURI>
</link>
<link>
- <name>fragment.cc</name>
- <type>1</type>
- <locationURI>PARENT-3-PROJECT_LOC/lm/fragment.cc</locationURI>
- </link>
- <link>
<name>left.hh</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/lm/left.hh</locationURI>
@@ -212,11 +202,6 @@
<locationURI>PARENT-3-PROJECT_LOC/lm/lm_exception.hh</locationURI>
</link>
<link>
- <name>max_order.cc</name>
- <type>1</type>
- <locationURI>PARENT-3-PROJECT_LOC/lm/max_order.cc</locationURI>
- </link>
- <link>
<name>max_order.hh</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/lm/max_order.hh</locationURI>
@@ -242,11 +227,6 @@
<locationURI>PARENT-3-PROJECT_LOC/lm/model_type.hh</locationURI>
</link>
<link>
- <name>ngram_query.cc</name>
- <type>1</type>
- <locationURI>PARENT-3-PROJECT_LOC/lm/ngram_query.cc</locationURI>
- </link>
- <link>
<name>ngram_query.hh</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/lm/ngram_query.hh</locationURI>
diff --git a/contrib/other-builds/mert_lib/.cproject b/contrib/other-builds/mert_lib/.cproject
index 41a471cd1..e1c19b822 100644
--- a/contrib/other-builds/mert_lib/.cproject
+++ b/contrib/other-builds/mert_lib/.cproject
@@ -7,7 +7,7 @@
<externalSetting>
<entry flags="VALUE_WORKSPACE_PATH" kind="includePath" name="/mert_lib"/>
<entry flags="VALUE_WORKSPACE_PATH" kind="libraryPath" name="/mert_lib/Debug"/>
- <entry flags="RESOLVED" kind="libraryFile" name="mert_lib"/>
+ <entry flags="RESOLVED" kind="libraryFile" name="mert_lib" srcPrefixMapping="" srcRootPath=""/>
</externalSetting>
</externalSettings>
<extensions>
@@ -23,13 +23,14 @@
<folderInfo id="cdt.managedbuild.config.gnu.lib.debug.1721952013." name="/" resourcePath="">
<toolChain id="cdt.managedbuild.toolchain.gnu.lib.debug.1932340583" name="Linux GCC" superClass="cdt.managedbuild.toolchain.gnu.lib.debug">
<targetPlatform id="cdt.managedbuild.target.gnu.platform.lib.debug.296711714" name="Debug Platform" superClass="cdt.managedbuild.target.gnu.platform.lib.debug"/>
- <builder buildPath="${workspace_loc:/mert_lib/Debug}" id="cdt.managedbuild.target.gnu.builder.lib.debug.1369910974" keepEnvironmentInBuildfile="false" managedBuildOn="true" name="Gnu Make Builder" superClass="cdt.managedbuild.target.gnu.builder.lib.debug"/>
+ <builder buildPath="${workspace_loc:/mert_lib/Debug}" id="cdt.managedbuild.target.gnu.builder.lib.debug.1369910974" keepEnvironmentInBuildfile="false" managedBuildOn="true" name="Gnu Make Builder" parallelBuildOn="true" parallelizationNumber="optimal" superClass="cdt.managedbuild.target.gnu.builder.lib.debug"/>
<tool id="cdt.managedbuild.tool.gnu.archiver.lib.debug.89397980" name="GCC Archiver" superClass="cdt.managedbuild.tool.gnu.archiver.lib.debug"/>
<tool id="cdt.managedbuild.tool.gnu.cpp.compiler.lib.debug.329920537" name="GCC C++ Compiler" superClass="cdt.managedbuild.tool.gnu.cpp.compiler.lib.debug">
<option id="gnu.cpp.compiler.lib.debug.option.optimization.level.469164841" name="Optimization Level" superClass="gnu.cpp.compiler.lib.debug.option.optimization.level" value="gnu.cpp.compiler.optimization.level.none" valueType="enumerated"/>
<option id="gnu.cpp.compiler.lib.debug.option.debugging.level.1050747398" name="Debug Level" superClass="gnu.cpp.compiler.lib.debug.option.debugging.level" value="gnu.cpp.compiler.debugging.level.max" valueType="enumerated"/>
<option id="gnu.cpp.compiler.option.include.paths.1565260476" name="Include paths (-I)" superClass="gnu.cpp.compiler.option.include.paths" valueType="includePath">
<listOptionValue builtIn="false" value="&quot;${workspace_loc}/../../&quot;"/>
+ <listOptionValue builtIn="false" value="&quot;${workspace_loc}/../../boost/include&quot;"/>
</option>
<inputType id="cdt.managedbuild.tool.gnu.cpp.compiler.input.1183866856" superClass="cdt.managedbuild.tool.gnu.cpp.compiler.input"/>
</tool>
@@ -45,11 +46,8 @@
</tool>
</toolChain>
</folderInfo>
- <fileInfo id="cdt.managedbuild.config.gnu.lib.debug.1721952013.626295813" name="extractor.cpp" rcbsApplicability="disable" resourcePath="mert/extractor.cpp" toolsToInvoke="cdt.managedbuild.tool.gnu.cpp.compiler.lib.debug.329920537.1550378460">
- <tool id="cdt.managedbuild.tool.gnu.cpp.compiler.lib.debug.329920537.1550378460" name="GCC C++ Compiler" superClass="cdt.managedbuild.tool.gnu.cpp.compiler.lib.debug.329920537"/>
- </fileInfo>
<sourceEntries>
- <entry excluding="mert/extractor.cpp" flags="VALUE_WORKSPACE_PATH|RESOLVED" kind="sourcePath" name=""/>
+ <entry excluding="mert/UtilTest.cpp|mert/TimerTest.cpp|mert/SingletonTest.cpp|mert/PointTest.cpp|mert/OptimizerFactoryTest.cpp|mert/NgramTest.cpp|mert/FeatureDataTest.cpp|mert/DataTest.cpp|mert/ReferenceTest.cpp|mert/VocabularyTest.cpp|mert/extractor.cpp" flags="VALUE_WORKSPACE_PATH|RESOLVED" kind="sourcePath" name=""/>
</sourceEntries>
</configuration>
</storageModule>
@@ -61,7 +59,7 @@
<externalSetting>
<entry flags="VALUE_WORKSPACE_PATH" kind="includePath" name="/mert_lib"/>
<entry flags="VALUE_WORKSPACE_PATH" kind="libraryPath" name="/mert_lib/Release"/>
- <entry flags="RESOLVED" kind="libraryFile" name="mert_lib"/>
+ <entry flags="RESOLVED" kind="libraryFile" name="mert_lib" srcPrefixMapping="" srcRootPath=""/>
</externalSetting>
</externalSettings>
<extensions>
@@ -119,5 +117,13 @@
<autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfileC"/>
</scannerConfigBuildInfo>
</storageModule>
- <storageModule moduleId="refreshScope"/>
+ <storageModule moduleId="refreshScope" versionNumber="2">
+ <configuration configurationName="Release">
+ <resource resourceType="PROJECT" workspacePath="/mert_lib"/>
+ </configuration>
+ <configuration configurationName="Debug">
+ <resource resourceType="PROJECT" workspacePath="/mert_lib"/>
+ </configuration>
+ </storageModule>
+ <storageModule moduleId="org.eclipse.cdt.core.LanguageSettingsProviders"/>
</cproject>
diff --git a/contrib/other-builds/moses-chart-cmd/.cproject b/contrib/other-builds/moses-chart-cmd/.cproject
index fedda926b..71462b5df 100644
--- a/contrib/other-builds/moses-chart-cmd/.cproject
+++ b/contrib/other-builds/moses-chart-cmd/.cproject
@@ -19,7 +19,7 @@
<folderInfo id="cdt.managedbuild.config.gnu.exe.debug.162355801." name="/" resourcePath="">
<toolChain id="cdt.managedbuild.toolchain.gnu.exe.debug.1633424067" name="Linux GCC" superClass="cdt.managedbuild.toolchain.gnu.exe.debug">
<targetPlatform binaryParser="org.eclipse.cdt.core.ELF;org.eclipse.cdt.core.MachO64" id="cdt.managedbuild.target.gnu.platform.exe.debug.1437309068" name="Debug Platform" superClass="cdt.managedbuild.target.gnu.platform.exe.debug"/>
- <builder buildPath="${workspace_loc:/moses-chart-cmd/Debug}" id="cdt.managedbuild.target.gnu.builder.exe.debug.1495140314" keepEnvironmentInBuildfile="false" managedBuildOn="true" name="Gnu Make Builder" superClass="cdt.managedbuild.target.gnu.builder.exe.debug"/>
+ <builder buildPath="${workspace_loc:/moses-chart-cmd/Debug}" id="cdt.managedbuild.target.gnu.builder.exe.debug.1495140314" keepEnvironmentInBuildfile="false" managedBuildOn="true" name="Gnu Make Builder" parallelBuildOn="true" parallelizationNumber="optimal" superClass="cdt.managedbuild.target.gnu.builder.exe.debug"/>
<tool id="cdt.managedbuild.tool.gnu.archiver.base.1247128100" name="GCC Archiver" superClass="cdt.managedbuild.tool.gnu.archiver.base"/>
<tool id="cdt.managedbuild.tool.gnu.cpp.compiler.exe.debug.1087697480" name="GCC C++ Compiler" superClass="cdt.managedbuild.tool.gnu.cpp.compiler.exe.debug">
<option id="gnu.cpp.compiler.exe.debug.option.optimization.level.1163099464" name="Optimization Level" superClass="gnu.cpp.compiler.exe.debug.option.optimization.level" value="gnu.cpp.compiler.optimization.level.none" valueType="enumerated"/>
@@ -46,6 +46,7 @@
<tool id="cdt.managedbuild.tool.gnu.cpp.linker.exe.debug.816413868" name="GCC C++ Linker" superClass="cdt.managedbuild.tool.gnu.cpp.linker.exe.debug">
<option id="gnu.cpp.link.option.paths.330225535" name="Library search path (-L)" superClass="gnu.cpp.link.option.paths" valueType="libPaths">
<listOptionValue builtIn="false" value="&quot;${workspace_loc:}/../../boost/lib&quot;"/>
+ <listOptionValue builtIn="false" value="&quot;${workspace_loc:}/../../boost/lib64&quot;"/>
<listOptionValue builtIn="false" value="&quot;${workspace_loc:}/../../irstlm/lib&quot;"/>
<listOptionValue builtIn="false" value="&quot;${workspace_loc:}/../../srilm/lib/macosx&quot;"/>
<listOptionValue builtIn="false" value="&quot;${workspace_loc:}/../../srilm/lib/i686-m64&quot;"/>
@@ -70,9 +71,11 @@
<listOptionValue builtIn="false" value="lm"/>
<listOptionValue builtIn="false" value="util"/>
<listOptionValue builtIn="false" value="z"/>
+ <listOptionValue builtIn="false" value="boost_filesystem-mt"/>
+ <listOptionValue builtIn="false" value="boost_iostreams-mt"/>
<listOptionValue builtIn="false" value="boost_system-mt"/>
<listOptionValue builtIn="false" value="boost_thread-mt"/>
- <listOptionValue builtIn="false" value="boost_filesystem-mt"/>
+ <listOptionValue builtIn="false" value="bz2"/>
<listOptionValue builtIn="false" value="rt"/>
</option>
<inputType id="cdt.managedbuild.tool.gnu.cpp.linker.input.128214028" superClass="cdt.managedbuild.tool.gnu.cpp.linker.input">
@@ -154,8 +157,13 @@
<autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfileCPP"/>
</scannerConfigBuildInfo>
</storageModule>
- <storageModule moduleId="refreshScope" versionNumber="1">
- <resource resourceType="PROJECT" workspacePath="/moses-chart-cmd"/>
+ <storageModule moduleId="refreshScope" versionNumber="2">
+ <configuration configurationName="Release">
+ <resource resourceType="PROJECT" workspacePath="/moses-chart-cmd"/>
+ </configuration>
+ <configuration configurationName="Debug">
+ <resource resourceType="PROJECT" workspacePath="/moses-chart-cmd"/>
+ </configuration>
</storageModule>
<storageModule moduleId="org.eclipse.cdt.make.core.buildtargets"/>
<storageModule moduleId="org.eclipse.cdt.core.LanguageSettingsProviders"/>
diff --git a/contrib/other-builds/moses-cmd/.cproject b/contrib/other-builds/moses-cmd/.cproject
index 10b6784d4..42d2100d8 100644
--- a/contrib/other-builds/moses-cmd/.cproject
+++ b/contrib/other-builds/moses-cmd/.cproject
@@ -19,7 +19,7 @@
<folderInfo id="cdt.managedbuild.config.gnu.exe.debug.461114338." name="/" resourcePath="">
<toolChain id="cdt.managedbuild.toolchain.gnu.exe.debug.1896491482" name="Linux GCC" superClass="cdt.managedbuild.toolchain.gnu.exe.debug">
<targetPlatform binaryParser="org.eclipse.cdt.core.ELF;org.eclipse.cdt.core.MachO64" id="cdt.managedbuild.target.gnu.platform.exe.debug.2144309834" name="Debug Platform" superClass="cdt.managedbuild.target.gnu.platform.exe.debug"/>
- <builder buildPath="${workspace_loc:/moses-cmd/Debug}" id="cdt.managedbuild.target.gnu.builder.exe.debug.56664170" keepEnvironmentInBuildfile="false" managedBuildOn="true" name="Gnu Make Builder" superClass="cdt.managedbuild.target.gnu.builder.exe.debug"/>
+ <builder buildPath="${workspace_loc:/moses-cmd/Debug}" id="cdt.managedbuild.target.gnu.builder.exe.debug.56664170" keepEnvironmentInBuildfile="false" managedBuildOn="true" name="Gnu Make Builder" parallelBuildOn="true" parallelizationNumber="optimal" superClass="cdt.managedbuild.target.gnu.builder.exe.debug"/>
<tool id="cdt.managedbuild.tool.gnu.archiver.base.1278274354" name="GCC Archiver" superClass="cdt.managedbuild.tool.gnu.archiver.base"/>
<tool id="cdt.managedbuild.tool.gnu.cpp.compiler.exe.debug.626095182" name="GCC C++ Compiler" superClass="cdt.managedbuild.tool.gnu.cpp.compiler.exe.debug">
<option id="gnu.cpp.compiler.exe.debug.option.optimization.level.2084031389" name="Optimization Level" superClass="gnu.cpp.compiler.exe.debug.option.optimization.level" value="gnu.cpp.compiler.optimization.level.none" valueType="enumerated"/>
@@ -46,6 +46,8 @@
<tool id="cdt.managedbuild.tool.gnu.cpp.linker.exe.debug.1546774818" name="GCC C++ Linker" superClass="cdt.managedbuild.tool.gnu.cpp.linker.exe.debug">
<option id="gnu.cpp.link.option.paths.523170942" name="Library search path (-L)" superClass="gnu.cpp.link.option.paths" valueType="libPaths">
<listOptionValue builtIn="false" value="&quot;${workspace_loc:}/../../irstlm/lib&quot;"/>
+ <listOptionValue builtIn="false" value="&quot;${workspace_loc:}/../../boost/lib&quot;"/>
+ <listOptionValue builtIn="false" value="&quot;${workspace_loc:}/../../boost/lib64&quot;"/>
<listOptionValue builtIn="false" value="&quot;${workspace_loc:}/../../srilm/lib/macosx&quot;"/>
<listOptionValue builtIn="false" value="&quot;${workspace_loc:}/../../srilm/lib/i686-m64&quot;"/>
<listOptionValue builtIn="false" value="&quot;${workspace_loc:}/../../srilm/lib/i686&quot;"/>
@@ -69,8 +71,11 @@
<listOptionValue builtIn="false" value="z"/>
<listOptionValue builtIn="false" value="boost_system-mt"/>
<listOptionValue builtIn="false" value="boost_thread-mt"/>
+ <listOptionValue builtIn="false" value="boost_iostreams-mt"/>
+ <listOptionValue builtIn="false" value="boost_filesystem-mt"/>
<listOptionValue builtIn="false" value="lm"/>
<listOptionValue builtIn="false" value="util"/>
+ <listOptionValue builtIn="false" value="bz2"/>
<listOptionValue builtIn="false" value="rt"/>
</option>
<inputType id="cdt.managedbuild.tool.gnu.cpp.linker.input.983725033" superClass="cdt.managedbuild.tool.gnu.cpp.linker.input">
@@ -155,8 +160,13 @@
<autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfileCPP"/>
</scannerConfigBuildInfo>
</storageModule>
- <storageModule moduleId="refreshScope" versionNumber="1">
- <resource resourceType="PROJECT" workspacePath="/moses-cmd"/>
+ <storageModule moduleId="refreshScope" versionNumber="2">
+ <configuration configurationName="Release">
+ <resource resourceType="PROJECT" workspacePath="/moses-cmd"/>
+ </configuration>
+ <configuration configurationName="Debug">
+ <resource resourceType="PROJECT" workspacePath="/moses-cmd"/>
+ </configuration>
</storageModule>
<storageModule moduleId="org.eclipse.cdt.make.core.buildtargets"/>
<storageModule moduleId="org.eclipse.cdt.core.LanguageSettingsProviders"/>
diff --git a/contrib/other-builds/moses/.cproject b/contrib/other-builds/moses/.cproject
index e54a1385b..787024533 100644
--- a/contrib/other-builds/moses/.cproject
+++ b/contrib/other-builds/moses/.cproject
@@ -1,7 +1,5 @@
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
-<?fileVersion 4.0.0?>
-
-<cproject storage_type_id="org.eclipse.cdt.core.XmlProjectDescriptionStorage">
+<?fileVersion 4.0.0?><cproject storage_type_id="org.eclipse.cdt.core.XmlProjectDescriptionStorage">
<storageModule moduleId="org.eclipse.cdt.core.settings">
<cconfiguration id="cdt.managedbuild.config.gnu.exe.debug.656913512">
<storageModule buildSystemId="org.eclipse.cdt.managedbuilder.core.configurationDataProvider" id="cdt.managedbuild.config.gnu.exe.debug.656913512" moduleId="org.eclipse.cdt.core.settings" name="Debug">
@@ -9,7 +7,7 @@
<externalSetting>
<entry flags="VALUE_WORKSPACE_PATH" kind="includePath" name="/moses"/>
<entry flags="VALUE_WORKSPACE_PATH" kind="libraryPath" name="/moses/Debug"/>
- <entry flags="RESOLVED" kind="libraryFile" name="moses"/>
+ <entry flags="RESOLVED" kind="libraryFile" name="moses" srcPrefixMapping="" srcRootPath=""/>
</externalSetting>
</externalSettings>
<extensions>
@@ -26,7 +24,7 @@
<folderInfo id="cdt.managedbuild.config.gnu.exe.debug.656913512." name="/" resourcePath="">
<toolChain id="cdt.managedbuild.toolchain.gnu.exe.debug.1793369992" name="Linux GCC" superClass="cdt.managedbuild.toolchain.gnu.exe.debug">
<targetPlatform binaryParser="org.eclipse.cdt.core.ELF;org.eclipse.cdt.core.MachO64" id="cdt.managedbuild.target.gnu.platform.exe.debug.1051650049" name="Debug Platform" superClass="cdt.managedbuild.target.gnu.platform.exe.debug"/>
- <builder buildPath="${workspace_loc:/moses/Debug}" id="cdt.managedbuild.target.gnu.builder.exe.debug.505583888" keepEnvironmentInBuildfile="false" managedBuildOn="true" name="Gnu Make Builder" superClass="cdt.managedbuild.target.gnu.builder.exe.debug"/>
+ <builder buildPath="${workspace_loc:/moses/Debug}" id="cdt.managedbuild.target.gnu.builder.exe.debug.505583888" keepEnvironmentInBuildfile="false" managedBuildOn="true" name="Gnu Make Builder" parallelBuildOn="true" parallelizationNumber="optimal" superClass="cdt.managedbuild.target.gnu.builder.exe.debug"/>
<tool id="cdt.managedbuild.tool.gnu.archiver.base.1976472988" name="GCC Archiver" superClass="cdt.managedbuild.tool.gnu.archiver.base"/>
<tool id="cdt.managedbuild.tool.gnu.cpp.compiler.exe.debug.1774992327" name="GCC C++ Compiler" superClass="cdt.managedbuild.tool.gnu.cpp.compiler.exe.debug">
<option id="gnu.cpp.compiler.exe.debug.option.optimization.level.1759650532" name="Optimization Level" superClass="gnu.cpp.compiler.exe.debug.option.optimization.level" value="gnu.cpp.compiler.optimization.level.none" valueType="enumerated"/>
@@ -152,8 +150,14 @@
<autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfileCPP"/>
</scannerConfigBuildInfo>
</storageModule>
- <storageModule moduleId="refreshScope" versionNumber="1">
- <resource resourceType="PROJECT" workspacePath="/moses"/>
+ <storageModule moduleId="refreshScope" versionNumber="2">
+ <configuration configurationName="Release">
+ <resource resourceType="PROJECT" workspacePath="/moses"/>
+ </configuration>
+ <configuration configurationName="Debug">
+ <resource resourceType="PROJECT" workspacePath="/moses"/>
+ </configuration>
</storageModule>
<storageModule moduleId="org.eclipse.cdt.make.core.buildtargets"/>
+ <storageModule moduleId="org.eclipse.cdt.core.LanguageSettingsProviders"/>
</cproject>
diff --git a/contrib/other-builds/search/.cproject b/contrib/other-builds/search/.cproject
index 9ccb8f8e9..2de36fecd 100644
--- a/contrib/other-builds/search/.cproject
+++ b/contrib/other-builds/search/.cproject
@@ -24,7 +24,7 @@
<folderInfo id="cdt.managedbuild.config.gnu.exe.debug.722547278." name="/" resourcePath="">
<toolChain id="cdt.managedbuild.toolchain.gnu.exe.debug.1512691763" name="Linux GCC" superClass="cdt.managedbuild.toolchain.gnu.exe.debug">
<targetPlatform binaryParser="org.eclipse.cdt.core.ELF;org.eclipse.cdt.core.MachO64" id="cdt.managedbuild.target.gnu.platform.exe.debug.633526059" name="Debug Platform" superClass="cdt.managedbuild.target.gnu.platform.exe.debug"/>
- <builder buildPath="${workspace_loc:/search/Debug}" id="cdt.managedbuild.target.gnu.builder.exe.debug.164367197" keepEnvironmentInBuildfile="false" managedBuildOn="true" name="Gnu Make Builder" superClass="cdt.managedbuild.target.gnu.builder.exe.debug"/>
+ <builder buildPath="${workspace_loc:/search/Debug}" id="cdt.managedbuild.target.gnu.builder.exe.debug.164367197" keepEnvironmentInBuildfile="false" managedBuildOn="true" name="Gnu Make Builder" parallelBuildOn="true" parallelizationNumber="optimal" superClass="cdt.managedbuild.target.gnu.builder.exe.debug"/>
<tool id="cdt.managedbuild.tool.gnu.archiver.base.854512708" name="GCC Archiver" superClass="cdt.managedbuild.tool.gnu.archiver.base"/>
<tool id="cdt.managedbuild.tool.gnu.cpp.compiler.exe.debug.1096845166" name="GCC C++ Compiler" superClass="cdt.managedbuild.tool.gnu.cpp.compiler.exe.debug">
<option id="gnu.cpp.compiler.exe.debug.option.optimization.level.240381177" name="Optimization Level" superClass="gnu.cpp.compiler.exe.debug.option.optimization.level" value="gnu.cpp.compiler.optimization.level.none" valueType="enumerated"/>
@@ -127,6 +127,13 @@
<autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfileC"/>
</scannerConfigBuildInfo>
</storageModule>
- <storageModule moduleId="refreshScope"/>
+ <storageModule moduleId="refreshScope" versionNumber="2">
+ <configuration configurationName="Release">
+ <resource resourceType="PROJECT" workspacePath="/search"/>
+ </configuration>
+ <configuration configurationName="Debug">
+ <resource resourceType="PROJECT" workspacePath="/search"/>
+ </configuration>
+ </storageModule>
<storageModule moduleId="org.eclipse.cdt.core.LanguageSettingsProviders"/>
</cproject>
diff --git a/contrib/other-builds/search/.project b/contrib/other-builds/search/.project
index efad842ea..95f074aae 100644
--- a/contrib/other-builds/search/.project
+++ b/contrib/other-builds/search/.project
@@ -157,11 +157,6 @@
<locationURI>PARENT-3-PROJECT_LOC/search/vertex.hh</locationURI>
</link>
<link>
- <name>vertex_generator.cc</name>
- <type>1</type>
- <locationURI>PARENT-3-PROJECT_LOC/search/vertex_generator.cc</locationURI>
- </link>
- <link>
<name>vertex_generator.hh</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/search/vertex_generator.hh</locationURI>
diff --git a/contrib/other-builds/util/.cproject b/contrib/other-builds/util/.cproject
index ab37362a4..2fd4d2dfb 100644
--- a/contrib/other-builds/util/.cproject
+++ b/contrib/other-builds/util/.cproject
@@ -24,7 +24,7 @@
<folderInfo id="cdt.managedbuild.config.gnu.macosx.exe.debug.1869657447." name="/" resourcePath="">
<toolChain id="cdt.managedbuild.toolchain.gnu.macosx.exe.debug.1388624938" name="MacOSX GCC" superClass="cdt.managedbuild.toolchain.gnu.macosx.exe.debug">
<targetPlatform binaryParser="org.eclipse.cdt.core.MachO64;org.eclipse.cdt.core.ELF" id="cdt.managedbuild.target.gnu.platform.macosx.exe.debug.1873607607" name="Debug Platform" superClass="cdt.managedbuild.target.gnu.platform.macosx.exe.debug"/>
- <builder buildPath="${workspace_loc:/util/Debug}" id="cdt.managedbuild.target.gnu.builder.macosx.exe.debug.2045214944" keepEnvironmentInBuildfile="false" managedBuildOn="true" name="Gnu Make Builder" superClass="cdt.managedbuild.target.gnu.builder.macosx.exe.debug"/>
+ <builder buildPath="${workspace_loc:/util/Debug}" id="cdt.managedbuild.target.gnu.builder.macosx.exe.debug.2045214944" keepEnvironmentInBuildfile="false" managedBuildOn="true" name="Gnu Make Builder" parallelBuildOn="true" parallelizationNumber="optimal" superClass="cdt.managedbuild.target.gnu.builder.macosx.exe.debug"/>
<tool id="cdt.managedbuild.tool.macosx.c.linker.macosx.exe.debug.589471640" name="MacOS X C Linker" superClass="cdt.managedbuild.tool.macosx.c.linker.macosx.exe.debug"/>
<tool id="cdt.managedbuild.tool.macosx.cpp.linker.macosx.exe.debug.1543780089" name="MacOS X C++ Linker" superClass="cdt.managedbuild.tool.macosx.cpp.linker.macosx.exe.debug">
<inputType id="cdt.managedbuild.tool.macosx.cpp.linker.input.635667684" superClass="cdt.managedbuild.tool.macosx.cpp.linker.input">
@@ -136,8 +136,13 @@
<autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfileC"/>
</scannerConfigBuildInfo>
</storageModule>
- <storageModule moduleId="refreshScope" versionNumber="1">
- <resource resourceType="PROJECT" workspacePath="/util"/>
+ <storageModule moduleId="refreshScope" versionNumber="2">
+ <configuration configurationName="Release">
+ <resource resourceType="PROJECT" workspacePath="/util"/>
+ </configuration>
+ <configuration configurationName="Debug">
+ <resource resourceType="PROJECT" workspacePath="/util"/>
+ </configuration>
</storageModule>
<storageModule moduleId="org.eclipse.cdt.make.core.buildtargets"/>
<storageModule moduleId="org.eclipse.cdt.core.LanguageSettingsProviders"/>
diff --git a/contrib/rpm/README b/contrib/rpm/README
new file mode 100644
index 000000000..8ba7ef4da
--- /dev/null
+++ b/contrib/rpm/README
@@ -0,0 +1,42 @@
+Building Moses RPM
+==================
+
+*** WARNING ***
+Before completing *any* of the tasks outlined in this README, please commit and push any changes you wish to be included in your installer.
+*** WARNING ***
+
+
+Building the RPM SPEC file
+--------------------------
+
+The first phase is to construct the RPM SPEC file in $HOME/rpmbuild. The build_source.sh script builds all the artefacts needed to build. This script needs the following information:
+
+ - The Git repository from which an installer will be built,
+ - The branch in the Git repository to build, and
+ - The version of the installed Moses distribution.
+
+For example, to build the RELEASE-1.0 branch in the mosesdecode repository (git://github.com/moses-smt/mosesdecoder.git):
+
+$ build_source.sh -r git://github.com/moses-smt/mosesdecoder.git -b RELASE-1.0 -v 1.0
+
+This builds the source tarballs in the $HOME/rpmbuild/SOURCES directory and the moses.spec file in $HOME/rpmbuild/SPECS.
+
+
+Building the RPM
+----------------
+
+Change directory to $HOME/rpmbuild, and build the binary RPM with:
+
+$ rpmbuild -bb SPECS/moses.spec
+
+This will download IRSTLM v5.70.04 and GIZA++ v2, then build them along with Moses and make the RPM in the directory $HOME/rpmbuild/RPMS/<architecture>/moses-<version>-1.<architecture>.rpm.
+
+For example building on a 64 bit Intel architecture, and building v1.0 the RPM would be called moses-1.0-1.x86_64.rpm.
+
+
+Building a Debian package
+-------------------------
+
+The Alien tool converts RPM packages to Debian packages. If a Debian package is required then follow the instructions on the following web-page:
+
+https://help.ubuntu.com/community/RPM/AlienHowto
diff --git a/contrib/rpm/build_source.sh b/contrib/rpm/build_source.sh
new file mode 100755
index 000000000..d0fac6a33
--- /dev/null
+++ b/contrib/rpm/build_source.sh
@@ -0,0 +1,63 @@
+#!/bin/bash
+
+BRANCH="master"
+declare -i NO_RPM_BUILD=0
+declare -r RPM_VERSION_TAG="___RPM_VERSION__"
+
+function usage() {
+ echo "`basename $0` -r [Moses Git repo] -b [Moses Git branch: default ${BRANCH}] -v [RPM version]"
+ exit 1
+}
+
+if [ $# -lt 4 ]; then
+ usage
+fi
+
+while getopts r:b:v:nh OPTION
+do
+ case "$OPTION" in
+ r) REPO="${OPTARG}";;
+ b) BRANCH="${OPTARG}";;
+ v) VERSION="${OPTARG}";;
+ n) NO_RPM_BUILD=1;;
+ [h\?]) usage;;
+ esac
+done
+
+if [ ! -d ./rpmbuild ]; then
+ echo "RPM build directory not in current working direcotry"
+ exit 1
+fi
+
+declare -r MOSES_DIR="moses-${VERSION}"
+git clone ${REPO} ${MOSES_DIR}
+if [ $? -ne 0 ]; then
+ echo "Failed to clone Git repository ${REPO}"
+ exit 3
+fi
+
+cd ${MOSES_DIR}
+
+git checkout ${BRANCH}
+if [ $? -ne 0 ]; then
+ echo "Failed to checkout branch ${BRANCH}"
+ exit 3
+fi
+
+cd ..
+
+tar -cf moses-${VERSION}.tar ${MOSES_DIR}
+gzip -f9 moses-${VERSION}.tar
+
+if [ ${NO_RPM_BUILD} -eq 0 ]; then
+ if [ ! -d ${HOME}/rpmbuild/SPECS ]; then
+ mkdir -p ${HOME}/rpmbuild/SPECS
+ fi
+ eval sed s/${RPM_VERSION_TAG}/${VERSION}/ ./rpmbuild/SPECS/moses.spec > ${HOME}/rpmbuild/SPECS/moses.spec
+ if [ ! -d ${HOME}/rpmbuild/SOURCES ]; then
+ mkdir -p ${HOME}/rpmbuild/SOURCES
+ fi
+ mv moses-${VERSION}.tar.gz ${HOME}/rpmbuild/SOURCES
+fi
+
+rm -Rf ${MOSES_DIR}
diff --git a/contrib/rpm/rpmbuild/SPECS/moses.spec b/contrib/rpm/rpmbuild/SPECS/moses.spec
new file mode 100644
index 000000000..0f4a6c6ec
--- /dev/null
+++ b/contrib/rpm/rpmbuild/SPECS/moses.spec
@@ -0,0 +1,65 @@
+Name: moses
+Summary: Moses is a statistical machine translation system that allows you to automatically train translation models for any language pair.
+Version: ___RPM_VERSION__
+Release: 1
+URL: http://www.statmt.org/moses/
+Source0: %{name}-%{version}.tar.gz
+License: LGPL
+Group: Development/Tools
+Vendor: Capita Translation and Interpreting
+Packager: Ian Johnson <ian.johnson@capita-ti.com>
+Requires: boost >= 1.48, python >= 2.6, perl >= 5
+BuildRoot: /home/ian/rpmbuild/builds/%{name}-%{version}-%{release}
+%description
+Moses is a statistical machine translation system that allows you to automatically train translation models for any language pair. All you need is a collection of translated texts (parallel corpus). An efficient search algorithm finds quickly the highest probability translation among the exponential number of choices.
+%prep
+%setup -q
+
+mkdir -p $RPM_BUILD_ROOT/opt/moses/giza++-v1.0.7
+
+wget -O $RPM_BUILD_DIR/irstlm-5.70.04.tgz http://moses-suite.googlecode.com/files/irstlm-5.70.04.tgz
+wget -O $RPM_BUILD_DIR/giza-pp-v1.0.7.tgz http://moses-suite.googlecode.com/files/giza-pp-v1.0.7.tar.gz
+
+cd $RPM_BUILD_DIR
+
+tar -zxf irstlm-5.70.04.tgz
+tar -zxf giza-pp-v1.0.7.tgz
+
+cd irstlm-5.70.04
+bash regenerate-makefiles.sh --force
+./configure --prefix $RPM_BUILD_ROOT/opt/moses/irstlm-5.70.04
+make
+make install
+
+cd ../giza-pp
+make
+cp $RPM_BUILD_DIR/giza-pp/GIZA++-v2/GIZA++ $RPM_BUILD_DIR/giza-pp/GIZA++-v2/snt2cooc.out $RPM_BUILD_DIR/giza-pp/mkcls-v2/mkcls $RPM_BUILD_ROOT/opt/moses/giza++-v1.0.7
+%build
+./bjam --with-irstlm=$RPM_BUILD_ROOT/opt/moses/irstlm-5.70.04 --with-giza=$RPM_BUILD_ROOT/opt/moses/giza++-v1.0.7 -j2
+%install
+mkdir -p $RPM_BUILD_ROOT/opt/moses/scripts
+cp -R bin $RPM_BUILD_ROOT/opt/moses
+cp -R scripts/analysis $RPM_BUILD_ROOT/opt/moses/scripts
+cp -R scripts/ems $RPM_BUILD_ROOT/opt/moses/scripts
+cp -R scripts/generic $RPM_BUILD_ROOT/opt/moses/scripts
+cp -R scripts/other $RPM_BUILD_ROOT/opt/moses/scripts
+cp -R scripts/recaser $RPM_BUILD_ROOT/opt/moses/scripts
+cp -R scripts/regression-testing $RPM_BUILD_ROOT/opt/moses/scripts
+cp -R scripts/share $RPM_BUILD_ROOT/opt/moses/scripts
+cp -R scripts/tokenizer $RPM_BUILD_ROOT/opt/moses/scripts
+cp -R scripts/training $RPM_BUILD_ROOT/opt/moses/scripts
+%clean
+%files
+%defattr(-,root,root)
+/opt/moses/bin/*
+/opt/moses/scripts/analysis/*
+/opt/moses/scripts/ems/*
+/opt/moses/scripts/generic/*
+/opt/moses/scripts/other/*
+/opt/moses/scripts/recaser/*
+/opt/moses/scripts/regression-testing/*
+/opt/moses/scripts/share/*
+/opt/moses/scripts/tokenizer/*
+/opt/moses/scripts/training/*
+/opt/moses/irstlm-5.70.04/*
+/opt/moses/giza++-v1.0.7/*
diff --git a/contrib/server/client.py b/contrib/server/client.py
new file mode 100755
index 000000000..43e77555a
--- /dev/null
+++ b/contrib/server/client.py
@@ -0,0 +1,21 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+# python port of client.perl
+
+import xmlrpclib
+import datetime
+
+url = "http://localhost:8080/RPC2"
+proxy = xmlrpclib.ServerProxy(url)
+
+text = u"il a souhaité que la présidence trace à nice le chemin pour l' avenir ."
+params = {"text":text, "align":"true", "report-all-factors":"true"}
+
+result = proxy.translate(params)
+print result['text']
+if 'align' in result:
+ print "Phrase alignments:"
+ aligns = result['align']
+ for align in aligns:
+ print "%s,%s,%s" %(align['tgt-start'], align['src-start'], align['src-end'])
diff --git a/contrib/server/mosesserver.cpp b/contrib/server/mosesserver.cpp
index 98024c891..5d9c40a9b 100644
--- a/contrib/server/mosesserver.cpp
+++ b/contrib/server/mosesserver.cpp
@@ -1,6 +1,8 @@
#include "util/check.hh"
#include <stdexcept>
#include <iostream>
+#include <vector>
+#include <algorithm>
#include "moses/ChartManager.h"
@@ -54,7 +56,7 @@ public:
PhraseDictionaryDynSuffixArray* pdsa = (PhraseDictionaryDynSuffixArray*) pdf->GetDictionary();
cerr << "Inserting into address " << pdsa << endl;
pdsa->insertSnt(source_, target_, alignment_);
- if(add2ORLM_) {
+ if(add2ORLM_) {
updateORLM();
}
cerr << "Done inserting\n";
@@ -83,8 +85,8 @@ public:
const std::string sBOS = orlm->GetSentenceStart()->GetString();
const std::string sEOS = orlm->GetSentenceEnd()->GetString();
Utils::splitToStr(target_, vl, " ");
- // insert BOS and EOS
- vl.insert(vl.begin(), sBOS);
+ // insert BOS and EOS
+ vl.insert(vl.begin(), sBOS);
vl.insert(vl.end(), sEOS);
for(int j=0; j < vl.size(); ++j) {
int i = (j<ngOrder) ? 0 : j-ngOrder+1;
@@ -177,7 +179,7 @@ public:
map<string, xmlrpc_c::value> retData;
if (staticData.IsChart()) {
- TreeInput tinput;
+ TreeInput tinput;
const vector<FactorType> &inputFactorOrder =
staticData.GetInputFactorOrder();
stringstream in(source + "\n");
@@ -260,10 +262,16 @@ public:
}
+
+ bool compareSearchGraphNode(const SearchGraphNode& a, const SearchGraphNode b) {
+ return a.hypo->GetId() < b.hypo->GetId();
+ }
+
void insertGraphInfo(Manager& manager, map<string, xmlrpc_c::value>& retData) {
vector<xmlrpc_c::value> searchGraphXml;
vector<SearchGraphNode> searchGraph;
manager.GetSearchGraph(searchGraph);
+ std::sort(searchGraph.begin(), searchGraph.end());
for (vector<SearchGraphNode>::const_iterator i = searchGraph.begin(); i != searchGraph.end(); ++i) {
map<string, xmlrpc_c::value> searchGraphXmlNode;
searchGraphXmlNode["forward"] = xmlrpc_c::value_double(i->forward);
diff --git a/contrib/sigtest-filter/filter-pt.cpp b/contrib/sigtest-filter/filter-pt.cpp
index f06d2b430..6ab1a5657 100644
--- a/contrib/sigtest-filter/filter-pt.cpp
+++ b/contrib/sigtest-filter/filter-pt.cpp
@@ -287,24 +287,24 @@ SentIdSet find_occurrences(const std::string& rule, C_SuffixArraySearchApplicati
if (hierarchical) {
// std::cerr << "splitting up phrase: " << phrase << "\n";
int pos = 0;
- int endPos = 0;
+ int NTStartPos, NTEndPos;
vector<std::string> phrases;
-
- while (rule.find("[X][X] ", pos) < rule.size()) {
- endPos = rule.find("[X][X] ",pos) - 1; // -1 to cut space before NT
- if (endPos < pos) { // no space: NT at start of rule (or two consecutive NTs)
- pos += 7;
+ while (rule.find("] ", pos) < rule.size()) {
+ NTStartPos = rule.find("[",pos) - 1; // -1 to cut space before NT
+ NTEndPos = rule.find("] ",pos);
+ if (NTStartPos < pos) { // no space: NT at start of rule (or two consecutive NTs)
+ pos = NTEndPos + 2;
continue;
}
- phrases.push_back(rule.substr(pos,endPos-pos));
- pos = endPos + 8;
+ phrases.push_back(rule.substr(pos,NTStartPos-pos));
+ pos = NTEndPos + 2;
}
- // cut LHS of rule
- endPos = rule.size()-4;
- if (endPos > pos) {
- phrases.push_back(rule.substr(pos,endPos-pos));
+ NTStartPos = rule.find("[",pos) - 1; // LHS of rule
+ if (NTStartPos > pos) {
+ phrases.push_back(rule.substr(pos,NTStartPos-pos));
}
+
sa_set = lookup_multiple_phrases(phrases, my_sa, rule, cache);
}
else {
diff --git a/contrib/tmcombine/README.md b/contrib/tmcombine/README.md
index 2cbc83299..7b8ebd45e 100644
--- a/contrib/tmcombine/README.md
+++ b/contrib/tmcombine/README.md
@@ -58,7 +58,7 @@ Regression tests (check if the output files (`test/phrase-table_testN`) differ f
FURTHER NOTES
-------------
- - Different combination algorithms require different statistics. To be on the safe side, use the options `-phrase-word-alignment` and `-write-lexical-counts` when training models.
+ - Different combination algorithms require different statistics. To be on the safe side, use the option and `-write-lexical-counts` when training models.
- The script assumes that phrase tables are sorted (to allow incremental, more memory-friendly processing). Sort the tables with `LC_ALL=C`. Phrase tables produced by Moses are sorted correctly.
diff --git a/contrib/tmcombine/tmcombine.py b/contrib/tmcombine/tmcombine.py
index 0bbcf7c78..5b65cc590 100755
--- a/contrib/tmcombine/tmcombine.py
+++ b/contrib/tmcombine/tmcombine.py
@@ -15,7 +15,7 @@
# Some general things to note:
-# - Different combination algorithms require different statistics. To be on the safe side, use the options `-phrase-word-alignment` and `-write-lexical-counts` when training models.
+# - Different combination algorithms require different statistics. To be on the safe side, use the option `-write-lexical-counts` when training models.
# - The script assumes that phrase tables are sorted (to allow incremental, more memory-friendly processing). sort with LC_ALL=C.
# - Some configurations require additional statistics that are loaded in memory (lexical tables; complete list of target phrases). If memory consumption is a problem, use the option --lowmem (slightly slower and writes temporary files to disk), or consider pruning your phrase table before combining (e.g. using Johnson et al. 2007).
# - The script can read/write gzipped files, but the Python implementation is slow. You're better off unzipping the files on the command line and working with the unzipped files.
@@ -306,7 +306,7 @@ class Moses():
# assuming that alignment is empty
elif len(line) == 4:
if self.require_alignment:
- sys.stderr.write('Error: unexpected phrase table format. Your current configuration requires alignment information. Make sure you trained your model with -phrase-word-alignment\n')
+ sys.stderr.write('Error: unexpected phrase table format. Your current configuration requires alignment information. Make sure you trained your model with -phrase-word-alignment (default in newer Moses versions)\n')
exit()
self.phrase_pairs[src][target][1] = [b'',line[3].lstrip(b'| ')]
diff --git a/mert/InterpolatedScorer.cpp b/mert/InterpolatedScorer.cpp
index e610cbdd0..af3f26bf2 100644
--- a/mert/InterpolatedScorer.cpp
+++ b/mert/InterpolatedScorer.cpp
@@ -164,7 +164,7 @@ void InterpolatedScorer::prepareStats(size_t sid, const string& text, ScoreStats
{
stringstream buff;
string align = text;
- string sentence = "";
+ string sentence = text;
size_t alignmentData = text.find("|||");
//Get sentence and alignment parts
if(alignmentData != string::npos) {
diff --git a/moses-chart-cmd/IOWrapper.cpp b/moses-chart-cmd/IOWrapper.cpp
index 09e06fcf6..b65873881 100644
--- a/moses-chart-cmd/IOWrapper.cpp
+++ b/moses-chart-cmd/IOWrapper.cpp
@@ -620,10 +620,27 @@ void IOWrapper::FixPrecision(std::ostream &stream, size_t size)
template <class T>
void ShiftOffsets(vector<T> &offsets, T shift)
{
+ T currPos = shift;
for (size_t i = 0; i < offsets.size(); ++i) {
- shift += offsets[i];
- offsets[i] += shift;
+ if (offsets[i] == 0) {
+ offsets[i] = currPos;
+ ++currPos;
+ }
+ else {
+ currPos += offsets[i];
+ }
+ }
+}
+
+size_t CalcSourceSize(const Moses::ChartHypothesis *hypo)
+{
+ size_t ret = hypo->GetCurrSourceRange().GetNumWordsCovered();
+ const std::vector<const ChartHypothesis*> &prevHypos = hypo->GetPrevHypos();
+ for (size_t i = 0; i < prevHypos.size(); ++i) {
+ size_t childSize = prevHypos[i]->GetCurrSourceRange().GetNumWordsCovered();
+ ret -= (childSize - 1);
}
+ return ret;
}
size_t IOWrapper::OutputAlignmentNBest(Alignments &retAlign, const Moses::ChartTrellisNode &node, size_t startTarget)
@@ -635,7 +652,11 @@ size_t IOWrapper::OutputAlignmentNBest(Alignments &retAlign, const Moses::ChartT
const TargetPhrase &tp = hypo->GetCurrTargetPhrase();
- vector<size_t> sourceOffsets(hypo->GetCurrSourceRange().GetNumWordsCovered(), 0);
+ size_t thisSourceSize = CalcSourceSize(hypo);
+
+ // position of each terminal word in translation rule, irrespective of alignment
+ // if non-term, number is undefined
+ vector<size_t> sourceOffsets(thisSourceSize, 0);
vector<size_t> targetOffsets(tp.GetSize(), 0);
const ChartTrellisNode::NodeChildren &prevNodes = node.GetChildren();
@@ -655,11 +676,12 @@ size_t IOWrapper::OutputAlignmentNBest(Alignments &retAlign, const Moses::ChartT
const ChartTrellisNode &prevNode = *prevNodes[sourceInd];
- // 1st. calc source size
+ // calc source size
size_t sourceSize = prevNode.GetHypothesis().GetCurrSourceRange().GetNumWordsCovered();
sourceOffsets[sourcePos] = sourceSize;
- // 2nd. calc target size. Recursively look thru child hypos
+ // calc target size.
+ // Recursively look thru child hypos
size_t currStartTarget = startTarget + totalTargetSize;
size_t targetSize = OutputAlignmentNBest(retAlign, prevNode, currStartTarget);
targetOffsets[targetPos] = targetSize;
@@ -672,27 +694,26 @@ size_t IOWrapper::OutputAlignmentNBest(Alignments &retAlign, const Moses::ChartT
}
}
- // 3rd. shift offsets
+ // convert position within translation rule to absolute position within
+ // source sentence / output sentence
ShiftOffsets(sourceOffsets, startSource);
ShiftOffsets(targetOffsets, startTarget);
// get alignments from this hypo
- vector< set<size_t> > retAlignmentsS2T(hypo->GetCurrSourceRange().GetNumWordsCovered());
const AlignmentInfo &aiTerm = hypo->GetCurrTargetPhrase().GetAlignTerm();
- OutputAlignment(retAlignmentsS2T, aiTerm);
// add to output arg, offsetting by source & target
- for (size_t source = 0; source < retAlignmentsS2T.size(); ++source) {
- const set<size_t> &targets = retAlignmentsS2T[source];
- set<size_t>::const_iterator iter;
- for (iter = targets.begin(); iter != targets.end(); ++iter) {
- size_t target = *iter;
- pair<size_t, size_t> alignPoint(source + sourceOffsets[source]
- ,target + targetOffsets[target]);
- pair<Alignments::iterator, bool> ret = retAlign.insert(alignPoint);
- CHECK(ret.second);
-
- }
+ AlignmentInfo::const_iterator iter;
+ for (iter = aiTerm.begin(); iter != aiTerm.end(); ++iter) {
+ const std::pair<size_t,size_t> &align = *iter;
+ size_t relSource = align.first;
+ size_t relTarget = align.second;
+ size_t absSource = sourceOffsets[relSource];
+ size_t absTarget = targetOffsets[relTarget];
+
+ pair<size_t, size_t> alignPoint(absSource, absTarget);
+ pair<Alignments::iterator, bool> ret = retAlign.insert(alignPoint);
+ CHECK(ret.second);
}
return totalTargetSize;
@@ -702,14 +723,16 @@ void IOWrapper::OutputAlignment(size_t translationId , const Moses::ChartHypothe
{
ostringstream out;
- Alignments retAlign;
- OutputAlignment(retAlign, hypo, 0);
+ if (hypo) {
+ Alignments retAlign;
+ OutputAlignment(retAlign, hypo, 0);
- // output alignments
- Alignments::const_iterator iter;
- for (iter = retAlign.begin(); iter != retAlign.end(); ++iter) {
- const pair<size_t, size_t> &alignPoint = *iter;
- out << alignPoint.first << "-" << alignPoint.second << " ";
+ // output alignments
+ Alignments::const_iterator iter;
+ for (iter = retAlign.begin(); iter != retAlign.end(); ++iter) {
+ const pair<size_t, size_t> &alignPoint = *iter;
+ out << alignPoint.first << "-" << alignPoint.second << " ";
+ }
}
out << endl;
@@ -723,7 +746,11 @@ size_t IOWrapper::OutputAlignment(Alignments &retAlign, const Moses::ChartHypoth
const TargetPhrase &tp = hypo->GetCurrTargetPhrase();
- vector<size_t> sourceOffsets(hypo->GetCurrSourceRange().GetNumWordsCovered(), 0);
+ size_t thisSourceSize = CalcSourceSize(hypo);
+
+ // position of each terminal word in translation rule, irrespective of alignment
+ // if non-term, number is undefined
+ vector<size_t> sourceOffsets(thisSourceSize, 0);
vector<size_t> targetOffsets(tp.GetSize(), 0);
const vector<const ChartHypothesis*> &prevHypos = hypo->GetPrevHypos();
@@ -743,11 +770,12 @@ size_t IOWrapper::OutputAlignment(Alignments &retAlign, const Moses::ChartHypoth
const ChartHypothesis *prevHypo = prevHypos[sourceInd];
- // 1st. calc source size
+ // calc source size
size_t sourceSize = prevHypo->GetCurrSourceRange().GetNumWordsCovered();
sourceOffsets[sourcePos] = sourceSize;
- // 2nd. calc target size. Recursively look thru child hypos
+ // calc target size.
+ // Recursively look thru child hypos
size_t currStartTarget = startTarget + totalTargetSize;
size_t targetSize = OutputAlignment(retAlign, prevHypo, currStartTarget);
targetOffsets[targetPos] = targetSize;
@@ -760,27 +788,27 @@ size_t IOWrapper::OutputAlignment(Alignments &retAlign, const Moses::ChartHypoth
}
}
- // 3rd. shift offsets
+ // convert position within translation rule to absolute position within
+ // source sentence / output sentence
ShiftOffsets(sourceOffsets, startSource);
ShiftOffsets(targetOffsets, startTarget);
// get alignments from this hypo
- vector< set<size_t> > retAlignmentsS2T(hypo->GetCurrSourceRange().GetNumWordsCovered());
const AlignmentInfo &aiTerm = hypo->GetCurrTargetPhrase().GetAlignTerm();
- OutputAlignment(retAlignmentsS2T, aiTerm);
// add to output arg, offsetting by source & target
- for (size_t source = 0; source < retAlignmentsS2T.size(); ++source) {
- const set<size_t> &targets = retAlignmentsS2T[source];
- set<size_t>::const_iterator iter;
- for (iter = targets.begin(); iter != targets.end(); ++iter) {
- size_t target = *iter;
- pair<size_t, size_t> alignPoint(source + sourceOffsets[source]
- ,target + targetOffsets[target]);
- pair<Alignments::iterator, bool> ret = retAlign.insert(alignPoint);
- CHECK(ret.second);
+ AlignmentInfo::const_iterator iter;
+ for (iter = aiTerm.begin(); iter != aiTerm.end(); ++iter) {
+ const std::pair<size_t,size_t> &align = *iter;
+ size_t relSource = align.first;
+ size_t relTarget = align.second;
+ size_t absSource = sourceOffsets[relSource];
+ size_t absTarget = targetOffsets[relTarget];
+
+ pair<size_t, size_t> alignPoint(absSource, absTarget);
+ pair<Alignments::iterator, bool> ret = retAlign.insert(alignPoint);
+ CHECK(ret.second);
- }
}
return totalTargetSize;
diff --git a/moses-cmd/IOWrapper.cpp b/moses-cmd/IOWrapper.cpp
index f11516839..335a570a6 100644
--- a/moses-cmd/IOWrapper.cpp
+++ b/moses-cmd/IOWrapper.cpp
@@ -262,6 +262,19 @@ void OutputAlignment(ostream &out, const vector<const Hypothesis *> &edges)
out << std::endl;
}
+void OutputAlignment(std::ostream &out, const Moses::Hypothesis *hypo)
+{
+ std::vector<const Hypothesis *> edges;
+ const Hypothesis *currentHypo = hypo;
+ while (currentHypo) {
+ edges.push_back(currentHypo);
+ currentHypo = currentHypo->GetPrevHypo();
+ }
+
+ OutputAlignment(out, edges);
+
+}
+
void OutputAlignment(OutputCollector* collector, size_t lineNo , const vector<const Hypothesis *> &edges)
{
ostringstream out;
diff --git a/moses-cmd/IOWrapper.h b/moses-cmd/IOWrapper.h
index 8f164dfb3..8dbdeda9c 100644
--- a/moses-cmd/IOWrapper.h
+++ b/moses-cmd/IOWrapper.h
@@ -137,7 +137,7 @@ void OutputBestHypo(const Moses::TrellisPath &path, long /*translationId*/,bool
void OutputInput(std::ostream& os, const Moses::Hypothesis* hypo);
void OutputAlignment(Moses::OutputCollector* collector, size_t lineNo, const Moses::Hypothesis *hypo);
void OutputAlignment(Moses::OutputCollector* collector, size_t lineNo, const Moses::TrellisPath &path);
-
+void OutputAlignment(std::ostream &out, const Moses::Hypothesis *hypo);
}
diff --git a/moses-cmd/Jamfile b/moses-cmd/Jamfile
index 04f395a81..bddc10911 100644
--- a/moses-cmd/Jamfile
+++ b/moses-cmd/Jamfile
@@ -1,4 +1,4 @@
-alias deps : IOWrapper.cpp mbr.cpp LatticeMBR.cpp TranslationAnalysis.cpp ../moses//moses ;
+alias deps : IOWrapper.cpp mbr.cpp LatticeMBR.cpp TranslationAnalysis.cpp ..//z ..//boost_iostreams ..//boost_filesystem ../moses//moses ;
exe moses : Main.cpp deps ;
exe lmbrgrid : LatticeMBRGrid.cpp deps ;
diff --git a/moses-cmd/Main.cpp b/moses-cmd/Main.cpp
index ac4527aae..b08ba532a 100644
--- a/moses-cmd/Main.cpp
+++ b/moses-cmd/Main.cpp
@@ -23,6 +23,13 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
* Moses main, for single-threaded and multi-threaded.
**/
+#include <boost/algorithm/string/predicate.hpp>
+#include <boost/filesystem.hpp>
+#include <boost/iostreams/device/file.hpp>
+#include <boost/iostreams/filter/bzip2.hpp>
+#include <boost/iostreams/filter/gzip.hpp>
+#include <boost/iostreams/filtering_stream.hpp>
+
#include <exception>
#include <fstream>
#include <sstream>
@@ -83,14 +90,18 @@ public:
OutputCollector* wordGraphCollector, OutputCollector* searchGraphCollector,
OutputCollector* detailedTranslationCollector,
OutputCollector* alignmentInfoCollector,
- OutputCollector* unknownsCollector) :
+ OutputCollector* unknownsCollector,
+ bool outputSearchGraphSLF,
+ bool outputSearchGraphHypergraph) :
m_source(source), m_lineNumber(lineNumber),
m_outputCollector(outputCollector), m_nbestCollector(nbestCollector),
m_latticeSamplesCollector(latticeSamplesCollector),
m_wordGraphCollector(wordGraphCollector), m_searchGraphCollector(searchGraphCollector),
m_detailedTranslationCollector(detailedTranslationCollector),
m_alignmentInfoCollector(alignmentInfoCollector),
- m_unknownsCollector(unknownsCollector) {}
+ m_unknownsCollector(unknownsCollector),
+ m_outputSearchGraphSLF(outputSearchGraphSLF),
+ m_outputSearchGraphHypergraph(outputSearchGraphHypergraph) {}
/** Translate one sentence
* gets called by main function implemented at end of this source file */
@@ -143,6 +154,96 @@ public:
#endif
}
+ // Output search graph in HTK standard lattice format (SLF)
+ if (m_outputSearchGraphSLF) {
+ stringstream fileName;
+ fileName << staticData.GetParam("output-search-graph-slf")[0] << "/" << m_lineNumber << ".slf";
+ std::ofstream *file = new std::ofstream;
+ file->open(fileName.str().c_str());
+ if (file->is_open() && file->good()) {
+ ostringstream out;
+ fix(out,PRECISION);
+ manager.OutputSearchGraphAsSLF(m_lineNumber, out);
+ *file << out.str();
+ file -> flush();
+ } else {
+ TRACE_ERR("Cannot output HTK standard lattice for line " << m_lineNumber << " because the output file is not open or not ready for writing" << std::endl);
+ }
+ }
+
+ // Output search graph in hypergraph format for Kenneth Heafield's lazy hypergraph decoder
+ if (m_outputSearchGraphHypergraph) {
+
+ vector<string> hypergraphParameters = staticData.GetParam("output-search-graph-hypergraph");
+
+ bool appendSuffix;
+ if (hypergraphParameters.size() > 0 && hypergraphParameters[0] == "true") {
+ appendSuffix = true;
+ } else {
+ appendSuffix = false;
+ }
+
+ string compression;
+ if (hypergraphParameters.size() > 1) {
+ compression = hypergraphParameters[1];
+ } else {
+ compression = "txt";
+ }
+
+ string hypergraphDir;
+ if ( hypergraphParameters.size() > 2 ) {
+ hypergraphDir = hypergraphParameters[2];
+ } else {
+ string nbestFile = staticData.GetNBestFilePath();
+ if ( ! nbestFile.empty() && nbestFile!="-" && !boost::starts_with(nbestFile,"/dev/stdout") ) {
+ boost::filesystem::path nbestPath(nbestFile);
+ hypergraphDir = nbestPath.parent_path().filename().native();
+ } else {
+ stringstream hypergraphDirName;
+ hypergraphDirName << boost::filesystem::current_path() << "/hypergraph";
+ hypergraphDir = hypergraphDirName.str();
+ }
+ }
+
+ if ( ! boost::filesystem::exists(hypergraphDir) ) {
+ boost::filesystem::create_directory(hypergraphDir);
+ }
+
+ if ( ! boost::filesystem::exists(hypergraphDir) ) {
+ TRACE_ERR("Cannot output hypergraphs to " << hypergraphDir << " because the directory does not exist" << std::endl);
+ } else if ( ! boost::filesystem::is_directory(hypergraphDir) ) {
+ TRACE_ERR("Cannot output hypergraphs to " << hypergraphDir << " because that path exists, but is not a directory" << std::endl);
+ } else {
+ stringstream fileName;
+ fileName << hypergraphDir << "/" << m_lineNumber;
+ if ( appendSuffix ) {
+ fileName << "." << compression;
+ }
+ boost::iostreams::filtering_ostream *file = new boost::iostreams::filtering_ostream;
+
+ if ( compression == "gz" ) {
+ file->push( boost::iostreams::gzip_compressor() );
+ } else if ( compression == "bz2" ) {
+ file->push( boost::iostreams::bzip2_compressor() );
+ } else if ( compression != "txt" ) {
+ TRACE_ERR("Unrecognized hypergraph compression format (" << compression << ") - using uncompressed plain txt" << std::endl);
+ compression = "txt";
+ }
+
+ file->push( boost::iostreams::file_sink(fileName.str(), ios_base::out) );
+
+ if (file->is_complete() && file->good()) {
+ fix(*file,PRECISION);
+ manager.OutputSearchGraphAsHypergraph(m_lineNumber, *file);
+ file -> flush();
+ } else {
+ TRACE_ERR("Cannot output hypergraph for line " << m_lineNumber << " because the output file " << fileName.str() << " is not open or not ready for writing" << std::endl);
+ }
+ file -> pop();
+ delete file;
+ }
+ }
+
// apply decision rule and output best translation(s)
if (m_outputCollector) {
ostringstream out;
@@ -157,7 +258,7 @@ public:
// MAP decoding: best hypothesis
const Hypothesis* bestHypo = NULL;
if (!staticData.UseMBR())
- {
+ {
bestHypo = manager.GetBestHypothesis();
if (bestHypo) {
if (staticData.IsPathRecoveryEnabled()) {
@@ -174,13 +275,18 @@ public:
staticData.GetOutputFactorOrder(),
staticData.GetReportSegmentation(),
staticData.GetReportAllFactors());
+ if (staticData.PrintAlignmentInfo()) {
+ out << "||| ";
+ OutputAlignment(out, bestHypo);
+ }
+
OutputAlignment(m_alignmentInfoCollector, m_lineNumber, bestHypo);
IFVERBOSE(1) {
debug << "BEST TRANSLATION: " << *bestHypo << endl;
}
}
out << endl;
- }
+ }
// MBR decoding (n-best MBR, lattice MBR, consensus)
else
@@ -311,6 +417,8 @@ private:
OutputCollector* m_detailedTranslationCollector;
OutputCollector* m_alignmentInfoCollector;
OutputCollector* m_unknownsCollector;
+ bool m_outputSearchGraphSLF;
+ bool m_outputSearchGraphHypergraph;
std::ofstream *m_alignmentStream;
@@ -323,7 +431,7 @@ static void PrintFeatureWeight(const FeatureFunction* ff)
vector<float> values = StaticData::Instance().GetAllWeights().GetScoresForProducer(ff);
for (size_t i = 0; i < numScoreComps; ++i)
cout << ff->GetScoreProducerDescription() << " "
- << ff->GetScoreProducerWeightShortName() << " "
+ << ff->GetScoreProducerWeightShortName(i) << " "
<< values[i] << endl;
}
else {
@@ -367,6 +475,63 @@ static void ShowWeights()
}
+size_t OutputFeatureWeightsForHypergraph(size_t index, const FeatureFunction* ff, std::ostream &outputSearchGraphStream)
+{
+ size_t numScoreComps = ff->GetNumScoreComponents();
+ if (numScoreComps != ScoreProducer::unlimited) {
+ vector<float> values = StaticData::Instance().GetAllWeights().GetScoresForProducer(ff);
+ if (numScoreComps > 1) {
+ for (size_t i = 0; i < numScoreComps; ++i) {
+ outputSearchGraphStream << ff->GetScoreProducerWeightShortName()
+ << i
+ << "=" << values[i] << endl;
+ }
+ } else {
+ outputSearchGraphStream << ff->GetScoreProducerWeightShortName()
+ << "=" << values[0] << endl;
+ }
+ return index+numScoreComps;
+ } else {
+ cerr << "Sparse features are not yet supported when outputting hypergraph format" << endl;
+ assert(false);
+ return 0;
+ }
+}
+
+void OutputFeatureWeightsForHypergraph(std::ostream &outputSearchGraphStream)
+{
+ outputSearchGraphStream.setf(std::ios::fixed);
+ outputSearchGraphStream.precision(6);
+
+ const StaticData& staticData = StaticData::Instance();
+ const TranslationSystem& system = staticData.GetTranslationSystem(TranslationSystem::DEFAULT);
+ const vector<const StatelessFeatureFunction*>& slf =system.GetStatelessFeatureFunctions();
+ const vector<const StatefulFeatureFunction*>& sff = system.GetStatefulFeatureFunctions();
+ size_t featureIndex = 1;
+ for (size_t i = 0; i < sff.size(); ++i) {
+ featureIndex = OutputFeatureWeightsForHypergraph(featureIndex, sff[i], outputSearchGraphStream);
+ }
+ for (size_t i = 0; i < slf.size(); ++i) {
+ if (slf[i]->GetScoreProducerWeightShortName() != "u" &&
+ slf[i]->GetScoreProducerWeightShortName() != "tm" &&
+ slf[i]->GetScoreProducerWeightShortName() != "I" &&
+ slf[i]->GetScoreProducerWeightShortName() != "g")
+ {
+ featureIndex = OutputFeatureWeightsForHypergraph(featureIndex, slf[i], outputSearchGraphStream);
+ }
+ }
+ const vector<PhraseDictionaryFeature*>& pds = system.GetPhraseDictionaries();
+ for( size_t i=0; i<pds.size(); i++ ) {
+ featureIndex = OutputFeatureWeightsForHypergraph(featureIndex, pds[i], outputSearchGraphStream);
+ }
+ const vector<GenerationDictionary*>& gds = system.GetGenerationDictionaries();
+ for( size_t i=0; i<gds.size(); i++ ) {
+ featureIndex = OutputFeatureWeightsForHypergraph(featureIndex, gds[i], outputSearchGraphStream);
+ }
+
+}
+
+
} //namespace
/** main function of the command line version of the decoder **/
@@ -391,20 +556,20 @@ int main(int argc, char** argv)
// load all the settings into the Parameter class
// (stores them as strings, or array of strings)
- Parameter* params = new Parameter();
- if (!params->LoadParam(argc,argv)) {
+ Parameter params;
+ if (!params.LoadParam(argc,argv)) {
exit(1);
}
// initialize all "global" variables, which are stored in StaticData
// note: this also loads models such as the language model, etc.
- if (!StaticData::LoadDataStatic(params, argv[0])) {
+ if (!StaticData::LoadDataStatic(&params, argv[0])) {
exit(1);
}
// setting "-show-weights" -> just dump out weights and exit
- if (params->isParamSpecified("show-weights")) {
+ if (params.isParamSpecified("show-weights")) {
ShowWeights();
exit(0);
}
@@ -430,6 +595,32 @@ int main(int argc, char** argv)
TRACE_ERR(weights);
TRACE_ERR("\n");
}
+ if (staticData.GetOutputSearchGraphHypergraph()) {
+ ofstream* weightsOut = new std::ofstream;
+ stringstream weightsFilename;
+ if (staticData.GetParam("output-search-graph-hypergraph").size() > 3) {
+ weightsFilename << staticData.GetParam("output-search-graph-hypergraph")[3];
+ } else {
+ string nbestFile = staticData.GetNBestFilePath();
+ if ( ! nbestFile.empty() && nbestFile!="-" && !boost::starts_with(nbestFile,"/dev/stdout") ) {
+ boost::filesystem::path nbestPath(nbestFile);
+ weightsFilename << nbestPath.parent_path().filename() << "/weights";
+ } else {
+ weightsFilename << boost::filesystem::current_path() << "/hypergraph/weights";
+ }
+ }
+ boost::filesystem::path weightsFilePath(weightsFilename.str());
+ if ( ! boost::filesystem::exists(weightsFilePath.parent_path()) ) {
+ boost::filesystem::create_directory(weightsFilePath.parent_path());
+ }
+ TRACE_ERR("The weights file is " << weightsFilename.str() << "\n");
+ weightsOut->open(weightsFilename.str().c_str());
+ OutputFeatureWeightsForHypergraph(*weightsOut);
+ weightsOut->flush();
+ weightsOut->close();
+ delete weightsOut;
+ }
+
// initialize output streams
// note: we can't just write to STDOUT or files
@@ -533,7 +724,9 @@ int main(int argc, char** argv)
searchGraphCollector.get(),
detailedTranslationCollector.get(),
alignmentInfoCollector.get(),
- unknownsCollector.get() );
+ unknownsCollector.get(),
+ staticData.GetOutputSearchGraphSLF(),
+ staticData.GetOutputSearchGraphHypergraph());
// execute task
#ifdef WITH_THREADS
pool.Submit(task);
@@ -551,6 +744,8 @@ int main(int argc, char** argv)
pool.Stop(true); //flush remaining jobs
#endif
+ delete ioWrapper;
+
} catch (const std::exception &e) {
std::cerr << "Exception: " << e.what() << std::endl;
return EXIT_FAILURE;
diff --git a/moses/AlignmentInfoCollection.cpp b/moses/AlignmentInfoCollection.cpp
index 5daba9ba1..53b83d8cd 100644
--- a/moses/AlignmentInfoCollection.cpp
+++ b/moses/AlignmentInfoCollection.cpp
@@ -30,6 +30,9 @@ AlignmentInfoCollection::AlignmentInfoCollection()
m_emptyAlignmentInfo = Add(pairs);
}
+AlignmentInfoCollection::~AlignmentInfoCollection()
+{}
+
const AlignmentInfo &AlignmentInfoCollection::GetEmptyAlignmentInfo() const
{
return *m_emptyAlignmentInfo;
diff --git a/moses/AlignmentInfoCollection.h b/moses/AlignmentInfoCollection.h
index 9c7f75e13..de0949f8f 100644
--- a/moses/AlignmentInfoCollection.h
+++ b/moses/AlignmentInfoCollection.h
@@ -55,6 +55,7 @@ class AlignmentInfoCollection
//! Only a single static variable should be created.
AlignmentInfoCollection();
+ ~AlignmentInfoCollection();
static AlignmentInfoCollection s_instance;
diff --git a/moses/Hypothesis.cpp b/moses/Hypothesis.cpp
index 506193d5b..5bd3a4e2b 100644
--- a/moses/Hypothesis.cpp
+++ b/moses/Hypothesis.cpp
@@ -462,7 +462,7 @@ void Hypothesis::CleanupArcList()
*/
const StaticData &staticData = StaticData::Instance();
size_t nBestSize = staticData.GetNBestSize();
- bool distinctNBest = staticData.GetDistinctNBest() || staticData.UseMBR() || staticData.GetOutputSearchGraph() || staticData.UseLatticeMBR() ;
+ bool distinctNBest = staticData.GetDistinctNBest() || staticData.UseMBR() || staticData.GetOutputSearchGraph() || staticData.GetOutputSearchGraphSLF() || staticData.GetOutputSearchGraphHypergraph() || staticData.UseLatticeMBR() ;
if (!distinctNBest && m_arcList->size() > nBestSize * 5) {
// prune arc list only if there too many arcs
diff --git a/moses/LM/SingleFactor.cpp b/moses/LM/SingleFactor.cpp
index 3418aefe2..c061d0fed 100644
--- a/moses/LM/SingleFactor.cpp
+++ b/moses/LM/SingleFactor.cpp
@@ -36,8 +36,9 @@ using namespace std;
namespace Moses
{
-LanguageModelSingleFactor::~LanguageModelSingleFactor() {}
-
+LanguageModelSingleFactor::~LanguageModelSingleFactor()
+{
+}
struct PointerState : public FFState {
const void* lmstate;
@@ -58,7 +59,11 @@ LanguageModelPointerState::LanguageModelPointerState()
m_beginSentenceState = new PointerState(NULL);
}
-LanguageModelPointerState::~LanguageModelPointerState() {}
+LanguageModelPointerState::~LanguageModelPointerState()
+{
+ delete m_nullContextState;
+ delete m_beginSentenceState;
+}
const FFState *LanguageModelPointerState::GetNullContextState() const
{
diff --git a/moses/Manager.cpp b/moses/Manager.cpp
index 468db0de3..2ca689bb0 100644
--- a/moses/Manager.cpp
+++ b/moses/Manager.cpp
@@ -26,8 +26,10 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
#endif
#include <algorithm>
-#include <limits>
#include <cmath>
+#include <limits>
+#include <map>
+#include <set>
#include "Manager.h"
#include "TypeDef.h"
#include "Util.h"
@@ -46,17 +48,19 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
#include "rule.pb.h"
#endif
+#include "util/exception.hh"
+
using namespace std;
namespace Moses
{
Manager::Manager(size_t lineNumber, InputType const& source, SearchAlgorithm searchAlgorithm, const TranslationSystem* system)
- :m_lineNumber(lineNumber)
- ,m_system(system)
+ :m_system(system)
,m_transOptColl(source.CreateTranslationOptionCollection(system))
,m_search(Search::CreateSearch(*this, source, searchAlgorithm, *m_transOptColl))
,interrupted_flag(0)
,m_hypoId(0)
+ ,m_lineNumber(lineNumber)
,m_source(source)
{
m_system->InitializeBeforeSentenceProcessing(source);
@@ -628,6 +632,435 @@ void Manager::GetSearchGraph(vector<SearchGraphNode>& searchGraph) const
}
+void Manager::OutputFeatureWeightsForSLF(std::ostream &outputSearchGraphStream) const
+{
+ outputSearchGraphStream.setf(std::ios::fixed);
+ outputSearchGraphStream.precision(6);
+
+ const StaticData& staticData = StaticData::Instance();
+ const TranslationSystem& system = staticData.GetTranslationSystem(TranslationSystem::DEFAULT);
+ const vector<const StatelessFeatureFunction*>& slf =system.GetStatelessFeatureFunctions();
+ const vector<const StatefulFeatureFunction*>& sff = system.GetStatefulFeatureFunctions();
+ size_t featureIndex = 1;
+ for (size_t i = 0; i < sff.size(); ++i) {
+ featureIndex = OutputFeatureWeightsForSLF(featureIndex, sff[i], outputSearchGraphStream);
+ }
+ for (size_t i = 0; i < slf.size(); ++i) {
+ if (slf[i]->GetScoreProducerWeightShortName() != "u" &&
+ slf[i]->GetScoreProducerWeightShortName() != "tm" &&
+ slf[i]->GetScoreProducerWeightShortName() != "I" &&
+ slf[i]->GetScoreProducerWeightShortName() != "g")
+ {
+ featureIndex = OutputFeatureWeightsForSLF(featureIndex, slf[i], outputSearchGraphStream);
+ }
+ }
+ const vector<PhraseDictionaryFeature*>& pds = system.GetPhraseDictionaries();
+ for( size_t i=0; i<pds.size(); i++ ) {
+ featureIndex = OutputFeatureWeightsForSLF(featureIndex, pds[i], outputSearchGraphStream);
+ }
+ const vector<GenerationDictionary*>& gds = system.GetGenerationDictionaries();
+ for( size_t i=0; i<gds.size(); i++ ) {
+ featureIndex = OutputFeatureWeightsForSLF(featureIndex, gds[i], outputSearchGraphStream);
+ }
+
+}
+
+void Manager::OutputFeatureValuesForSLF(const Hypothesis* hypo, bool zeros, std::ostream &outputSearchGraphStream) const
+{
+ outputSearchGraphStream.setf(std::ios::fixed);
+ outputSearchGraphStream.precision(6);
+
+ // outputSearchGraphStream << endl;
+ // outputSearchGraphStream << (*hypo) << endl;
+ // const ScoreComponentCollection& scoreCollection = hypo->GetScoreBreakdown();
+ // outputSearchGraphStream << scoreCollection << endl;
+
+ const StaticData& staticData = StaticData::Instance();
+ const TranslationSystem& system = staticData.GetTranslationSystem(TranslationSystem::DEFAULT);
+ const vector<const StatelessFeatureFunction*>& slf =system.GetStatelessFeatureFunctions();
+ const vector<const StatefulFeatureFunction*>& sff = system.GetStatefulFeatureFunctions();
+ size_t featureIndex = 1;
+ for (size_t i = 0; i < sff.size(); ++i) {
+ featureIndex = OutputFeatureValuesForSLF(featureIndex, zeros, hypo, sff[i], outputSearchGraphStream);
+ }
+ for (size_t i = 0; i < slf.size(); ++i) {
+ if (slf[i]->GetScoreProducerWeightShortName() != "u" &&
+ slf[i]->GetScoreProducerWeightShortName() != "tm" &&
+ slf[i]->GetScoreProducerWeightShortName() != "I" &&
+ slf[i]->GetScoreProducerWeightShortName() != "g")
+ {
+ featureIndex = OutputFeatureValuesForSLF(featureIndex, zeros, hypo, slf[i], outputSearchGraphStream);
+ }
+ }
+ const vector<PhraseDictionaryFeature*>& pds = system.GetPhraseDictionaries();
+ for( size_t i=0; i<pds.size(); i++ ) {
+ featureIndex = OutputFeatureValuesForSLF(featureIndex, zeros, hypo, pds[i], outputSearchGraphStream);
+ }
+ const vector<GenerationDictionary*>& gds = system.GetGenerationDictionaries();
+ for( size_t i=0; i<gds.size(); i++ ) {
+ featureIndex = OutputFeatureValuesForSLF(featureIndex, zeros, hypo, gds[i], outputSearchGraphStream);
+ }
+
+}
+
+void Manager::OutputFeatureValuesForHypergraph(const Hypothesis* hypo, std::ostream &outputSearchGraphStream) const
+{
+ outputSearchGraphStream.setf(std::ios::fixed);
+ outputSearchGraphStream.precision(6);
+
+ const StaticData& staticData = StaticData::Instance();
+ const TranslationSystem& system = staticData.GetTranslationSystem(TranslationSystem::DEFAULT);
+ const vector<const StatelessFeatureFunction*>& slf =system.GetStatelessFeatureFunctions();
+ const vector<const StatefulFeatureFunction*>& sff = system.GetStatefulFeatureFunctions();
+ size_t featureIndex = 1;
+ for (size_t i = 0; i < sff.size(); ++i) {
+ featureIndex = OutputFeatureValuesForHypergraph(featureIndex, hypo, sff[i], outputSearchGraphStream);
+ }
+ for (size_t i = 0; i < slf.size(); ++i) {
+ if (slf[i]->GetScoreProducerWeightShortName() != "u" &&
+ slf[i]->GetScoreProducerWeightShortName() != "tm" &&
+ slf[i]->GetScoreProducerWeightShortName() != "I" &&
+ slf[i]->GetScoreProducerWeightShortName() != "g")
+ {
+ featureIndex = OutputFeatureValuesForHypergraph(featureIndex, hypo, slf[i], outputSearchGraphStream);
+ }
+ }
+ const vector<PhraseDictionaryFeature*>& pds = system.GetPhraseDictionaries();
+ for( size_t i=0; i<pds.size(); i++ ) {
+ featureIndex = OutputFeatureValuesForHypergraph(featureIndex, hypo, pds[i], outputSearchGraphStream);
+ }
+ const vector<GenerationDictionary*>& gds = system.GetGenerationDictionaries();
+ for( size_t i=0; i<gds.size(); i++ ) {
+ featureIndex = OutputFeatureValuesForHypergraph(featureIndex, hypo, gds[i], outputSearchGraphStream);
+ }
+
+}
+
+
+size_t Manager::OutputFeatureWeightsForSLF(size_t index, const FeatureFunction* ff, std::ostream &outputSearchGraphStream) const
+{
+ size_t numScoreComps = ff->GetNumScoreComponents();
+ if (numScoreComps != ScoreProducer::unlimited) {
+ vector<float> values = StaticData::Instance().GetAllWeights().GetScoresForProducer(ff);
+ for (size_t i = 0; i < numScoreComps; ++i) {
+ outputSearchGraphStream << "# " << ff->GetScoreProducerDescription()
+ << " " << ff->GetScoreProducerWeightShortName()
+ << " " << (i+1) << " of " << numScoreComps << endl
+ << "x" << (index+i) << "scale=" << values[i] << endl;
+ }
+ return index+numScoreComps;
+ } else {
+ cerr << "Sparse features are not supported when outputting HTK standard lattice format" << endl;
+ assert(false);
+ return 0;
+ }
+}
+
+size_t Manager::OutputFeatureValuesForSLF(size_t index, bool zeros, const Hypothesis* hypo, const FeatureFunction* ff, std::ostream &outputSearchGraphStream) const
+{
+
+ // { const FeatureFunction* sp = ff;
+ // const FVector& m_scores = scoreCollection.GetScoresVector();
+ // FVector& scores = const_cast<FVector&>(m_scores);
+ // std::string prefix = sp->GetScoreProducerDescription() + FName::SEP;
+ // // std::cout << "prefix==" << prefix << endl;
+ // // cout << "m_scores==" << m_scores << endl;
+ // // cout << "m_scores.size()==" << m_scores.size() << endl;
+ // // cout << "m_scores.coreSize()==" << m_scores.coreSize() << endl;
+ // // cout << "m_scores.cbegin() ?= m_scores.cend()\t" << (m_scores.cbegin() == m_scores.cend()) << endl;
+
+
+ // // for(FVector::FNVmap::const_iterator i = m_scores.cbegin(); i != m_scores.cend(); i++) {
+ // // std::cout<<prefix << "\t" << (i->first) << "\t" << (i->second) << std::endl;
+ // // }
+ // for(int i=0, n=v.size(); i<n; i+=1) {
+ // // outputSearchGraphStream << prefix << i << "==" << v[i] << std::endl;
+
+ // }
+ // }
+
+ // FVector featureValues = scoreCollection.GetVectorForProducer(ff);
+ // outputSearchGraphStream << featureValues << endl;
+ const ScoreComponentCollection& scoreCollection = hypo->GetScoreBreakdown();
+
+ vector<float> featureValues = scoreCollection.GetScoresForProducer(ff);
+ size_t numScoreComps = featureValues.size();//featureValues.coreSize();
+ // if (numScoreComps != ScoreProducer::unlimited) {
+ // vector<float> values = StaticData::Instance().GetAllWeights().GetScoresForProducer(ff);
+ for (size_t i = 0; i < numScoreComps; ++i) {
+ outputSearchGraphStream << "x" << (index+i) << "=" << ((zeros) ? 0.0 : featureValues[i]) << " ";
+ }
+ return index+numScoreComps;
+ // } else {
+ // cerr << "Sparse features are not supported when outputting HTK standard lattice format" << endl;
+ // assert(false);
+ // return 0;
+ // }
+}
+
+size_t Manager::OutputFeatureValuesForHypergraph(size_t index, const Hypothesis* hypo, const FeatureFunction* ff, std::ostream &outputSearchGraphStream) const
+{
+ ScoreComponentCollection scoreCollection = hypo->GetScoreBreakdown();
+ const Hypothesis *prevHypo = hypo->GetPrevHypo();
+ if (prevHypo) {
+ scoreCollection.MinusEquals( prevHypo->GetScoreBreakdown() );
+ }
+ vector<float> featureValues = scoreCollection.GetScoresForProducer(ff);
+ size_t numScoreComps = featureValues.size();
+
+ if (numScoreComps > 1) {
+ for (size_t i = 0; i < numScoreComps; ++i) {
+ outputSearchGraphStream << ff->GetScoreProducerWeightShortName() << i << "=" << featureValues[i] << " ";
+ }
+ } else {
+ outputSearchGraphStream << ff->GetScoreProducerWeightShortName() << "=" << featureValues[0] << " ";
+ }
+
+ return index+numScoreComps;
+}
+
+/**! Output search graph in hypergraph format of Kenneth Heafield's lazy hypergraph decoder */
+void Manager::OutputSearchGraphAsHypergraph(long translationId, std::ostream &outputSearchGraphStream) const
+{
+
+ VERBOSE(2,"Getting search graph to output as hypergraph for sentence " << translationId << std::endl)
+
+ vector<SearchGraphNode> searchGraph;
+ GetSearchGraph(searchGraph);
+
+
+ map<int,int> mosesIDToHypergraphID;
+ // map<int,int> hypergraphIDToMosesID;
+ set<int> terminalNodes;
+ multimap<int,int> hypergraphIDToArcs;
+
+ VERBOSE(2,"Gathering information about search graph to output as hypergraph for sentence " << translationId << std::endl)
+
+ long numNodes = 0;
+ long endNode = 0;
+ {
+ long hypergraphHypothesisID = 0;
+ for (size_t arcNumber = 0, size=searchGraph.size(); arcNumber < size; ++arcNumber) {
+
+ // Get an id number for the previous hypothesis
+ const Hypothesis *prevHypo = searchGraph[arcNumber].hypo->GetPrevHypo();
+ if (prevHypo!=NULL) {
+ int mosesPrevHypothesisID = prevHypo->GetId();
+ if (mosesIDToHypergraphID.count(mosesPrevHypothesisID) == 0) {
+ mosesIDToHypergraphID[mosesPrevHypothesisID] = hypergraphHypothesisID;
+ // hypergraphIDToMosesID[hypergraphHypothesisID] = mosesPrevHypothesisID;
+ hypergraphHypothesisID += 1;
+ }
+ }
+
+ // Get an id number for this hypothesis
+ int mosesHypothesisID;
+ if (searchGraph[arcNumber].recombinationHypo) {
+ mosesHypothesisID = searchGraph[arcNumber].recombinationHypo->GetId();
+ } else {
+ mosesHypothesisID = searchGraph[arcNumber].hypo->GetId();
+ }
+
+ if (mosesIDToHypergraphID.count(mosesHypothesisID) == 0) {
+
+ mosesIDToHypergraphID[mosesHypothesisID] = hypergraphHypothesisID;
+ // hypergraphIDToMosesID[hypergraphHypothesisID] = mosesHypothesisID;
+
+ bool terminalNode = (searchGraph[arcNumber].forward == -1);
+ if (terminalNode) {
+ // Final arc to end node, representing the end of the sentence </s>
+ terminalNodes.insert(hypergraphHypothesisID);
+ }
+
+ hypergraphHypothesisID += 1;
+ }
+
+ // Record that this arc ends at this node
+ hypergraphIDToArcs.insert(pair<int,int>(mosesIDToHypergraphID[mosesHypothesisID],arcNumber));
+
+ }
+
+ // Unique end node
+ endNode = hypergraphHypothesisID;
+ // mosesIDToHypergraphID[hypergraphHypothesisID] = hypergraphHypothesisID;
+ numNodes = endNode + 1;
+
+ }
+
+
+ long numArcs = searchGraph.size() + terminalNodes.size();
+
+ // Print number of nodes and arcs
+ outputSearchGraphStream << numNodes << " " << numArcs << endl;
+
+ VERBOSE(2,"Search graph to output as hypergraph for sentence " << translationId
+ << " contains " << numArcs << " arcs and " << numNodes << " nodes" << std::endl)
+
+ VERBOSE(2,"Outputting search graph to output as hypergraph for sentence " << translationId << std::endl)
+
+
+ for (int hypergraphHypothesisID=0; hypergraphHypothesisID < endNode; hypergraphHypothesisID+=1) {
+ if (hypergraphHypothesisID % 100000 == 0) {
+ VERBOSE(2,"Processed " << hypergraphHypothesisID << " of " << numNodes << " hypergraph nodes for sentence " << translationId << std::endl);
+ }
+ // int mosesID = hypergraphIDToMosesID[hypergraphHypothesisID];
+ size_t count = hypergraphIDToArcs.count(hypergraphHypothesisID);
+ // VERBOSE(2,"Hypergraph node " << hypergraphHypothesisID << " has " << count << " incoming arcs" << std::endl)
+ if (count > 0) {
+ outputSearchGraphStream << count << "\n";
+
+ pair<multimap<int,int>::iterator, multimap<int,int>::iterator> range =
+ hypergraphIDToArcs.equal_range(hypergraphHypothesisID);
+ for (multimap<int,int>::iterator it=range.first; it!=range.second; ++it) {
+ int lineNumber = (*it).second;
+ const Hypothesis *thisHypo = searchGraph[lineNumber].hypo;
+ int mosesHypothesisID;// = thisHypo->GetId();
+ if (searchGraph[lineNumber].recombinationHypo) {
+ mosesHypothesisID = searchGraph[lineNumber].recombinationHypo->GetId();
+ } else {
+ mosesHypothesisID = searchGraph[lineNumber].hypo->GetId();
+ }
+ // int actualHypergraphHypothesisID = mosesIDToHypergraphID[mosesHypothesisID];
+ UTIL_THROW_IF(
+ (hypergraphHypothesisID != mosesIDToHypergraphID[mosesHypothesisID]),
+ util::Exception,
+ "Error while writing search lattice as hypergraph for sentence " << translationId << ". " <<
+ "Moses node " << mosesHypothesisID << " was expected to have hypergraph id " << hypergraphHypothesisID <<
+ ", but actually had hypergraph id " << mosesIDToHypergraphID[mosesHypothesisID] <<
+ ". There are " << numNodes << " nodes in the search lattice."
+ );
+
+ const Hypothesis *prevHypo = thisHypo->GetPrevHypo();
+ if (prevHypo==NULL) {
+ // VERBOSE(2,"Hypergraph node " << hypergraphHypothesisID << " start of sentence" << std::endl)
+ outputSearchGraphStream << "<s> ||| \n";
+ } else {
+ int startNode = mosesIDToHypergraphID[prevHypo->GetId()];
+ // VERBOSE(2,"Hypergraph node " << hypergraphHypothesisID << " has parent node " << startNode << std::endl)
+ UTIL_THROW_IF(
+ (startNode >= hypergraphHypothesisID),
+ util::Exception,
+ "Error while writing search lattice as hypergraph for sentence" << translationId << ". " <<
+ "The nodes must be output in topological order. The code attempted to violate this restriction."
+ );
+
+ const TargetPhrase &targetPhrase = thisHypo->GetCurrTargetPhrase();
+ int targetWordCount = targetPhrase.GetSize();
+
+ outputSearchGraphStream << "[" << startNode << "]";
+ for (int targetWordIndex=0; targetWordIndex<targetWordCount; targetWordIndex+=1) {
+ outputSearchGraphStream << " " << targetPhrase.GetWord(targetWordIndex);
+ }
+ outputSearchGraphStream << " ||| ";
+ OutputFeatureValuesForHypergraph(thisHypo, outputSearchGraphStream);
+ outputSearchGraphStream << "\n";
+ }
+ }
+ }
+ }
+
+ // Print node and arc(s) for end of sentence </s>
+ outputSearchGraphStream << terminalNodes.size() << "\n";
+ for (set<int>::iterator it=terminalNodes.begin(); it!=terminalNodes.end(); ++it) {
+ outputSearchGraphStream << "[" << (*it) << "] </s> ||| \n";
+ }
+
+}
+
+
+/**! Output search graph in HTK standard lattice format (SLF) */
+void Manager::OutputSearchGraphAsSLF(long translationId, std::ostream &outputSearchGraphStream) const
+{
+
+ vector<SearchGraphNode> searchGraph;
+ GetSearchGraph(searchGraph);
+
+ long numArcs = 0;
+ long numNodes = 0;
+
+ map<int,int> nodes;
+ set<int> terminalNodes;
+
+ // Unique start node
+ nodes[0] = 0;
+
+ for (size_t arcNumber = 0; arcNumber < searchGraph.size(); ++arcNumber) {
+
+ int targetWordCount = searchGraph[arcNumber].hypo->GetCurrTargetPhrase().GetSize();
+ numArcs += targetWordCount;
+
+ int hypothesisID = searchGraph[arcNumber].hypo->GetId();
+ if (nodes.count(hypothesisID) == 0) {
+
+ numNodes += targetWordCount;
+ nodes[hypothesisID] = numNodes;
+ //numNodes += 1;
+
+ bool terminalNode = (searchGraph[arcNumber].forward == -1);
+ if (terminalNode) {
+ numArcs += 1;
+ }
+ }
+
+ }
+ numNodes += 1;
+
+ // Unique end node
+ nodes[numNodes] = numNodes;
+
+ outputSearchGraphStream << "UTTERANCE=Sentence_" << translationId << endl;
+ outputSearchGraphStream << "VERSION=1.1" << endl;
+ outputSearchGraphStream << "base=2.71828182845905" << endl;
+ outputSearchGraphStream << "NODES=" << (numNodes+1) << endl;
+ outputSearchGraphStream << "LINKS=" << numArcs << endl;
+
+ OutputFeatureWeightsForSLF(outputSearchGraphStream);
+
+ for (size_t arcNumber = 0, lineNumber = 0; lineNumber < searchGraph.size(); ++lineNumber) {
+ const Hypothesis *thisHypo = searchGraph[lineNumber].hypo;
+ const Hypothesis *prevHypo = thisHypo->GetPrevHypo();
+ if (prevHypo) {
+
+ int startNode = nodes[prevHypo->GetId()];
+ int endNode = nodes[thisHypo->GetId()];
+ bool terminalNode = (searchGraph[lineNumber].forward == -1);
+ const TargetPhrase &targetPhrase = thisHypo->GetCurrTargetPhrase();
+ int targetWordCount = targetPhrase.GetSize();
+
+ for (int targetWordIndex=0; targetWordIndex<targetWordCount; targetWordIndex+=1) {
+ int x = (targetWordCount-targetWordIndex);
+
+ outputSearchGraphStream << "J=" << arcNumber;
+
+ if (targetWordIndex==0) {
+ outputSearchGraphStream << " S=" << startNode;
+ } else {
+ outputSearchGraphStream << " S=" << endNode - x;
+ }
+
+ outputSearchGraphStream << " E=" << endNode - (x-1)
+ << " W=" << targetPhrase.GetWord(targetWordIndex);
+
+ OutputFeatureValuesForSLF(thisHypo, (targetWordIndex>0), outputSearchGraphStream);
+
+ outputSearchGraphStream << endl;
+
+ arcNumber += 1;
+ }
+
+ if (terminalNode && terminalNodes.count(endNode) == 0) {
+ terminalNodes.insert(endNode);
+ outputSearchGraphStream << "J=" << arcNumber
+ << " S=" << endNode
+ << " E=" << numNodes
+ << endl;
+ arcNumber += 1;
+ }
+ }
+ }
+
+}
+
void OutputSearchNode(long translationId, std::ostream &outputSearchGraphStream,
const SearchGraphNode& searchNode)
{
diff --git a/moses/Manager.h b/moses/Manager.h
index dd011bc84..11762ec37 100644
--- a/moses/Manager.h
+++ b/moses/Manager.h
@@ -56,6 +56,10 @@ struct SearchGraphNode {
hypo(theHypo), recombinationHypo(theRecombinationHypo),
forward(theForward), fscore(theFscore) {}
+ bool operator<(const SearchGraphNode& sgn) const {
+ return this->hypo->GetId() < sgn.hypo->GetId();
+ }
+
};
/** The Manager class implements a stack decoding algorithm for phrase-based decoding
@@ -93,6 +97,19 @@ class Manager
Manager(Manager const&);
void operator=(Manager const&);
const TranslationSystem* m_system;
+private:
+
+ // Helper functions to output search graph in HTK standard lattice format
+ void OutputFeatureWeightsForSLF(std::ostream &outputSearchGraphStream) const;
+ size_t OutputFeatureWeightsForSLF(size_t index, const FeatureFunction* ff, std::ostream &outputSearchGraphStream) const;
+ void OutputFeatureValuesForSLF(const Hypothesis* hypo, bool zeros, std::ostream &outputSearchGraphStream) const;
+ size_t OutputFeatureValuesForSLF(size_t index, bool zeros, const Hypothesis* hypo, const FeatureFunction* ff, std::ostream &outputSearchGraphStream) const;
+
+ // Helper functions to output search graph in the hypergraph format of Kenneth Heafield's lazy hypergraph decoder
+ void OutputFeatureValuesForHypergraph(const Hypothesis* hypo, std::ostream &outputSearchGraphStream) const;
+ size_t OutputFeatureValuesForHypergraph(size_t index, const Hypothesis* hypo, const FeatureFunction* ff, std::ostream &outputSearchGraphStream) const;
+
+
protected:
// data
// InputType const& m_source; /**< source sentence to be translated */
@@ -103,6 +120,7 @@ protected:
size_t interrupted_flag;
std::auto_ptr<SentenceStats> m_sentenceStats;
int m_hypoId; //used to number the hypos as they are created.
+ size_t m_lineNumber;
void GetConnectedGraph(
std::map< int, bool >* pConnected,
@@ -113,7 +131,6 @@ protected:
public:
- size_t m_lineNumber;
InputType const& m_source; /**< source sentence to be translated */
Manager(size_t lineNumber, InputType const& source, SearchAlgorithm searchAlgorithm, const TranslationSystem* system);
~Manager();
@@ -137,6 +154,8 @@ public:
#endif
void OutputSearchGraph(long translationId, std::ostream &outputSearchGraphStream) const;
+ void OutputSearchGraphAsSLF(long translationId, std::ostream &outputSearchGraphStream) const;
+ void OutputSearchGraphAsHypergraph(long translationId, std::ostream &outputSearchGraphStream) const;
void GetSearchGraph(std::vector<SearchGraphNode>& searchGraph) const;
const InputType& GetSource() const {
return m_source;
diff --git a/moses/PDTAimp.h b/moses/PDTAimp.h
index 25131b98a..5680b8ecb 100644
--- a/moses/PDTAimp.h
+++ b/moses/PDTAimp.h
@@ -11,6 +11,7 @@
#include "moses/TranslationModel/PhraseDictionaryTreeAdaptor.h"
#include "SparsePhraseDictionaryFeature.h"
#include "Util.h"
+#include "util/tokenize_piece.hh"
namespace Moses
{
@@ -284,11 +285,10 @@ protected:
FactorCollection &factorCollection = FactorCollection::Instance();
for(size_t k=0; k<factorStrings.size(); ++k) {
- std::vector<std::string> factors=TokenizeMultiCharSeparator(*factorStrings[k],StaticData::Instance().GetFactorDelimiter());
- CHECK(factors.size()==m_output.size());
+ util::TokenIter<util::MultiCharacter, false> word(*factorStrings[k], StaticData::Instance().GetFactorDelimiter());
Word& w=targetPhrase.AddWord();
- for(size_t l=0; l<m_output.size(); ++l) {
- w[m_output[l]]= factorCollection.AddFactor(Output, m_output[l], factors[l]);
+ for(size_t l=0; l<m_output.size(); ++l, ++word) {
+ w[m_output[l]]= factorCollection.AddFactor(*word);
}
}
diff --git a/moses/Parameter.cpp b/moses/Parameter.cpp
index 103277d34..6a9745ade 100644
--- a/moses/Parameter.cpp
+++ b/moses/Parameter.cpp
@@ -107,6 +107,7 @@ Parameter::Parameter()
AddParam("monotone-at-punctuation", "mp", "do not reorder over punctuation");
AddParam("distortion-file", "source factors (0 if table independent of source), target factors, location of the factorized/lexicalized reordering tables");
AddParam("distortion", "configurations for each factorized/lexicalized reordering model.");
+ AddParam("early-distortion-cost", "edc", "include estimate of distortion cost yet to be incurred in the score [Moore & Quirk 2007]. Default is no");
AddParam("xml-input", "xi", "allows markup of input with desired translations and probabilities. values can be 'pass-through' (default), 'inclusive', 'exclusive', 'ignore'");
AddParam("xml-brackets", "xb", "specify strings to be used as xml tags opening and closing, e.g. \"{{ }}\" (default \"< >\"). Avoid square brackets because of configuration file format. Valid only with text input mode" );
AddParam("minimum-bayes-risk", "mbr", "use miminum Bayes risk to determine best translation");
@@ -130,6 +131,8 @@ Parameter::Parameter()
AddParam("output-search-graph", "osg", "Output connected hypotheses of search into specified filename");
AddParam("output-search-graph-extended", "osgx", "Output connected hypotheses of search into specified filename, in extended format");
AddParam("unpruned-search-graph", "usg", "When outputting chart search graph, do not exclude dead ends. Note: stack pruning may have eliminated some hypotheses");
+ AddParam("output-search-graph-slf", "slf", "Output connected hypotheses of search into specified directory, one file per sentence, in HTK standard lattice format (SLF)");
+ AddParam("output-search-graph-hypergraph", "Output connected hypotheses of search into specified directory, one file per sentence, in a hypergraph format (see Kenneth Heafield's lazy hypergraph decoder)");
AddParam("include-lhs-in-search-graph", "lhssg", "When outputting chart search graph, include the label of the LHS of the rule (useful when using syntax)");
#ifdef HAVE_PROTOBUF
AddParam("output-search-graph-pb", "pb", "Write phrase lattice to protocol buffer objects in the specified path.");
@@ -177,6 +180,7 @@ Parameter::Parameter()
AddParam("minlexr-memory", "Load lexical reordering table in minlexr format into memory");
AddParam("minphr-memory", "Load phrase table in minphr format into memory");
+ AddParam("print-alignment-info", "Output word-to-word alignment into the log file. Word-to-word alignments are takne from the phrase table if any. Default is false");
AddParam("include-segmentation-in-n-best", "include phrasal segmentation in the n-best list. default is false");
AddParam("print-alignment-info-in-n-best", "Include word-to-word alignment in the n-best list. Word-to-word alignments are takne from the phrase table if any. Default is false");
AddParam("alignment-output-file", "print output word alignments into given file");
diff --git a/moses/SourceWordDeletionFeature.cpp b/moses/SourceWordDeletionFeature.cpp
index c5a61111f..c312a3b03 100644
--- a/moses/SourceWordDeletionFeature.cpp
+++ b/moses/SourceWordDeletionFeature.cpp
@@ -55,12 +55,7 @@ void SourceWordDeletionFeature::ComputeFeatures(const TargetPhrase& targetPhrase
// handle special case: unknown words (they have no word alignment)
size_t targetLength = targetPhrase.GetSize();
size_t sourceLength = targetPhrase.GetSourcePhrase().GetSize();
- if (targetLength == 1 && sourceLength == 1) {
- const Factor* f1 = targetPhrase.GetWord(0).GetFactor(1);
- if (f1 && f1->GetString().compare(UNKNOWN_FACTOR) == 0) {
- return;
- }
- }
+ if (targetLength == 1 && sourceLength == 1 && !alignmentInfo.GetSize()) return;
// flag aligned words
bool aligned[16];
diff --git a/moses/StaticData.cpp b/moses/StaticData.cpp
index df05b64d3..449187da7 100644
--- a/moses/StaticData.cpp
+++ b/moses/StaticData.cpp
@@ -162,10 +162,6 @@ bool StaticData::LoadData(Parameter *parameter)
}
}
- if(m_parameter->GetParam("sort-word-alignment").size()) {
- m_wordAlignmentSort = (WordAlignmentSort) Scan<size_t>(m_parameter->GetParam("sort-word-alignment")[0]);
- }
-
// factor delimiter
if (m_parameter->GetParam("factor-delimiter").size() > 0) {
m_factorDelimiter = m_parameter->GetParam("factor-delimiter")[0];
@@ -175,6 +171,16 @@ bool StaticData::LoadData(Parameter *parameter)
SetBooleanParameter( &m_outputHypoScore, "output-hypo-score", false );
//word-to-word alignment
+ // alignments
+ SetBooleanParameter( &m_PrintAlignmentInfo, "print-alignment-info", false );
+ if (m_PrintAlignmentInfo) {
+ m_needAlignmentInfo = true;
+ }
+
+ if(m_parameter->GetParam("sort-word-alignment").size()) {
+ m_wordAlignmentSort = (WordAlignmentSort) Scan<size_t>(m_parameter->GetParam("sort-word-alignment")[0]);
+ }
+
SetBooleanParameter( &m_PrintAlignmentInfoNbest, "print-alignment-info-in-n-best", false );
if (m_PrintAlignmentInfoNbest) {
m_needAlignmentInfo = true;
@@ -235,8 +241,19 @@ bool StaticData::LoadData(Parameter *parameter)
}
m_outputSearchGraph = true;
m_outputSearchGraphExtended = true;
- } else
+ } else {
m_outputSearchGraph = false;
+ }
+ if (m_parameter->GetParam("output-search-graph-slf").size() > 0) {
+ m_outputSearchGraphSLF = true;
+ } else {
+ m_outputSearchGraphSLF = false;
+ }
+ if (m_parameter->GetParam("output-search-graph-hypergraph").size() > 0) {
+ m_outputSearchGraphHypergraph = true;
+ } else {
+ m_outputSearchGraphHypergraph = false;
+ }
#ifdef HAVE_PROTOBUF
if (m_parameter->GetParam("output-search-graph-pb").size() > 0) {
if (m_parameter->GetParam("output-search-graph-pb").size() != 1) {
diff --git a/moses/StaticData.h b/moses/StaticData.h
index 448f1a4e7..20d36e4b8 100644
--- a/moses/StaticData.h
+++ b/moses/StaticData.h
@@ -171,6 +171,7 @@ protected:
bool m_reportAllFactorsNBest;
std::string m_detailedTranslationReportingFilePath;
bool m_onlyDistinctNBest;
+ bool m_PrintAlignmentInfo;
bool m_needAlignmentInfo;
bool m_PrintAlignmentInfoNbest;
@@ -216,6 +217,8 @@ protected:
bool m_outputWordGraph; //! whether to output word graph
bool m_outputSearchGraph; //! whether to output search graph
bool m_outputSearchGraphExtended; //! ... in extended format
+ bool m_outputSearchGraphSLF; //! whether to output search graph in HTK standard lattice format (SLF)
+ bool m_outputSearchGraphHypergraph; //! whether to output search graph in hypergraph
#ifdef HAVE_PROTOBUF
bool m_outputSearchGraphPB; //! whether to output search graph as a protobuf
#endif
@@ -458,7 +461,7 @@ public:
return m_nBestFilePath;
}
bool IsNBestEnabled() const {
- return (!m_nBestFilePath.empty()) || m_mbr || m_useLatticeMBR || m_mira || m_outputSearchGraph || m_useConsensusDecoding || !m_latticeSamplesFilePath.empty()
+ return (!m_nBestFilePath.empty()) || m_mbr || m_useLatticeMBR || m_mira || m_outputSearchGraph || m_outputSearchGraphSLF || m_outputSearchGraphHypergraph || m_useConsensusDecoding || !m_latticeSamplesFilePath.empty()
#ifdef HAVE_PROTOBUF
|| m_outputSearchGraphPB
#endif
@@ -631,6 +634,12 @@ public:
bool GetOutputSearchGraphExtended() const {
return m_outputSearchGraphExtended;
}
+ bool GetOutputSearchGraphSLF() const {
+ return m_outputSearchGraphSLF;
+ }
+ bool GetOutputSearchGraphHypergraph() const {
+ return m_outputSearchGraphHypergraph;
+ }
#ifdef HAVE_PROTOBUF
bool GetOutputSearchGraphPB() const {
return m_outputSearchGraphPB;
@@ -722,6 +731,9 @@ public:
const std::string &GetAlignmentOutputFile() const {
return m_alignmentOutputFile;
}
+ bool PrintAlignmentInfo() const {
+ return m_PrintAlignmentInfo;
+ }
bool PrintAlignmentInfoInNbest() const {
return m_PrintAlignmentInfoNbest;
}
diff --git a/moses/TargetPhrase.cpp b/moses/TargetPhrase.cpp
index b1d99ab50..6f14657a3 100644
--- a/moses/TargetPhrase.cpp
+++ b/moses/TargetPhrase.cpp
@@ -326,8 +326,10 @@ TO_STRING_BODY(TargetPhrase);
std::ostream& operator<<(std::ostream& os, const TargetPhrase& tp)
{
- os << static_cast<const Phrase&>(tp) << ":" << tp.GetAlignNonTerm();
- os << ": c=" << tp.m_fullScore;
+ os << static_cast<const Phrase&>(tp) << ":" << flush;
+ os << tp.GetAlignNonTerm() << flush;
+ os << ": c=" << tp.m_fullScore << flush;
+ os << " " << tp.m_scoreBreakdown << flush;
return os;
}
diff --git a/moses/TargetWordInsertionFeature.cpp b/moses/TargetWordInsertionFeature.cpp
index 537c5c9cb..3b9bf36ba 100644
--- a/moses/TargetWordInsertionFeature.cpp
+++ b/moses/TargetWordInsertionFeature.cpp
@@ -56,12 +56,7 @@ void TargetWordInsertionFeature::ComputeFeatures(const TargetPhrase& targetPhras
// handle special case: unknown words (they have no word alignment)
size_t targetLength = targetPhrase.GetSize();
size_t sourceLength = targetPhrase.GetSourcePhrase().GetSize();
- if (targetLength == 1 && sourceLength == 1) {
- const Factor* f1 = targetPhrase.GetWord(0).GetFactor(1);
- if (f1 && f1->GetString().compare(UNKNOWN_FACTOR) == 0) {
- return;
- }
- }
+ if (targetLength == 1 && sourceLength == 1 && !alignmentInfo.GetSize()) return;
// flag aligned words
bool aligned[16];
diff --git a/moses/TranslationModel/PhraseDictionaryTree.cpp b/moses/TranslationModel/PhraseDictionaryTree.cpp
index 515d2f649..675656112 100644
--- a/moses/TranslationModel/PhraseDictionaryTree.cpp
+++ b/moses/TranslationModel/PhraseDictionaryTree.cpp
@@ -156,22 +156,6 @@ PhraseDictionaryTree::PrefixPtr::operator bool() const
typedef LVoc<std::string> WordVoc;
-static WordVoc* ReadVoc(const std::string& filename)
-{
- static std::map<std::string,WordVoc*> vocs;
-#ifdef WITH_THREADS
- boost::mutex mutex;
- boost::mutex::scoped_lock lock(mutex);
-#endif
- std::map<std::string,WordVoc*>::iterator vi = vocs.find(filename);
- if (vi == vocs.end()) {
- WordVoc* voc = new WordVoc();
- voc->Read(filename);
- vocs[filename] = voc;
- }
- return vocs[filename];
-}
-
class PDTimp {
public:
@@ -184,8 +168,8 @@ public:
std::vector<OFF_T> srcOffsets;
FILE *os,*ot;
- WordVoc* sv;
- WordVoc* tv;
+ WordVoc sv;
+ WordVoc tv;
ObjectPool<PPimp> pPool;
// a comparison with the Boost MemPools might be useful
@@ -269,12 +253,12 @@ public:
rv.back().tokens.reserve(iphrase.size());
for(size_t j=0; j<iphrase.size(); ++j) {
- rv.back().tokens.push_back(&tv->symbol(iphrase[j]));
+ rv.back().tokens.push_back(&tv.symbol(iphrase[j]));
}
rv.back().scores = i->GetScores();
const IPhrase& fnames = i->GetFeatureNames();
for (size_t j = 0; j < fnames.size(); ++j) {
- rv.back().fnames.push_back(&tv->symbol(fnames[j]));
+ rv.back().fnames.push_back(&tv.symbol(fnames[j]));
}
rv.back().fvalues = i->GetFeatureValues();
if (wa) wa->push_back(i->GetAlignment());
@@ -289,7 +273,7 @@ public:
CHECK(p);
if(w.empty() || w==EPSILON) return p;
- LabelId wi=sv->index(w);
+ LabelId wi=sv.index(w);
if(wi==InvalidLabelId) return PPtr(); // unknown word
else if(p.imp->isRoot()) {
@@ -304,6 +288,8 @@ public:
return PPtr();
}
+
+ WordVoc* ReadVoc(const std::string& filename);
};
@@ -350,10 +336,8 @@ int PDTimp::Read(const std::string& fn)
for(size_t i=0; i<data.size(); ++i)
data[i]=CPT(os,srcOffsets[i]);
- sv = ReadVoc(ifsv);
- tv = ReadVoc(iftv);
- //sv.Read(ifsv);
- //tv.Read(iftv);
+ sv.Read(ifsv);
+ tv.Read(iftv);
TRACE_ERR("binary phrasefile loaded, default OFF_T: "<<PTF::getDefault()
<<"\n");
@@ -370,7 +354,7 @@ void PDTimp::PrintTgtCand(const TgtCands& tcand,std::ostream& out) const
const IPhrase& iphr=tcand[i].GetPhrase();
out << i << " -- " << sc << " -- ";
- for(size_t j=0; j<iphr.size(); ++j) out << tv->symbol(iphr[j])<<" ";
+ for(size_t j=0; j<iphr.size(); ++j) out << tv.symbol(iphr[j])<<" ";
out<< " -- " << trgAlign;
out << std::endl;
}
@@ -423,7 +407,7 @@ GetTargetCandidates(const std::vector<std::string>& src,
{
IPhrase f(src.size());
for(size_t i=0; i<src.size(); ++i) {
- f[i]=imp->sv->index(src[i]);
+ f[i]=imp->sv.index(src[i]);
if(f[i]==InvalidLabelId) return;
}
@@ -439,7 +423,7 @@ GetTargetCandidates(const std::vector<std::string>& src,
{
IPhrase f(src.size());
for(size_t i=0; i<src.size(); ++i) {
- f[i]=imp->sv->index(src[i]);
+ f[i]=imp->sv.index(src[i]);
if(f[i]==InvalidLabelId) return;
}
@@ -455,7 +439,7 @@ PrintTargetCandidates(const std::vector<std::string>& src,
{
IPhrase f(src.size());
for(size_t i=0; i<src.size(); ++i) {
- f[i]=imp->sv->index(src[i]);
+ f[i]=imp->sv.index(src[i]);
if(f[i]==InvalidLabelId) {
TRACE_ERR("the source phrase '"<<src<<"' contains an unknown word '"
<<src[i]<<"'\n");
@@ -497,8 +481,6 @@ int PhraseDictionaryTree::Create(std::istream& inFile,const std::string& out)
std::vector<OFF_T> vo;
size_t lnc=0;
size_t numElement = NOT_FOUND; // 3=old format, 5=async format which include word alignment info
- imp->sv = new WordVoc();
- imp->tv = new WordVoc();
size_t missingAlignmentCount = 0;
while(getline(inFile, line)) {
@@ -532,11 +514,11 @@ int PhraseDictionaryTree::Create(std::istream& inFile,const std::string& out)
std::vector<std::string> wordVec = Tokenize(sourcePhraseString);
for (size_t i = 0 ; i < wordVec.size() ; ++i)
- f.push_back(imp->sv->add(wordVec[i]));
+ f.push_back(imp->sv.add(wordVec[i]));
wordVec = Tokenize(targetPhraseString);
for (size_t i = 0 ; i < wordVec.size() ; ++i)
- e.push_back(imp->tv->add(wordVec[i]));
+ e.push_back(imp->tv.add(wordVec[i]));
// while(is>>w && w!="|||") sc.push_back(atof(w.c_str()));
// Mauro: to handle 0 probs in phrase tables
@@ -576,7 +558,7 @@ int PhraseDictionaryTree::Create(std::istream& inFile,const std::string& out)
abort();
}
for (size_t i = 0; i < sparseTokens.size(); i+=2) {
- fnames.push_back(imp->tv->add(sparseTokens[i]));
+ fnames.push_back(imp->tv.add(sparseTokens[i]));
fvalues.push_back(Scan<FValue>(sparseTokens[i+1]));
}
}
@@ -663,8 +645,8 @@ int PhraseDictionaryTree::Create(std::istream& inFile,const std::string& out)
fWriteVector(oi,vo);
fClose(oi);
- imp->sv->Write(ofsv);
- imp->tv->Write(oftv);
+ imp->sv.Write(ofsv);
+ imp->tv.Write(oftv);
return 1;
}
diff --git a/moses/TranslationModel/fuzzy-match/FuzzyMatchWrapper.cpp b/moses/TranslationModel/fuzzy-match/FuzzyMatchWrapper.cpp
index c680d7245..065368ca7 100644
--- a/moses/TranslationModel/fuzzy-match/FuzzyMatchWrapper.cpp
+++ b/moses/TranslationModel/fuzzy-match/FuzzyMatchWrapper.cpp
@@ -552,7 +552,9 @@ namespace tmmt
bool FuzzyMatchWrapper::GetLSEDCache(const std::pair< WORD_ID, WORD_ID > &key, unsigned int &value) const
{
+#ifdef WITH_THREADS
boost::shared_lock<boost::shared_mutex> read_lock(m_accessLock);
+#endif
map< pair< WORD_ID, WORD_ID >, unsigned int >::const_iterator lookup = m_lsed.find( key );
if (lookup != m_lsed.end()) {
value = lookup->second;
@@ -564,7 +566,9 @@ namespace tmmt
void FuzzyMatchWrapper::SetLSEDCache(const std::pair< WORD_ID, WORD_ID > &key, const unsigned int &value)
{
+#ifdef WITH_THREADS
boost::unique_lock<boost::shared_mutex> lock(m_accessLock);
+#endif
m_lsed[ key ] = value;
}
diff --git a/moses/Util.cpp b/moses/Util.cpp
index 98de1241e..495e05124 100644
--- a/moses/Util.cpp
+++ b/moses/Util.cpp
@@ -35,6 +35,7 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
#include "TypeDef.h"
#include "Util.h"
#include "Timer.h"
+#include "util/exception.hh"
#include "util/file.hh"
using namespace std;
@@ -65,6 +66,8 @@ const std::string ToLower(const std::string& str)
return lc;
}
+class BoolValueException : public util::Exception {};
+
template<>
bool Scan<bool>(const std::string &input)
{
@@ -73,8 +76,7 @@ bool Scan<bool>(const std::string &input)
return true;
if (lc == "no" || lc == "n" || lc =="false" || lc == "0")
return false;
- TRACE_ERR( "Scan<bool>: didn't understand '" << lc << "', returning false" << std::endl);
- return false;
+ UTIL_THROW(BoolValueException, "Could not interpret " << input << " as a boolean. After lowercasing, valid values are yes, y, true, 1, no, n, false, and 0.");
}
bool FileExists(const std::string& filePath)
diff --git a/moses/Word.cpp b/moses/Word.cpp
index c23e8de8c..2c1ac09ea 100644
--- a/moses/Word.cpp
+++ b/moses/Word.cpp
@@ -25,6 +25,7 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
#include "Word.h"
#include "TypeDef.h"
#include "StaticData.h" // needed to determine the FactorDelimiter
+#include "util/exception.hh"
#include "util/tokenize_piece.hh"
using namespace std;
@@ -95,6 +96,8 @@ std::string Word::GetString(FactorType factorType) const
return NULL;
}
+class StrayFactorException : public util::Exception {};
+
void Word::CreateFromString(FactorDirection direction
, const std::vector<FactorType> &factorOrder
, const StringPiece &str
@@ -106,7 +109,7 @@ void Word::CreateFromString(FactorDirection direction
for (size_t ind = 0; ind < factorOrder.size() && fit; ++ind, ++fit) {
m_factorArray[factorOrder[ind]] = factorCollection.AddFactor(*fit);
}
- CHECK(!fit);
+ UTIL_THROW_IF(fit, StrayFactorException, "You have configured " << factorOrder.size() << " factors but the word " << str << " contains factor delimiter " << StaticData::Instance().GetFactorDelimiter() << " too many times.");
// assume term/non-term same for all factors
m_isNonTerminal = isNonTerminal;
diff --git a/phrase-extract/consolidate-main.cpp b/phrase-extract/consolidate-main.cpp
index 70de9678b..fd33907de 100644
--- a/phrase-extract/consolidate-main.cpp
+++ b/phrase-extract/consolidate-main.cpp
@@ -256,7 +256,7 @@ void processFiles( char* fileNameDirect, char* fileNameIndirect, char* fileNameC
if (kneserNeyFlag) {
float D = kneserNey_D3;
if (countEF < 2) D = kneserNey_D1;
- if (countEF < 3) D = kneserNey_D2;
+ else if (countEF < 3) D = kneserNey_D2;
if (D > countEF) D = countEF - 0.01; // sanity constraint
float p_b_E = n1_E / totalCount; // target phrase prob based on distinct
diff --git a/phrase-extract/extract-main.cpp b/phrase-extract/extract-main.cpp
index 92c8a470e..cab91e92d 100644
--- a/phrase-extract/extract-main.cpp
+++ b/phrase-extract/extract-main.cpp
@@ -712,6 +712,10 @@ for(int fi=startF; fi<=endF; fi++) {
if (m_options.isOrientationFlag())
outextractstrOrientation << orientationInfo;
+ if (m_options.isIncludeSentenceIdFlag()) {
+ outextractstr << " ||| " << sentence.sentenceID;
+ }
+
if (m_options.getInstanceWeightsFile().length()) {
if (m_options.isTranslationFlag()) {
outextractstr << " ||| " << sentence.weightString;
@@ -722,9 +726,6 @@ for(int fi=startF; fi<=endF; fi++) {
}
}
- if (m_options.isIncludeSentenceIdFlag()) {
- outextractstr << " ||| " << sentence.sentenceID;
- }
if (m_options.isTranslationFlag()) outextractstr << "\n";
if (m_options.isTranslationFlag()) outextractstrInv << "\n";
diff --git a/scripts/ems/experiment.meta b/scripts/ems/experiment.meta
index 214569206..769fc0ebf 100644
--- a/scripts/ems/experiment.meta
+++ b/scripts/ems/experiment.meta
@@ -1000,6 +1000,7 @@ lowercase-reference
out: reference
default-name: evaluation/reference
pass-unless: output-lowercaser
+ pass-if: recaser
multiref: $moses-script-dir/ems/support/run-command-on-multiple-refsets.perl
template: $output-lowercaser < IN > OUT
nist-bleu
diff --git a/scripts/ems/support/analysis.perl b/scripts/ems/support/analysis.perl
index 29962ca71..a2f9580a9 100755
--- a/scripts/ems/support/analysis.perl
+++ b/scripts/ems/support/analysis.perl
@@ -745,7 +745,8 @@ sub hierarchical_segmentation {
open(OUTPUT_TREE,">$dir/output-tree") or die "Cannot open: $!";
open(NODE,">$dir/node") or die "Cannot open: $!";
while(<TRACE>) {
- /^Trans Opt (\d+) \[(\d+)\.\.(\d+)\]: (.+) : (\S+) \-\>(.+) :([\(\),\d\- ]*): pC=[\d\.\-e]+, c=/ || die("cannot scan line $_");
+ /^Trans Opt (\d+) \[(\d+)\.\.(\d+)\]: (.+) : (\S+) \-\>(.+) :([\(\),\d\- ]*): pC=[\d\.\-e]+, c=/ ||
+ /^Trans Opt (\d+) \[(\d+)\.\.(\d+)\]: (.+) : (\S+) \-\>(.+) :([\(\),\d\- ]*): c=/ || die("cannot scan line $_");
my ($sentence,$start,$end,$spans,$rule_lhs,$rule_rhs,$alignment) = ($1,$2,$3,$4,$5,$6,$7);
if ($last_sentence >= 0 && $sentence != $last_sentence) {
&hs_process($last_sentence,\@DERIVATION,\%STATS);
@@ -1137,9 +1138,17 @@ sub process_search_graph {
`mkdir -p $dir/search-graph`;
my $last_sentence = -1;
while(<OSG>) {
- /^(\d+) (\d+)\-?\>?(\S*) (\S+) =\> (.+) :(.*): pC=([\de\-\.]+), c=([\de\-\.]+) \[(\d+)\.\.(\d+)\] (.*)\[total=([\d\-\.]+)\] \<\</ || die("ERROR: buggy search graph line: $_");
- my ($sentence,$id,$recomb,$lhs,$output,$alignment,$rule_score,$heuristic_rule_score,$from,$to,$children,$hyp_score)
- = ($1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12);
+ my ($sentence,$id,$recomb,$lhs,$output,$alignment,$rule_score,$heuristic_rule_score,$from,$to,$children,$hyp_score);
+ if (/^(\d+) (\d+)\-?\>?(\S*) (\S+) =\> (.+) :(.*): pC=([\de\-\.]+), c=([\de\-\.]+) \[(\d+)\.\.(\d+)\] (.*)\[total=([\d\-\.]+)\] \<\</) {
+ ($sentence,$id,$recomb,$lhs,$output,$alignment,$rule_score,$heuristic_rule_score,$from,$to,$children,$hyp_score) = ($1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12);
+ }
+ elsif (/^(\d+) (\d+)\-?\>?(\S*) (\S+) =\> (.+) :(.*): c=([\de\-\.]+) \[(\d+)\.\.(\d+)\] (.*)\[total=([\d\-\.]+)\] core/) {
+ ($sentence,$id,$recomb,$lhs,$output,$alignment,$rule_score,$from,$to,$children,$hyp_score) = ($1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12);
+ $heuristic_rule_score = $rule_score; # hmmmm....
+ }
+ else {
+ die("ERROR: buggy search graph line: $_");
+ }
chop($alignment) if $alignment;
chop($children) if $children;
$recomb = 0 unless $recomb;
diff --git a/scripts/ems/support/wrap-xml.perl b/scripts/ems/support/wrap-xml.perl
index e941aa95b..4ef6a1de6 100755
--- a/scripts/ems/support/wrap-xml.perl
+++ b/scripts/ems/support/wrap-xml.perl
@@ -13,10 +13,10 @@ chomp(@OUT);
while(<SRC>) {
chomp;
if (/^<srcset/) {
- s/<srcset/<tstset trglang="$language"/;
+ s/<srcset/<tstset trglang="$language"/i;
}
elsif (/^<\/srcset/) {
- s/<\/srcset/<\/tstset/;
+ s/<\/srcset/<\/tstset/i;
}
elsif (/^<doc/i) {
s/ *sysid="[^\"]+"//;
@@ -26,10 +26,10 @@ while(<SRC>) {
my $line = shift(@OUT);
$line = "" if $line =~ /NO BEST TRANSLATION/;
if (/<\/seg>/) {
- s/(<seg[^>]+> *).*(<\/seg>)/$1$line$2/;
+ s/(<seg[^>]+> *).*(<\/seg>)/$1$line$2/i;
}
else {
- s/(<seg[^>]+> *)[^<]*/$1$line/;
+ s/(<seg[^>]+> *)[^<]*/$1$line/i;
}
}
print $_."\n";
diff --git a/scripts/generic/compound-splitter.perl b/scripts/generic/compound-splitter.perl
index 8f82ab8d9..beca70eb0 100755
--- a/scripts/generic/compound-splitter.perl
+++ b/scripts/generic/compound-splitter.perl
@@ -16,15 +16,15 @@ $HELP = 1
unless &GetOptions('corpus=s' => \$CORPUS,
'model=s' => \$MODEL,
'filler=s' => \$FILLER,
- 'factored' => \$FACTORED,
+ 'factored' => \$FACTORED,
'min-size=i' => \$MIN_SIZE,
'min-count=i' => \$MIN_COUNT,
'max-count=i' => \$MAX_COUNT,
'help' => \$HELP,
'verbose' => \$VERBOSE,
- 'syntax' => \$SYNTAX,
- 'binarize' => \$BINARIZE,
- 'mark-split' => \$MARK_SPLIT,
+ 'syntax' => \$SYNTAX,
+ 'binarize' => \$BINARIZE,
+ 'mark-split' => \$MARK_SPLIT,
'train' => \$TRAIN);
if ($HELP ||
@@ -155,34 +155,37 @@ sub apply {
next if defined($COUNT{$lc}) && $COUNT{$lc} > $count;
$COUNT{$lc} = $count;
$TRUECASE{$lc} = $factored_word;
- $LABEL{$lc} = $label if $SYNTAX;
+ $LABEL{$lc} = $label if $SYNTAX;
}
close(MODEL);
while(<STDIN>) {
my $first = 1;
chop; s/\s+/ /g; s/^ //; s/ $//;
- my @BUFFER; # for xml tags
+ my @BUFFER; # for xml tags
foreach my $factored_word (split) {
print " " unless $first;
$first = 0;
- # syntax: don't split xml
- if ($SYNTAX && ($factored_word =~ /^</ || $factored_word =~ />$/)) {
- push @BUFFER,$factored_word;
- $first = 1;
- next;
- }
-
- # get case class
- my $word = $factored_word;
- $word =~ s/\|.+//g; # just first factor
- my $lc = lc($word);
-
+ # syntax: don't split xml
+ if ($SYNTAX && ($factored_word =~ /^</ || $factored_word =~ />$/)) {
+ push @BUFFER,$factored_word;
+ $first = 1;
+ next;
+ }
+
+ # get case class
+ my $word = $factored_word;
+ $word =~ s/\|.+//g; # just first factor
+ my $lc = lc($word);
+
+ print STDERR "considering $word ($lc)...\n" if $VERBOSE;
# don't split frequent words
- if (defined($COUNT{$lc}) && $COUNT{$lc}>=$MAX_COUNT) {
- print join(" ",@BUFFER)." " if scalar(@BUFFER); @BUFFER = (); # clear buffer
+ if ((defined($COUNT{$lc}) && $COUNT{$lc}>=$MAX_COUNT) ||
+ $lc !~ /[a-zA-Z]/) {; # has to have at least one letter
+ print join(" ",@BUFFER)." " if scalar(@BUFFER); @BUFFER = (); # clear buffer
print $factored_word;
+ print STDERR "\tfrequent word ($COUNT{$lc}>=$MAX_COUNT), skipping\n" if $VERBOSE;
next;
}
diff --git a/scripts/generic/extract-parallel.perl b/scripts/generic/extract-parallel.perl
index 7533b39e0..192169c86 100755
--- a/scripts/generic/extract-parallel.perl
+++ b/scripts/generic/extract-parallel.perl
@@ -153,9 +153,9 @@ if (defined($baselineExtract)) {
$catOCmd .= "$baselineExtract.o$sorted.gz ";
}
-$catCmd .= " | LC_ALL=C $sortCmd -T $TMPDIR | gzip -c > $extract.sorted.gz \n";
-$catInvCmd .= " | LC_ALL=C $sortCmd -T $TMPDIR | gzip -c > $extract.inv.sorted.gz \n";
-$catOCmd .= " | LC_ALL=C $sortCmd -T $TMPDIR | gzip -c > $extract.o.sorted.gz \n";
+$catCmd .= " | LC_ALL=C $sortCmd -T $TMPDIR 2>> /dev/stderr | gzip -c > $extract.sorted.gz 2>> /dev/stderr \n";
+$catInvCmd .= " | LC_ALL=C $sortCmd -T $TMPDIR 2>> /dev/stderr | gzip -c > $extract.inv.sorted.gz 2>> /dev/stderr \n";
+$catOCmd .= " | LC_ALL=C $sortCmd -T $TMPDIR 2>> /dev/stderr | gzip -c > $extract.o.sorted.gz 2>> /dev/stderr \n";
@children = ();
diff --git a/scripts/generic/moses-parallel.pl b/scripts/generic/moses-parallel.pl
index d1840fc55..b8d393e71 100755
--- a/scripts/generic/moses-parallel.pl
+++ b/scripts/generic/moses-parallel.pl
@@ -64,6 +64,7 @@ my $wordgraphfile=undef;
my $wordgraphflag=0;
my $robust=5; # resubmit crashed jobs robust-times
my $alifile=undef;
+my $detailsfile=undef;
my $logfile="";
my $logflag="";
my $searchgraphlist="";
@@ -93,6 +94,7 @@ sub init(){
'output-search-graph|osg=s'=> \$searchgraphlist,
'output-word-graph|owg=s'=> \$wordgraphlist,
'alignment-output-file=s'=> \$alifile,
+ 'translation-details|T=s'=> \$detailsfile,
'qsub-prefix=s'=> \$qsubname,
'queue-parameters=s'=> \$queueparameters,
'inputtype=i'=> \$inputtype,
@@ -539,6 +541,7 @@ while ($robust && scalar @idx_todo) {
concatenate_1best();
concatenate_logs() if $logflag;
concatenate_ali() if defined $alifile;
+concatenate_details() if defined $detailsfile;
concatenate_nbest() if $nbestflag;
safesystem("cat nbest$$ >> /dev/stdout") if $nbestlist[0] eq '-';
@@ -580,6 +583,11 @@ sub preparing_script(){
$tmpalioutfile="-alignment-output-file $tmpdir/$alifile.$splitpfx$idx";
}
+ my $tmpdetailsoutfile = "";
+ if (defined $detailsfile){
+ $tmpdetailsoutfile="-translation-details $tmpdir/$detailsfile.$splitpfx$idx";
+ }
+
my $tmpsearchgraphlist="";
if ($searchgraphflag){
$tmpsearchgraphlist="-output-search-graph $tmpdir/$searchgraphfile.$splitpfx$idx";
@@ -592,13 +600,17 @@ sub preparing_script(){
my $tmpStartTranslationId = ""; # "-start-translation-id $currStartTranslationId";
- print OUT "$mosescmd $mosesparameters $tmpStartTranslationId $tmpalioutfile $tmpwordgraphlist $tmpsearchgraphlist $tmpnbestlist $inputmethod ${inputfile}.$splitpfx$idx > $tmpdir/${inputfile}.$splitpfx$idx.trans\n\n";
+ print OUT "$mosescmd $mosesparameters $tmpStartTranslationId $tmpalioutfile $tmpdetailsoutfile $tmpwordgraphlist $tmpsearchgraphlist $tmpnbestlist $inputmethod ${inputfile}.$splitpfx$idx > $tmpdir/${inputfile}.$splitpfx$idx.trans\n\n";
print OUT "echo exit status \$\?\n\n";
if (defined $alifile){
print OUT "\\mv -f $tmpdir/${alifile}.$splitpfx$idx .\n\n";
print OUT "echo exit status \$\?\n\n";
}
+ if (defined $detailsfile){
+ print OUT "\\mv -f $tmpdir/${detailsfile}.$splitpfx$idx .\n\n";
+ print OUT "echo exit status \$\?\n\n";
+ }
if ($nbestflag){
print OUT "\\mv -f $tmpdir/${nbestfile}.$splitpfx$idx .\n\n";
print OUT "echo exit status \$\?\n\n";
@@ -827,6 +839,18 @@ sub concatenate_ali(){
close(OUT);
}
+sub concatenate_details(){
+ open (OUT, "> ${detailsfile}");
+ foreach my $idx (@idxlist){
+ my @in=();
+ open (IN, "$detailsfile.$splitpfx$idx");
+ @in=<IN>;
+ print OUT "@in";
+ close(IN);
+ }
+ close(OUT);
+}
+
sub check_exit_status(){
print STDERR "check_exit_status\n";
@@ -925,6 +949,7 @@ sub remove_temporary_files(){
unlink("${inputfile}.${splitpfx}${idx}.trans");
unlink("${inputfile}.${splitpfx}${idx}");
if (defined $alifile){ unlink("${alifile}.${splitpfx}${idx}"); }
+ if (defined $detailsfile){ unlink("${detailsfile}.${splitpfx}${idx}"); }
if ($nbestflag){ unlink("${nbestfile}.${splitpfx}${idx}"); }
if ($searchgraphflag){ unlink("${searchgraphfile}.${splitpfx}${idx}"); }
if ($wordgraphflag){ unlink("${wordgraphfile}.${splitpfx}${idx}"); }
diff --git a/scripts/generic/mteval-v13a.pl b/scripts/generic/mteval-v13a.pl
index 879212e6e..f1f8f9ef6 100755
--- a/scripts/generic/mteval-v13a.pl
+++ b/scripts/generic/mteval-v13a.pl
@@ -1009,7 +1009,7 @@ sub extract_sgml_tag_and_span
sub extract_sgml_tag_attribute
{
my ($name, $data) = @_;
- ($data =~ m|$name\s*=\s*\"([^\"]*)\"|si) ? ($1) : ();
+ ($data =~ m|$name\s*=\s*\"?([^\"]*)\"?|si) ? ($1) : ();
}
#################################
diff --git a/scripts/generic/score-parallel.perl b/scripts/generic/score-parallel.perl
index 520fbddbe..3f763e5d9 100755
--- a/scripts/generic/score-parallel.perl
+++ b/scripts/generic/score-parallel.perl
@@ -163,7 +163,7 @@ else
$cmd .= "| LC_ALL=C $sortCmd -T $TMPDIR ";
}
- $cmd .= " | gzip -c > $ptHalf";
+ $cmd .= " | gzip -c > $ptHalf 2>> /dev/stderr ";
}
print STDERR $cmd;
systemCheck($cmd);
diff --git a/scripts/recaser/detruecase.perl b/scripts/recaser/detruecase.perl
index 49c89c299..012c143ac 100755
--- a/scripts/recaser/detruecase.perl
+++ b/scripts/recaser/detruecase.perl
@@ -6,11 +6,12 @@ use Getopt::Long "GetOptions";
binmode(STDIN, ":utf8");
binmode(STDOUT, ":utf8");
-
-my ($SRC,$INFILE);
+my ($SRC,$INFILE,$UNBUFFERED);
die("detruecase.perl < in > out")
unless &GetOptions('headline=s' => \$SRC,
- 'in=s' => \$INFILE);
+ 'in=s' => \$INFILE,
+ 'b|unbuffered' => \$UNBUFFERED);
+if (defined($UNBUFFERED) && $UNBUFFERED) { $|=1; }
my %SENTENCE_END = ("."=>1,":"=>1,"?"=>1,"!"=>1);
my %DELAYED_SENTENCE_START = ("("=>1,"["=>1,"\""=>1,"'"=>1,"&quot;"=>1,"&apos;"=>1,"&#91;"=>1,"&#93;"=>1);
diff --git a/scripts/recaser/recase.perl b/scripts/recaser/recase.perl
index c83c30daa..2858cda61 100755
--- a/scripts/recaser/recase.perl
+++ b/scripts/recaser/recase.perl
@@ -4,7 +4,7 @@
use strict;
use Getopt::Long "GetOptions";
-my ($SRC,$INFILE,$RECASE_MODEL);
+my ($SRC,$INFILE,$RECASE_MODEL,$UNBUFFERED);
my $MOSES = "moses";
my $LANGUAGE = "en"; # English by default;
die("recase.perl --in file --model ini-file > out")
@@ -12,9 +12,11 @@ die("recase.perl --in file --model ini-file > out")
'headline=s' => \$SRC,
'lang=s' => \$LANGUAGE,
'moses=s' => \$MOSES,
- 'model=s' => \$RECASE_MODEL)
+ 'model=s' => \$RECASE_MODEL,
+ 'b|unbuffered' => \$UNBUFFERED)
&& defined($INFILE)
&& defined($RECASE_MODEL);
+if (defined($UNBUFFERED) && $UNBUFFERED) { $|=1; }
my %treated_languages = map { ($_,1) } qw/en cs/;
die "I don't know any rules for $LANGUAGE. Use 'en' as the default."
diff --git a/scripts/recaser/truecase.perl b/scripts/recaser/truecase.perl
index 0e2df27a2..517f5c7a1 100755
--- a/scripts/recaser/truecase.perl
+++ b/scripts/recaser/truecase.perl
@@ -8,9 +8,11 @@ binmode(STDIN, ":utf8");
binmode(STDOUT, ":utf8");
# apply switches
-my $MODEL;
-die("truecase.perl --model truecaser < in > out")
- unless &GetOptions('model=s' => \$MODEL);
+my ($MODEL, $UNBUFFERED);
+die("truecase.perl --model MODEL [-b] < in > out")
+ unless &GetOptions('model=s' => \$MODEL,'b|unbuffered' => \$UNBUFFERED)
+ && defined($MODEL);
+if (defined($UNBUFFERED) && $UNBUFFERED) { $|=1; }
my (%BEST,%KNOWN);
open(MODEL,$MODEL) || die("ERROR: could not open '$MODEL'");
diff --git a/scripts/share/nonbreaking_prefixes/nonbreaking_prefix.hu b/scripts/share/nonbreaking_prefixes/nonbreaking_prefix.hu
new file mode 100644
index 000000000..c6b9af8ca
--- /dev/null
+++ b/scripts/share/nonbreaking_prefixes/nonbreaking_prefix.hu
@@ -0,0 +1,103 @@
+#Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker.
+#Special cases are included for prefixes that ONLY appear before 0-9 numbers.
+
+#any single upper case letter followed by a period is not a sentence ender (excluding I occasionally, but we leave it in)
+#usually upper case letters are initials in a name
+A
+B
+C
+D
+E
+F
+G
+H
+I
+J
+K
+L
+M
+N
+O
+P
+Q
+R
+S
+T
+U
+V
+W
+X
+Y
+Z
+
+#List of titles. These are often followed by upper-case names, but do not indicate sentence breaks
+Dr
+dr
+kb
+Kb
+vö
+Vö
+pl
+Pl
+ca
+Ca
+min
+Min
+max
+Max
+ún
+Ún
+prof
+Prof
+de
+De
+du
+Du
+Szt
+St
+
+#Numbers only. These should only induce breaks when followed by a numeric sequence
+# add NUMERIC_ONLY after the word for this function
+#This case is mostly for the english "No." which can either be a sentence of its own, or
+#if followed by a number, a non-breaking prefix
+
+# Month name abbreviations
+jan #NUMERIC_ONLY#
+Jan #NUMERIC_ONLY#
+Feb #NUMERIC_ONLY#
+feb #NUMERIC_ONLY#
+márc #NUMERIC_ONLY#
+Márc #NUMERIC_ONLY#
+ápr #NUMERIC_ONLY#
+Ápr #NUMERIC_ONLY#
+máj #NUMERIC_ONLY#
+Máj #NUMERIC_ONLY#
+jún #NUMERIC_ONLY#
+Jún #NUMERIC_ONLY#
+Júl #NUMERIC_ONLY#
+júl #NUMERIC_ONLY#
+aug #NUMERIC_ONLY#
+Aug #NUMERIC_ONLY#
+Szept #NUMERIC_ONLY#
+szept #NUMERIC_ONLY#
+okt #NUMERIC_ONLY#
+Okt #NUMERIC_ONLY#
+nov #NUMERIC_ONLY#
+Nov #NUMERIC_ONLY#
+dec #NUMERIC_ONLY#
+Dec #NUMERIC_ONLY#
+
+# Other abbreviations
+tel #NUMERIC_ONLY#
+Tel #NUMERIC_ONLY#
+Fax #NUMERIC_ONLY#
+fax #NUMERIC_ONLY#
diff --git a/scripts/share/nonbreaking_prefixes/nonbreaking_prefix.lv b/scripts/share/nonbreaking_prefixes/nonbreaking_prefix.lv
new file mode 100644
index 000000000..81754a17a
--- /dev/null
+++ b/scripts/share/nonbreaking_prefixes/nonbreaking_prefix.lv
@@ -0,0 +1,100 @@
+#Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker.
+#Special cases are included for prefixes that ONLY appear before 0-9 numbers.
+
+#any single upper case letter followed by a period is not a sentence ender (excluding I occasionally, but we leave it in)
+#usually upper case letters are initials in a name
+A
+B
+C
+D
+E
+F
+G
+H
+I
+J
+K
+L
+M
+N
+O
+P
+Q
+R
+S
+T
+U
+V
+W
+X
+Y
+Z
+
+#List of titles. These are often followed by upper-case names, but do not indicate sentence breaks
+dr
+Dr
+med
+prof
+Prof
+inž
+Inž
+ist.loc
+Ist.loc
+kor.loc
+Kor.loc
+v.i
+vietn
+Vietn
+
+#misc - odd period-ending items that NEVER indicate breaks (p.m. does NOT fall into this category - it sometimes ends a sentence)
+a.l
+t.p
+pārb
+Pārb
+vec
+Vec
+inv
+Inv
+sk
+Sk
+spec
+Spec
+vienk
+Vienk
+virz
+Virz
+māksl
+Māksl
+mūz
+Mūz
+akad
+Akad
+soc
+Soc
+galv
+Galv
+vad
+Vad
+sertif
+Sertif
+folkl
+Folkl
+hum
+Hum
+
+#Numbers only. These should only induce breaks when followed by a numeric sequence
+# add NUMERIC_ONLY after the word for this function
+#This case is mostly for the english "No." which can either be a sentence of its own, or
+#if followed by a number, a non-breaking prefix
+Nr #NUMERIC_ONLY#
diff --git a/scripts/tokenizer/tokenizer.perl b/scripts/tokenizer/tokenizer.perl
index f59cd5f86..986a2dfb5 100755
--- a/scripts/tokenizer/tokenizer.perl
+++ b/scripts/tokenizer/tokenizer.perl
@@ -171,7 +171,7 @@ if ($TIMING)
# tokenize a batch of texts saved in an array
# input: an array containing a batch of texts
-# return: another array cotaining a batch of tokenized texts for the input array
+# return: another array containing a batch of tokenized texts for the input array
sub tokenize_batch
{
my(@text_list) = @_;
diff --git a/scripts/training/clean-corpus-n.perl b/scripts/training/clean-corpus-n.perl
index bea32052a..2865fe391 100755
--- a/scripts/training/clean-corpus-n.perl
+++ b/scripts/training/clean-corpus-n.perl
@@ -47,7 +47,7 @@ my $l1input = "$corpus.$l1";
if (-e $l1input) {
$opn = $l1input;
} elsif (-e $l1input.".gz") {
- $opn = "zcat $l1input.gz |";
+ $opn = "gunzip -c $l1input.gz |";
} else {
die "Error: $l1input does not exist";
}
@@ -57,7 +57,7 @@ my $l2input = "$corpus.$l2";
if (-e $l2input) {
$opn = $l2input;
} elsif (-e $l2input.".gz") {
- $opn = "zcat $l2input.gz |";
+ $opn = "gunzip -c $l2input.gz |";
} else {
die "Error: $l2input does not exist";
}
@@ -160,3 +160,4 @@ sub word_count {
my @w = split(/ /,$line);
return scalar @w;
}
+
diff --git a/scripts/training/filter-rule-table.py b/scripts/training/filter-rule-table.py
index 8bef034de..86c8b300e 100755
--- a/scripts/training/filter-rule-table.py
+++ b/scripts/training/filter-rule-table.py
@@ -40,7 +40,8 @@ def printUsage():
def main():
parser = optparse.OptionParser()
parser.add_option("-c", "--min-non-initial-rule-count",
- action="store", dest="minCount", type="int", default="1",
+ action="store", dest="minCount",
+ type="float", default="0.0",
help="prune non-initial rules where count is below N",
metavar="N")
(options, args) = parser.parse_args()
diff --git a/scripts/training/mert-moses.pl b/scripts/training/mert-moses.pl
index 688e8ce55..9f5f25f15 100755
--- a/scripts/training/mert-moses.pl
+++ b/scripts/training/mert-moses.pl
@@ -1,4 +1,4 @@
-#!/usr/bin/perl -w
+#!/usr/bin/perl -w
# $Id$
# Usage:
# mert-moses.pl <foreign> <english> <decoder-executable> <decoder-config>
@@ -371,7 +371,7 @@ my $pro_optimizer = File::Spec->catfile($mertdir, "megam_i686.opt"); # or set t
if (($___PAIRWISE_RANKED_OPTIMIZER || $___PRO_STARTING_POINT) && ! -x $pro_optimizer) {
print "Could not find $pro_optimizer, installing it in $mertdir\n";
- my $megam_url = "http://www.umiacs.umd.edu/~hal/megam/";
+ my $megam_url = "http://hal3.name/megam";
if (&is_mac_osx()) {
die "Error: Sorry for Mac OS X users! Please get the source code of megam and compile by hand. Please see $megam_url for details.";
}
diff --git a/scripts/training/train-model.perl b/scripts/training/train-model.perl
index 5b0553581..e4292007e 100755
--- a/scripts/training/train-model.perl
+++ b/scripts/training/train-model.perl
@@ -38,8 +38,9 @@ my($_EXTERNAL_BINDIR, $_ROOT_DIR, $_CORPUS_DIR, $_GIZA_E2F, $_GIZA_F2E, $_MODEL_
$_MEMSCORE, $_FINAL_ALIGNMENT_MODEL,
$_CONTINUE,$_MAX_LEXICAL_REORDERING,$_DO_STEPS,
@_ADDITIONAL_INI,$_ADDITIONAL_INI_FILE,
- $_SPARSE_TRANSLATION_TABLE, @_BASELINE_ALIGNMENT_MODEL, $_BASELINE_EXTRACT, $_BASELINE_CORPUS, $_BASELINE_ALIGNMENT,
+ $_SPARSE_TRANSLATION_TABLE, @_BASELINE_ALIGNMENT_MODEL, $_BASELINE_EXTRACT, $_BASELINE_ALIGNMENT,
$_DICTIONARY, $_SPARSE_PHRASE_FEATURES, $_EPPEX, $_INSTANCE_WEIGHTS_FILE, $_LMODEL_OOV_FEATURE, $IGNORE);
+my $_BASELINE_CORPUS = "";
my $_CORES = 1;
my $debug = 0; # debug this script, do not delete any files in debug mode
diff --git a/util/file.cc b/util/file.cc
index 86d9b12de..c7d8e23b2 100644
--- a/util/file.cc
+++ b/util/file.cc
@@ -111,15 +111,26 @@ void ResizeOrThrow(int fd, uint64_t to) {
UTIL_THROW_IF_ARG(ret, FDException, (fd), "while resizing to " << to << " bytes");
}
+namespace {
+std::size_t GuardLarge(std::size_t size) {
+ // The following operating systems have broken read/write/pread/pwrite that
+ // only supports up to 2^31.
+#if defined(_WIN32) || defined(_WIN64) || defined(__APPLE__) || defined(OS_ANDROID)
+ return std::min(static_cast<std::size_t>(INT_MAX), size);
+#else
+ return size;
+#endif
+}
+}
+
std::size_t PartialRead(int fd, void *to, std::size_t amount) {
#if defined(_WIN32) || defined(_WIN64)
- amount = min(static_cast<std::size_t>(INT_MAX), amount);
- int ret = _read(fd, to, amount);
+ int ret = _read(fd, to, GuardLarge(amount));
#else
errno = 0;
ssize_t ret;
do {
- ret = read(fd, to, amount);
+ ret = read(fd, to, GuardLarge(amount));
} while (ret == -1 && errno == EINTR);
#endif
UTIL_THROW_IF_ARG(ret < 0, FDException, (fd), "while reading " << amount << " bytes");
@@ -169,11 +180,13 @@ void PReadOrThrow(int fd, void *to_void, std::size_t size, uint64_t off) {
ssize_t ret;
errno = 0;
do {
+ ret =
#ifdef OS_ANDROID
- ret = pread64(fd, to, size, off);
+ pread64
#else
- ret = pread(fd, to, size, off);
+ pread
#endif
+ (fd, to, GuardLarge(size), off);
} while (ret == -1 && errno == EINTR);
if (ret <= 0) {
UTIL_THROW_IF(ret == 0, EndOfFileException, " for reading " << size << " bytes at " << off << " from " << NameFromFD(fd));
@@ -190,14 +203,20 @@ void WriteOrThrow(int fd, const void *data_void, std::size_t size) {
const uint8_t *data = static_cast<const uint8_t*>(data_void);
while (size) {
#if defined(_WIN32) || defined(_WIN64)
- int ret = write(fd, data, min(static_cast<std::size_t>(INT_MAX), size));
+ int ret;
#else
- errno = 0;
ssize_t ret;
+#endif
+ errno = 0;
do {
- ret = write(fd, data, size);
- } while (ret == -1 && errno == EINTR);
+ ret =
+#if defined(_WIN32) || defined(_WIN64)
+ _write
+#else
+ write
#endif
+ (fd, data, GuardLarge(size));
+ } while (ret == -1 && errno == EINTR);
UTIL_THROW_IF_ARG(ret < 1, FDException, (fd), "while writing " << size << " bytes");
data += ret;
size -= ret;
diff --git a/util/read_compressed.cc b/util/read_compressed.cc
index b81549e42..b62a6e833 100644
--- a/util/read_compressed.cc
+++ b/util/read_compressed.cc
@@ -180,12 +180,73 @@ class GZip : public ReadBase {
};
#endif // HAVE_ZLIB
+const uint8_t kBZMagic[3] = {'B', 'Z', 'h'};
+
#ifdef HAVE_BZLIB
class BZip : public ReadBase {
public:
- explicit BZip(int fd, void *already_data, std::size_t already_size) {
+ BZip(int fd, void *already_data, std::size_t already_size) {
scoped_fd hold(fd);
closer_.reset(FDOpenReadOrThrow(hold));
+ file_ = NULL;
+ Open(already_data, already_size);
+ }
+
+ BZip(FILE *file, void *already_data, std::size_t already_size) {
+ closer_.reset(file);
+ file_ = NULL;
+ Open(already_data, already_size);
+ }
+
+ ~BZip() {
+ Close(file_);
+ }
+
+ std::size_t Read(void *to, std::size_t amount, ReadCompressed &thunk) {
+ assert(file_);
+ int bzerror = BZ_OK;
+ int ret = BZ2_bzRead(&bzerror, file_, to, std::min<std::size_t>(static_cast<std::size_t>(INT_MAX), amount));
+ long pos = ftell(closer_.get());
+ if (pos != -1) ReadCount(thunk) = pos;
+ switch (bzerror) {
+ case BZ_STREAM_END:
+ /* bzip2 files can be concatenated by e.g. pbzip2. Annoyingly, the
+ * library doesn't handle this internally. This gets the trailing
+ * data, grows it up to magic as needed, validates the magic, and
+ * reopens.
+ */
+ {
+ bzerror = BZ_OK;
+ void *trailing_data;
+ int trailing_size;
+ BZ2_bzReadGetUnused(&bzerror, file_, &trailing_data, &trailing_size);
+ UTIL_THROW_IF(bzerror != BZ_OK, BZException, "bzip2 error in BZ2_bzReadGetUnused " << BZ2_bzerror(file_, &bzerror) << " code " << bzerror);
+ std::string trailing(static_cast<const char*>(trailing_data), trailing_size);
+ Close(file_);
+
+ if (trailing_size < (int)sizeof(kBZMagic)) {
+ trailing.resize(sizeof(kBZMagic));
+ if (1 != fread(&trailing[trailing_size], sizeof(kBZMagic) - trailing_size, 1, closer_.get())) {
+ UTIL_THROW_IF(trailing_size, BZException, "File has trailing cruft");
+ // Legitimate end of file.
+ ReplaceThis(new Complete(), thunk);
+ return ret;
+ }
+ }
+ UTIL_THROW_IF(memcmp(trailing.data(), kBZMagic, sizeof(kBZMagic)), BZException, "Trailing cruft is not another bzip2 stream");
+ Open(&trailing[0], trailing.size());
+ }
+ return ret;
+ case BZ_OK:
+ return ret;
+ default:
+ UTIL_THROW(BZException, "bzip2 error " << BZ2_bzerror(file_, &bzerror) << " code " << bzerror);
+ }
+ }
+
+ private:
+ void Open(void *already_data, std::size_t already_size) {
+ assert(!file_);
int bzerror = BZ_OK;
file_ = BZ2_bzReadOpen(&bzerror, closer_.get(), 0, 0, already_data, already_size);
switch (bzerror) {
@@ -199,38 +260,23 @@ class BZip : public ReadBase {
UTIL_THROW(BZException, "IO error reading file");
case BZ_MEM_ERROR:
throw std::bad_alloc();
+ default:
+ UTIL_THROW(BZException, "Unknown bzip2 error code " << bzerror);
}
+ assert(file_);
}
- ~BZip() {
+ static void Close(BZFILE *&file) {
+ if (file == NULL) return;
int bzerror = BZ_OK;
- BZ2_bzReadClose(&bzerror, file_);
+ BZ2_bzReadClose(&bzerror, file);
if (bzerror != BZ_OK) {
- std::cerr << "bz2 readclose error" << std::endl;
+ std::cerr << "bz2 readclose error number " << bzerror << std::endl;
abort();
}
+ file = NULL;
}
- std::size_t Read(void *to, std::size_t amount, ReadCompressed &thunk) {
- int bzerror = BZ_OK;
- int ret = BZ2_bzRead(&bzerror, file_, to, std::min<std::size_t>(static_cast<std::size_t>(INT_MAX), amount));
- long pos;
- switch (bzerror) {
- case BZ_STREAM_END:
- pos = ftell(closer_.get());
- if (pos != -1) ReadCount(thunk) = pos;
- ReplaceThis(new Complete(), thunk);
- return ret;
- case BZ_OK:
- pos = ftell(closer_.get());
- if (pos != -1) ReadCount(thunk) = pos;
- return ret;
- default:
- UTIL_THROW(BZException, "bzip2 error " << BZ2_bzerror(file_, &bzerror) << " code " << bzerror);
- }
- }
-
- private:
scoped_FILE closer_;
BZFILE *file_;
};
@@ -346,11 +392,11 @@ MagicResult DetectMagic(const void *from_void) {
if (header[0] == 0x1f && header[1] == 0x8b) {
return GZIP;
}
- if (header[0] == 'B' && header[1] == 'Z' && header[2] == 'h') {
+ if (!memcmp(header, kBZMagic, sizeof(kBZMagic))) {
return BZIP;
}
- const uint8_t xzmagic[6] = { 0xFD, '7', 'z', 'X', 'Z', 0x00 };
- if (!memcmp(header, xzmagic, 6)) {
+ const uint8_t kXZMagic[6] = { 0xFD, '7', 'z', 'X', 'Z', 0x00 };
+ if (!memcmp(header, kXZMagic, sizeof(kXZMagic))) {
return XZIP;
}
return UNKNOWN;