Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/moses-smt/mosesdecoder.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--.gitignore1
-rw-r--r--Jamroot4
-rw-r--r--OnDiskPt/queryOnDiskPt.cpp2
-rw-r--r--contrib/server/mosesserver.cpp39
-rw-r--r--moses-cmd/Jamfile9
-rw-r--r--moses/BitmapContainer.cpp14
-rw-r--r--moses/Manager.cpp4
-rw-r--r--moses/TranslationModel/UG/Jamfile35
-rw-r--r--moses/TranslationModel/UG/generic/program_options/ug_splice_arglist.cc50
-rw-r--r--moses/TranslationModel/UG/generic/program_options/ug_splice_arglist.h18
-rw-r--r--moses/TranslationModel/UG/mm/Jamfile19
-rw-r--r--moses/TranslationModel/UG/mm/custom-pt.cc9
-rw-r--r--moses/TranslationModel/UG/mm/ug_bitext.cc183
-rw-r--r--moses/TranslationModel/UG/mm/ug_bitext.h39
-rw-r--r--moses/TranslationModel/UG/mm/ug_im_ttrack.h34
-rw-r--r--moses/TranslationModel/UG/mm/ug_lexical_phrase_scorer2.h21
-rw-r--r--moses/TranslationModel/UG/mm/ug_phrasepair.cc97
-rw-r--r--moses/TranslationModel/UG/mm/ug_phrasepair.h243
-rw-r--r--moses/TranslationModel/UG/mm/ug_tsa_tree_iterator.h46
-rw-r--r--moses/TranslationModel/UG/mmsapt.cpp1034
-rw-r--r--moses/TranslationModel/UG/mmsapt.h87
-rw-r--r--moses/TranslationModel/UG/mmsapt_align.cc607
-rw-r--r--moses/TranslationModel/UG/mmsapt_phrase_scorers.h269
-rw-r--r--moses/TranslationModel/UG/ptable-lookup.cc14
-rw-r--r--moses/TranslationModel/UG/sapt_phrase_key.h13
-rw-r--r--moses/TranslationModel/UG/sapt_phrase_scorers.h12
-rw-r--r--moses/TranslationModel/UG/sapt_pscore_base.h103
-rw-r--r--moses/TranslationModel/UG/sapt_pscore_coherence.h33
-rw-r--r--moses/TranslationModel/UG/sapt_pscore_lex1.h70
-rw-r--r--moses/TranslationModel/UG/sapt_pscore_logcnt.h65
-rw-r--r--moses/TranslationModel/UG/sapt_pscore_pbwd.h58
-rw-r--r--moses/TranslationModel/UG/sapt_pscore_pfwd.h70
-rw-r--r--moses/TranslationModel/UG/sapt_pscore_provenance.h47
-rw-r--r--moses/TranslationModel/UG/sapt_pscore_rareness.h41
-rw-r--r--moses/TranslationModel/UG/sapt_pscore_unaligned.h67
-rw-r--r--moses/TranslationModel/UG/sim-pe.cc83
-rw-r--r--moses/TranslationModel/UG/try-align.cc47
-rw-r--r--moses/TranslationModel/fuzzy-match/FuzzyMatchWrapper.cpp4
-rw-r--r--moses/TypeDef.h6
-rw-r--r--moses/Util.h4
-rw-r--r--scripts/server/moses.py10
-rwxr-xr-xscripts/server/sim-pe.py57
42 files changed, 2365 insertions, 1303 deletions
diff --git a/.gitignore b/.gitignore
index f870bed03..e7c37d86c 100644
--- a/.gitignore
+++ b/.gitignore
@@ -79,3 +79,4 @@ nbproject/
mingw/MosesGUI/MosesGUI.e4p
mingw/MosesGUI/_eric4project/
+contrib/m4m/merge-sorted
diff --git a/Jamroot b/Jamroot
index 283b4dd6f..79ec39940 100644
--- a/Jamroot
+++ b/Jamroot
@@ -152,13 +152,15 @@ build-projects lm util phrase-extract search moses moses/LM mert moses-cmd moses
if [ option.get "with-mm" : : "yes" ]
{
alias mm :
+ moses/TranslationModel/UG//spe-check-coverage2
moses/TranslationModel/UG//ptable-lookup
+ moses/TranslationModel/UG//sim-pe
+ moses/TranslationModel/UG//spe-check-coverage
moses/TranslationModel/UG/mm//mtt-build
moses/TranslationModel/UG/mm//mtt-dump
moses/TranslationModel/UG/mm//symal2mam
moses/TranslationModel/UG/mm//mam2symal
moses/TranslationModel/UG/mm//mam_verify
- moses/TranslationModel/UG/mm//custom-pt
moses/TranslationModel/UG/mm//mmlex-build
moses/TranslationModel/UG/mm//mmlex-lookup
moses/TranslationModel/UG/mm//mtt-count-words
diff --git a/OnDiskPt/queryOnDiskPt.cpp b/OnDiskPt/queryOnDiskPt.cpp
index a38fc5435..77576d956 100644
--- a/OnDiskPt/queryOnDiskPt.cpp
+++ b/OnDiskPt/queryOnDiskPt.cpp
@@ -22,7 +22,7 @@ int main(int argc, char **argv)
{
int tableLimit = 20;
std::string ttable = "";
- bool useAlignments = false;
+ // bool useAlignments = false;
for(int i = 1; i < argc; i++) {
if(!strcmp(argv[i], "-tlimit")) {
diff --git a/contrib/server/mosesserver.cpp b/contrib/server/mosesserver.cpp
index 1ff11f0ae..f14111f33 100644
--- a/contrib/server/mosesserver.cpp
+++ b/contrib/server/mosesserver.cpp
@@ -4,6 +4,7 @@
#include <algorithm>
+#include "moses/Util.h"
#include "moses/ChartManager.h"
#include "moses/Hypothesis.h"
#include "moses/Manager.h"
@@ -59,7 +60,7 @@ public:
if(add2ORLM_) {
//updateORLM();
}
- cerr << "Done inserting\n";
+ XVERBOSE(1,"Done inserting\n");
//PhraseDictionary* pdsa = (PhraseDictionary*) pdf->GetDictionary(*dummy);
map<string, xmlrpc_c::value> retData;
//*retvalP = xmlrpc_c::value_struct(retData);
@@ -120,17 +121,17 @@ public:
if(si == params.end())
throw xmlrpc_c::fault("Missing source sentence", xmlrpc_c::fault::CODE_PARSE);
source_ = xmlrpc_c::value_string(si->second);
- cerr << "source = " << source_ << endl;
+ XVERBOSE(1,"source = " << source_ << endl);
si = params.find("target");
if(si == params.end())
throw xmlrpc_c::fault("Missing target sentence", xmlrpc_c::fault::CODE_PARSE);
target_ = xmlrpc_c::value_string(si->second);
- cerr << "target = " << target_ << endl;
+ XVERBOSE(1,"target = " << target_ << endl);
si = params.find("alignment");
if(si == params.end())
throw xmlrpc_c::fault("Missing alignment", xmlrpc_c::fault::CODE_PARSE);
alignment_ = xmlrpc_c::value_string(si->second);
- cerr << "alignment = " << alignment_ << endl;
+ XVERBOSE(1,"alignment = " << alignment_ << endl);
si = params.find("bounded");
bounded_ = (si != params.end());
si = params.find("updateORLM");
@@ -224,7 +225,7 @@ public:
}
const string source((xmlrpc_c::value_string(si->second)));
- cerr << "Input: " << source << endl;
+ XVERBOSE(1,"Input: " << source << endl);
si = params.find("align");
bool addAlignInfo = (si != params.end());
si = params.find("word-align");
@@ -287,13 +288,13 @@ public:
}
} else {
Sentence sentence;
- const vector<FactorType> &inputFactorOrder =
- staticData.GetInputFactorOrder();
+ const vector<FactorType> &
+ inputFactorOrder = staticData.GetInputFactorOrder();
stringstream in(source + "\n");
sentence.Read(in,inputFactorOrder);
size_t lineNumber = 0; // TODO: Include sentence request number here?
Manager manager(lineNumber, sentence, staticData.GetSearchAlgorithm());
- manager.ProcessSentence();
+ manager.ProcessSentence();
const Hypothesis* hypo = manager.GetBestHypothesis();
vector<xmlrpc_c::value> alignInfo;
@@ -331,7 +332,7 @@ public:
pair<string, xmlrpc_c::value>
text("text", xmlrpc_c::value_string(out.str()));
retData.insert(text);
- cerr << "Output: " << out.str() << endl;
+ XVERBOSE(1,"Output: " << out.str() << endl);
*retvalP = xmlrpc_c::value_struct(retData);
}
@@ -574,7 +575,7 @@ int main(int argc, char** argv)
{
//Extract port and log, send other args to moses
- char** mosesargv = new char*[argc+2];
+ char** mosesargv = new char*[argc+2]; // why "+2" [UG]
int mosesargc = 0;
int port = 8080;
const char* logfile = "/dev/null";
@@ -634,11 +635,11 @@ int main(int argc, char** argv)
myRegistry.addMethod("updater", updater);
myRegistry.addMethod("optimize", optimizer);
- xmlrpc_c::serverAbyss myAbyssServer(
- myRegistry,
- port, // TCP port on which to listen
- logfile
- );
+ xmlrpc_c::serverAbyss myAbyssServer(
+ myRegistry,
+ port, // TCP port on which to listen
+ logfile
+ );
/* doesn't work with xmlrpc-c v. 1.16.33 - ie very old lib on Ubuntu 12.04
xmlrpc_c::serverAbyss myAbyssServer(
xmlrpc_c::serverAbyss::constrOpt()
@@ -648,12 +649,10 @@ int main(int argc, char** argv)
.allowOrigin("*")
);
*/
-
- cerr << "Listening on port " << port << endl;
+
+ XVERBOSE(1,"Listening on port " << port << endl);
if (isSerial) {
- while(1) {
- myAbyssServer.runOnce();
- }
+ while(1) myAbyssServer.runOnce();
} else {
myAbyssServer.run();
}
diff --git a/moses-cmd/Jamfile b/moses-cmd/Jamfile
index bddc10911..d257cd26c 100644
--- a/moses-cmd/Jamfile
+++ b/moses-cmd/Jamfile
@@ -3,4 +3,11 @@ alias deps : IOWrapper.cpp mbr.cpp LatticeMBR.cpp TranslationAnalysis.cpp ..//z
exe moses : Main.cpp deps ;
exe lmbrgrid : LatticeMBRGrid.cpp deps ;
-alias programs : moses lmbrgrid ;
+exe simulate-pe :
+simulate-pe.cc
+$(TOP)/moses/TranslationModel/UG/generic//generic
+$(TOP)//boost_program_options
+deps
+;
+
+alias programs : moses lmbrgrid simulate-pe ;
diff --git a/moses/BitmapContainer.cpp b/moses/BitmapContainer.cpp
index 981b04895..ee2d55fc8 100644
--- a/moses/BitmapContainer.cpp
+++ b/moses/BitmapContainer.cpp
@@ -161,13 +161,17 @@ BackwardsEdge::BackwardsEdge(const BitmapContainer &prevBitmapContainer
}
if (m_translations.size() > 1) {
- UTIL_THROW_IF2(m_translations.Get(0)->GetFutureScore() < m_translations.Get(1)->GetFutureScore(),
- "Non-monotonic future score");
+ UTIL_THROW_IF2(m_translations.Get(0)->GetFutureScore() < m_translations.Get(1)->GetFutureScore(),
+ "Non-monotonic future score: "
+ << m_translations.Get(0)->GetFutureScore() << " vs. "
+ << m_translations.Get(1)->GetFutureScore());
}
if (m_hypotheses.size() > 1) {
UTIL_THROW_IF2(m_hypotheses[0]->GetTotalScore() < m_hypotheses[1]->GetTotalScore(),
- "Non-monotonic total score");
+ "Non-monotonic total score"
+ << m_hypotheses[0]->GetTotalScore() << " vs. "
+ << m_hypotheses[1]->GetTotalScore());
}
HypothesisScoreOrdererWithDistortion orderer (&transOptRange);
@@ -442,7 +446,9 @@ BitmapContainer::ProcessBestHypothesis()
if (!Empty()) {
HypothesisQueueItem *check = Dequeue(true);
UTIL_THROW_IF2(item->GetHypothesis()->GetTotalScore() < check->GetHypothesis()->GetTotalScore(),
- "Non-monotonic total score");
+ "Non-monotonic total score: "
+ << item->GetHypothesis()->GetTotalScore() << " vs. "
+ << check->GetHypothesis()->GetTotalScore());
}
// Logging for the criminally insane
diff --git a/moses/Manager.cpp b/moses/Manager.cpp
index 6bc82378e..196f4d997 100644
--- a/moses/Manager.cpp
+++ b/moses/Manager.cpp
@@ -105,7 +105,9 @@ void Manager::ProcessSentence()
// some reporting on how long this took
IFVERBOSE(1) {
GetSentenceStats().StopTimeCollectOpts();
- TRACE_ERR("Line "<< m_lineNumber << ": Collecting options took " << GetSentenceStats().GetTimeCollectOpts() << " seconds" << endl);
+ TRACE_ERR("Line "<< m_lineNumber << ": Collecting options took "
+ << GetSentenceStats().GetTimeCollectOpts() << " seconds at "
+ << __FILE__ << ":" << __LINE__ << endl);
}
// search for best translation with the specified algorithm
diff --git a/moses/TranslationModel/UG/Jamfile b/moses/TranslationModel/UG/Jamfile
index ecd175a65..c36d4a072 100644
--- a/moses/TranslationModel/UG/Jamfile
+++ b/moses/TranslationModel/UG/Jamfile
@@ -20,6 +20,39 @@ $(TOP)/moses/TranslationModel/UG//mmsapt
$(TOP)/util//kenutil
;
+exe sim-pe :
+sim-pe.cc
+$(TOP)/moses//moses
+$(TOP)/moses/TranslationModel/UG/generic//generic
+$(TOP)//boost_iostreams
+$(TOP)//boost_program_options
+$(TOP)/moses/TranslationModel/UG/mm//mm
+$(TOP)/moses/TranslationModel/UG//mmsapt
+$(TOP)/util//kenutil
+;
+
+exe spe-check-coverage :
+spe-check-coverage.cc
+$(TOP)/moses//moses
+$(TOP)/moses/TranslationModel/UG/generic//generic
+$(TOP)//boost_iostreams
+$(TOP)//boost_program_options
+$(TOP)/moses/TranslationModel/UG/mm//mm
+$(TOP)/moses/TranslationModel/UG//mmsapt
+$(TOP)/util//kenutil
+;
+
+exe spe-check-coverage2 :
+spe-check-coverage2.cc
+$(TOP)/moses//moses
+$(TOP)/moses/TranslationModel/UG/generic//generic
+$(TOP)//boost_iostreams
+$(TOP)//boost_program_options
+$(TOP)/moses/TranslationModel/UG/mm//mm
+$(TOP)/moses/TranslationModel/UG//mmsapt
+$(TOP)/util//kenutil
+;
+
install $(PREFIX)/bin : try-align ;
-fakelib mmsapt : [ glob *.cpp mmsapt*.cc ] ;
+fakelib mmsapt : [ glob *.cpp mmsapt*.cc sapt*.cc ] ;
diff --git a/moses/TranslationModel/UG/generic/program_options/ug_splice_arglist.cc b/moses/TranslationModel/UG/generic/program_options/ug_splice_arglist.cc
new file mode 100644
index 000000000..7dc2cd18f
--- /dev/null
+++ b/moses/TranslationModel/UG/generic/program_options/ug_splice_arglist.cc
@@ -0,0 +1,50 @@
+//-*- c++ -*-
+#include "ug_splice_arglist.h"
+#include "moses/Util.h"
+#include "util/exception.hh"
+#include <boost/foreach.hpp>
+
+namespace Moses {
+
+ void
+ filter_arguments(int const argc_in, char const* const* const argv_in,
+ int & argc_moses, char*** argv_moses,
+ int & argc_other, char*** argv_other,
+ vector<pair<string,int> > const& filter)
+ {
+ *argv_moses = new char*[argc_in];
+ *argv_other = new char*[argc_in];
+ (*argv_moses)[0] = new char[strlen(argv_in[0])+1];
+ strcpy((*argv_moses)[0], argv_in[0]);
+ argc_moses = 1;
+ argc_other = 0;
+ typedef pair<string,int> option;
+ int i = 1;
+ while (i < argc_in)
+ {
+ BOOST_FOREACH(option const& o, filter)
+ {
+ if (o.first == argv_in[i])
+ {
+ (*argv_other)[argc_other] = new char[strlen(argv_in[i])+1];
+ strcpy((*argv_other)[argc_other++],argv_in[i]);
+ for (int k = 0; k < o.second; ++k)
+ {
+ UTIL_THROW_IF2(++i >= argc_in || argv_in[i][0] == '-',
+ "[" << HERE << "] Missing argument for "
+ << "parameter " << o.first << "!");
+ (*argv_other)[argc_other] = new char[strlen(argv_in[i])+1];
+ strcpy((*argv_other)[argc_other++],argv_in[i]);
+ }
+ if (++i >= argc_in) break;
+ }
+ }
+ if (i >= argc_in) break;
+ (*argv_moses)[argc_moses] = new char[strlen(argv_in[i])+1];
+ strcpy((*argv_moses)[argc_moses++], argv_in[i++]);
+ }
+ }
+
+} // namespace Moses
+
+
diff --git a/moses/TranslationModel/UG/generic/program_options/ug_splice_arglist.h b/moses/TranslationModel/UG/generic/program_options/ug_splice_arglist.h
new file mode 100644
index 000000000..e56585e8a
--- /dev/null
+++ b/moses/TranslationModel/UG/generic/program_options/ug_splice_arglist.h
@@ -0,0 +1,18 @@
+//-*- c++ -*-
+#pragma once
+#include <vector>
+#include <string>
+namespace Moses {
+ using namespace std;
+
+ // Function to splice the argument list (e.g. before handing it over to
+ // Moses LoadParam() function. /filter/ is a vector of argument names
+ // and the number of arguments after each of them
+ void
+ filter_arguments(int const argc_in, char const* const* const argv_in,
+ int & argc_moses, char*** argv_moses,
+ int & argc_other, char*** argv_other,
+ vector<pair<string,int> > const& filter);
+
+
+} // namespace Moses
diff --git a/moses/TranslationModel/UG/mm/Jamfile b/moses/TranslationModel/UG/mm/Jamfile
index 2cc923581..8d8af050a 100644
--- a/moses/TranslationModel/UG/mm/Jamfile
+++ b/moses/TranslationModel/UG/mm/Jamfile
@@ -72,15 +72,15 @@ $(TOP)/moses/TranslationModel/UG/mm//mm
$(TOP)/util//kenutil
;
-exe custom-pt :
-custom-pt.cc
-$(TOP)/moses//moses
-$(TOP)//boost_iostreams
-$(TOP)//boost_program_options
-$(TOP)/moses/TranslationModel/UG/mm//mm
-$(TOP)/moses/TranslationModel/UG/generic//generic
-$(TOP)/util//kenutil
-;
+# exe custom-pt :
+# custom-pt.cc
+# $(TOP)/moses//moses
+# $(TOP)//boost_iostreams
+# $(TOP)//boost_program_options
+# $(TOP)/moses/TranslationModel/UG/mm//mm
+# $(TOP)/moses/TranslationModel/UG/generic//generic
+# $(TOP)/util//kenutil
+# ;
exe calc-coverage :
@@ -98,7 +98,6 @@ mtt-dump
mtt-count-words
symal2mam
mam2symal
-custom-pt
mmlex-build
mmlex-lookup
mam_verify
diff --git a/moses/TranslationModel/UG/mm/custom-pt.cc b/moses/TranslationModel/UG/mm/custom-pt.cc
index 1c1e0893c..e52772b48 100644
--- a/moses/TranslationModel/UG/mm/custom-pt.cc
+++ b/moses/TranslationModel/UG/mm/custom-pt.cc
@@ -1,6 +1,6 @@
// build a phrase table for the given input
// #include "ug_lexical_phrase_scorer2.h"
-
+#if 0
#include <stdint.h>
#include <string>
#include <vector>
@@ -25,7 +25,7 @@
#include "ug_bitext.h"
#include "../mmsapt_phrase_scorers.h"
#include "ug_lexical_phrase_scorer2.h"
-
+#include "../sapt_phrase_scorers.h"
using namespace std;
using namespace ugdiss;
using namespace Moses;
@@ -110,6 +110,7 @@ int main(int argc, char* argv[])
{
// assert(argc == 4);
#if 0
+#if 0
string base = argv[1];
string L1 = argv[2];
string L2 = argv[3];
@@ -182,7 +183,7 @@ int main(int argc, char* argv[])
}
}
}
-
+#endif
exit(0);
}
-
+#endif
diff --git a/moses/TranslationModel/UG/mm/ug_bitext.cc b/moses/TranslationModel/UG/mm/ug_bitext.cc
index 8dbbdcb92..a1a6dff7b 100644
--- a/moses/TranslationModel/UG/mm/ug_bitext.cc
+++ b/moses/TranslationModel/UG/mm/ug_bitext.cc
@@ -158,99 +158,25 @@ namespace Moses
jstats::
invalidate()
{
- my_rcnt = 0;
+ if (my_wcnt > 0)
+ my_wcnt *= -1;
}
- bool
+ void
jstats::
- valid()
- {
- return my_rcnt != 0;
- }
-
- bool
- PhrasePair::
- operator<=(PhrasePair const& other) const
+ validate()
{
- return this->score <= other.score;
+ if (my_wcnt < 0)
+ my_wcnt *= -1;
}
bool
- PhrasePair::
- operator>=(PhrasePair const& other) const
- {
- return this->score >= other.score;
- }
-
- bool
- PhrasePair::
- operator<(PhrasePair const& other) const
- {
- return this->score < other.score;
- }
-
- bool
- PhrasePair::
- operator>(PhrasePair const& other) const
- {
- return this->score > other.score;
- }
-
- PhrasePair::
- PhrasePair() {}
-
- PhrasePair::
- PhrasePair(PhrasePair const& o)
- : p1(o.p1),
- p2(o.p2),
- raw1(o.raw1),
- raw2(o.raw2),
- sample1(o.sample1),
- sample2(o.sample2),
- good1(o.good1),
- good2(o.good2),
- joint(o.joint),
- fvals(o.fvals),
- aln(o.aln),
- score(o.score)
- {
- for (size_t i = 0; i <= po_other; ++i)
- {
- dfwd[i] = o.dfwd[i];
- dbwd[i] = o.dbwd[i];
- }
- }
-
- void
- PhrasePair::
- init(uint64_t const pid1, pstats const& ps, size_t const numfeats)
+ jstats::
+ valid()
{
- p1 = pid1;
- p2 = 0;
- raw1 = ps.raw_cnt;
- sample1 = ps.sample_cnt;
- sample2 = 0;
- good1 = ps.good;
- good2 = 0;
- raw2 = 0;
- fvals.resize(numfeats);
+ return my_wcnt >= 0;
}
- void
- PhrasePair::
- init(uint64_t const pid1,
- pstats const& ps1,
- pstats const& ps2,
- size_t const numfeats)
- {
- p1 = pid1;
- raw1 = ps1.raw_cnt + ps2.raw_cnt;
- sample1 = ps1.sample_cnt + ps2.sample_cnt;
- sample2 = 0;
- good1 = ps1.good + ps2.good;
- good2 = 0;
- fvals.resize(numfeats);
- }
float
lbop(size_t const tries, size_t const succ, float const confidence)
@@ -261,85 +187,6 @@ namespace Moses
find_lower_bound_on_p(tries, succ, confidence)));
}
- PhrasePair const&
- PhrasePair::
- update(uint64_t const pid2, jstats const& js)
- {
- p2 = pid2;
- raw2 = js.cnt2();
- joint = js.rcnt();
- assert(js.aln().size());
- if (js.aln().size())
- aln = js.aln()[0].second;
- float total_fwd = 0, total_bwd = 0;
- for (int i = po_first; i <= po_other; i++)
- {
- PhraseOrientation po = static_cast<PhraseOrientation>(i);
- total_fwd += js.dcnt_fwd(po)+1;
- total_bwd += js.dcnt_bwd(po)+1;
- }
- for (int i = po_first; i <= po_other; i++)
- {
- PhraseOrientation po = static_cast<PhraseOrientation>(i);
- dfwd[i] = float(js.dcnt_fwd(po)+1)/total_fwd;
- dbwd[i] = float(js.dcnt_bwd(po)+1)/total_bwd;
- }
- return *this;
- }
-
- PhrasePair const&
- PhrasePair::
- update(uint64_t const pid2, jstats const& js1, jstats const& js2)
- {
- p2 = pid2;
- raw2 = js1.cnt2() + js2.cnt2();
- joint = js1.rcnt() + js2.rcnt();
- assert(js1.aln().size() || js2.aln().size());
- if (js1.aln().size())
- aln = js1.aln()[0].second;
- else if (js2.aln().size())
- aln = js2.aln()[0].second;
- for (int i = po_first; i < po_other; i++)
- {
- PhraseOrientation po = static_cast<PhraseOrientation>(i);
- dfwd[i] = float(js1.dcnt_fwd(po) + js2.dcnt_fwd(po) + 1)/(sample1+po_other);
- dbwd[i] = float(js1.dcnt_bwd(po) + js2.dcnt_bwd(po) + 1)/(sample1+po_other);
- }
- return *this;
- }
-
- PhrasePair const&
- PhrasePair::
- update(uint64_t const pid2,
- size_t const raw2extra,
- jstats const& js)
- {
- p2 = pid2;
- raw2 = js.cnt2() + raw2extra;
- joint = js.rcnt();
- assert(js.aln().size());
- if (js.aln().size())
- aln = js.aln()[0].second;
- for (int i = po_first; i <= po_other; i++)
- {
- PhraseOrientation po = static_cast<PhraseOrientation>(i);
- dfwd[i] = float(js.dcnt_fwd(po)+1)/(sample1+po_other);
- dbwd[i] = float(js.dcnt_bwd(po)+1)/(sample1+po_other);
- }
- return *this;
- }
-
- float
- PhrasePair::
- eval(vector<float> const& w)
- {
- assert(w.size() == this->fvals.size());
- this->score = 0;
- for (size_t i = 0; i < w.size(); ++i)
- this->score += w[i] * this->fvals[i];
- return this->score;
- }
-
template<>
sptr<imBitext<L2R_Token<SimpleWordId> > >
imBitext<L2R_Token<SimpleWordId> >::
@@ -371,7 +218,8 @@ namespace Moses
uint32_t row,col; char c;
while (ibuf >> row >> c >> col)
{
- assert(c == '-');
+ UTIL_THROW_IF2(c != '-', "[" << HERE << "] "
+ << "Error in alignment information:\n" << a);
binwrite(obuf,row);
binwrite(obuf,col);
}
@@ -639,7 +487,6 @@ namespace Moses
cout << string(90,'-') << endl;
}
-
PhraseOrientation
find_po_fwd(vector<vector<ushort> >& a1,
vector<vector<ushort> >& a2,
@@ -654,13 +501,13 @@ namespace Moses
ushort ns1,ne1,ne2;
if (!expand_phrase_pair(a1,a2,n2,b1,e1,ns1,ne1,ne2))
- {
- return po_other;
- }
+ return po_other;
+
if (ns1 >= e1)
{
for (ushort j = e1; j < ns1; ++j)
- if (a1[j].size()) return po_jfwd;
+ if (a1[j].size())
+ return po_jfwd;
return po_mono;
}
else
diff --git a/moses/TranslationModel/UG/mm/ug_bitext.h b/moses/TranslationModel/UG/mm/ug_bitext.h
index 397253973..4cb34c02d 100644
--- a/moses/TranslationModel/UG/mm/ug_bitext.h
+++ b/moses/TranslationModel/UG/mm/ug_bitext.h
@@ -56,6 +56,7 @@ namespace Moses {
class Mmsapt;
namespace bitext
{
+ template<typename TKN> class Bitext;
using namespace ugdiss;
template<typename TKN> class Bitext;
@@ -120,6 +121,7 @@ namespace Moses {
void add(float w, vector<uchar> const& a, uint32_t const cnt2,
uint32_t fwd_orient, uint32_t bwd_orient);
void invalidate();
+ void validate();
bool valid();
uint32_t dcnt_fwd(PhraseOrientation const idx) const;
uint32_t dcnt_bwd(PhraseOrientation const idx) const;
@@ -157,43 +159,6 @@ namespace Moses {
uint32_t fwd_o, uint32_t bwd_o);
};
- class
- PhrasePair
- {
- public:
- uint64_t p1, p2;
- uint32_t raw1,raw2,sample1,sample2,good1,good2,joint;
- vector<float> fvals;
- float dfwd[po_other+1];
- float dbwd[po_other+1];
- vector<uchar> aln;
- // float avlex12,avlex21; // average lexical probs (Moses std)
- // float znlex1,znlex2; // zens-ney lexical smoothing
- // float colex1,colex2; // based on raw lexical occurrences
- float score;
- PhrasePair();
- PhrasePair(PhrasePair const& o);
- bool operator<(PhrasePair const& other) const;
- bool operator>(PhrasePair const& other) const;
- bool operator<=(PhrasePair const& other) const;
- bool operator>=(PhrasePair const& other) const;
-
- void init(uint64_t const pid1, pstats const& ps, size_t const numfeats);
- void init(uint64_t const pid1, pstats const& ps1, pstats const& ps2,
- size_t const numfeats);
-
- PhrasePair const&
- update(uint64_t const pid2, jstats const& js);
-
- PhrasePair const&
- update(uint64_t const pid2, jstats const& js1, jstats const& js2);
-
- PhrasePair const&
- update(uint64_t const pid2, size_t const raw2extra, jstats const& js);
-
- float eval(vector<float> const& w);
- };
-
template<typename TKN>
class Bitext
diff --git a/moses/TranslationModel/UG/mm/ug_im_ttrack.h b/moses/TranslationModel/UG/mm/ug_im_ttrack.h
index 05066c922..0c6e4afbf 100644
--- a/moses/TranslationModel/UG/mm/ug_im_ttrack.h
+++ b/moses/TranslationModel/UG/mm/ug_im_ttrack.h
@@ -16,6 +16,9 @@
#include "tpt_tokenindex.h"
#include "ug_ttrack_base.h"
#include "tpt_tokenindex.h"
+#include "util/exception.hh"
+#include "moses/Util.h"
+
// #include "ug_vocab.h"
// define the corpus buffer size (in sentences) and the
@@ -49,6 +52,8 @@ namespace ugdiss
typename boost::shared_ptr<imTtrack<Token> >
append<Token>(typename boost::shared_ptr<imTtrack<Token> > const & crp, vector<Token> const & snt);
+ void m_check_token_count(); // debugging function
+
public:
imTtrack(boost::shared_ptr<vector<vector<Token> > > const& d);
@@ -70,6 +75,22 @@ namespace ugdiss
};
template<typename Token>
+ void
+ imTtrack<Token>::
+ m_check_token_count()
+ { // sanity check
+ size_t check = 0;
+ BOOST_FOREACH(vector<Token> const& s, *myData)
+ check += s.size();
+ UTIL_THROW_IF2(check != this->numToks, "[" << HERE << "]"
+ << " Wrong token count after appending sentence!"
+ << " Counted " << check << " but expected "
+ << this->numToks << " in a total of " << myData->size()
+ << " sentences.");
+
+ }
+
+ template<typename Token>
Token const*
imTtrack<Token>::
sntStart(size_t sid) const // return pointer to beginning of sentence
@@ -111,9 +132,9 @@ namespace ugdiss
template<typename Token>
imTtrack<Token>::
imTtrack(istream& in, TokenIndex const& V, ostream* log = NULL)
+ : numToks(0)
{
myData.reset(new vector<vector<Token> >());
- numToks = 0;
string line,w;
size_t linectr=0;
boost::unordered_map<string,id_type> H;
@@ -135,6 +156,7 @@ namespace ugdiss
template<typename Token>
imTtrack<Token>::
imTtrack(size_t reserve)
+ : numToks(0)
{
myData.reset(new vector<vector<Token> >());
if (reserve) myData->reserve(reserve);
@@ -143,9 +165,9 @@ namespace ugdiss
template<typename Token>
imTtrack<Token>::
imTtrack(boost::shared_ptr<vector<vector<Token> > > const& d)
+ : numToks(0)
{
myData = d;
- numToks = 0;
BOOST_FOREACH(vector<Token> const& v, *d)
numToks += v.size();
}
@@ -171,6 +193,9 @@ namespace ugdiss
shared_ptr<imTtrack<TOKEN> >
append(shared_ptr<imTtrack<TOKEN> > const& crp, vector<TOKEN> const & snt)
{
+#if 1
+ if (crp) crp->m_check_token_count();
+#endif
shared_ptr<imTtrack<TOKEN> > ret;
if (crp == NULL)
{
@@ -185,6 +210,11 @@ namespace ugdiss
}
else ret = crp;
ret->myData->push_back(snt);
+ ret->numToks += snt.size();
+
+#if 1
+ ret->m_check_token_count();
+#endif
return ret;
}
diff --git a/moses/TranslationModel/UG/mm/ug_lexical_phrase_scorer2.h b/moses/TranslationModel/UG/mm/ug_lexical_phrase_scorer2.h
index 558b5a7fa..b7e359223 100644
--- a/moses/TranslationModel/UG/mm/ug_lexical_phrase_scorer2.h
+++ b/moses/TranslationModel/UG/mm/ug_lexical_phrase_scorer2.h
@@ -27,7 +27,6 @@ namespace ugdiss
typedef mm2dTable<id_type,id_type,uint32_t,uint32_t> table_t;
table_t COOC;
void open(string const& fname);
-
template<typename someint>
void
score(TKN const* snt1, size_t const s1, size_t const e1,
@@ -104,7 +103,19 @@ namespace ugdiss
if (COOC.m1(s) == 0 || COOC.m2(t) == 0) return 1.0;
UTIL_THROW_IF2(alpha < 0,"At " << __FILE__ << ":" << __LINE__
<< ": alpha parameter must be >= 0");
- return float(COOC[s][t]+alpha)/(COOC.m1(s)+alpha);
+ float ret = COOC[s][t]+alpha;
+ ret = (ret?ret:1.)/(COOC.m1(s)+alpha);
+ UTIL_THROW_IF2(ret <= 0 || ret > 1, "At " << __FILE__ << ":" << __LINE__
+ << ": result not > 0 and <= 1. alpha = " << alpha << "; "
+ << COOC[s][t] << "/" << COOC.m1(s));
+
+#if 0
+ cerr << "[" << s << "," << t << "] "
+ << COOC.m1(s) << "/"
+ << COOC[s][t] << "/"
+ << COOC.m2(t) << endl;
+#endif
+ return ret;
}
template<typename TKN>
@@ -115,7 +126,11 @@ namespace ugdiss
if (COOC.m1(s) == 0 || COOC.m2(t) == 0) return 1.0;
UTIL_THROW_IF2(alpha < 0,"At " << __FILE__ << ":" << __LINE__
<< ": alpha parameter must be >= 0");
- return float(COOC[s][t]+alpha)/(COOC.m2(t)+alpha);
+ float ret = float(COOC[s][t]+alpha);
+ ret = (ret?ret:1.)/(COOC.m2(t)+alpha);
+ UTIL_THROW_IF2(ret <= 0 || ret > 1, "At " << __FILE__ << ":" << __LINE__
+ << ": result not > 0 and <= 1.");
+ return ret;
}
template<typename TKN>
diff --git a/moses/TranslationModel/UG/mm/ug_phrasepair.cc b/moses/TranslationModel/UG/mm/ug_phrasepair.cc
new file mode 100644
index 000000000..6373f8468
--- /dev/null
+++ b/moses/TranslationModel/UG/mm/ug_phrasepair.cc
@@ -0,0 +1,97 @@
+#include "ug_phrasepair.h"
+namespace Moses {
+ namespace bitext
+ {
+
+#if 0
+ void
+ PhrasePair::
+ init()
+ {
+ p1 = p2 = raw1 = raw2 = sample1 = sample2 = good1 = good2 = joint = 0;
+ }
+
+ void
+ PhrasePair::
+ init(uint64_t const pid1,
+ pstats const& ps1,
+ pstats const& ps2,
+ size_t const numfeats)
+ {
+ p1 = pid1;
+ raw1 = ps1.raw_cnt + ps2.raw_cnt;
+ sample1 = ps1.sample_cnt + ps2.sample_cnt;
+ sample2 = 0;
+ good1 = ps1.good + ps2.good;
+ good2 = 0;
+ joint = 0;
+ fvals.resize(numfeats);
+ }
+
+ PhrasePair const&
+ PhrasePair::
+ update(uint64_t const pid2, jstats const& js1, jstats const& js2)
+ {
+ p2 = pid2;
+ raw2 = js1.cnt2() + js2.cnt2();
+ joint = js1.rcnt() + js2.rcnt();
+ assert(js1.aln().size() || js2.aln().size());
+ if (js1.aln().size())
+ aln = js1.aln()[0].second;
+ else if (js2.aln().size())
+ aln = js2.aln()[0].second;
+ for (int i = po_first; i < po_other; i++)
+ {
+ PhraseOrientation po = static_cast<PhraseOrientation>(i);
+ dfwd[i] = float(js1.dcnt_fwd(po) + js2.dcnt_fwd(po) + 1)/(sample1+po_other);
+ dbwd[i] = float(js1.dcnt_bwd(po) + js2.dcnt_bwd(po) + 1)/(sample1+po_other);
+ }
+ return *this;
+ }
+
+ PhrasePair const&
+ PhrasePair::
+ update(uint64_t const pid2, size_t r2)
+ {
+ p2 = pid2;
+ raw2 = r2;
+ joint = 0;
+ return *this;
+ }
+
+
+ PhrasePair const&
+ PhrasePair::
+ update(uint64_t const pid2,
+ size_t const raw2extra,
+ jstats const& js)
+ {
+ p2 = pid2;
+ raw2 = js.cnt2() + raw2extra;
+ joint = js.rcnt();
+ assert(js.aln().size());
+ if (js.aln().size())
+ aln = js.aln()[0].second;
+ for (int i = po_first; i <= po_other; i++)
+ {
+ PhraseOrientation po = static_cast<PhraseOrientation>(i);
+ dfwd[i] = float(js.dcnt_fwd(po)+1)/(sample1+po_other);
+ dbwd[i] = float(js.dcnt_bwd(po)+1)/(sample1+po_other);
+ }
+ return *this;
+ }
+
+ float
+ PhrasePair::
+ eval(vector<float> const& w)
+ {
+ assert(w.size() == this->fvals.size());
+ this->score = 0;
+ for (size_t i = 0; i < w.size(); ++i)
+ this->score += w[i] * this->fvals[i];
+ return this->score;
+ }
+#endif
+ } // namespace bitext
+} // namespace Moses
+
diff --git a/moses/TranslationModel/UG/mm/ug_phrasepair.h b/moses/TranslationModel/UG/mm/ug_phrasepair.h
new file mode 100644
index 000000000..8cd43dc18
--- /dev/null
+++ b/moses/TranslationModel/UG/mm/ug_phrasepair.h
@@ -0,0 +1,243 @@
+//-*- c++ -*-
+#pragma once
+#include "ug_bitext.h"
+
+using namespace ugdiss;
+using namespace std;
+
+namespace Moses {
+ namespace bitext
+ {
+
+ template<typename Token>
+ string
+ toString(TokenIndex const& V, Token const* x, size_t const len)
+ {
+ if (!len) return "";
+ UTIL_THROW_IF2(!x, HERE << ": Unexpected end of phrase!");
+ ostringstream buf;
+ buf << V[x->id()];
+ size_t i = 1;
+ for (x = x->next(); x && i < len; ++i, x = x->next())
+ buf << " " << V[x->id()];
+ UTIL_THROW_IF2(i != len, HERE << ": Unexpected end of phrase!");
+ return buf.str();
+ }
+
+ template<typename Token>
+ class
+ PhrasePair
+ {
+ public:
+ Token const* start1;
+ Token const* start2;
+ uint32_t len1;
+ uint32_t len2;
+ // uint64_t p1, p2;
+ uint32_t raw1,raw2,sample1,sample2,good1,good2,joint;
+ vector<float> fvals;
+ float dfwd[po_other+1]; // distortion counts // counts or probs?
+ float dbwd[po_other+1]; // distortion counts
+ vector<uchar> aln;
+ float score;
+ PhrasePair() { };
+ PhrasePair(PhrasePair const& o);
+
+ PhrasePair const& operator+=(PhrasePair const& other);
+
+ bool operator<(PhrasePair const& other) const;
+ bool operator>(PhrasePair const& other) const;
+ bool operator<=(PhrasePair const& other) const;
+ bool operator>=(PhrasePair const& other) const;
+
+ void init();
+ void init(Token const* x, uint32_t const len,
+ pstats const* ps = NULL, size_t const numfeats=0);
+
+ // void init(uint64_t const pid1, pstats const& ps, size_t const numfeats);
+ // void init(uint64_t const pid1, pstats const& ps1, pstats const& ps2,
+ // size_t const numfeats);
+
+ // PhrasePair const&
+ // update(uint64_t const pid2, size_t r2 = 0);
+
+ PhrasePair const&
+ update(Token const* x, uint32_t const len, jstats const& js);
+
+ // PhrasePair const&
+ // update(uint64_t const pid2, jstats const& js1, jstats const& js2);
+
+ // PhrasePair const&
+ // update(uint64_t const pid2, size_t const raw2extra, jstats const& js);
+
+ // float
+ // eval(vector<float> const& w);
+
+ class SortByTargetIdSeq
+ {
+ public:
+ int cmp(PhrasePair const& a, PhrasePair const& b) const;
+ bool operator()(PhrasePair const& a, PhrasePair const& b) const;
+ };
+ };
+
+ template<typename Token>
+ void
+ PhrasePair<Token>::
+ init(Token const* x, uint32_t const len,
+ pstats const* ps, size_t const numfeats)
+ {
+ start1 = x; len1 = len;
+ // p1 = pid1;
+ // p2 = 0;
+ if (ps)
+ {
+ raw1 = ps->raw_cnt;
+ sample1 = ps->sample_cnt;
+ good1 = ps->good;
+ }
+ else raw1 = sample1 = good1 = 0;
+ joint = 0;
+ good2 = 0;
+ sample2 = 0;
+ raw2 = 0;
+ fvals.resize(numfeats);
+ }
+
+ template<typename Token>
+ PhrasePair<Token> const&
+ PhrasePair<Token>::
+ update(Token const* x, uint32_t const len, jstats const& js)
+ {
+ // p2 = pid2;
+ start2 = x; len2 = len;
+ raw2 = js.cnt2();
+ joint = js.rcnt();
+ assert(js.aln().size());
+ if (js.aln().size())
+ aln = js.aln()[0].second;
+ float total_fwd = 0, total_bwd = 0;
+ for (int i = po_first; i <= po_other; i++)
+ {
+ PhraseOrientation po = static_cast<PhraseOrientation>(i);
+ total_fwd += js.dcnt_fwd(po)+1;
+ total_bwd += js.dcnt_bwd(po)+1;
+ }
+
+ // should we do that here or leave the raw counts?
+ for (int i = po_first; i <= po_other; i++)
+ {
+ PhraseOrientation po = static_cast<PhraseOrientation>(i);
+ dfwd[i] = float(js.dcnt_fwd(po)+1)/total_fwd;
+ dbwd[i] = float(js.dcnt_bwd(po)+1)/total_bwd;
+ }
+
+ return *this;
+ }
+
+ template<typename Token>
+ bool
+ PhrasePair<Token>::
+ operator<(PhrasePair const& other) const
+ { return this->score < other.score; }
+
+ template<typename Token>
+ bool
+ PhrasePair<Token>::
+ operator>(PhrasePair const& other) const
+ { return this->score > other.score; }
+
+ template<typename Token>
+ bool
+ PhrasePair<Token>::
+ operator<=(PhrasePair const& other) const
+ { return this->score <= other.score; }
+
+ template<typename Token>
+ bool
+ PhrasePair<Token>::
+ operator>=(PhrasePair const& other) const
+ { return this->score >= other.score; }
+
+ template<typename Token>
+ PhrasePair<Token> const&
+ PhrasePair<Token>::
+ operator+=(PhrasePair const& o)
+ {
+ raw1 += o.raw1;
+ raw2 += o.raw2;
+ sample1 += o.sample1;
+ sample2 += o.sample2;
+ good1 += o.good1;
+ good2 += o.good2;
+ joint += o.joint;
+ return *this;
+ }
+
+ template<typename Token>
+ PhrasePair<Token>::
+ PhrasePair(PhrasePair<Token> const& o)
+ : start1(o.start1)
+ , start2(o.start2)
+ , len1(o.len1)
+ , len2(o.len2)
+ , raw1(o.raw1)
+ , raw2(o.raw2)
+ , sample1(o.sample1)
+ , sample2(o.sample2)
+ , good1(o.good1)
+ , good2(o.good2)
+ , joint(o.joint)
+ , fvals(o.fvals)
+ , aln(o.aln)
+ , score(o.score)
+ {
+ for (size_t i = 0; i <= po_other; ++i)
+ {
+ dfwd[i] = o.dfwd[i];
+ dbwd[i] = o.dbwd[i];
+ }
+ }
+
+ template<typename Token>
+ int
+ PhrasePair<Token>::
+ SortByTargetIdSeq::
+ cmp(PhrasePair const& a, PhrasePair const& b) const
+ {
+ size_t i = 0;
+ Token const* x = a.start2;
+ Token const* y = b.start2;
+ while (i < a.len2 && i < b.len2 && x->id() == y->id())
+ {
+ x = x->next();
+ y = y->next();
+ ++i;
+ }
+ if (i == a.len2 && i == b.len2) return 0;
+ if (i == a.len2) return -1;
+ if (i == b.len2) return 1;
+ return x->id() < y->id() ? -1 : 1;
+ }
+
+ template<typename Token>
+ bool
+ PhrasePair<Token>::
+ SortByTargetIdSeq::
+ operator()(PhrasePair const& a, PhrasePair const& b) const
+ {
+ return this->cmp(a,b) < 0;
+ }
+
+ template<typename Token>
+ void
+ PhrasePair<Token>::
+ init()
+ {
+ len1 = len2 = raw1 = raw2 = sample1 = sample2 = good1 = good2 = joint = 0;
+ start1 = start2 = NULL;
+ }
+
+
+ } // namespace bitext
+} // namespace Moses
diff --git a/moses/TranslationModel/UG/mm/ug_tsa_tree_iterator.h b/moses/TranslationModel/UG/mm/ug_tsa_tree_iterator.h
index 14bf6cdad..ab7f96bf0 100644
--- a/moses/TranslationModel/UG/mm/ug_tsa_tree_iterator.h
+++ b/moses/TranslationModel/UG/mm/ug_tsa_tree_iterator.h
@@ -7,6 +7,8 @@
#include "ug_typedefs.h"
#include "tpt_tokenindex.h"
#include <iostream>
+#include "util/exception.hh"
+#include "moses/Util.h"
//#include <cassert>
// #include "ug_bv_iter.h"
@@ -60,10 +62,15 @@ namespace ugdiss
// TSA_tree_iterator(TSA_tree_iterator const& other);
TSA_tree_iterator(TSA<Token> const* s);
+ TSA_tree_iterator(TSA<Token> const* s, TSA_tree_iterator<Token> const& other);
TSA_tree_iterator(TSA<Token> const* r, id_type const* s, size_t const len);
// TSA_tree_iterator(TSA<Token> const* s, Token const& t);
TSA_tree_iterator(TSA<Token> const* s,
Token const* kstart,
+ size_t const len,
+ bool full_match_only=true);
+ TSA_tree_iterator(TSA<Token> const* s,
+ Token const* kstart,
Token const* kend,
bool full_match_only=true);
// TSA_tree_iterator(TSA<Token> const* s,
@@ -150,9 +157,12 @@ namespace ugdiss
double approxOccurrenceCount(int p=-1) const
{
assert(root);
+ if (p < 0) p += lower.size();
double ret = arrayByteSpanSize(p)/root->aveIndexEntrySize();
- assert(ret < root->corpus->numTokens());
if (ret < 25) ret = rawCnt(p);
+ UTIL_THROW_IF2(ret > root->corpus->numTokens(), "[" << HERE << "] "
+ << "Word count mismatch.");
+ assert(ret <= root->corpus->numTokens());
return ret;
}
@@ -320,6 +330,18 @@ namespace ugdiss
template<typename Token>
TSA_tree_iterator<Token>::
+ TSA_tree_iterator(TSA<Token> const* s, TSA_tree_iterator<Token> const& other)
+ : root(s)
+ {
+ Token const* x = other.getToken(0);
+ for (size_t i = 0; i < other.size() && this->extend(x->id()); ++i)
+ x = x->next();
+ };
+
+
+
+ template<typename Token>
+ TSA_tree_iterator<Token>::
TSA_tree_iterator
(TSA<Token> const* r,
id_type const* s,
@@ -385,6 +407,25 @@ namespace ugdiss
template<typename Token>
TSA_tree_iterator<Token>::
TSA_tree_iterator(TSA<Token> const* s, Token const* kstart,
+ size_t const len, bool full_match_only)
+ : root(s)
+ {
+ if (!root) return;
+ size_t i = 0;
+ for (; i < len && kstart && extend(*kstart); ++i)
+ kstart = kstart->next();
+ if (full_match_only && i != len)
+ {
+ lower.clear();
+ upper.clear();
+ }
+ };
+
+ // DEPRECATED: DO NOT USE. Use the one that takes the length
+ // instead of kend.
+ template<typename Token>
+ TSA_tree_iterator<Token>::
+ TSA_tree_iterator(TSA<Token> const* s, Token const* kstart,
Token const* kend, bool full_match_only)
: root(s)
{
@@ -561,8 +602,7 @@ namespace ugdiss
TSA_tree_iterator<Token>::
rawCnt(int p) const
{
- if (p < 0)
- p = lower.size()+p;
+ if (p < 0) p += lower.size();
assert(p>=0);
if (lower.size() == 0) return root->getCorpusSize();
return root->rawCnt(lower[p],upper[p]);
diff --git a/moses/TranslationModel/UG/mmsapt.cpp b/moses/TranslationModel/UG/mmsapt.cpp
index dc9945472..596fec4e6 100644
--- a/moses/TranslationModel/UG/mmsapt.cpp
+++ b/moses/TranslationModel/UG/mmsapt.cpp
@@ -1,13 +1,38 @@
#include "mmsapt.h"
#include <boost/foreach.hpp>
+#include <boost/scoped_ptr.hpp>
#include <boost/tokenizer.hpp>
#include <algorithm>
+#include "moses/TranslationModel/UG/mm/ug_phrasepair.h"
+#include "util/exception.hh"
+#include <set>
namespace Moses
{
using namespace bitext;
using namespace std;
using namespace boost;
+
+
+ // uint64_t
+ // pack_phrasekey(uint64_t const shard_id, uint64_t const snt_id,
+ // uint64_t const offset, uint64_t const len)
+ // {
+ // uint64_t one = 1;
+ // // 8 bits - 256 shards
+ // // 13 bits - max offset
+ // // 11 bits - max len
+ // // 32 bits - max sentence id
+ // UTIL_TRHOW_IF2(shard_id >= 256, "[" << HERE << "] "
+ // << "Sentence ID exceeds limit.");
+ // UTIL_THROW_IF2(snt_id >= 4294967296, "[" << HERE << "] "
+ // << "Sentence ID exceeds limit.");
+ // UTIL_TRHOW_IF2(offset >= 8192, "[" << HERE << "]"
+ // << "Phrase offset exceeds limit.");
+ // UTIL_TRHOW_IF2(offset >= 2048, "[" << HERE << "]"
+ // << "Phrase length exceeds limit.");
+ // return ((shard_id<<56)+(snt_id<<24)+(offset<<11)+len);
+ // }
void
fillIdSeq(Phrase const& mophrase, size_t const ifactor,
@@ -23,7 +48,7 @@ namespace Moses
void
- parseLine(string const& line, map<string,string> & params)
+ parseLine(string const& line, map<string,string> & param)
{
char_separator<char> sep("; ");
tokenizer<char_separator<char> > tokens(line,sep);
@@ -32,9 +57,14 @@ namespace Moses
size_t i = t.find_first_not_of(" =");
size_t j = t.find_first_of(" =",i+1);
size_t k = t.find_first_not_of(" =",j+1);
+ UTIL_THROW_IF2(i == string::npos || k == string::npos,
+ "[" << HERE << "] "
+ << "Parameter specification error near '"
+ << t << "' in moses ini line\n"
+ << line);
assert(i != string::npos);
assert(k != string::npos);
- params[t.substr(i,j)] = t.substr(k);
+ param[t.substr(i,j)] = t.substr(k);
}
}
@@ -57,13 +87,13 @@ namespace Moses
Mmsapt::
Mmsapt(string const& line)
: PhraseDictionary(line)
- , m_lex_alpha(1.0)
- , withLogCountFeatures(false)
- , withCoherence(true)
- , m_pfwd_features("g")
- , m_pbwd_features("g")
- , withPbwd(true)
- , poolCounts(true)
+ // , m_lex_alpha(1.0)
+ // , withLogCountFeatures(false)
+ // , withCoherence(true)
+ // , m_pfwd_features("g")
+ // , m_pbwd_features("g")
+ // , withPbwd(true)
+ // , poolCounts(true)
, ofactor(1,0)
, m_tpc_ctr(0)
{
@@ -94,81 +124,125 @@ namespace Moses
void
Mmsapt::
+ register_ff(sptr<pscorer> const& ff, vector<sptr<pscorer> > & registry)
+ {
+ registry.push_back(ff);
+ ff->setIndex(m_feature_names.size());
+ for (int i = 0; i < ff->fcnt(); ++i)
+ {
+ m_feature_names.push_back(ff->fname(i));
+ m_is_logval.push_back(ff->isLogVal(i));
+ m_is_integer.push_back(ff->isIntegerValued(i));
+ }
+ }
+
+ bool
+ Mmsapt::
+ isLogVal(int i) const { return m_is_logval.at(i); }
+
+ bool
+ Mmsapt::
+ isInteger(int i) const { return m_is_integer.at(i); }
+
+ void
+ Mmsapt::
init(string const& line)
{
map<string,string>::const_iterator m;
- map<string,string> param;
- parseLine(line,param);
+ parseLine(line,this->param);
+
+ this->m_numScoreComponents = atoi(param["num-features"].c_str());
m = param.find("config");
if (m != param.end())
read_config_file(m->second,param);
-
- bname = param["base"];
+
+ bname = param["base"];
L1 = param["L1"];
L2 = param["L2"];
- assert(bname.size());
- assert(L1.size());
- assert(L2.size());
-
- m = param.find("pfwd-denom");
- m_pfwd_denom = m != param.end() ? m->second[0] : 's';
-
- m = param.find("smooth");
- m_lbop_parameter = m != param.end() ? atof(m->second.c_str()) : .05;
- m = param.find("max-samples");
- m_default_sample_size = m != param.end() ? atoi(m->second.c_str()) : 1000;
+ UTIL_THROW_IF2(bname.size() == 0, "Missing corpus base name at " << HERE);
+ UTIL_THROW_IF2(L1.size() == 0, "Missing L1 tag at " << HERE);
+ UTIL_THROW_IF2(L2.size() == 0, "Missing L2 tag at " << HERE);
- if ((m = param.find("logcnt-features")) != param.end())
- withLogCountFeatures = m->second != "0";
-
- if ((m = param.find("coh")) != param.end())
- withCoherence = m->second != "0";
-
- if ((m = param.find("pfwd")) != param.end())
- m_pfwd_features = (m->second == "0" ? "" : m->second);
-
- if (m_pfwd_features == "1") // legacy; deprecated
- m_pfwd_features[0] = m_pfwd_denom;
+ // set defaults for all parameters if not specified so far
+ pair<string,string> dflt("input-factor","0");
+ input_factor = atoi(param.insert(dflt).first->second.c_str());
+ // shouldn't that be a string?
- if ((m = param.find("pbwd")) != param.end())
- m_pbwd_features = (m->second == "0" ? "" : m->second);
+ dflt = pair<string,string> ("smooth",".01");
+ m_lbop_conf = atof(param.insert(dflt).first->second.c_str());
- if (m_pbwd_features == "1")
- m_pbwd_features = "r"; // lecagy; deprecated
+ dflt = pair<string,string> ("lexalpha","0");
+ m_lex_alpha = atof(param.insert(dflt).first->second.c_str());
- if ((m = param.find("lexalpha")) != param.end())
- m_lex_alpha = atof(m->second.c_str());
+ dflt = pair<string,string> ("sample","1000");
+ m_default_sample_size = atoi(param.insert(dflt).first->second.c_str());
- m = param.find("workers");
- m_workers = m != param.end() ? atoi(m->second.c_str()) : 8;
+ dflt = pair<string,string>("workers","8");
+ m_workers = atoi(param.insert(dflt).first->second.c_str());
m_workers = min(m_workers,24UL);
- if ((m = param.find("limit")) != param.end())
- m_tableLimit = atoi(m->second.c_str());
+ dflt = pair<string,string>("limit","20");
+ m_tableLimit = atoi(param.insert(dflt).first->second.c_str());
- m = param.find("cache-size");
- m_history.reserve(m != param.end()?max(1000,atoi(m->second.c_str())):10000);
+ dflt = pair<string,string>("cache","10000");
+ size_t hsize = max(1000,atoi(param.insert(dflt).first->second.c_str()));
+ m_history.reserve(hsize);
// in plain language: cache size is at least 1000, and 10,000 by default
// this cache keeps track of the most frequently used target phrase collections
// even when not actively in use
-
- this->m_numScoreComponents = atoi(param["num-features"].c_str());
- m = param.find("ifactor");
- input_factor = m != param.end() ? atoi(m->second.c_str()) : 0;
+ // Feature functions are initialized in function Load();
+ param.insert(pair<string,string>("pfwd", "g"));
+ param.insert(pair<string,string>("pbwd", "g"));
+ param.insert(pair<string,string>("logcnt", "0"));
+ param.insert(pair<string,string>("coh", "0"));
+ param.insert(pair<string,string>("rare", "1"));
+ param.insert(pair<string,string>("prov", "1"));
poolCounts = true;
if ((m = param.find("extra")) != param.end())
extra_data = m->second;
+ // check for unknown parameters
+ vector<string> known_parameters; known_parameters.reserve(50);
+ known_parameters.push_back("L1");
+ known_parameters.push_back("L2");
+ known_parameters.push_back("Mmsapt");
+ known_parameters.push_back("base");
+ known_parameters.push_back("cache");
+ known_parameters.push_back("coh");
+ known_parameters.push_back("config");
+ known_parameters.push_back("extra");
+ known_parameters.push_back("input-factor");
+ known_parameters.push_back("lexalpha");
+ known_parameters.push_back("limit");
+ known_parameters.push_back("logcnt");
+ known_parameters.push_back("name");
+ known_parameters.push_back("num-features");
+ known_parameters.push_back("output-factor");
+ known_parameters.push_back("pbwd");
+ known_parameters.push_back("pfwd");
+ known_parameters.push_back("prov");
+ known_parameters.push_back("rare");
+ known_parameters.push_back("sample");
+ known_parameters.push_back("smooth");
+ known_parameters.push_back("unal");
+ known_parameters.push_back("workers");
+ for (map<string,string>::iterator m = param.begin(); m != param.end(); ++m)
+ {
+ UTIL_THROW_IF2(!binary_search(known_parameters.begin(),
+ known_parameters.end(), m->first),
+ HERE << ": Unknown parameter specification for Mmsapt: "
+ << m->first);
+ }
}
void
Mmsapt::
- load_extra_data(string bname)
+ load_extra_data(string bname, bool locking = true)
{
// TO DO: ADD CHECKS FOR ROBUSTNESS
// - file existence?
@@ -186,122 +260,120 @@ namespace Moses
while(getline(in2,line)) text2.push_back(line);
while(getline(ina,line)) symal.push_back(line);
- lock_guard<mutex> guard(this->lock);
+ boost::scoped_ptr<lock_guard<mutex> > guard;
+ if (locking) guard.reset(new lock_guard<mutex>(this->lock));
btdyn = btdyn->add(text1,text2,symal);
assert(btdyn);
// cerr << "Loaded " << btdyn->T1->size() << " sentence pairs" << endl;
}
- size_t
+ template<typename fftype>
+ void
Mmsapt::
- add_corpus_specific_features
- (vector<sptr<pscorer > >& ffvec, size_t num_feats)
+ check_ff(string const ffname, vector<sptr<pscorer> >* registry)
{
- float const lbop = m_lbop_parameter; // just for code readability below
- // for the time being, we assume that all phrase probability features
- // use the same confidence parameter for lower-bound-estimation
- for (size_t i = 0; i < m_pfwd_features.size(); ++i)
- {
- UTIL_THROW_IF2(m_pfwd_features[i] != 'g' &&
- m_pfwd_features[i] != 'r' &&
- m_pfwd_features[i] != 's',
- "Can't handle pfwd feature type '"
- << m_pfwd_features[i] << "'.");
- sptr<PScorePfwd<Token> > ff(new PScorePfwd<Token>());
- size_t k = num_feats;
- num_feats = ff->init(num_feats,lbop,m_pfwd_features[i]);
- for (;k < num_feats; ++k) m_feature_names.push_back(ff->fname(k));
- ffvec.push_back(ff);
+ string const& spec = param[ffname];
+ if (spec == "" || spec == "0") return;
+ if (registry)
+ {
+ sptr<fftype> ff(new fftype(spec));
+ register_ff(ff, *registry);
}
-
- for (size_t i = 0; i < m_pbwd_features.size(); ++i)
- {
- UTIL_THROW_IF2(m_pbwd_features[i] != 'g' &&
- m_pbwd_features[i] != 'r' &&
- m_pbwd_features[i] != 's',
- "Can't handle pbwd feature type '"
- << m_pbwd_features[i] << "'.");
- sptr<PScorePbwd<Token> > ff(new PScorePbwd<Token>());
- size_t k = num_feats;
- num_feats = ff->init(num_feats,lbop,m_pbwd_features[i]);
- for (;k < num_feats; ++k) m_feature_names.push_back(ff->fname(k));
- ffvec.push_back(ff);
+ else if (spec[spec.size()-1] == '+') // corpus specific
+ {
+ sptr<fftype> ff(new fftype(spec));
+ register_ff(ff, m_active_ff_fix);
+ ff.reset(new fftype(spec));
+ register_ff(ff, m_active_ff_dyn);
}
-
- // if (withPbwd)
- // {
- // sptr<PScorePbwd<Token> > ff(new PScorePbwd<Token>());
- // size_t k = num_feats;
- // num_feats = ff->init(num_feats,lbop);
- // for (; k < num_feats; ++k) m_feature_names.push_back(ff->fname(k));
- // ffvec.push_back(ff);
- // }
-
- if (withLogCountFeatures)
+ else
{
- sptr<PScoreLogCounts<Token> > ff(new PScoreLogCounts<Token>());
- size_t k = num_feats;
- num_feats = ff->init(num_feats);
- for (; k < num_feats; ++k) m_feature_names.push_back(ff->fname(k));
- ffvec.push_back(ff);
+ sptr<fftype> ff(new fftype(spec));
+ register_ff(ff, m_active_ff_common);
}
+ }
- return num_feats;
+ template<typename fftype>
+ void
+ Mmsapt::
+ check_ff(string const ffname, float const xtra, vector<sptr<pscorer> >* registry)
+ {
+ string const& spec = param[ffname];
+ if (spec == "" || spec == "0") return;
+ if (registry)
+ {
+ sptr<fftype> ff(new fftype(xtra,spec));
+ register_ff(ff, *registry);
+ }
+ else if (spec[spec.size()-1] == '+') // corpus specific
+ {
+ sptr<fftype> ff(new fftype(xtra,spec));
+ register_ff(ff, m_active_ff_fix);
+ ff.reset(new fftype(xtra,spec));
+ register_ff(ff, m_active_ff_dyn);
+ }
+ else
+ {
+ sptr<fftype> ff(new fftype(xtra,spec));
+ register_ff(ff, m_active_ff_common);
+ }
}
+ // void
+ // Mmsapt::
+ // add_corpus_specific_features(vector<sptr<pscorer > >& registry)
+ // {
+ // check_ff<PScorePbwd<Token> >("pbwd",m_lbop_conf,registry);
+ // check_ff<PScoreLogCnt<Token> >("logcnt",registry);
+ // }
+
void
Mmsapt::
Load()
{
+ lock_guard<mutex> guard(this->lock);
+
+ // can load only once
+ // UTIL_THROW_IF2(shards.size(),"Mmsapt is already loaded at " << HERE);
+
+ // lexical scores
+ string lexfile = bname + L1 + "-" + L2 + ".lex";
+ sptr<PScoreLex1<Token> > ff(new PScoreLex1<Token>(param["lex_alpha"],lexfile));
+ register_ff(ff,m_active_ff_common);
+
+ // these are always computed on pooled data
+ check_ff<PScoreRareness<Token> > ("rare", &m_active_ff_common);
+ check_ff<PScoreUnaligned<Token> >("unal", &m_active_ff_common);
+ check_ff<PScoreCoherence<Token> >("coh", &m_active_ff_common);
+
+ // for these ones either way is possible (specification ends with '+'
+ // if corpus-specific
+ check_ff<PScorePfwd<Token> >("pfwd", m_lbop_conf);
+ check_ff<PScorePbwd<Token> >("pbwd", m_lbop_conf);
+ check_ff<PScoreLogCnt<Token> >("logcnt");
+
+ // These are always corpus-specific
+ check_ff<PScoreProvenance<Token> >("prov", &m_active_ff_fix);
+ check_ff<PScoreProvenance<Token> >("prov", &m_active_ff_dyn);
+
+ UTIL_THROW_IF2(this->m_feature_names.size() != this->m_numScoreComponents,
+ "At " << HERE << ": number of feature values provided by "
+ << "Phrase table (" << this->m_feature_names.size()
+ << ") does not match number specified in Moses config file ("
+ << this->m_numScoreComponents << ")!\n";);
+
+ // Load corpora. For the time being, we can have one memory-mapped static
+ // corpus and one in-memory dynamic corpus
+ // sptr<mmbitext> btfix(new mmbitext());
btfix.num_workers = this->m_workers;
btfix.open(bname, L1, L2);
btfix.setDefaultSampleSize(m_default_sample_size);
+ // shards.push_back(btfix);
- size_t num_feats = 0;
-
- // lexical scores are currently always active
- sptr<PScoreLex<Token> > ff(new PScoreLex<Token>(m_lex_alpha));
- size_t k = num_feats;
- num_feats = ff->init(num_feats, bname + L1 + "-" + L2 + ".lex");
- for (; k < num_feats; ++k) m_feature_names.push_back(ff->fname(k));
- m_active_ff_common.push_back(ff);
-
- if (withCoherence)
- {
- sptr<PScoreCoherence<Token> > ff(new PScoreCoherence<Token>());
- size_t k = num_feats;
- num_feats = ff->init(num_feats);
- for (; k < num_feats; ++k) m_feature_names.push_back(ff->fname(k));
- m_active_ff_common.push_back(ff);
- }
-
- num_feats = add_corpus_specific_features(m_active_ff_fix,num_feats);
- // cerr << num_feats << "/" << this->m_numScoreComponents
- // << " at " << __FILE__ << ":" << __LINE__ << endl;
- poolCounts = poolCounts && num_feats == this->m_numScoreComponents;
- if (!poolCounts)
- num_feats = add_corpus_specific_features(m_active_ff_dyn, num_feats);
-
-#if 0
- cerr << "MMSAPT provides " << num_feats << " features at "
- << __FILE__ << ":" << __LINE__ << endl;
- BOOST_FOREACH(string const& fname, m_feature_names)
- cerr << fname << endl;
-#endif
- UTIL_THROW_IF2(num_feats != this->m_numScoreComponents,
- "At " << __FILE__ << ":" << __LINE__
- << ": number of feature values provided by Phrase table ("
- << num_feats << ") does not match number specified in "
- << "Moses config file (" << this->m_numScoreComponents
- << ")!\n";);
-
-
- btdyn.reset(new imBitext<Token>(btfix.V1, btfix.V2,m_default_sample_size));
+ btdyn.reset(new imbitext(btfix.V1, btfix.V2, m_default_sample_size));
btdyn->num_workers = this->m_workers;
if (extra_data.size())
- {
- load_extra_data(extra_data);
- }
+ load_extra_data(extra_data,false);
#if 0
// currently not used
@@ -330,258 +402,345 @@ namespace Moses
TargetPhrase*
Mmsapt::
- createTargetPhrase(Phrase const& src,
- Bitext<Token> const& bt,
- PhrasePair const& pp) const
+ mkTPhrase(Phrase const& src,
+ PhrasePair<Token>* fix,
+ PhrasePair<Token>* dyn,
+ sptr<Bitext<Token> > const& dynbt) const
{
- Word w; uint32_t sid,off,len;
+ UTIL_THROW_IF2(!fix && !dyn, HERE <<
+ ": Can't create target phrase from nothing.");
+ vector<float> fvals(this->m_numScoreComponents);
+ PhrasePair<Token> pool = fix ? *fix : *dyn;
+ if (fix)
+ {
+ BOOST_FOREACH(sptr<pscorer> const& ff, m_active_ff_fix)
+ (*ff)(btfix, *fix, &fvals);
+ }
+ if (dyn)
+ {
+ BOOST_FOREACH(sptr<pscorer> const& ff, m_active_ff_dyn)
+ (*ff)(*dynbt, *dyn, &fvals);
+ }
+
+ if (fix && dyn) { pool += *dyn; }
+ else if (fix)
+ {
+ PhrasePair<Token> zilch; zilch.init();
+ TSA<Token>::tree_iterator m(dynbt->I2.get(), fix->start2, fix->len2);
+ if (m.size() == fix->len2)
+ zilch.raw2 = m.approxOccurrenceCount();
+ pool += zilch;
+ BOOST_FOREACH(sptr<pscorer> const& ff, m_active_ff_dyn)
+ (*ff)(*dynbt, ff->allowPooling() ? pool : zilch, &fvals);
+ }
+ else if (dyn)
+ {
+ PhrasePair<Token> zilch; zilch.init();
+ TSA<Token>::tree_iterator m(btfix.I2.get(), dyn->start2, dyn->len2);
+ if (m.size() == dyn->len2)
+ zilch.raw2 = m.approxOccurrenceCount();
+ pool += zilch;
+ BOOST_FOREACH(sptr<pscorer> const& ff, m_active_ff_fix)
+ (*ff)(*dynbt, ff->allowPooling() ? pool : zilch, &fvals);
+ }
+ if (fix)
+ {
+ BOOST_FOREACH(sptr<pscorer> const& ff, m_active_ff_common)
+ (*ff)(btfix, pool, &fvals);
+ }
+ else
+ {
+ BOOST_FOREACH(sptr<pscorer> const& ff, m_active_ff_common)
+ (*ff)(*dynbt, pool, &fvals);
+ }
TargetPhrase* tp = new TargetPhrase();
- parse_pid(pp.p2, sid, off, len);
- Token const* x = bt.T2->sntStart(sid) + off;
- for (uint32_t k = 0; k < len; ++k)
+ Token const* x = fix ? fix->start2 : dyn->start2;
+ uint32_t len = fix ? fix->len2 : dyn->len2;
+ for (uint32_t k = 0; k < len; ++k, x = x->next())
{
- // cerr << (*bt.V2)[x[k].id()] << " at " << __FILE__ << ":" << __LINE__ << endl;
- StringPiece wrd = (*bt.V2)[x[k].id()];
- // if ((off+len) > bt.T2->sntLen(sid))
- // cerr << off << ";" << len << " " << bt.T2->sntLen(sid) << endl;
- assert(off+len <= bt.T2->sntLen(sid));
- w.CreateFromString(Output,ofactor,wrd,false);
+ StringPiece wrd = (*(btfix.V2))[x->id()];
+ Word w; w.CreateFromString(Output,ofactor,wrd,false);
tp->AddWord(w);
}
- tp->GetScoreBreakdown().Assign(this, pp.fvals);
+ tp->GetScoreBreakdown().Assign(this, fvals);
tp->Evaluate(src);
return tp;
}
- // process phrase stats from a single parallel corpus
- void
- Mmsapt::
- process_pstats
- (Phrase const& src,
- uint64_t const pid1,
- pstats const& stats,
- Bitext<Token> const & bt,
- TargetPhraseCollection* tpcoll
- ) const
- {
- PhrasePair pp;
- pp.init(pid1, stats, this->m_numScoreComponents);
- pstats::trg_map_t::const_iterator t;
- for (t = stats.trg.begin(); t != stats.trg.end(); ++t)
- {
- pp.update(t->first,t->second);
- BOOST_FOREACH(sptr<pscorer> const& ff, m_active_ff_fix)
- (*ff)(bt,pp);
- BOOST_FOREACH(sptr<pscorer> const& ff, m_active_ff_common)
- (*ff)(bt,pp);
- tpcoll->Add(createTargetPhrase(src,bt,pp));
- }
- }
+ // TargetPhrase*
+ // Mmsapt::
+ // mkTPhrase(Phrase const& src,
+ // Bitext<Token> const& bt,
+ // PhrasePair const& pp) const
+ // {
+ // Word w; uint32_t sid,off,len;
+ // TargetPhrase* tp = new TargetPhrase();
+ // parse_pid(pp.p2, sid, off, len);
+ // Token const* x = bt.T2->sntStart(sid) + off;
+ // for (uint32_t k = 0; k < len; ++k)
+ // {
+ // // cerr << (*bt.V2)[x[k].id()] << " at " << __FILE__ << ":" << __LINE__ << endl;
+ // StringPiece wrd = (*bt.V2)[x[k].id()];
+ // // if ((off+len) > bt.T2->sntLen(sid))
+ // // cerr << off << ";" << len << " " << bt.T2->sntLen(sid) << endl;
+ // assert(off+len <= bt.T2->sntLen(sid));
+ // w.CreateFromString(Output,ofactor,wrd,false);
+ // tp->AddWord(w);
+ // }
+ // tp->GetScoreBreakdown().Assign(this, pp.fvals);
+ // tp->Evaluate(src);
+ // return tp;
+ // }
+
+ // // process phrase stats from a single parallel corpus
+ // void
+ // Mmsapt::
+ // process_pstats
+ // (Phrase const& src,
+ // uint64_t const pid1,
+ // pstats const& stats,
+ // Bitext<Token> const & bt,
+ // TargetPhraseCollection* tpcoll
+ // ) const
+ // {
+ // PhrasePair pp;
+ // pp.init(pid1, stats, this->m_numScoreComponents);
+ // pstats::trg_map_t::const_iterator t;
+ // for (t = stats.trg.begin(); t != stats.trg.end(); ++t)
+ // {
+ // pp.update(t->first,t->second);
+ // BOOST_FOREACH(sptr<pscorer> const& ff, m_active_ff_fix)
+ // (*ff)(bt,pp);
+ // BOOST_FOREACH(sptr<pscorer> const& ff, m_active_ff_common)
+ // (*ff)(bt,pp);
+ // tpcoll->Add(mkTPhrase(src,bt,pp));
+ // }
+ // }
+
+ // void
+ // Mmsapt::
+ // ScorePPfix(PhrasePair& pp) const
+ // {
+ // BOOST_FOREACH(sptr<pscorer> const& ff, m_active_ff_fix)
+ // (*ff)(btfix,pp);
+ // BOOST_FOREACH(sptr<pscorer> const& ff, m_active_ff_common)
+ // (*ff)(btfix,pp);
+ // }
+
+// // process phrase stats from a single parallel corpus
+// bool
+// Mmsapt::
+// pool_pstats(Phrase const& src,
+// uint64_t const pid1a,
+// pstats * statsa,
+// Bitext<Token> const & bta,
+// uint64_t const pid1b,
+// pstats const* statsb,
+// Bitext<Token> const & btb,
+// TargetPhraseCollection* tpcoll) const
+// {
+// PhrasePair pp;
+// if (statsa && statsb)
+// pp.init(pid1b, *statsa, *statsb, this->m_numScoreComponents);
+// else if (statsa)
+// pp.init(pid1a, *statsa, this->m_numScoreComponents);
+// else if (statsb)
+// pp.init(pid1b, *statsb, this->m_numScoreComponents);
+// else return false; // throw "no stats for pooling available!";
+
+// pstats::trg_map_t::const_iterator b;
+// pstats::trg_map_t::iterator a;
+// if (statsb)
+// {
+// for (b = statsb->trg.begin(); b != statsb->trg.end(); ++b)
+// {
+// uint32_t sid,off,len;
+// parse_pid(b->first, sid, off, len);
+// Token const* x = btb.T2->sntStart(sid) + off;
+// TSA<Token>::tree_iterator m(bta.I2.get(),x,x+len);
+// if (m.size() == len)
+// {
+// ;
+// if (statsa && ((a = statsa->trg.find(m.getPid()))
+// != statsa->trg.end()))
+// {
+// pp.update(b->first,a->second,b->second);
+// a->second.invalidate();
+// }
+// else
+// pp.update(b->first,m.approxOccurrenceCount(),
+// b->second);
+// }
+// else pp.update(b->first,b->second);
+// BOOST_FOREACH(sptr<pscorer> const& ff, m_active_ff_fix)
+// (*ff)(btb,pp);
+// BOOST_FOREACH(sptr<pscorer> const& ff, m_active_ff_common)
+// (*ff)(btb,pp);
+// tpcoll->Add(mkTPhrase(src,btb,pp));
+// }
+// }
+// if (!statsa) return statsb != NULL;
+// for (a = statsa->trg.begin(); a != statsa->trg.end(); ++a)
+// {
+// uint32_t sid,off,len;
+// if (!a->second.valid()) continue;
+// parse_pid(a->first, sid, off, len);
+// if (btb.T2)
+// {
+// Token const* x = bta.T2->sntStart(sid) + off;
+// TSA<Token>::tree_iterator m(btb.I2.get(), x, len);
+// if (m.size() == len)
+// pp.update(a->first,m.approxOccurrenceCount(),a->second);
+// else
+// pp.update(a->first,a->second);
+// }
+// else pp.update(a->first,a->second);
+// #if 0
+// // jstats const& j = a->second;
+// cerr << bta.T1->pid2str(bta.V1.get(),pp.p1) << " ::: "
+// << bta.T2->pid2str(bta.V2.get(),pp.p2) << endl;
+// cerr << pp.raw1 << " " << pp.sample1 << " " << pp.good1 << " "
+// << pp.joint << " " << pp.raw2 << endl;
+// #endif
+
+// UTIL_THROW_IF2(pp.raw2 == 0,
+// "OOPS" << bta.T1->pid2str(bta.V1.get(),pp.p1) << " ::: "
+// << bta.T2->pid2str(bta.V2.get(),pp.p2) << ": "
+// << pp.raw1 << " " << pp.sample1 << " "
+// << pp.good1 << " " << pp.joint << " "
+// << pp.raw2);
+// BOOST_FOREACH(sptr<pscorer> const& ff, m_active_ff_fix)
+// (*ff)(bta,pp);
+// BOOST_FOREACH(sptr<pscorer> const& ff, m_active_ff_common)
+// (*ff)(bta,pp);
+// tpcoll->Add(mkTPhrase(src,bta,pp));
+// }
+// return true;
+// }
- void
- Mmsapt::
- ScorePPfix(bitext::PhrasePair& pp) const
- {
- BOOST_FOREACH(sptr<pscorer> const& ff, m_active_ff_fix)
- (*ff)(btfix,pp);
- BOOST_FOREACH(sptr<pscorer> const& ff, m_active_ff_common)
- (*ff)(btfix,pp);
- }
- // process phrase stats from a single parallel corpus
- bool
- Mmsapt::
- pool_pstats(Phrase const& src,
- uint64_t const pid1a,
- pstats * statsa,
- Bitext<Token> const & bta,
- uint64_t const pid1b,
- pstats const* statsb,
- Bitext<Token> const & btb,
- TargetPhraseCollection* tpcoll) const
- {
- PhrasePair pp;
- if (statsa && statsb)
- pp.init(pid1b, *statsa, *statsb, this->m_numScoreComponents);
- else if (statsa)
- pp.init(pid1a, *statsa, this->m_numScoreComponents);
- else if (statsb)
- pp.init(pid1b, *statsb, this->m_numScoreComponents);
- else return false; // throw "no stats for pooling available!";
-
- pstats::trg_map_t::const_iterator b;
- pstats::trg_map_t::iterator a;
- if (statsb)
- {
- for (b = statsb->trg.begin(); b != statsb->trg.end(); ++b)
- {
- uint32_t sid,off,len;
- parse_pid(b->first, sid, off, len);
- Token const* x = bta.T2->sntStart(sid) + off;
- TSA<Token>::tree_iterator m(bta.I2.get(),x,x+len);
- if (m.size() == len)
- {
- ;
- if (statsa && ((a = statsa->trg.find(m.getPid()))
- != statsa->trg.end()))
- {
- pp.update(b->first,a->second,b->second);
- a->second.invalidate();
- }
- else
- pp.update(b->first,m.approxOccurrenceCount(),
- b->second);
- }
- else pp.update(b->first,b->second);
- BOOST_FOREACH(sptr<pscorer> const& ff, m_active_ff_fix)
- (*ff)(btb,pp);
- BOOST_FOREACH(sptr<pscorer> const& ff, m_active_ff_common)
- (*ff)(btb,pp);
- tpcoll->Add(createTargetPhrase(src,btb,pp));
- }
- }
- if (!statsa) return statsb != NULL;
- for (a = statsa->trg.begin(); a != statsa->trg.end(); ++a)
- {
- uint32_t sid,off,len;
- if (!a->second.valid()) continue;
- parse_pid(a->first, sid, off, len);
- if (btb.T2)
- {
- Token const* x = bta.T2->sntStart(sid) + off;
- TSA<Token>::tree_iterator m(btb.I2.get(), x, x+len);
- if (m.size() == len)
- pp.update(a->first,m.approxOccurrenceCount(),a->second);
- else
- pp.update(a->first,a->second);
- }
- else
- pp.update(a->first,a->second);
-#if 0
- // jstats const& j = a->second;
- cerr << bta.T1->pid2str(bta.V1.get(),pp.p1) << " ::: "
- << bta.T2->pid2str(bta.V2.get(),pp.p2) << endl;
- cerr << pp.raw1 << " " << pp.sample1 << " " << pp.good1 << " "
- << pp.joint << " " << pp.raw2 << endl;
-#endif
- UTIL_THROW_IF2(pp.raw2 == 0,
- "OOPS" << bta.T1->pid2str(bta.V1.get(),pp.p1) << " ::: "
- << bta.T2->pid2str(bta.V2.get(),pp.p2) << ": "
- << pp.raw1 << " " << pp.sample1 << " "
- << pp.good1 << " " << pp.joint << " "
- << pp.raw2);
- BOOST_FOREACH(sptr<pscorer> const& ff, m_active_ff_fix)
- (*ff)(bta,pp);
- BOOST_FOREACH(sptr<pscorer> const& ff, m_active_ff_common)
- (*ff)(bta,pp);
- tpcoll->Add(createTargetPhrase(src,bta,pp));
- }
- return true;
- }
-
- // process phrase stats from a single parallel corpus
- bool
- Mmsapt::
- combine_pstats
- (Phrase const& src,
- uint64_t const pid1a, pstats * statsa, Bitext<Token> const & bta,
- uint64_t const pid1b, pstats const* statsb, Bitext<Token> const & btb,
- TargetPhraseCollection* tpcoll) const
- {
- PhrasePair ppfix,ppdyn,pool;
- // ppfix: counts from btfix
- // ppdyn: counts from btdyn
- // pool: pooled counts from both
- Word w;
- if (statsa) ppfix.init(pid1a,*statsa,this->m_numScoreComponents);
- if (statsb) ppdyn.init(pid1b,*statsb,this->m_numScoreComponents);
- pstats::trg_map_t::const_iterator b;
- pstats::trg_map_t::iterator a;
-
- if (statsb)
- {
- pool.init(pid1b,*statsb,0);
- for (b = statsb->trg.begin(); b != statsb->trg.end(); ++b)
- {
- ppdyn.update(b->first,b->second);
- BOOST_FOREACH(sptr<pscorer> const& ff, m_active_ff_dyn)
- (*ff)(btb,ppdyn);
+ // // process phrase stats from a single parallel corpus
+ // bool
+ // Mmsapt::
+ // combine_pstats
+ // (Phrase const& src,
+ // uint64_t const pid1a, pstats * statsa, Bitext<Token> const & bta,
+ // uint64_t const pid1b, pstats const* statsb, Bitext<Token> const & btb,
+ // TargetPhraseCollection* tpcoll) const
+ // {
+ // if (!statsa && !statsb) return false;
+
+ // PhrasePair ppfix,ppdyn,pool; Word w;
+ // // ppfix: counts from btfix
+ // // ppdyn: counts from btdyn
+ // // pool: pooled counts from both
+
+ // pstats::trg_map_t::const_iterator b;
+ // pstats::trg_map_t::iterator a;
+
+
+ // set<uint64_t> check;
+ // if (statsb)
+ // {
+ // ppdyn.init(pid1b,*statsb,this->m_numScoreComponents);
+ // if (statsa)
+ // {
+ // pool.init(pid1b, *statsa, *statsb, 0);
+ // ppfix.init(pid1a,*statsa, 0);
+ // }
+ // else
+ // {
+ // pool.init(pid1b, *statsb,0);
+ // ppfix.init();
+ // }
+
+ // for (b = statsb->trg.begin(); b != statsb->trg.end(); ++b)
+ // {
+ // ppdyn.update(b->first,b->second);
+ // BOOST_FOREACH(sptr<pscorer> const& ff, m_active_ff_dyn)
+ // (*ff)(btb,ppdyn);
- uint32_t sid,off,len;
- parse_pid(b->first, sid, off, len);
- Token const* x = bta.T2->sntStart(sid) + off;
- TSA<Token>::tree_iterator m(bta.I2.get(),x,x+len);
+ // uint32_t sid,off,len;
+ // parse_pid(b->first, sid, off, len);
+ // Token const* x = btb.T2->sntStart(sid) + off;
+ // TSA<Token>::tree_iterator m(bta.I2.get(),x,len);
- if (m.size() && statsa &&
- ((a = statsa->trg.find(m.getPid())) != statsa->trg.end()))
- {
- // phrase pair found also in btfix
- ppfix.update(a->first,a->second);
- BOOST_FOREACH(sptr<pscorer> const& ff, m_active_ff_fix)
- (*ff)(bta,ppfix,&ppdyn.fvals);
- BOOST_FOREACH(sptr<pscorer> const& ff, m_active_ff_common)
- (*ff)(bta,ppfix,&ppdyn.fvals);
- a->second.invalidate();
- }
- else
- {
- // phrase pair was not found in btfix
-
- // ... but the source phrase was
- if (m.size())
- pool.update(b->first,m.approxOccurrenceCount(), b->second);
-
- // ... and not even the source phrase
- else
- pool.update(b->first,b->second);
-
- BOOST_FOREACH(sptr<pscorer> const& ff, m_active_ff_fix)
- (*ff)(btb,pool,&ppdyn.fvals);
- BOOST_FOREACH(sptr<pscorer> const& ff, m_active_ff_common)
- (*ff)(btb,pool,&ppdyn.fvals);
-
- }
-
- tpcoll->Add(createTargetPhrase(src,btb,ppdyn));
- }
- }
-
- // now deal with all phraise pairs that are ONLY in btfix
- // (the ones that are in both were dealt with above)
- if (statsa)
- {
- pool.init(pid1a,*statsa,0);
- for (a = statsa->trg.begin(); a != statsa->trg.end(); ++a)
- {
- if (!a->second.valid()) continue; // done above
- ppfix.update(a->first,a->second);
- BOOST_FOREACH(sptr<pscorer> const& ff, m_active_ff_fix)
- (*ff)(bta,ppfix);
- BOOST_FOREACH(sptr<pscorer> const& ff, m_active_ff_common)
- (*ff)(bta,ppfix);
+ // Token const* y = m.getToken(0);
+ // for (size_t i = 0; i < len; ++i)
+ // cout << x[i].id() << " " << endl;
+ // for (size_t i = 0; i < m.size(); ++i)
+ // cout << y[i].id() << " " << endl;
- if (btb.I2)
- {
- uint32_t sid,off,len;
- parse_pid(a->first, sid, off, len);
- Token const* x = bta.T2->sntStart(sid) + off;
- TSA<Token>::tree_iterator m(btb.I2.get(),x,x+len);
- if (m.size())
- pool.update(a->first,m.approxOccurrenceCount(),a->second);
- else
- pool.update(a->first,a->second);
- }
- else pool.update(a->first,a->second);
- BOOST_FOREACH(sptr<pscorer> const& ff, m_active_ff_dyn)
- (*ff)(btb,pool,&ppfix.fvals);
- if (ppfix.p2)
- tpcoll->Add(createTargetPhrase(src,bta,ppfix));
- }
- }
- return (statsa || statsb);
- }
+ // if (statsa && m.size() &&
+ // ((a = statsa->trg.find(m.getPid())) != statsa->trg.end()))
+ // { // i.e., phrase pair found also in btfix
+ // ppfix.update(a->first,a->second);
+ // pool.update(b->first, b->second, a->second);
+ // BOOST_FOREACH(sptr<pscorer> const& ff, m_active_ff_fix)
+ // (*ff)(bta, ppfix, &ppdyn.fvals);
+ // check.insert(a->first);
+ // }
+ // else // phrase pair was not found in btfix
+ // {
+ // if (m.size()) // ... but the source phrase was
+ // {
+ // pool.update(b->first, m.approxOccurrenceCount(), b->second);
+ // ppfix.update(b->first,m.approxOccurrenceCount());
+ // }
+ // else // ... and not even the source phrase
+ // {
+ // pool.update(b->first, b->second);
+ // ppfix.update(b->first,0);
+ // }
+ // BOOST_FOREACH(sptr<pscorer> const& ff, m_active_ff_fix)
+ // (*ff)(btb, ff->allowPooling() ? pool : ppfix, &ppdyn.fvals);
+ // }
+ // BOOST_FOREACH(sptr<pscorer> const& ff, m_active_ff_common)
+ // (*ff)(btb, pool, &ppdyn.fvals);
+ // tpcoll->Add(mkTPhrase(src,btb,ppdyn));
+ // }
+ // }
+
+ // // now deal with all phraise pairs that are ONLY in btfix
+ // // (the ones that are in both were dealt with above)
+ // if (statsa)
+ // {
+ // ppfix.init(pid1a, *statsa, this->m_numScoreComponents);
+ // pool.init(pid1a, *statsa, 0);
+ // ppdyn.init();
+ // for (a = statsa->trg.begin(); a != statsa->trg.end(); ++a)
+ // {
+ // if (check.find(a->first) != check.end())
+ // continue;
+
+ // ppfix.update(a->first, a->second);
+ // BOOST_FOREACH(sptr<pscorer> const& ff, m_active_ff_fix)
+ // (*ff)(bta, ppfix);
+
+ // if (btb.I2)
+ // {
+ // uint32_t sid,off,len;
+ // parse_pid(a->first, sid, off, len);
+ // Token const* x = bta.T2->sntStart(sid) + off;
+ // TSA<Token>::tree_iterator m(btb.I2.get(), x, len);
+ // if (m.size())
+ // pool.update(a->first, m.approxOccurrenceCount(), a->second);
+ // else
+ // pool.update(a->first, a->second);
+ // }
+ // else pool.update(a->first, a->second);
+ // BOOST_FOREACH(sptr<pscorer> const& ff, m_active_ff_dyn)
+ // (*ff)(btb, ff->allowPooling() ? pool : ppdyn, &ppfix.fvals);
+ // BOOST_FOREACH(sptr<pscorer> const& ff, m_active_ff_common)
+ // (*ff)(bta, pool, &ppfix.fvals);
+ // if (ppfix.p2)
+ // tpcoll->Add(mkTPhrase(src, bta, ppfix));
+ // }
+ // }
+ // return true;
+ // }
Mmsapt::
TargetPhraseCollectionWrapper::
@@ -595,8 +754,34 @@ namespace Moses
{
assert(this->refCount == 0);
}
-
+ template<typename Token>
+ void
+ expand(typename Bitext<Token>::iter const& m,
+ Bitext<Token> const& bt,
+ pstats const& ps, vector<PhrasePair<Token> >& dest)
+ {
+ dest.reserve(ps.trg.size());
+ PhrasePair<Token> pp;
+ pp.init(m.getToken(0), m.size(), &ps, 0);
+ // cout << HERE << " " << toString(*(bt.V1),pp.start1,pp.len1) << endl;
+ pstats::trg_map_t::const_iterator a;
+ for (a = ps.trg.begin(); a != ps.trg.end(); ++a)
+ {
+ uint32_t sid,off,len;
+ parse_pid(a->first, sid, off, len);
+ pp.update(bt.T2->sntStart(sid)+off, len, a->second);
+ dest.push_back(pp);
+ }
+ typename PhrasePair<Token>::SortByTargetIdSeq sorter;
+ sort(dest.begin(), dest.end(),sorter);
+#if 0
+ BOOST_FOREACH(PhrasePair<Token> const& p, dest)
+ cout << toString (*bt.V1,p.start1,p.len1) << " ::: "
+ << toString (*bt.V2,p.start2,p.len2) << " "
+ << p.joint << endl;
+#endif
+ }
// This is not the most efficient way of phrase lookup!
TargetPhraseCollection const*
@@ -605,13 +790,9 @@ namespace Moses
{
// map from Moses Phrase to internal id sequence
vector<id_type> sphrase;
- fillIdSeq(src,input_factor,*btfix.V1,sphrase);
+ fillIdSeq(src,input_factor,*(btfix.V1),sphrase);
if (sphrase.size() == 0) return NULL;
- // lookup in static bitext
- TSA<Token>::tree_iterator mfix(btfix.I1.get(),&sphrase[0],sphrase.size());
-
- // lookup in dynamic bitext
// Reserve a local copy of the dynamic bitext in its current form. /btdyn/
// is set to a new copy of the dynamic bitext every time a sentence pair
// is added. /dyn/ keeps the old bitext around as long as we need it.
@@ -621,12 +802,13 @@ namespace Moses
dyn = btdyn;
}
assert(dyn);
+
+ // lookup phrases in both bitexts
+ TSA<Token>::tree_iterator mfix(btfix.I1.get(), &sphrase[0], sphrase.size());
TSA<Token>::tree_iterator mdyn(dyn->I1.get());
if (dyn->I1.get())
- {
- for (size_t i = 0; mdyn.size() == i && i < sphrase.size(); ++i)
- mdyn.extend(sphrase[i]);
- }
+ for (size_t i = 0; mdyn.size() == i && i < sphrase.size(); ++i)
+ mdyn.extend(sphrase[i]);
#if 0
cerr << src << endl;
@@ -634,43 +816,62 @@ namespace Moses
<< mdyn.size() << " " << mdyn.getPid() << endl;
#endif
- // phrase not found in either
- if (mdyn.size() != sphrase.size() &&
- mfix.size() != sphrase.size())
- return NULL; // not found
+ if (mdyn.size() != sphrase.size() && mfix.size() != sphrase.size())
+ return NULL; // phrase not found in either bitext
// cache lookup:
-
- uint64_t phrasekey;
- if (mfix.size() == sphrase.size())
- phrasekey = (mfix.getPid()<<1);
- else
- phrasekey = (mdyn.getPid()<<1)+1;
-
+ uint64_t phrasekey = (mfix.size() == sphrase.size() ? (mfix.getPid()<<1)
+ : (mdyn.getPid()<<1)+1);
size_t revision = dyn->revision();
{
boost::lock_guard<boost::mutex> guard(this->lock);
tpc_cache_t::iterator c = m_cache.find(phrasekey);
+ // TO DO: we should revise the revision mechanism: we take the length
+ // of the dynamic bitext (in sentences) at the time the PT entry
+ // was stored as the time stamp. For each word in the
+ // vocabulary, we also store its most recent occurrence in the
+ // bitext. Only if the timestamp of each word in the phrase is
+ // newer than the timestamp of the phrase itself we must update
+ // the entry.
if (c != m_cache.end() && c->second->revision == revision)
return encache(c->second);
}
- // not found or not up to date
+ // OK: pt entry not found or not up to date
+ // lookup and expansion could be done in parallel threds,
+ // but ppdyn is probably small anyway
+ // TO DO: have Bitexts return lists of PhrasePairs instead of pstats
+ // no need to expand pstats at every single lookup again, especially
+ // for btfix.
sptr<pstats> sfix,sdyn;
- if (mfix.size() == sphrase.size())
- sfix = btfix.lookup(mfix);
- if (mdyn.size() == sphrase.size())
- sdyn = dyn->lookup(mdyn);
+ if (mfix.size() == sphrase.size()) sfix = btfix.lookup(mfix);
+ if (mdyn.size() == sphrase.size()) sdyn = dyn->lookup(mdyn);
+
+ vector<PhrasePair<Token> > ppfix,ppdyn;
+ if (sfix) expand(mfix, btfix, *sfix, ppfix);
+ if (sdyn) expand(mdyn, *dyn, *sdyn, ppdyn);
- TargetPhraseCollectionWrapper*
- ret = new TargetPhraseCollectionWrapper(revision,phrasekey);
- if ((poolCounts &&
- pool_pstats(src, mfix.getPid(),sfix.get(),btfix,
- mdyn.getPid(),sdyn.get(),*dyn,ret))
- || combine_pstats(src, mfix.getPid(),sfix.get(),btfix,
- mdyn.getPid(),sdyn.get(),*dyn,ret))
+ // now we have two lists of Phrase Pairs, let's merge them
+ TargetPhraseCollectionWrapper* ret;
+ ret = new TargetPhraseCollectionWrapper(revision,phrasekey);
+ PhrasePair<Token>::SortByTargetIdSeq sorter;
+ size_t i = 0; size_t k = 0;
+ while (i < ppfix.size() && k < ppdyn.size())
+ {
+ int cmp = sorter.cmp(ppfix[i], ppdyn[k]);
+ if (cmp < 0) ret->Add(mkTPhrase(src,&ppfix[i++],NULL,dyn));
+ else if (cmp == 0) ret->Add(mkTPhrase(src,&ppfix[i++],&ppdyn[k++],dyn));
+ else ret->Add(mkTPhrase(src,NULL,&ppdyn[k++],dyn));
+ }
+ while (i < ppfix.size()) ret->Add(mkTPhrase(src,&ppfix[i++],NULL,dyn));
+ while (k < ppdyn.size()) ret->Add(mkTPhrase(src,NULL,&ppdyn[k++],dyn));
+ if (m_tableLimit) ret->Prune(true, m_tableLimit);
+ else ret->Prune(true,ret->GetSize());
+#if 0
+ if (combine_pstats(src,
+ mfix.getPid(), sfix.get(), btfix,
+ mdyn.getPid(), sdyn.get(), *dyn, ret))
{
- if (m_tableLimit) ret->Prune(true,m_tableLimit);
#if 0
sort(ret->begin(), ret->end(), CompareTargetPhrase());
cout << "SOURCE PHRASE: " << src << endl;
@@ -686,6 +887,9 @@ namespace Moses
}
#endif
}
+#endif
+
+ // put the result in the cache and return
boost::lock_guard<boost::mutex> guard(this->lock);
m_cache[phrasekey] = ret;
return encache(ret);
@@ -839,6 +1043,7 @@ namespace Moses
TSA<Token>::tree_iterator mfix(btfix.I1.get(),&myphrase[0],myphrase.size());
if (mfix.size() == myphrase.size())
{
+ btfix.prep(mfix);
// cerr << phrase << " " << mfix.approxOccurrenceCount() << endl;
return true;
}
@@ -854,6 +1059,7 @@ namespace Moses
{
for (size_t i = 0; mdyn.size() == i && i < myphrase.size(); ++i)
mdyn.extend(myphrase[i]);
+ if (mdyn.size() == myphrase.size()) dyn->prep(mdyn);
}
return mdyn.size() == myphrase.size();
}
diff --git a/moses/TranslationModel/UG/mmsapt.h b/moses/TranslationModel/UG/mmsapt.h
index b6be36131..a7ece8fdb 100644
--- a/moses/TranslationModel/UG/mmsapt.h
+++ b/moses/TranslationModel/UG/mmsapt.h
@@ -19,6 +19,7 @@
#include "moses/TranslationModel/UG/mm/ug_typedefs.h"
#include "moses/TranslationModel/UG/mm/tpt_pickler.h"
#include "moses/TranslationModel/UG/mm/ug_bitext.h"
+#include "moses/TranslationModel/UG/mm/ug_phrasepair.h"
#include "moses/TranslationModel/UG/mm/ug_lexical_phrase_scorer2.h"
#include "moses/InputFileStream.h"
@@ -29,7 +30,8 @@
#include <map>
#include "moses/TranslationModel/PhraseDictionary.h"
-#include "mmsapt_phrase_scorers.h"
+#include "mmsapt_phrase_scorers.h" // deprecated
+#include "sapt_phrase_scorers.h"
// TO DO:
// - make lexical phrase scorer take addition to the "dynamic overlay" into account
@@ -47,47 +49,68 @@ namespace Moses
#endif
{
friend class Alignment;
+ map<string,string> param;
public:
typedef L2R_Token<SimpleWordId> Token;
typedef mmBitext<Token> mmbitext;
typedef imBitext<Token> imbitext;
+ typedef Bitext<Token> bitext;
typedef TSA<Token> tsa;
typedef PhraseScorer<Token> pscorer;
+
private:
+ // vector<sptr<bitext> > shards;
mmbitext btfix;
- sptr<imbitext> btdyn;
+ sptr<imbitext> btdyn;
string bname,extra_data;
string L1;
string L2;
- float m_lbop_parameter;
- float m_lex_alpha;
+ float m_lbop_conf; // confidence level for lbop smoothing
+ float m_lex_alpha; // alpha paramter (j+a)/(m+a) for lexical smoothing
// alpha parameter for lexical smoothing (joint+alpha)/(marg + alpha)
// must be > 0 if dynamic
size_t m_default_sample_size;
size_t m_workers; // number of worker threads for sampling the bitexts
- // deprecated!
- char m_pfwd_denom; // denominator for computation of fwd phrase score:
- // 'r' - divide by raw count
- // 's' - divide by sample count
- // 'g' - devide by number of "good" (i.e. coherent) samples
- // size_t num_features;
+ // // deprecated!
+ // char m_pfwd_denom; // denominator for computation of fwd phrase score:
+ // // 'r' - divide by raw count
+ // // 's' - divide by sample count
+ // // 'g' - devide by number of "good" (i.e. coherent) samples
+ // // size_t num_features;
size_t input_factor;
size_t output_factor; // we can actually return entire Tokens!
- bool withLogCountFeatures; // add logs of counts as features?
- bool withCoherence;
- string m_pfwd_features; // which pfwd functions to use
- string m_pbwd_features; // which pbwd functions to use
+ // bool withLogCountFeatures; // add logs of counts as features?
+ // bool withCoherence;
+ // string m_pfwd_features; // which pfwd functions to use
+ // string m_pbwd_features; // which pbwd functions to use
+
+ // for display for human inspection (ttable dumps):
vector<string> m_feature_names; // names of features activated
+ vector<bool> m_is_logval; // keeps track of which features are log valued
+ vector<bool> m_is_integer; // keeps track of which features are integer valued
+
vector<sptr<pscorer > > m_active_ff_fix; // activated feature functions (fix)
vector<sptr<pscorer > > m_active_ff_dyn; // activated feature functions (dyn)
vector<sptr<pscorer > > m_active_ff_common; // activated feature functions (dyn)
- size_t
- add_corpus_specific_features
- (vector<sptr<pscorer > >& ffvec, size_t num_feats);
+ void
+ register_ff(sptr<pscorer> const& ff, vector<sptr<pscorer> > & registry);
+
+ template<typename fftype>
+ void
+ check_ff(string const ffname,vector<sptr<pscorer> >* registry = NULL);
+ // add feature function if specified
+
+ template<typename fftype>
+ void
+ check_ff(string const ffname, float const xtra, vector<sptr<pscorer> >* registry = NULL);
+ // add feature function if specified
+
+ void
+ add_corpus_specific_features(vector<sptr<pscorer > >& ffvec);
// built-in feature functions
// PScorePfwd<Token> calc_pfwd_fix, calc_pfwd_dyn;
@@ -140,12 +163,24 @@ namespace Moses
mm2dtable_t COOCraw;
TargetPhrase*
- createTargetPhrase
+ mkTPhrase(Phrase const& src,
+ Moses::bitext::PhrasePair<Token>* fix,
+ Moses::bitext::PhrasePair<Token>* dyn,
+ sptr<Bitext<Token> > const& dynbt) const;
+
+ // template<typename Token>
+ // void
+ // expand(typename Bitext<Token>::iter const& m, Bitext<Token> const& bt,
+ // pstats const& pstats, vector<PhrasePair<Token> >& dest);
+
+#if 0
+ TargetPhrase*
+ mkTPhrase
(Phrase const& src,
Bitext<Token> const& bt,
- bitext::PhrasePair const& pp
+ Moses::bitext::PhrasePair const& pp
) const;
-
+#endif
void
process_pstats
(Phrase const& src,
@@ -180,7 +215,7 @@ namespace Moses
) const;
void
- load_extra_data(string bname);
+ load_extra_data(string bname, bool locking);
mutable size_t m_tpc_ctr;
public:
@@ -231,8 +266,14 @@ namespace Moses
vector<string> const&
GetFeatureNames() const;
- void
- ScorePPfix(bitext::PhrasePair& pp) const;
+ // void
+ // ScorePPfix(bitext::PhrasePair& pp) const;
+
+ bool
+ isLogVal(int i) const;
+
+ bool
+ isInteger(int i) const;
private:
};
diff --git a/moses/TranslationModel/UG/mmsapt_align.cc b/moses/TranslationModel/UG/mmsapt_align.cc
index 407df648d..8b6bf1eb2 100644
--- a/moses/TranslationModel/UG/mmsapt_align.cc
+++ b/moses/TranslationModel/UG/mmsapt_align.cc
@@ -1,335 +1,336 @@
#include "mmsapt.h"
+// currently broken
-namespace Moses
-{
- using namespace bitext;
- using namespace std;
- using namespace boost;
+// namespace Moses
+// {
+// using namespace bitext;
+// using namespace std;
+// using namespace boost;
- struct PPgreater
- {
- bool operator()(PhrasePair const& a, PhrasePair const& b)
- {
- return a.score > b.score;
- }
- };
+// struct PPgreater
+// {
+// bool operator()(PhrasePair const& a, PhrasePair const& b)
+// {
+// return a.score > b.score;
+// }
+// };
- void
- Mmsapt::
- setWeights(vector<float> const & w)
- {
- assert(w.size() == this->m_numScoreComponents);
- this->feature_weights = w;
- }
+// void
+// Mmsapt::
+// setWeights(vector<float> const & w)
+// {
+// assert(w.size() == this->m_numScoreComponents);
+// this->feature_weights = w;
+// }
- struct PhraseAlnHyp
- {
- PhrasePair pp;
- ushort s1,e1,s2,e2; // start and end positions
- int prev; // preceding alignment hypothesis
- float score;
- bitvector scov; // source coverage
- PhraseAlnHyp(PhrasePair const& ppx, int slen,
- pair<uint32_t,uint32_t> const& sspan,
- pair<uint32_t,uint32_t> const& tspan)
- : pp(ppx), prev(-1), score(ppx.score), scov(slen)
- {
- s1 = sspan.first; e1 = sspan.second;
- s2 = tspan.first; e2 = tspan.second;
- for (size_t i = s1; i < e1; ++i)
- scov.set(i);
- }
+// struct PhraseAlnHyp
+// {
+// PhrasePair pp;
+// ushort s1,e1,s2,e2; // start and end positions
+// int prev; // preceding alignment hypothesis
+// float score;
+// bitvector scov; // source coverage
+// PhraseAlnHyp(PhrasePair const& ppx, int slen,
+// pair<uint32_t,uint32_t> const& sspan,
+// pair<uint32_t,uint32_t> const& tspan)
+// : pp(ppx), prev(-1), score(ppx.score), scov(slen)
+// {
+// s1 = sspan.first; e1 = sspan.second;
+// s2 = tspan.first; e2 = tspan.second;
+// for (size_t i = s1; i < e1; ++i)
+// scov.set(i);
+// }
- bool operator<(PhraseAlnHyp const& other) const
- {
- return this->score < other.score;
- }
+// bool operator<(PhraseAlnHyp const& other) const
+// {
+// return this->score < other.score;
+// }
- bool operator>(PhraseAlnHyp const& other) const
- {
- return this->score > other.score;
- }
+// bool operator>(PhraseAlnHyp const& other) const
+// {
+// return this->score > other.score;
+// }
- PhraseOrientation
- po_bwd(PhraseAlnHyp const* prev) const
- {
- if (s2 == 0) return po_first;
- assert(prev);
- assert(prev->e2 <= s2);
- if (prev->e2 < s2) return po_other;
- if (prev->e1 == s1) return po_mono;
- if (prev->e1 < s1) return po_jfwd;
- if (prev->s1 == e1) return po_swap;
- if (prev->s1 > e1) return po_jbwd;
- return po_other;
- }
+// PhraseOrientation
+// po_bwd(PhraseAlnHyp const* prev) const
+// {
+// if (s2 == 0) return po_first;
+// assert(prev);
+// assert(prev->e2 <= s2);
+// if (prev->e2 < s2) return po_other;
+// if (prev->e1 == s1) return po_mono;
+// if (prev->e1 < s1) return po_jfwd;
+// if (prev->s1 == e1) return po_swap;
+// if (prev->s1 > e1) return po_jbwd;
+// return po_other;
+// }
- PhraseOrientation
- po_fwd(PhraseAlnHyp const* next) const
- {
- if (!next) return po_last;
- assert(next->s2 >= e2);
- if (next->s2 < e2) return po_other;
- if (next->e1 == s1) return po_swap;
- if (next->e1 < s1) return po_jbwd;
- if (next->s1 == e1) return po_mono;
- if (next->s1 > e1) return po_jfwd;
- return po_other;
- }
+// PhraseOrientation
+// po_fwd(PhraseAlnHyp const* next) const
+// {
+// if (!next) return po_last;
+// assert(next->s2 >= e2);
+// if (next->s2 < e2) return po_other;
+// if (next->e1 == s1) return po_swap;
+// if (next->e1 < s1) return po_jbwd;
+// if (next->s1 == e1) return po_mono;
+// if (next->s1 > e1) return po_jfwd;
+// return po_other;
+// }
- float
- dprob_fwd(PhraseAlnHyp const& next)
- {
- return pp.dfwd[po_fwd(&next)];
- }
+// float
+// dprob_fwd(PhraseAlnHyp const& next)
+// {
+// return pp.dfwd[po_fwd(&next)];
+// }
- float
- dprob_bwd(PhraseAlnHyp const& prev)
- {
- return pp.dbwd[po_bwd(&prev)];
- }
+// float
+// dprob_bwd(PhraseAlnHyp const& prev)
+// {
+// return pp.dbwd[po_bwd(&prev)];
+// }
- };
+// };
- class Alignment
- {
- typedef L2R_Token<SimpleWordId> Token;
- typedef TSA<Token> tsa;
- typedef pair<uint32_t, uint32_t> span;
- typedef vector<vector<uint64_t> > pidmap_t; // span -> phrase ID
- typedef boost::unordered_map<uint64_t,vector<span> > pid2span_t;
- typedef pstats::trg_map_t jStatsTable;
+// class Alignment
+// {
+// typedef L2R_Token<SimpleWordId> Token;
+// typedef TSA<Token> tsa;
+// typedef pair<uint32_t, uint32_t> span;
+// typedef vector<vector<uint64_t> > pidmap_t; // span -> phrase ID
+// typedef boost::unordered_map<uint64_t,vector<span> > pid2span_t;
+// typedef pstats::trg_map_t jStatsTable;
- Mmsapt const& PT;
- vector<id_type> s,t;
- pidmap_t sspan2pid, tspan2pid; // span -> phrase ID
- pid2span_t spid2span,tpid2span;
- vector<vector<sptr<pstats> > > spstats;
+// Mmsapt const& PT;
+// vector<id_type> s,t;
+// pidmap_t sspan2pid, tspan2pid; // span -> phrase ID
+// pid2span_t spid2span,tpid2span;
+// vector<vector<sptr<pstats> > > spstats;
- vector<PhrasePair> PP;
- // position-independent phrase pair info
- public:
- vector<PhraseAlnHyp> PAH;
- vector<vector<int> > tpos2ahyp;
- // maps from target start positions to PhraseAlnHyps starting at
- // that position
+// vector<PhrasePair> PP;
+// // position-independent phrase pair info
+// public:
+// vector<PhraseAlnHyp> PAH;
+// vector<vector<int> > tpos2ahyp;
+// // maps from target start positions to PhraseAlnHyps starting at
+// // that position
- sptr<pstats> getPstats(span const& sspan);
- void fill_tspan_maps();
- void fill_sspan_maps();
- public:
- Alignment(Mmsapt const& pt, string const& src, string const& trg);
- void show(ostream& out);
- void show(ostream& out, PhraseAlnHyp const& ah);
- };
+// sptr<pstats> getPstats(span const& sspan);
+// void fill_tspan_maps();
+// void fill_sspan_maps();
+// public:
+// Alignment(Mmsapt const& pt, string const& src, string const& trg);
+// void show(ostream& out);
+// void show(ostream& out, PhraseAlnHyp const& ah);
+// };
- void
- Alignment::
- show(ostream& out, PhraseAlnHyp const& ah)
- {
-#if 0
- LexicalPhraseScorer2<Token>::table_t const&
- COOCjnt = PT.calc_lex.scorer.COOC;
+// void
+// Alignment::
+// show(ostream& out, PhraseAlnHyp const& ah)
+// {
+// #if 0
+// LexicalPhraseScorer2<Token>::table_t const&
+// COOCjnt = PT.calc_lex.scorer.COOC;
- out << setw(10) << exp(ah.score) << " "
- << PT.btfix.T2->pid2str(PT.btfix.V2.get(), ah.pp.p2)
- << " <=> "
- << PT.btfix.T1->pid2str(PT.btfix.V1.get(), ah.pp.p1);
- vector<uchar> const& a = ah.pp.aln;
- // BOOST_FOREACH(int x,a) cout << "[" << x << "] ";
- for (size_t u = 0; u+1 < a.size(); u += 2)
- out << " " << int(a[u+1]) << "-" << int(a[u]);
+// out << setw(10) << exp(ah.score) << " "
+// << PT.btfix.T2->pid2str(PT.btfix.V2.get(), ah.pp.p2)
+// << " <=> "
+// << PT.btfix.T1->pid2str(PT.btfix.V1.get(), ah.pp.p1);
+// vector<uchar> const& a = ah.pp.aln;
+// // BOOST_FOREACH(int x,a) cout << "[" << x << "] ";
+// for (size_t u = 0; u+1 < a.size(); u += 2)
+// out << " " << int(a[u+1]) << "-" << int(a[u]);
- if (ah.e2-ah.s2 == 1 and ah.e1-ah.s1 == 1)
- out << " " << COOCjnt[s[ah.s1]][t[ah.s2]]
- << "/" << PT.COOCraw[s[ah.s1]][t[ah.s2]]
- << "=" << float(COOCjnt[s[ah.s1]][t[ah.s2]])/PT.COOCraw[s[ah.s1]][t[ah.s2]];
- out << endl;
- // float const* ofwdj = ah.pp.dfwd;
- // float const* obwdj = ah.pp.dbwd;
- // uint32_t const* ofwdm = spstats[ah.s1][ah.e1-ah.s1-1]->ofwd;
- // uint32_t const* obwdm = spstats[ah.s1][ah.e1-ah.s1-1]->obwd;
- // out << " [first: " << ofwdj[po_first]<<"/"<<ofwdm[po_first]
- // << " last: " << ofwdj[po_last]<<"/"<<ofwdm[po_last]
- // << " mono: " << ofwdj[po_mono]<<"/"<<ofwdm[po_mono]
- // << " jfwd: " << ofwdj[po_jfwd]<<"/"<<ofwdm[po_jfwd]
- // << " swap: " << ofwdj[po_swap]<<"/"<<ofwdm[po_swap]
- // << " jbwd: " << ofwdj[po_jbwd]<<"/"<<ofwdm[po_jbwd]
- // << " other: " << ofwdj[po_other]<<"/"<<ofwdm[po_other]
- // << "]" << endl
- // << " [first: " << obwdj[po_first]<<"/"<<obwdm[po_first]
- // << " last: " << obwdj[po_last]<<"/"<<obwdm[po_last]
- // << " mono: " << obwdj[po_mono]<<"/"<<obwdm[po_mono]
- // << " jfwd: " << obwdj[po_jfwd]<<"/"<<obwdm[po_jfwd]
- // << " swap: " << obwdj[po_swap]<<"/"<<obwdm[po_swap]
- // << " jbwd: " << obwdj[po_jbwd]<<"/"<<obwdm[po_jbwd]
- // << " other: " << obwdj[po_other]<<"/"<<obwdm[po_other]
- // << "]" << endl;
-#endif
- }
+// if (ah.e2-ah.s2 == 1 and ah.e1-ah.s1 == 1)
+// out << " " << COOCjnt[s[ah.s1]][t[ah.s2]]
+// << "/" << PT.COOCraw[s[ah.s1]][t[ah.s2]]
+// << "=" << float(COOCjnt[s[ah.s1]][t[ah.s2]])/PT.COOCraw[s[ah.s1]][t[ah.s2]];
+// out << endl;
+// // float const* ofwdj = ah.pp.dfwd;
+// // float const* obwdj = ah.pp.dbwd;
+// // uint32_t const* ofwdm = spstats[ah.s1][ah.e1-ah.s1-1]->ofwd;
+// // uint32_t const* obwdm = spstats[ah.s1][ah.e1-ah.s1-1]->obwd;
+// // out << " [first: " << ofwdj[po_first]<<"/"<<ofwdm[po_first]
+// // << " last: " << ofwdj[po_last]<<"/"<<ofwdm[po_last]
+// // << " mono: " << ofwdj[po_mono]<<"/"<<ofwdm[po_mono]
+// // << " jfwd: " << ofwdj[po_jfwd]<<"/"<<ofwdm[po_jfwd]
+// // << " swap: " << ofwdj[po_swap]<<"/"<<ofwdm[po_swap]
+// // << " jbwd: " << ofwdj[po_jbwd]<<"/"<<ofwdm[po_jbwd]
+// // << " other: " << ofwdj[po_other]<<"/"<<ofwdm[po_other]
+// // << "]" << endl
+// // << " [first: " << obwdj[po_first]<<"/"<<obwdm[po_first]
+// // << " last: " << obwdj[po_last]<<"/"<<obwdm[po_last]
+// // << " mono: " << obwdj[po_mono]<<"/"<<obwdm[po_mono]
+// // << " jfwd: " << obwdj[po_jfwd]<<"/"<<obwdm[po_jfwd]
+// // << " swap: " << obwdj[po_swap]<<"/"<<obwdm[po_swap]
+// // << " jbwd: " << obwdj[po_jbwd]<<"/"<<obwdm[po_jbwd]
+// // << " other: " << obwdj[po_other]<<"/"<<obwdm[po_other]
+// // << "]" << endl;
+// #endif
+// }
- void
- Alignment::
- show(ostream& out)
- {
- // show what we have so far ...
- for (size_t s2 = 0; s2 < t.size(); ++s2)
- {
- VectorIndexSorter<PhraseAlnHyp> foo(PAH);
- sort(tpos2ahyp[s2].begin(), tpos2ahyp[s2].end(), foo);
- for (size_t h = 0; h < tpos2ahyp[s2].size(); ++h)
- show(out,PAH[tpos2ahyp[s2][h]]);
- }
- }
+// void
+// Alignment::
+// show(ostream& out)
+// {
+// // show what we have so far ...
+// for (size_t s2 = 0; s2 < t.size(); ++s2)
+// {
+// VectorIndexSorter<PhraseAlnHyp> foo(PAH);
+// sort(tpos2ahyp[s2].begin(), tpos2ahyp[s2].end(), foo);
+// for (size_t h = 0; h < tpos2ahyp[s2].size(); ++h)
+// show(out,PAH[tpos2ahyp[s2][h]]);
+// }
+// }
- sptr<pstats>
- Alignment::
- getPstats(span const& sspan)
- {
- size_t k = sspan.second - sspan.first - 1;
- if (k < spstats[sspan.first].size())
- return spstats[sspan.first][k];
- else return sptr<pstats>();
- }
+// sptr<pstats>
+// Alignment::
+// getPstats(span const& sspan)
+// {
+// size_t k = sspan.second - sspan.first - 1;
+// if (k < spstats[sspan.first].size())
+// return spstats[sspan.first][k];
+// else return sptr<pstats>();
+// }
- void
- Alignment::
- fill_tspan_maps()
- {
- tspan2pid.assign(t.size(),vector<uint64_t>(t.size(),0));
- for (size_t i = 0; i < t.size(); ++i)
- {
- tsa::tree_iterator m(PT.btfix.I2.get());
- for (size_t k = i; k < t.size() && m.extend(t[k]); ++k)
- {
- uint64_t pid = m.getPid();
- tpid2span[pid].push_back(pair<uint32_t,uint32_t>(i,k+1));
- tspan2pid[i][k] = pid;
- }
- }
- }
+// void
+// Alignment::
+// fill_tspan_maps()
+// {
+// tspan2pid.assign(t.size(),vector<uint64_t>(t.size(),0));
+// for (size_t i = 0; i < t.size(); ++i)
+// {
+// tsa::tree_iterator m(PT.btfix.I2.get());
+// for (size_t k = i; k < t.size() && m.extend(t[k]); ++k)
+// {
+// uint64_t pid = m.getPid();
+// tpid2span[pid].push_back(pair<uint32_t,uint32_t>(i,k+1));
+// tspan2pid[i][k] = pid;
+// }
+// }
+// }
- void
- Alignment::
- fill_sspan_maps()
- {
- sspan2pid.assign(s.size(),vector<uint64_t>(s.size(),0));
- spstats.resize(s.size());
- for (size_t i = 0; i < s.size(); ++i)
- {
- tsa::tree_iterator m(PT.btfix.I1.get());
- for (size_t k = i; k < s.size() && m.extend(s[k]); ++k)
- {
- uint64_t pid = m.getPid();
- sspan2pid[i][k] = pid;
- pid2span_t::iterator p = spid2span.find(pid);
- if (p != spid2span.end())
- {
- int x = p->second[0].first;
- int y = p->second[0].second-1;
- spstats[i].push_back(spstats[x][y-x]);
- }
- else
- {
- spstats[i].push_back(PT.btfix.lookup(m));
- cout << PT.btfix.T1->pid2str(PT.btfix.V1.get(),pid) << " "
- << spstats[i].back()->good << "/" << spstats[i].back()->sample_cnt
- << endl;
- }
- spid2span[pid].push_back(pair<uint32_t,uint32_t>(i,k+1));
- }
- }
- }
+// void
+// Alignment::
+// fill_sspan_maps()
+// {
+// sspan2pid.assign(s.size(),vector<uint64_t>(s.size(),0));
+// spstats.resize(s.size());
+// for (size_t i = 0; i < s.size(); ++i)
+// {
+// tsa::tree_iterator m(PT.btfix.I1.get());
+// for (size_t k = i; k < s.size() && m.extend(s[k]); ++k)
+// {
+// uint64_t pid = m.getPid();
+// sspan2pid[i][k] = pid;
+// pid2span_t::iterator p = spid2span.find(pid);
+// if (p != spid2span.end())
+// {
+// int x = p->second[0].first;
+// int y = p->second[0].second-1;
+// spstats[i].push_back(spstats[x][y-x]);
+// }
+// else
+// {
+// spstats[i].push_back(PT.btfix.lookup(m));
+// cout << PT.btfix.T1->pid2str(PT.btfix.V1.get(),pid) << " "
+// << spstats[i].back()->good << "/" << spstats[i].back()->sample_cnt
+// << endl;
+// }
+// spid2span[pid].push_back(pair<uint32_t,uint32_t>(i,k+1));
+// }
+// }
+// }
- Alignment::
- Alignment(Mmsapt const& pt, string const& src, string const& trg)
- : PT(pt)
- {
- PT.btfix.V1->fillIdSeq(src,s);
- PT.btfix.V2->fillIdSeq(trg,t);
+// Alignment::
+// Alignment(Mmsapt const& pt, string const& src, string const& trg)
+// : PT(pt)
+// {
+// PT.btfix.V1->fillIdSeq(src,s);
+// PT.btfix.V2->fillIdSeq(trg,t);
- // LexicalPhraseScorer2<Token>::table_t const& COOC = PT.calc_lex.scorer.COOC;
- // BOOST_FOREACH(id_type i, t)
- // {
- // cout << (*PT.btfix.V2)[i];
- // if (i < PT.wlex21.size())
- // {
- // BOOST_FOREACH(id_type k, PT.wlex21[i])
- // {
- // size_t j = COOC[k][i];
- // size_t m1 = COOC.m1(k);
- // size_t m2 = COOC.m2(i);
- // if (j*1000 > m1 && j*1000 > m2)
- // cout << " " << (*PT.btfix.V1)[k];
- // }
- // }
- // cout << endl;
- // }
+// // LexicalPhraseScorer2<Token>::table_t const& COOC = PT.calc_lex.scorer.COOC;
+// // BOOST_FOREACH(id_type i, t)
+// // {
+// // cout << (*PT.btfix.V2)[i];
+// // if (i < PT.wlex21.size())
+// // {
+// // BOOST_FOREACH(id_type k, PT.wlex21[i])
+// // {
+// // size_t j = COOC[k][i];
+// // size_t m1 = COOC.m1(k);
+// // size_t m2 = COOC.m2(i);
+// // if (j*1000 > m1 && j*1000 > m2)
+// // cout << " " << (*PT.btfix.V1)[k];
+// // }
+// // }
+// // cout << endl;
+// // }
- fill_tspan_maps();
- fill_sspan_maps();
- tpos2ahyp.resize(t.size());
- // now fill the association score table
- PAH.reserve(1000000);
- typedef pid2span_t::iterator psiter;
- for (psiter L = spid2span.begin(); L != spid2span.end(); ++L)
- {
- if (!L->second.size()) continue; // should never happen anyway
- int i = L->second[0].first;
- int k = L->second[0].second - i -1;
- sptr<pstats> ps = spstats[i][k];
- PhrasePair pp; pp.init(L->first,*ps, PT.m_numScoreComponents);
- jStatsTable & J = ps->trg;
- for (jStatsTable::iterator y = J.begin(); y != J.end(); ++y)
- {
- psiter R = tpid2span.find(y->first);
- if (R == tpid2span.end()) continue;
- pp.update(y->first, y->second);
- PT.ScorePPfix(pp);
- pp.eval(PT.feature_weights);
- PP.push_back(pp);
- BOOST_FOREACH(span const& sspan, L->second)
- {
- BOOST_FOREACH(span const& tspan, R->second)
- {
- tpos2ahyp[tspan.first].push_back(PAH.size());
- PAH.push_back(PhraseAlnHyp(PP.back(),s.size(),sspan,tspan));
- }
- }
- }
- }
- }
+// fill_tspan_maps();
+// fill_sspan_maps();
+// tpos2ahyp.resize(t.size());
+// // now fill the association score table
+// PAH.reserve(1000000);
+// typedef pid2span_t::iterator psiter;
+// for (psiter L = spid2span.begin(); L != spid2span.end(); ++L)
+// {
+// if (!L->second.size()) continue; // should never happen anyway
+// int i = L->second[0].first;
+// int k = L->second[0].second - i -1;
+// sptr<pstats> ps = spstats[i][k];
+// PhrasePair pp; pp.init(L->first,*ps, PT.m_numScoreComponents);
+// jStatsTable & J = ps->trg;
+// for (jStatsTable::iterator y = J.begin(); y != J.end(); ++y)
+// {
+// psiter R = tpid2span.find(y->first);
+// if (R == tpid2span.end()) continue;
+// pp.update(y->first, y->second);
+// PT.ScorePPfix(pp);
+// pp.eval(PT.feature_weights);
+// PP.push_back(pp);
+// BOOST_FOREACH(span const& sspan, L->second)
+// {
+// BOOST_FOREACH(span const& tspan, R->second)
+// {
+// tpos2ahyp[tspan.first].push_back(PAH.size());
+// PAH.push_back(PhraseAlnHyp(PP.back(),s.size(),sspan,tspan));
+// }
+// }
+// }
+// }
+// }
- int
- extend(vector<PhraseAlnHyp> & PAH, int edge, int next)
- {
- if ((PAH[edge].scov & PAH[next].scov).count())
- return -1;
- int ret = PAH.size();
- PAH.push_back(PAH[next]);
- PhraseAlnHyp & h = PAH.back();
- h.prev = edge;
- h.scov |= PAH[edge].scov;
- h.score += log(PAH[edge].dprob_fwd(PAH[next]));
- h.score += log(PAH[next].dprob_bwd(PAH[edge]));
- return ret;
- }
+// int
+// extend(vector<PhraseAlnHyp> & PAH, int edge, int next)
+// {
+// if ((PAH[edge].scov & PAH[next].scov).count())
+// return -1;
+// int ret = PAH.size();
+// PAH.push_back(PAH[next]);
+// PhraseAlnHyp & h = PAH.back();
+// h.prev = edge;
+// h.scov |= PAH[edge].scov;
+// h.score += log(PAH[edge].dprob_fwd(PAH[next]));
+// h.score += log(PAH[next].dprob_bwd(PAH[edge]));
+// return ret;
+// }
- sptr<vector<int> >
- Mmsapt::
- align(string const& src, string const& trg) const
- {
- // For the time being, we consult only the fixed bitext.
- // We might also consider the dynamic bitext. => TO DO.
- Alignment A(*this,src,trg);
- VectorIndexSorter<PhraseAlnHyp> foo(A.PAH);
- vector<size_t> o; foo.GetOrder(o);
- BOOST_FOREACH(int i, o) A.show(cout,A.PAH[i]);
- sptr<vector<int> > aln;
- return aln;
-}
-}
+// sptr<vector<int> >
+// Mmsapt::
+// align(string const& src, string const& trg) const
+// {
+// // For the time being, we consult only the fixed bitext.
+// // We might also consider the dynamic bitext. => TO DO.
+// Alignment A(*this,src,trg);
+// VectorIndexSorter<PhraseAlnHyp> foo(A.PAH);
+// vector<size_t> o; foo.GetOrder(o);
+// BOOST_FOREACH(int i, o) A.show(cout,A.PAH[i]);
+// sptr<vector<int> > aln;
+// return aln;
+// }
+// }
diff --git a/moses/TranslationModel/UG/mmsapt_phrase_scorers.h b/moses/TranslationModel/UG/mmsapt_phrase_scorers.h
index 6e852b44b..083afb3a3 100644
--- a/moses/TranslationModel/UG/mmsapt_phrase_scorers.h
+++ b/moses/TranslationModel/UG/mmsapt_phrase_scorers.h
@@ -1,268 +1,17 @@
// -*- c++ -*-
+// written by Ulrich Germann
#pragma once
#include "moses/TranslationModel/UG/mm/ug_bitext.h"
#include "util/exception.hh"
+#include "boost/format.hpp"
+#include "sapt_pscore_base.h"
+
+// DEPRECATED CODE: Word and phrase penalties are now
+// added by the decoder.
namespace Moses {
namespace bitext
{
-
- template<typename Token>
- class
- PhraseScorer
- {
- protected:
- int m_index;
- int m_num_feats;
- vector<string> m_feature_names;
- public:
-
- virtual
- void
- operator()(Bitext<Token> const& pt, PhrasePair& pp, vector<float> * dest=NULL)
- const = 0;
-
- int
- fcnt() const
- { return m_num_feats; }
-
- vector<string> const &
- fnames() const
- { return m_feature_names; }
-
- string const &
- fname(int i) const
- {
- UTIL_THROW_IF2((i < m_index || i >= m_index + m_num_feats),
- "Feature name index out of range at "
- << __FILE__ << ":" << __LINE__);
- return m_feature_names.at(i - m_index);
- }
-
- int
- getIndex() const
- { return m_index; }
- };
-
- ////////////////////////////////////////////////////////////////////////////////
-
- template<typename Token>
- class
- PScorePfwd : public PhraseScorer<Token>
- {
- float conf;
- char denom;
- public:
- PScorePfwd()
- {
- this->m_num_feats = 1;
- }
-
- int
- init(int const i, float const c, char d)
- {
- conf = c;
- denom = d;
- this->m_index = i;
- ostringstream buf;
- buf << format("pfwd-%c%.3f") % denom % c;
- this->m_feature_names.push_back(buf.str());
- return i + this->m_num_feats;
- }
-
- void
- operator()(Bitext<Token> const& bt, PhrasePair & pp,
- vector<float> * dest = NULL) const
- {
- if (!dest) dest = &pp.fvals;
- if (pp.joint > pp.good1)
- {
- cerr<<bt.toString(pp.p1,0)<<" ::: "<<bt.toString(pp.p2,1)<<endl;
- cerr<<pp.joint<<"/"<<pp.good1<<"/"<<pp.raw2<<endl;
- }
- switch (denom)
- {
- case 'g':
- (*dest)[this->m_index] = log(lbop(pp.good1, pp.joint, conf));
- break;
- case 's':
- (*dest)[this->m_index] = log(lbop(pp.sample1, pp.joint, conf));
- break;
- case 'r':
- (*dest)[this->m_index] = log(lbop(pp.raw1, pp.joint, conf));
- }
- }
- };
-
- ////////////////////////////////////////////////////////////////////////////////
-
- template<typename Token>
- class
- PScorePbwd : public PhraseScorer<Token>
- {
- float conf;
- char denom;
- public:
- PScorePbwd()
- {
- this->m_num_feats = 1;
- }
-
- int
- init(int const i, float const c, char d)
- {
- conf = c;
- denom = d;
- this->m_index = i;
- ostringstream buf;
- buf << format("pbwd-%c%.3f") % denom % c;
- this->m_feature_names.push_back(buf.str());
- return i + this->m_num_feats;
- }
-
- void
- operator()(Bitext<Token> const& bt, PhrasePair& pp,
- vector<float> * dest = NULL) const
- {
- if (!dest) dest = &pp.fvals;
- // we use the denominator specification to scale the raw counts on the
- // target side; the clean way would be to counter-sample
- uint32_t r2 = pp.raw2;
- if (denom == 'g') r2 = round(r2 * float(pp.good1) / pp.raw1);
- else if (denom == 's') r2 = round(r2 * float(pp.sample1) / pp.raw1);
- (*dest)[this->m_index] = log(lbop(max(r2, pp.joint),pp.joint,conf));
- }
- };
-
- ////////////////////////////////////////////////////////////////////////////////
-
- template<typename Token>
- class
- PScoreCoherence : public PhraseScorer<Token>
- {
- public:
- PScoreCoherence()
- {
- this->m_num_feats = 1;
- }
-
- int
- init(int const i)
- {
- this->m_index = i;
- this->m_feature_names.push_back(string("coherence"));
- return i + this->m_num_feats;
- }
-
- void
- operator()(Bitext<Token> const& bt, PhrasePair& pp,
- vector<float> * dest = NULL) const
- {
- if (!dest) dest = &pp.fvals;
- (*dest)[this->m_index] = log(pp.good1) - log(pp.sample1);
- }
- };
-
- ////////////////////////////////////////////////////////////////////////////////
-
- template<typename Token>
- class
- PScoreLogCounts : public PhraseScorer<Token>
- {
- float conf;
- public:
- PScoreLogCounts()
- {
- this->m_num_feats = 5;
- }
-
- int
- init(int const i)
- {
- this->m_index = i;
- this->m_feature_names.push_back("log-r1");
- this->m_feature_names.push_back("log-s1");
- this->m_feature_names.push_back("log-g1");
- this->m_feature_names.push_back("log-j");
- this->m_feature_names.push_back("log-r2");
- return i + this->m_num_feats;
- }
-
- void
- operator()(Bitext<Token> const& bt, PhrasePair& pp,
- vector<float> * dest = NULL) const
- {
- if (!dest) dest = &pp.fvals;
- size_t i = this->m_index;
- assert(pp.raw1);
- assert(pp.sample1);
- assert(pp.good1);
- assert(pp.joint);
- assert(pp.raw2);
- (*dest)[i] = -log(pp.raw1);
- (*dest)[++i] = -log(pp.sample1);
- (*dest)[++i] = -log(pp.good1);
- (*dest)[++i] = +log(pp.joint);
- (*dest)[++i] = -log(pp.raw2);
- }
- };
-
- template<typename Token>
- class
- PScoreLex : public PhraseScorer<Token>
- {
- float const m_alpha;
- public:
- LexicalPhraseScorer2<Token> scorer;
-
- PScoreLex(float const a)
- : m_alpha(a)
- { this->m_num_feats = 2; }
-
- int
- init(int const i, string const& fname)
- {
- scorer.open(fname);
- this->m_index = i;
- this->m_feature_names.push_back("lexfwd");
- this->m_feature_names.push_back("lexbwd");
- return i + this->m_num_feats;
- }
-
- void
- operator()(Bitext<Token> const& bt, PhrasePair& pp, vector<float> * dest = NULL) const
- {
- if (!dest) dest = &pp.fvals;
- uint32_t sid1=0,sid2=0,off1=0,off2=0,len1=0,len2=0;
- parse_pid(pp.p1, sid1, off1, len1);
- parse_pid(pp.p2, sid2, off2, len2);
-
-#if 0
- cout << len1 << " " << len2 << endl;
- Token const* t1 = bt.T1->sntStart(sid1);
- for (size_t i = off1; i < off1 + len1; ++i)
- cout << (*bt.V1)[t1[i].id()] << " ";
- cout << __FILE__ << ":" << __LINE__ << endl;
-
- Token const* t2 = bt.T2->sntStart(sid2);
- for (size_t i = off2; i < off2 + len2; ++i)
- cout << (*bt.V2)[t2[i].id()] << " ";
- cout << __FILE__ << ":" << __LINE__ << endl;
-
- BOOST_FOREACH (int a, pp.aln)
- cout << a << " " ;
- cout << __FILE__ << ":" << __LINE__ << "\n" << endl;
-
-#endif
- scorer.score(bt.T1->sntStart(sid1)+off1,0,len1,
- bt.T2->sntStart(sid2)+off2,0,len2,
- pp.aln, m_alpha,
- (*dest)[this->m_index],
- (*dest)[this->m_index+1]);
- }
-
- };
-
/// Word penalty
template<typename Token>
class
@@ -280,7 +29,8 @@ namespace Moses {
}
void
- operator()(Bitext<Token> const& bt, PhrasePair& pp, vector<float> * dest = NULL) const
+ operator()(Bitext<Token> const& bt, PhrasePair<Token>& pp,
+ vector<float> * dest = NULL) const
{
if (!dest) dest = &pp.fvals;
uint32_t sid2=0,off2=0,len2=0;
@@ -307,7 +57,8 @@ namespace Moses {
}
void
- operator()(Bitext<Token> const& bt, PhrasePair& pp, vector<float> * dest = NULL) const
+ operator()(Bitext<Token> const& bt, PhrasePair<Token>& pp,
+ vector<float> * dest = NULL) const
{
if (!dest) dest = &pp.fvals;
(*dest)[this->m_index] = 1;
diff --git a/moses/TranslationModel/UG/ptable-lookup.cc b/moses/TranslationModel/UG/ptable-lookup.cc
index 106505f05..2cbf89b16 100644
--- a/moses/TranslationModel/UG/ptable-lookup.cc
+++ b/moses/TranslationModel/UG/ptable-lookup.cc
@@ -106,15 +106,11 @@ int main(int argc, char* argv[])
cout << " ";
for (size_t k = idx.first; k < idx.second; ++k)
{
- if (mmsapt && fname[k-idx.first].substr(0,3) == "log")
- {
- if(scores[k] < 0)
- cout << " " << format("%10d") % round(exp(-scores[k]));
- else
- cout << " " << format("%10d") % round(exp(scores[k]));
- }
- else
- cout << " " << format("%10.8f") % exp(scores[k]);
+ size_t j = k-idx.first;
+ float f = (mmsapt ? mmsapt->isLogVal(j) ? exp(scores[k]) : scores[k]
+ : scores[k] < 0 ? exp(scores[k]) : scores[k]);
+ string fmt = (mmsapt && mmsapt->isInteger(j)) ? "%10d" : "%10.8f";
+ cout << " " << format(fmt) % f;
}
cout << endl;
}
diff --git a/moses/TranslationModel/UG/sapt_phrase_key.h b/moses/TranslationModel/UG/sapt_phrase_key.h
new file mode 100644
index 000000000..e1ecf1573
--- /dev/null
+++ b/moses/TranslationModel/UG/sapt_phrase_key.h
@@ -0,0 +1,13 @@
+//-*- c++ -*-
+#pragma once
+#include <stdint.h>
+
+using namespace std;
+namespace sapt
+{
+ using namespace Moses;
+ using namespace std;
+
+
+
+}
diff --git a/moses/TranslationModel/UG/sapt_phrase_scorers.h b/moses/TranslationModel/UG/sapt_phrase_scorers.h
new file mode 100644
index 000000000..37cfd26fd
--- /dev/null
+++ b/moses/TranslationModel/UG/sapt_phrase_scorers.h
@@ -0,0 +1,12 @@
+// -*- c++ -*-
+// Phrase scoring functions for suffix array-based phrase tables
+// written by Ulrich Germann
+#pragma once
+#include "sapt_pscore_unaligned.h" // count # of unaligned words
+#include "sapt_pscore_provenance.h" // reward for joint phrase occ. per corpus
+#include "sapt_pscore_rareness.h" // penalty for rare occurrences (global?)
+#include "sapt_pscore_logcnt.h" // logs of observed counts
+#include "sapt_pscore_lex1.h" // plain vanilla Moses lexical scores
+#include "sapt_pscore_pfwd.h" // fwd phrase prob
+#include "sapt_pscore_pbwd.h" // bwd phrase prob
+#include "sapt_pscore_coherence.h" // coherence feature: good/sample-size
diff --git a/moses/TranslationModel/UG/sapt_pscore_base.h b/moses/TranslationModel/UG/sapt_pscore_base.h
new file mode 100644
index 000000000..68a491145
--- /dev/null
+++ b/moses/TranslationModel/UG/sapt_pscore_base.h
@@ -0,0 +1,103 @@
+// -*- c++ -*-
+// Base classes for suffix array-based phrase scorers
+// written by Ulrich Germann
+#pragma once
+#include "moses/TranslationModel/UG/mm/ug_bitext.h"
+#include "moses/TranslationModel/UG/mm/ug_phrasepair.h"
+#include "util/exception.hh"
+#include "boost/format.hpp"
+
+namespace Moses {
+ namespace bitext
+ {
+
+ // abstract base class that defines the common API for phrase scorers
+ template<typename Token>
+ class
+ PhraseScorer
+ {
+ protected:
+ int m_index;
+ int m_num_feats;
+ string m_tag;
+ vector<string> m_feature_names;
+ public:
+
+ virtual
+ void
+ operator()(Bitext<Token> const& pt,
+ PhrasePair<Token>& pp,
+ vector<float> * dest=NULL)
+ const = 0;
+
+ void
+ setIndex(int const i) { m_index = i; }
+
+ int
+ getIndex() const { return m_index; }
+
+ int
+ fcnt() const { return m_num_feats; }
+
+ vector<string> const &
+ fnames() const { return m_feature_names; }
+
+ string const &
+ fname(int i) const
+ {
+ if (i < 0) i += m_num_feats;
+ UTIL_THROW_IF2(i < 0 || i >= m_num_feats,
+ "Feature name index out of range at " << HERE);
+ return m_feature_names.at(i);
+ }
+
+ virtual
+ bool
+ isLogVal(int i) const { return true; };
+ // is this feature log valued?
+
+ virtual
+ bool
+ isIntegerValued(int i) const { return false; };
+ // is this feature integer valued (e.g., count features)?
+
+ virtual
+ bool
+ allowPooling() const { return true; }
+ // does this feature function allow pooling of counts if
+ // there are no occurrences in the respective corpus?
+
+ };
+
+ // base class for 'families' of phrase scorers that have a single
+ template<typename Token>
+ class
+ SingleRealValuedParameterPhraseScorerFamily
+ : public PhraseScorer<Token>
+ {
+ protected:
+ vector<float> m_x;
+
+ virtual
+ void
+ init(string const specs)
+ {
+ using namespace boost;
+ UTIL_THROW_IF2(this->m_tag.size() == 0,
+ "m_tag must be initialized in constructor");
+ UTIL_THROW_IF2(specs.size() == 0,"empty specification string!");
+ UTIL_THROW_IF2(this->m_feature_names.size(),
+ "PhraseScorer can only be initialized once!");
+ this->m_index = -1;
+ float x; char c;
+ for (istringstream buf(specs); buf>>x; buf>>c)
+ {
+ this->m_x.push_back(x);
+ string fname = (format("%s-%.2f") % this->m_tag % x).str();
+ this->m_feature_names.push_back(fname);
+ }
+ this->m_num_feats = this->m_x.size();
+ }
+ };
+ } // namespace bitext
+} // namespace moses
diff --git a/moses/TranslationModel/UG/sapt_pscore_coherence.h b/moses/TranslationModel/UG/sapt_pscore_coherence.h
new file mode 100644
index 000000000..a3211df54
--- /dev/null
+++ b/moses/TranslationModel/UG/sapt_pscore_coherence.h
@@ -0,0 +1,33 @@
+// -*- c++ -*-
+// written by Ulrich Germann
+#pragma once
+#include "moses/TranslationModel/UG/mm/ug_bitext.h"
+#include "util/exception.hh"
+#include "boost/format.hpp"
+
+namespace Moses {
+ namespace bitext
+ {
+ template<typename Token>
+ class
+ PScoreCoherence : public PhraseScorer<Token>
+ {
+ public:
+ PScoreCoherence(string const dummy)
+ {
+ this->m_index = -1;
+ this->m_num_feats = 1;
+ this->m_feature_names.push_back(string("coherence"));
+ }
+
+ void
+ operator()(Bitext<Token> const& bt,
+ PhrasePair<Token>& pp,
+ vector<float> * dest = NULL) const
+ {
+ if (!dest) dest = &pp.fvals;
+ (*dest)[this->m_index] = log(pp.good1) - log(pp.sample1);
+ }
+ };
+ }
+}
diff --git a/moses/TranslationModel/UG/sapt_pscore_lex1.h b/moses/TranslationModel/UG/sapt_pscore_lex1.h
new file mode 100644
index 000000000..be994b0d3
--- /dev/null
+++ b/moses/TranslationModel/UG/sapt_pscore_lex1.h
@@ -0,0 +1,70 @@
+// -*- c++ -*-
+// Phrase scorer that counts the number of unaligend words in the phrase
+// written by Ulrich Germann
+
+#include "moses/TranslationModel/UG/mm/ug_bitext.h"
+#include "sapt_pscore_base.h"
+#include <boost/dynamic_bitset.hpp>
+
+namespace Moses {
+ namespace bitext
+ {
+ template<typename Token>
+ class
+ PScoreLex1 : public PhraseScorer<Token>
+ {
+ float m_alpha;
+ public:
+ LexicalPhraseScorer2<Token> scorer;
+
+ PScoreLex1(string const& alpaspec, string const& lexfile)
+ {
+ this->m_index = -1;
+ this->m_num_feats = 2;
+ this->m_feature_names.reserve(2);
+ this->m_feature_names.push_back("lexfwd");
+ this->m_feature_names.push_back("lexbwd");
+ m_alpha = atof(alpaspec.c_str());
+ scorer.open(lexfile);
+ }
+
+ void
+ operator()(Bitext<Token> const& bt,
+ PhrasePair<Token>& pp,
+ vector<float> * dest = NULL) const
+ {
+ if (!dest) dest = &pp.fvals;
+ // uint32_t sid1=0,sid2=0,off1=0,off2=0,len1=0,len2=0;
+ // parse_pid(pp.p1, sid1, off1, len1);
+ // parse_pid(pp.p2, sid2, off2, len2);
+#if 0
+ cout << len1 << " " << len2 << endl;
+ Token const* t1 = bt.T1->sntStart(sid1);
+ for (size_t i = off1; i < off1 + len1; ++i)
+ cout << (*bt.V1)[t1[i].id()] << " ";
+ cout << __FILE__ << ":" << __LINE__ << endl;
+
+ Token const* t2 = bt.T2->sntStart(sid2);
+ for (size_t i = off2; i < off2 + len2; ++i)
+ cout << (*bt.V2)[t2[i].id()] << " ";
+ cout << __FILE__ << ":" << __LINE__ << endl;
+
+ BOOST_FOREACH (int a, pp.aln)
+ cout << a << " " ;
+ cout << __FILE__ << ":" << __LINE__ << "\n" << endl;
+
+ scorer.score(bt.T1->sntStart(sid1)+off1,0,len1,
+ bt.T2->sntStart(sid2)+off2,0,len2,
+ pp.aln, m_alpha,
+ (*dest)[this->m_index],
+ (*dest)[this->m_index+1]);
+#endif
+ scorer.score(pp.start1,0, pp.len1,
+ pp.start2,0, pp.len2, pp.aln, m_alpha,
+ (*dest)[this->m_index],
+ (*dest)[this->m_index+1]);
+ }
+ };
+ } //namespace bitext
+} // namespace Moses
+
diff --git a/moses/TranslationModel/UG/sapt_pscore_logcnt.h b/moses/TranslationModel/UG/sapt_pscore_logcnt.h
new file mode 100644
index 000000000..2790323ed
--- /dev/null
+++ b/moses/TranslationModel/UG/sapt_pscore_logcnt.h
@@ -0,0 +1,65 @@
+// -*- c++ -*-
+// Phrase scorer that rewards the number of phrase pair occurrences in a bitext
+// with the asymptotic function x/(j+x) where x > 0 is a function
+// parameter that determines the steepness of the rewards curve
+// written by Ulrich Germann
+
+#include "sapt_pscore_base.h"
+#include <boost/dynamic_bitset.hpp>
+
+using namespace std;
+namespace Moses {
+ namespace bitext {
+
+ template<typename Token>
+ class
+ PScoreLogCnt : public PhraseScorer<Token>
+ {
+ string m_specs;
+ public:
+ PScoreLogCnt(string const specs)
+ {
+ this->m_index = -1;
+ this->m_specs = specs;
+ if (specs.find("r1") != string::npos) // raw source phrase counts
+ this->m_feature_names.push_back("log-r1");
+ if (specs.find("s1") != string::npos)
+ this->m_feature_names.push_back("log-s1"); // L1 sample size
+ if (specs.find("g1") != string::npos) // coherent phrases
+ this->m_feature_names.push_back("log-g1");
+ if (specs.find("j") != string::npos) // joint counts
+ this->m_feature_names.push_back("log-j");
+ if (specs.find("r2") != string::npos) // raw target phrase counts
+ this->m_feature_names.push_back("log-r2");
+ this->m_num_feats = this->m_feature_names.size();
+ }
+
+ bool
+ isIntegerValued(int i) const { return true; }
+
+ void
+ operator()(Bitext<Token> const& bt,
+ PhrasePair<Token>& pp,
+ vector<float> * dest = NULL) const
+ {
+ if (!dest) dest = &pp.fvals;
+ assert(pp.raw1);
+ assert(pp.sample1);
+ assert(pp.good1);
+ assert(pp.joint);
+ assert(pp.raw2);
+ size_t i = this->m_index;
+ if (m_specs.find("r1") != string::npos)
+ (*dest)[i++] = log(pp.raw1);
+ if (m_specs.find("s1") != string::npos)
+ (*dest)[i++] = log(pp.sample1);
+ if (m_specs.find("g1") != string::npos)
+ (*dest)[i++] = log(pp.good1);
+ if (m_specs.find("j") != string::npos)
+ (*dest)[i++] = log(pp.joint);
+ if (m_specs.find("r2") != string::npos)
+ (*dest)[++i] = log(pp.raw2);
+ }
+ };
+ } // namespace bitext
+} // namespace Moses
diff --git a/moses/TranslationModel/UG/sapt_pscore_pbwd.h b/moses/TranslationModel/UG/sapt_pscore_pbwd.h
new file mode 100644
index 000000000..f7b4686d7
--- /dev/null
+++ b/moses/TranslationModel/UG/sapt_pscore_pbwd.h
@@ -0,0 +1,58 @@
+//-*- c++ -*-
+// written by Ulrich Germann
+#pragma once
+#include "moses/TranslationModel/UG/mm/ug_bitext.h"
+#include "util/exception.hh"
+#include "boost/format.hpp"
+#include "boost/foreach.hpp"
+
+namespace Moses {
+ namespace bitext
+ {
+ template<typename Token>
+ class
+ PScorePbwd : public PhraseScorer<Token>
+ {
+ float conf;
+ string denom;
+
+ public:
+ PScorePbwd(float const c, string d)
+ {
+ this->m_index = -1;
+ conf = c;
+ denom = d;
+ size_t checksum = d.size();
+ BOOST_FOREACH(char const& x, denom)
+ {
+ if (x == '+') { --checksum; continue; }
+ if (x != 'g' && x != 's' && x != 'r') continue;
+ string s = (format("pbwd-%c%.3f") % x % c).str();
+ this->m_feature_names.push_back(s);
+ }
+ this->m_num_feats = this->m_feature_names.size();
+ UTIL_THROW_IF2(this->m_feature_names.size() != checksum,
+ "Unknown parameter in specification '"
+ << d << "' for Pbwd phrase scorer at " << HERE);
+ }
+
+ void
+ operator()(Bitext<Token> const& bt,
+ PhrasePair<Token>& pp,
+ vector<float> * dest = NULL) const
+ {
+ if (!dest) dest = &pp.fvals;
+ // we use the denominator specification to scale the raw counts on the
+ // target side; the clean way would be to counter-sample
+ size_t i = this->m_index;
+ BOOST_FOREACH(char const& x, denom)
+ {
+ uint32_t m2 = pp.raw2;
+ if (x == 'g') m2 = round(m2 * float(pp.good1) / pp.raw1);
+ else if (x == 's') m2 = round(m2 * float(pp.sample1) / pp.raw1);
+ (*dest)[i++] = log(lbop(max(m2, pp.joint),pp.joint,conf));
+ }
+ }
+ };
+ } // namespace bitext
+} // namespace Moses
diff --git a/moses/TranslationModel/UG/sapt_pscore_pfwd.h b/moses/TranslationModel/UG/sapt_pscore_pfwd.h
new file mode 100644
index 000000000..ed48a93d2
--- /dev/null
+++ b/moses/TranslationModel/UG/sapt_pscore_pfwd.h
@@ -0,0 +1,70 @@
+// -*- c++ -*-
+// written by Ulrich Germann
+#pragma once
+#include "moses/TranslationModel/UG/mm/ug_bitext.h"
+#include "util/exception.hh"
+#include "boost/format.hpp"
+#include "boost/foreach.hpp"
+
+namespace Moses {
+ namespace bitext
+ {
+ template<typename Token>
+ class
+ PScorePfwd : public PhraseScorer<Token>
+ {
+ float conf;
+ string denom;
+
+ public:
+
+ PScorePfwd(float const c, string d)
+ {
+ this->m_index = -1;
+ conf = c;
+ denom = d;
+ size_t checksum = d.size();
+ BOOST_FOREACH(char const& x, denom)
+ {
+ if (x == '+') { --checksum; continue; }
+ if (x != 'g' && x != 's' && x != 'r') continue;
+ string s = (format("pfwd-%c%.3f") % x % c).str();
+ this->m_feature_names.push_back(s);
+ }
+ this->m_num_feats = this->m_feature_names.size();
+ UTIL_THROW_IF2(this->m_feature_names.size() != checksum,
+ "Unknown parameter in specification '"
+ << d << "' for Pfwd phrase scorer at " << HERE);
+ }
+
+ void
+ operator()(Bitext<Token> const& bt, PhrasePair<Token> & pp,
+ vector<float> * dest = NULL) const
+ {
+ if (!dest) dest = &pp.fvals;
+ if (pp.joint > pp.good1)
+ {
+ pp.joint = pp.good1;
+ // cerr<<bt.toString(pp.p1,0)<<" ::: "<<bt.toString(pp.p2,1)<<endl;
+ // cerr<<pp.joint<<"/"<<pp.good1<<"/"<<pp.raw2<<endl;
+ }
+ size_t i = this->m_index;
+ BOOST_FOREACH(char const& c, this->denom)
+ {
+ switch (c)
+ {
+ case 'g':
+ (*dest)[i++] = log(lbop(pp.good1, pp.joint, conf));
+ break;
+ case 's':
+ (*dest)[i++] = log(lbop(pp.sample1, pp.joint, conf));
+ break;
+ case 'r':
+ (*dest)[i++] = log(lbop(pp.raw1, pp.joint, conf));
+ }
+ }
+ }
+ };
+ }
+}
+
diff --git a/moses/TranslationModel/UG/sapt_pscore_provenance.h b/moses/TranslationModel/UG/sapt_pscore_provenance.h
new file mode 100644
index 000000000..c33b98fe7
--- /dev/null
+++ b/moses/TranslationModel/UG/sapt_pscore_provenance.h
@@ -0,0 +1,47 @@
+// -*- c++ -*-
+// Phrase scorer that rewards the number of phrase pair occurrences in a bitext
+// with the asymptotic function j/(j+x) where x > 0 is a function
+// parameter that determines the steepness of the rewards curve
+// written by Ulrich Germann
+
+#include "sapt_pscore_base.h"
+#include <boost/dynamic_bitset.hpp>
+
+using namespace std;
+namespace Moses {
+ namespace bitext {
+
+ // asymptotic provenance feature n/(n+x)
+ template<typename Token>
+ class
+ PScoreProvenance : public SingleRealValuedParameterPhraseScorerFamily<Token>
+ {
+ public:
+
+ PScoreProvenance(string const& spec)
+ {
+ this->m_tag = "prov";
+ this->init(spec);
+ }
+
+ bool
+ isLogVal(int i) const { return false; }
+
+ void
+ operator()(Bitext<Token> const& bt,
+ PhrasePair<Token>& pp,
+ vector<float> * dest = NULL) const
+ {
+ if (!dest) dest = &pp.fvals;
+ size_t i = this->m_index;
+ BOOST_FOREACH(float const x, this->m_x)
+ (*dest).at(i++) = pp.joint/(x + pp.joint);
+ }
+
+ bool
+ allowPooling() const
+ { return false; }
+
+ };
+ } // namespace bitext
+} // namespace Moses
diff --git a/moses/TranslationModel/UG/sapt_pscore_rareness.h b/moses/TranslationModel/UG/sapt_pscore_rareness.h
new file mode 100644
index 000000000..58f204c88
--- /dev/null
+++ b/moses/TranslationModel/UG/sapt_pscore_rareness.h
@@ -0,0 +1,41 @@
+// -*- c++ -*-
+// Phrase scorer that rewards the number of phrase pair occurrences in a bitext
+// with the asymptotic function x/(j+x) where x > 0 is a function
+// parameter that determines the steepness of the rewards curve
+// written by Ulrich Germann
+
+#include "sapt_pscore_base.h"
+#include <boost/dynamic_bitset.hpp>
+
+using namespace std;
+namespace Moses {
+ namespace bitext {
+
+ // rareness penalty: x/(n+x)
+ template<typename Token>
+ class
+ PScoreRareness : public SingleRealValuedParameterPhraseScorerFamily<Token>
+ {
+ public:
+ PScoreRareness(string const spec)
+ {
+ this->m_tag = "rare";
+ this->init(spec);
+ }
+
+ bool
+ isLogVal(int i) const { return false; }
+
+ void
+ operator()(Bitext<Token> const& bt,
+ PhrasePair<Token>& pp,
+ vector<float> * dest = NULL) const
+ {
+ if (!dest) dest = &pp.fvals;
+ size_t i = this->m_index;
+ BOOST_FOREACH(float const x, this->m_x)
+ (*dest).at(i++) = x/(x + pp.joint);
+ }
+ };
+ } // namespace bitext
+} // namespace Moses
diff --git a/moses/TranslationModel/UG/sapt_pscore_unaligned.h b/moses/TranslationModel/UG/sapt_pscore_unaligned.h
new file mode 100644
index 000000000..bdd2919b4
--- /dev/null
+++ b/moses/TranslationModel/UG/sapt_pscore_unaligned.h
@@ -0,0 +1,67 @@
+// -*- c++ -*-
+// Phrase scorer that counts the number of unaligend words in the phrase
+// written by Ulrich Germann
+
+#include "sapt_pscore_base.h"
+#include <boost/dynamic_bitset.hpp>
+
+namespace Moses {
+ namespace bitext
+ {
+ template<typename Token>
+ class
+ PScoreUnaligned : public PhraseScorer<Token>
+ {
+ typedef boost::dynamic_bitset<uint64_t> bitvector;
+ public:
+ PScoreUnaligned(string const spec)
+ {
+ this->m_index = -1;
+ int f = this->m_num_feats = atoi(spec.c_str());
+ UTIL_THROW_IF2(f != 1 && f != 2,"unal parameter must be 1 or 2 at "<<HERE);
+ this->m_feature_names.resize(f);
+ if (f == 1)
+ this->m_feature_names[0] = "unal";
+ else
+ {
+ this->m_feature_names[0] = "unal-s";
+ this->m_feature_names[1] = "unal-t";
+ }
+ }
+
+ bool
+ isLogVal(int i) const { return false; }
+
+ bool
+ isIntegerValued(int i) const { return true; }
+
+ void
+ operator()(Bitext<Token> const& bt,
+ PhrasePair<Token>& pp,
+ vector<float> * dest = NULL) const
+ {
+ if (!dest) dest = &pp.fvals;
+ // uint32_t sid1=0,sid2=0,off1=0,off2=0,len1=0,len2=0;
+ // parse_pid(pp.p1, sid1, off1, len1);
+ // parse_pid(pp.p2, sid2, off2, len2);
+ bitvector check1(pp.len1),check2(pp.len2);
+ for (size_t i = 0; i < pp.aln.size(); )
+ {
+ check1.set(pp.aln[i++]);
+ check2.set(pp.aln.at(i++));
+ }
+
+ if (this->m_num_feats == 1)
+ {
+ (*dest)[this->m_index] = pp.len1 - check1.count();
+ (*dest)[this->m_index] += pp.len2 - check2.count();
+ }
+ else
+ {
+ (*dest)[this->m_index] = pp.len1 - check1.count();
+ (*dest)[this->m_index+1] = pp.len2 - check2.count();
+ }
+ }
+ };
+ } // namespace bitext
+} // namespace Moses
diff --git a/moses/TranslationModel/UG/sim-pe.cc b/moses/TranslationModel/UG/sim-pe.cc
new file mode 100644
index 000000000..58a70cab4
--- /dev/null
+++ b/moses/TranslationModel/UG/sim-pe.cc
@@ -0,0 +1,83 @@
+#include "mmsapt.h"
+#include "moses/Manager.h"
+#include "moses/TranslationModel/PhraseDictionaryTreeAdaptor.h"
+#include <boost/foreach.hpp>
+#include <boost/format.hpp>
+#include <boost/tokenizer.hpp>
+#include <boost/shared_ptr.hpp>
+#include <algorithm>
+#include <iostream>
+
+using namespace Moses;
+using namespace bitext;
+using namespace std;
+using namespace boost;
+
+vector<FactorType> fo(1,FactorType(0));
+
+ostream&
+operator<<(ostream& out, Hypothesis const* x)
+{
+ vector<const Hypothesis*> H;
+ for (const Hypothesis* h = x; h; h = h->GetPrevHypo())
+ H.push_back(h);
+ for (; H.size(); H.pop_back())
+ {
+ Phrase const& p = H.back()->GetCurrTargetPhrase();
+ for (size_t pos = 0 ; pos < p.GetSize() ; pos++)
+ out << *p.GetFactor(pos, 0) << (H.size() ? " " : "");
+ }
+ return out;
+}
+
+vector<FactorType> ifo;
+size_t lineNumber;
+
+string
+translate(string const& source)
+{
+ StaticData const& global = StaticData::Instance();
+
+ Sentence sentence;
+ istringstream ibuf(source+"\n");
+ sentence.Read(ibuf,ifo);
+
+ Manager manager(lineNumber, sentence, global.GetSearchAlgorithm());
+ manager.ProcessSentence();
+
+ ostringstream obuf;
+ const Hypothesis* h = manager.GetBestHypothesis();
+ obuf << h;
+ return obuf.str();
+
+}
+
+int main(int argc, char* argv[])
+{
+ Parameter params;
+ if (!params.LoadParam(argc,argv) || !StaticData::LoadDataStatic(&params, argv[0]))
+ exit(1);
+
+ StaticData const& global = StaticData::Instance();
+ global.SetVerboseLevel(0);
+ ifo = global.GetInputFactorOrder();
+
+ lineNumber = 0; // TODO: Include sentence request number here?
+ string source, target, alignment;
+ while (getline(cin,source))
+ {
+ getline(cin,target);
+ getline(cin,alignment);
+ cout << "[S] " << source << endl;
+ cout << "[H] " << translate(source) << endl;
+ cout << "[T] " << target << endl;
+ Mmsapt* pdsa = reinterpret_cast<Mmsapt*>(PhraseDictionary::GetColl()[0]);
+ pdsa->add(source,target,alignment);
+ cout << "[X] " << translate(source) << endl;
+ cout << endl;
+ }
+ exit(0);
+}
+
+
+
diff --git a/moses/TranslationModel/UG/try-align.cc b/moses/TranslationModel/UG/try-align.cc
index 30c87ccab..483ad2c34 100644
--- a/moses/TranslationModel/UG/try-align.cc
+++ b/moses/TranslationModel/UG/try-align.cc
@@ -2,32 +2,33 @@
using namespace std;
using namespace Moses;
+// currently broken
Mmsapt* PT;
int main(int argc, char* argv[])
{
- string base = argv[1];
- string L1 = argv[2];
- string L2 = argv[3];
- ostringstream buf;
- buf << "Mmsapt name=PT0 output-factor=0 num-features=5 base="
- << base << " L1=" << L1 << " L2=" << L2;
- string configline = buf.str();
- PT = new Mmsapt(configline);
- PT->Load();
- float w[] = { 0.0582634, 0.0518865, 0.0229819, 0.00640856, 0.647506 };
- vector<float> weights(w,w+5);
- PT->setWeights(weights);
- // these values are taken from a moses.ini file;
- // is there a convenient way of accessing them from within mmsapt ???
- string eline,fline;
- // TokenIndex V; V.open("crp/trn/mm/de.tdx");
- while (getline(cin,eline) && getline(cin,fline))
- {
- cout << eline << endl;
- cout << fline << endl;
- PT->align(eline,fline);
- }
- delete PT;
+ // string base = argv[1];
+ // string L1 = argv[2];
+ // string L2 = argv[3];
+ // ostringstream buf;
+ // buf << "Mmsapt name=PT0 output-factor=0 num-features=5 base="
+ // << base << " L1=" << L1 << " L2=" << L2;
+ // string configline = buf.str();
+ // PT = new Mmsapt(configline);
+ // PT->Load();
+ // float w[] = { 0.0582634, 0.0518865, 0.0229819, 0.00640856, 0.647506 };
+ // vector<float> weights(w,w+5);
+ // PT->setWeights(weights);
+ // // these values are taken from a moses.ini file;
+ // // is there a convenient way of accessing them from within mmsapt ???
+ // string eline,fline;
+ // // TokenIndex V; V.open("crp/trn/mm/de.tdx");
+ // while (getline(cin,eline) && getline(cin,fline))
+ // {
+ // cout << eline << endl;
+ // cout << fline << endl;
+ // PT->align(eline,fline);
+ // }
+ // delete PT;
}
diff --git a/moses/TranslationModel/fuzzy-match/FuzzyMatchWrapper.cpp b/moses/TranslationModel/fuzzy-match/FuzzyMatchWrapper.cpp
index 8766743b3..a91c58343 100644
--- a/moses/TranslationModel/fuzzy-match/FuzzyMatchWrapper.cpp
+++ b/moses/TranslationModel/fuzzy-match/FuzzyMatchWrapper.cpp
@@ -345,10 +345,10 @@ string FuzzyMatchWrapper::ExtractTM(WordIndex &wordIndex, long translationId, co
// find the best matches according to letter sed
string best_path = "";
int best_match = -1;
- int best_letter_cost;
+ unsigned int best_letter_cost;
if (lsed_flag) {
best_letter_cost = compute_length( input[sentenceInd] ) * min_match / 100 + 1;
- for(int si=0; si<best_tm.size(); si++) {
+ for(size_t si=0; si<best_tm.size(); si++) {
int s = best_tm[si];
string path;
unsigned int letter_cost = sed( input[sentenceInd], source[s], path, true );
diff --git a/moses/TypeDef.h b/moses/TypeDef.h
index fb9fd56cb..7852d130d 100644
--- a/moses/TypeDef.h
+++ b/moses/TypeDef.h
@@ -59,7 +59,11 @@ const size_t DEFAULT_MAX_HYPOSTACK_SIZE = 200;
const size_t DEFAULT_MAX_TRANS_OPT_CACHE_SIZE = 10000;
const size_t DEFAULT_MAX_TRANS_OPT_SIZE = 5000;
const size_t DEFAULT_MAX_PART_TRANS_OPT_SIZE = 10000;
-const size_t DEFAULT_MAX_PHRASE_LENGTH = 20;
+#ifdef PT_UG
+ const size_t DEFAULT_MAX_PHRASE_LENGTH = -1;
+#else
+ const size_t DEFAULT_MAX_PHRASE_LENGTH = 20;
+#endif
const size_t DEFAULT_MAX_CHART_SPAN = 10;
const size_t ARRAY_SIZE_INCR = 10; //amount by which a phrase gets resized when necessary
const float LOWEST_SCORE = -100.0f;
diff --git a/moses/Util.h b/moses/Util.h
index 3bba71332..24a4e2c28 100644
--- a/moses/Util.h
+++ b/moses/Util.h
@@ -56,8 +56,12 @@ namespace Moses
/** verbose macros
* */
+
#define VERBOSE(level,str) { if (StaticData::Instance().GetVerboseLevel() >= level) { TRACE_ERR(str); } }
#define IFVERBOSE(level) if (StaticData::Instance().GetVerboseLevel() >= level)
+#define XVERBOSE(level,str) { if (StaticData::Instance().GetVerboseLevel() >= level) { TRACE_ERR("[" << __FILE__ << ":" << __LINE__ << "] ");TRACE_ERR(str); } }
+#define HERE __FILE__ << ":" << __LINE__
+
#if __GNUC__ == 4 && __GNUC_MINOR__ == 8 && (__GNUC_PATCHLEVEL__ == 1 || __GNUC_PATCHLEVEL__ == 2)
// gcc nth_element() bug
diff --git a/scripts/server/moses.py b/scripts/server/moses.py
index 155458b9b..a176c473a 100644
--- a/scripts/server/moses.py
+++ b/scripts/server/moses.py
@@ -152,7 +152,7 @@ def find_free_port(p):
class MosesServer(ProcessWrapper):
- def __init__(self,args=["-fd", "\n"]):
+ def __init__(self,args=[]):
self.process = None
mserver_cmd = moses_root+"/bin/mosesserver"
self.cmd = [mserver_cmd] + args
@@ -175,7 +175,10 @@ class MosesServer(ProcessWrapper):
self.cmd.extend(["--server-port", "%d"%self.port])
if debug:
print >>sys.stderr,self.cmd
- self.process = Popen(self.cmd,stderr = sys.stderr)
+ # self.stderr = open("mserver.%d.stderr"%self.port,'w')
+ # self.stdout = open("mserver.%d.stdout"%self.port,'w')
+ # self.process = Popen(self.cmd,stderr = self.stderr,stdout = self.stdout)
+ self.process = Popen(self.cmd)
else:
devnull = open(os.devnull,"w")
self.process = Popen(self.cmd, stderr=devnull, stdout=devnull)
@@ -216,10 +219,13 @@ class MosesServer(ProcessWrapper):
elif type(input) is list:
return [self.translate(x) for x in input]
+
elif type(input) is dict:
return self.proxy.translate(input)
+
else:
raise Exception("Can't handle input of this type!")
+
except:
attempts += 1
print >>sys.stderr, "WAITING", attempts
diff --git a/scripts/server/sim-pe.py b/scripts/server/sim-pe.py
index 340695a56..52d1e314a 100755
--- a/scripts/server/sim-pe.py
+++ b/scripts/server/sim-pe.py
@@ -127,13 +127,40 @@ def translate(proxy, args, line):
param['nbest-distinct'] = True
pass
attempts = 0
- while attempts < 120:
+ while attempts < 20:
+ t1 = time.time()
try:
- return proxy.translate(param)
- except:
- print >>sys.stderr, "Waiting", proxy
- attempts += 1
+ return proxy.translate(param)
+
+ # except xmlrpclib.Fault as e:
+ # except xmlrpclib.ProtocolError as e:
+ # except xmlrpclib.ResponseError as e:
+ except xmlrpclib.Error as e:
+ time.sleep(2) # give all the stderr stuff a chance to be flushed
+ print >>sys.stderr," XMLRPC error:",e
+ print >>sys.stderr, "Input was"
+ print >>sys.stderr, param
+ sys.exit(1)
+
+ except IOError as e:
+ print >>sys.stderr,"I/O error({0}): {1}".format(e.errno, e.strerror)
time.sleep(5)
+
+ except:
+ serverstatus = mserver.process.poll()
+ if serverstatus == None:
+ print >>sys.stderr, "Connection failed after %f seconds"%(time.time()-t1)
+ attempts += 1
+ if attempts > 10:
+ time.sleep(10)
+ else:
+ time.sleep(5)
+ pass
+ else:
+
+ print >>sys.stderr, "Oopsidaisy, server exited with code %d (signal %d)"\
+ %(serverstatus/256,serverstatus%256)
+ pass
pass
pass
raise Exception("Exception: could not reach translation server.")
@@ -210,17 +237,25 @@ if __name__ == "__main__":
pass
pass
- if args.url:
- mserver.connect(args.url)
- else:
- mserver.start(args=mo_args,port=args.port,debug=args.debug)
- pass
-
ref = None
aln = None
if args.ref: ref = read_data(args.ref)
if args.aln: aln = read_data(args.aln)
+ if ref and aln:
+ try:
+ mo_args.index("--serial")
+ except:
+ mo_args.append("--serial")
+ pass
+ pass
+
+ if args.url:
+ mserver.connect(args.url)
+ else:
+ mserver.start(args=mo_args, port=args.port, debug=args.debug)
+ pass
+
if (args.input == "-"):
line = sys.stdin.readline()
idx = 0