diff options
42 files changed, 2365 insertions, 1303 deletions
diff --git a/.gitignore b/.gitignore index f870bed03..e7c37d86c 100644 --- a/.gitignore +++ b/.gitignore @@ -79,3 +79,4 @@ nbproject/ mingw/MosesGUI/MosesGUI.e4p mingw/MosesGUI/_eric4project/ +contrib/m4m/merge-sorted @@ -152,13 +152,15 @@ build-projects lm util phrase-extract search moses moses/LM mert moses-cmd moses if [ option.get "with-mm" : : "yes" ] { alias mm : + moses/TranslationModel/UG//spe-check-coverage2 moses/TranslationModel/UG//ptable-lookup + moses/TranslationModel/UG//sim-pe + moses/TranslationModel/UG//spe-check-coverage moses/TranslationModel/UG/mm//mtt-build moses/TranslationModel/UG/mm//mtt-dump moses/TranslationModel/UG/mm//symal2mam moses/TranslationModel/UG/mm//mam2symal moses/TranslationModel/UG/mm//mam_verify - moses/TranslationModel/UG/mm//custom-pt moses/TranslationModel/UG/mm//mmlex-build moses/TranslationModel/UG/mm//mmlex-lookup moses/TranslationModel/UG/mm//mtt-count-words diff --git a/OnDiskPt/queryOnDiskPt.cpp b/OnDiskPt/queryOnDiskPt.cpp index a38fc5435..77576d956 100644 --- a/OnDiskPt/queryOnDiskPt.cpp +++ b/OnDiskPt/queryOnDiskPt.cpp @@ -22,7 +22,7 @@ int main(int argc, char **argv) { int tableLimit = 20; std::string ttable = ""; - bool useAlignments = false; + // bool useAlignments = false; for(int i = 1; i < argc; i++) { if(!strcmp(argv[i], "-tlimit")) { diff --git a/contrib/server/mosesserver.cpp b/contrib/server/mosesserver.cpp index 1ff11f0ae..f14111f33 100644 --- a/contrib/server/mosesserver.cpp +++ b/contrib/server/mosesserver.cpp @@ -4,6 +4,7 @@ #include <algorithm> +#include "moses/Util.h" #include "moses/ChartManager.h" #include "moses/Hypothesis.h" #include "moses/Manager.h" @@ -59,7 +60,7 @@ public: if(add2ORLM_) { //updateORLM(); } - cerr << "Done inserting\n"; + XVERBOSE(1,"Done inserting\n"); //PhraseDictionary* pdsa = (PhraseDictionary*) pdf->GetDictionary(*dummy); map<string, xmlrpc_c::value> retData; //*retvalP = xmlrpc_c::value_struct(retData); @@ -120,17 +121,17 @@ public: if(si == params.end()) throw xmlrpc_c::fault("Missing source sentence", xmlrpc_c::fault::CODE_PARSE); source_ = xmlrpc_c::value_string(si->second); - cerr << "source = " << source_ << endl; + XVERBOSE(1,"source = " << source_ << endl); si = params.find("target"); if(si == params.end()) throw xmlrpc_c::fault("Missing target sentence", xmlrpc_c::fault::CODE_PARSE); target_ = xmlrpc_c::value_string(si->second); - cerr << "target = " << target_ << endl; + XVERBOSE(1,"target = " << target_ << endl); si = params.find("alignment"); if(si == params.end()) throw xmlrpc_c::fault("Missing alignment", xmlrpc_c::fault::CODE_PARSE); alignment_ = xmlrpc_c::value_string(si->second); - cerr << "alignment = " << alignment_ << endl; + XVERBOSE(1,"alignment = " << alignment_ << endl); si = params.find("bounded"); bounded_ = (si != params.end()); si = params.find("updateORLM"); @@ -224,7 +225,7 @@ public: } const string source((xmlrpc_c::value_string(si->second))); - cerr << "Input: " << source << endl; + XVERBOSE(1,"Input: " << source << endl); si = params.find("align"); bool addAlignInfo = (si != params.end()); si = params.find("word-align"); @@ -287,13 +288,13 @@ public: } } else { Sentence sentence; - const vector<FactorType> &inputFactorOrder = - staticData.GetInputFactorOrder(); + const vector<FactorType> & + inputFactorOrder = staticData.GetInputFactorOrder(); stringstream in(source + "\n"); sentence.Read(in,inputFactorOrder); size_t lineNumber = 0; // TODO: Include sentence request number here? Manager manager(lineNumber, sentence, staticData.GetSearchAlgorithm()); - manager.ProcessSentence(); + manager.ProcessSentence(); const Hypothesis* hypo = manager.GetBestHypothesis(); vector<xmlrpc_c::value> alignInfo; @@ -331,7 +332,7 @@ public: pair<string, xmlrpc_c::value> text("text", xmlrpc_c::value_string(out.str())); retData.insert(text); - cerr << "Output: " << out.str() << endl; + XVERBOSE(1,"Output: " << out.str() << endl); *retvalP = xmlrpc_c::value_struct(retData); } @@ -574,7 +575,7 @@ int main(int argc, char** argv) { //Extract port and log, send other args to moses - char** mosesargv = new char*[argc+2]; + char** mosesargv = new char*[argc+2]; // why "+2" [UG] int mosesargc = 0; int port = 8080; const char* logfile = "/dev/null"; @@ -634,11 +635,11 @@ int main(int argc, char** argv) myRegistry.addMethod("updater", updater); myRegistry.addMethod("optimize", optimizer); - xmlrpc_c::serverAbyss myAbyssServer( - myRegistry, - port, // TCP port on which to listen - logfile - ); + xmlrpc_c::serverAbyss myAbyssServer( + myRegistry, + port, // TCP port on which to listen + logfile + ); /* doesn't work with xmlrpc-c v. 1.16.33 - ie very old lib on Ubuntu 12.04 xmlrpc_c::serverAbyss myAbyssServer( xmlrpc_c::serverAbyss::constrOpt() @@ -648,12 +649,10 @@ int main(int argc, char** argv) .allowOrigin("*") ); */ - - cerr << "Listening on port " << port << endl; + + XVERBOSE(1,"Listening on port " << port << endl); if (isSerial) { - while(1) { - myAbyssServer.runOnce(); - } + while(1) myAbyssServer.runOnce(); } else { myAbyssServer.run(); } diff --git a/moses-cmd/Jamfile b/moses-cmd/Jamfile index bddc10911..d257cd26c 100644 --- a/moses-cmd/Jamfile +++ b/moses-cmd/Jamfile @@ -3,4 +3,11 @@ alias deps : IOWrapper.cpp mbr.cpp LatticeMBR.cpp TranslationAnalysis.cpp ..//z exe moses : Main.cpp deps ; exe lmbrgrid : LatticeMBRGrid.cpp deps ; -alias programs : moses lmbrgrid ; +exe simulate-pe : +simulate-pe.cc +$(TOP)/moses/TranslationModel/UG/generic//generic +$(TOP)//boost_program_options +deps +; + +alias programs : moses lmbrgrid simulate-pe ; diff --git a/moses/BitmapContainer.cpp b/moses/BitmapContainer.cpp index 981b04895..ee2d55fc8 100644 --- a/moses/BitmapContainer.cpp +++ b/moses/BitmapContainer.cpp @@ -161,13 +161,17 @@ BackwardsEdge::BackwardsEdge(const BitmapContainer &prevBitmapContainer } if (m_translations.size() > 1) { - UTIL_THROW_IF2(m_translations.Get(0)->GetFutureScore() < m_translations.Get(1)->GetFutureScore(), - "Non-monotonic future score"); + UTIL_THROW_IF2(m_translations.Get(0)->GetFutureScore() < m_translations.Get(1)->GetFutureScore(), + "Non-monotonic future score: " + << m_translations.Get(0)->GetFutureScore() << " vs. " + << m_translations.Get(1)->GetFutureScore()); } if (m_hypotheses.size() > 1) { UTIL_THROW_IF2(m_hypotheses[0]->GetTotalScore() < m_hypotheses[1]->GetTotalScore(), - "Non-monotonic total score"); + "Non-monotonic total score" + << m_hypotheses[0]->GetTotalScore() << " vs. " + << m_hypotheses[1]->GetTotalScore()); } HypothesisScoreOrdererWithDistortion orderer (&transOptRange); @@ -442,7 +446,9 @@ BitmapContainer::ProcessBestHypothesis() if (!Empty()) { HypothesisQueueItem *check = Dequeue(true); UTIL_THROW_IF2(item->GetHypothesis()->GetTotalScore() < check->GetHypothesis()->GetTotalScore(), - "Non-monotonic total score"); + "Non-monotonic total score: " + << item->GetHypothesis()->GetTotalScore() << " vs. " + << check->GetHypothesis()->GetTotalScore()); } // Logging for the criminally insane diff --git a/moses/Manager.cpp b/moses/Manager.cpp index 6bc82378e..196f4d997 100644 --- a/moses/Manager.cpp +++ b/moses/Manager.cpp @@ -105,7 +105,9 @@ void Manager::ProcessSentence() // some reporting on how long this took IFVERBOSE(1) { GetSentenceStats().StopTimeCollectOpts(); - TRACE_ERR("Line "<< m_lineNumber << ": Collecting options took " << GetSentenceStats().GetTimeCollectOpts() << " seconds" << endl); + TRACE_ERR("Line "<< m_lineNumber << ": Collecting options took " + << GetSentenceStats().GetTimeCollectOpts() << " seconds at " + << __FILE__ << ":" << __LINE__ << endl); } // search for best translation with the specified algorithm diff --git a/moses/TranslationModel/UG/Jamfile b/moses/TranslationModel/UG/Jamfile index ecd175a65..c36d4a072 100644 --- a/moses/TranslationModel/UG/Jamfile +++ b/moses/TranslationModel/UG/Jamfile @@ -20,6 +20,39 @@ $(TOP)/moses/TranslationModel/UG//mmsapt $(TOP)/util//kenutil ; +exe sim-pe : +sim-pe.cc +$(TOP)/moses//moses +$(TOP)/moses/TranslationModel/UG/generic//generic +$(TOP)//boost_iostreams +$(TOP)//boost_program_options +$(TOP)/moses/TranslationModel/UG/mm//mm +$(TOP)/moses/TranslationModel/UG//mmsapt +$(TOP)/util//kenutil +; + +exe spe-check-coverage : +spe-check-coverage.cc +$(TOP)/moses//moses +$(TOP)/moses/TranslationModel/UG/generic//generic +$(TOP)//boost_iostreams +$(TOP)//boost_program_options +$(TOP)/moses/TranslationModel/UG/mm//mm +$(TOP)/moses/TranslationModel/UG//mmsapt +$(TOP)/util//kenutil +; + +exe spe-check-coverage2 : +spe-check-coverage2.cc +$(TOP)/moses//moses +$(TOP)/moses/TranslationModel/UG/generic//generic +$(TOP)//boost_iostreams +$(TOP)//boost_program_options +$(TOP)/moses/TranslationModel/UG/mm//mm +$(TOP)/moses/TranslationModel/UG//mmsapt +$(TOP)/util//kenutil +; + install $(PREFIX)/bin : try-align ; -fakelib mmsapt : [ glob *.cpp mmsapt*.cc ] ; +fakelib mmsapt : [ glob *.cpp mmsapt*.cc sapt*.cc ] ; diff --git a/moses/TranslationModel/UG/generic/program_options/ug_splice_arglist.cc b/moses/TranslationModel/UG/generic/program_options/ug_splice_arglist.cc new file mode 100644 index 000000000..7dc2cd18f --- /dev/null +++ b/moses/TranslationModel/UG/generic/program_options/ug_splice_arglist.cc @@ -0,0 +1,50 @@ +//-*- c++ -*- +#include "ug_splice_arglist.h" +#include "moses/Util.h" +#include "util/exception.hh" +#include <boost/foreach.hpp> + +namespace Moses { + + void + filter_arguments(int const argc_in, char const* const* const argv_in, + int & argc_moses, char*** argv_moses, + int & argc_other, char*** argv_other, + vector<pair<string,int> > const& filter) + { + *argv_moses = new char*[argc_in]; + *argv_other = new char*[argc_in]; + (*argv_moses)[0] = new char[strlen(argv_in[0])+1]; + strcpy((*argv_moses)[0], argv_in[0]); + argc_moses = 1; + argc_other = 0; + typedef pair<string,int> option; + int i = 1; + while (i < argc_in) + { + BOOST_FOREACH(option const& o, filter) + { + if (o.first == argv_in[i]) + { + (*argv_other)[argc_other] = new char[strlen(argv_in[i])+1]; + strcpy((*argv_other)[argc_other++],argv_in[i]); + for (int k = 0; k < o.second; ++k) + { + UTIL_THROW_IF2(++i >= argc_in || argv_in[i][0] == '-', + "[" << HERE << "] Missing argument for " + << "parameter " << o.first << "!"); + (*argv_other)[argc_other] = new char[strlen(argv_in[i])+1]; + strcpy((*argv_other)[argc_other++],argv_in[i]); + } + if (++i >= argc_in) break; + } + } + if (i >= argc_in) break; + (*argv_moses)[argc_moses] = new char[strlen(argv_in[i])+1]; + strcpy((*argv_moses)[argc_moses++], argv_in[i++]); + } + } + +} // namespace Moses + + diff --git a/moses/TranslationModel/UG/generic/program_options/ug_splice_arglist.h b/moses/TranslationModel/UG/generic/program_options/ug_splice_arglist.h new file mode 100644 index 000000000..e56585e8a --- /dev/null +++ b/moses/TranslationModel/UG/generic/program_options/ug_splice_arglist.h @@ -0,0 +1,18 @@ +//-*- c++ -*- +#pragma once +#include <vector> +#include <string> +namespace Moses { + using namespace std; + + // Function to splice the argument list (e.g. before handing it over to + // Moses LoadParam() function. /filter/ is a vector of argument names + // and the number of arguments after each of them + void + filter_arguments(int const argc_in, char const* const* const argv_in, + int & argc_moses, char*** argv_moses, + int & argc_other, char*** argv_other, + vector<pair<string,int> > const& filter); + + +} // namespace Moses diff --git a/moses/TranslationModel/UG/mm/Jamfile b/moses/TranslationModel/UG/mm/Jamfile index 2cc923581..8d8af050a 100644 --- a/moses/TranslationModel/UG/mm/Jamfile +++ b/moses/TranslationModel/UG/mm/Jamfile @@ -72,15 +72,15 @@ $(TOP)/moses/TranslationModel/UG/mm//mm $(TOP)/util//kenutil ; -exe custom-pt : -custom-pt.cc -$(TOP)/moses//moses -$(TOP)//boost_iostreams -$(TOP)//boost_program_options -$(TOP)/moses/TranslationModel/UG/mm//mm -$(TOP)/moses/TranslationModel/UG/generic//generic -$(TOP)/util//kenutil -; +# exe custom-pt : +# custom-pt.cc +# $(TOP)/moses//moses +# $(TOP)//boost_iostreams +# $(TOP)//boost_program_options +# $(TOP)/moses/TranslationModel/UG/mm//mm +# $(TOP)/moses/TranslationModel/UG/generic//generic +# $(TOP)/util//kenutil +# ; exe calc-coverage : @@ -98,7 +98,6 @@ mtt-dump mtt-count-words symal2mam mam2symal -custom-pt mmlex-build mmlex-lookup mam_verify diff --git a/moses/TranslationModel/UG/mm/custom-pt.cc b/moses/TranslationModel/UG/mm/custom-pt.cc index 1c1e0893c..e52772b48 100644 --- a/moses/TranslationModel/UG/mm/custom-pt.cc +++ b/moses/TranslationModel/UG/mm/custom-pt.cc @@ -1,6 +1,6 @@ // build a phrase table for the given input // #include "ug_lexical_phrase_scorer2.h" - +#if 0 #include <stdint.h> #include <string> #include <vector> @@ -25,7 +25,7 @@ #include "ug_bitext.h" #include "../mmsapt_phrase_scorers.h" #include "ug_lexical_phrase_scorer2.h" - +#include "../sapt_phrase_scorers.h" using namespace std; using namespace ugdiss; using namespace Moses; @@ -110,6 +110,7 @@ int main(int argc, char* argv[]) { // assert(argc == 4); #if 0 +#if 0 string base = argv[1]; string L1 = argv[2]; string L2 = argv[3]; @@ -182,7 +183,7 @@ int main(int argc, char* argv[]) } } } - +#endif exit(0); } - +#endif diff --git a/moses/TranslationModel/UG/mm/ug_bitext.cc b/moses/TranslationModel/UG/mm/ug_bitext.cc index 8dbbdcb92..a1a6dff7b 100644 --- a/moses/TranslationModel/UG/mm/ug_bitext.cc +++ b/moses/TranslationModel/UG/mm/ug_bitext.cc @@ -158,99 +158,25 @@ namespace Moses jstats:: invalidate() { - my_rcnt = 0; + if (my_wcnt > 0) + my_wcnt *= -1; } - bool + void jstats:: - valid() - { - return my_rcnt != 0; - } - - bool - PhrasePair:: - operator<=(PhrasePair const& other) const + validate() { - return this->score <= other.score; + if (my_wcnt < 0) + my_wcnt *= -1; } bool - PhrasePair:: - operator>=(PhrasePair const& other) const - { - return this->score >= other.score; - } - - bool - PhrasePair:: - operator<(PhrasePair const& other) const - { - return this->score < other.score; - } - - bool - PhrasePair:: - operator>(PhrasePair const& other) const - { - return this->score > other.score; - } - - PhrasePair:: - PhrasePair() {} - - PhrasePair:: - PhrasePair(PhrasePair const& o) - : p1(o.p1), - p2(o.p2), - raw1(o.raw1), - raw2(o.raw2), - sample1(o.sample1), - sample2(o.sample2), - good1(o.good1), - good2(o.good2), - joint(o.joint), - fvals(o.fvals), - aln(o.aln), - score(o.score) - { - for (size_t i = 0; i <= po_other; ++i) - { - dfwd[i] = o.dfwd[i]; - dbwd[i] = o.dbwd[i]; - } - } - - void - PhrasePair:: - init(uint64_t const pid1, pstats const& ps, size_t const numfeats) + jstats:: + valid() { - p1 = pid1; - p2 = 0; - raw1 = ps.raw_cnt; - sample1 = ps.sample_cnt; - sample2 = 0; - good1 = ps.good; - good2 = 0; - raw2 = 0; - fvals.resize(numfeats); + return my_wcnt >= 0; } - void - PhrasePair:: - init(uint64_t const pid1, - pstats const& ps1, - pstats const& ps2, - size_t const numfeats) - { - p1 = pid1; - raw1 = ps1.raw_cnt + ps2.raw_cnt; - sample1 = ps1.sample_cnt + ps2.sample_cnt; - sample2 = 0; - good1 = ps1.good + ps2.good; - good2 = 0; - fvals.resize(numfeats); - } float lbop(size_t const tries, size_t const succ, float const confidence) @@ -261,85 +187,6 @@ namespace Moses find_lower_bound_on_p(tries, succ, confidence))); } - PhrasePair const& - PhrasePair:: - update(uint64_t const pid2, jstats const& js) - { - p2 = pid2; - raw2 = js.cnt2(); - joint = js.rcnt(); - assert(js.aln().size()); - if (js.aln().size()) - aln = js.aln()[0].second; - float total_fwd = 0, total_bwd = 0; - for (int i = po_first; i <= po_other; i++) - { - PhraseOrientation po = static_cast<PhraseOrientation>(i); - total_fwd += js.dcnt_fwd(po)+1; - total_bwd += js.dcnt_bwd(po)+1; - } - for (int i = po_first; i <= po_other; i++) - { - PhraseOrientation po = static_cast<PhraseOrientation>(i); - dfwd[i] = float(js.dcnt_fwd(po)+1)/total_fwd; - dbwd[i] = float(js.dcnt_bwd(po)+1)/total_bwd; - } - return *this; - } - - PhrasePair const& - PhrasePair:: - update(uint64_t const pid2, jstats const& js1, jstats const& js2) - { - p2 = pid2; - raw2 = js1.cnt2() + js2.cnt2(); - joint = js1.rcnt() + js2.rcnt(); - assert(js1.aln().size() || js2.aln().size()); - if (js1.aln().size()) - aln = js1.aln()[0].second; - else if (js2.aln().size()) - aln = js2.aln()[0].second; - for (int i = po_first; i < po_other; i++) - { - PhraseOrientation po = static_cast<PhraseOrientation>(i); - dfwd[i] = float(js1.dcnt_fwd(po) + js2.dcnt_fwd(po) + 1)/(sample1+po_other); - dbwd[i] = float(js1.dcnt_bwd(po) + js2.dcnt_bwd(po) + 1)/(sample1+po_other); - } - return *this; - } - - PhrasePair const& - PhrasePair:: - update(uint64_t const pid2, - size_t const raw2extra, - jstats const& js) - { - p2 = pid2; - raw2 = js.cnt2() + raw2extra; - joint = js.rcnt(); - assert(js.aln().size()); - if (js.aln().size()) - aln = js.aln()[0].second; - for (int i = po_first; i <= po_other; i++) - { - PhraseOrientation po = static_cast<PhraseOrientation>(i); - dfwd[i] = float(js.dcnt_fwd(po)+1)/(sample1+po_other); - dbwd[i] = float(js.dcnt_bwd(po)+1)/(sample1+po_other); - } - return *this; - } - - float - PhrasePair:: - eval(vector<float> const& w) - { - assert(w.size() == this->fvals.size()); - this->score = 0; - for (size_t i = 0; i < w.size(); ++i) - this->score += w[i] * this->fvals[i]; - return this->score; - } - template<> sptr<imBitext<L2R_Token<SimpleWordId> > > imBitext<L2R_Token<SimpleWordId> >:: @@ -371,7 +218,8 @@ namespace Moses uint32_t row,col; char c; while (ibuf >> row >> c >> col) { - assert(c == '-'); + UTIL_THROW_IF2(c != '-', "[" << HERE << "] " + << "Error in alignment information:\n" << a); binwrite(obuf,row); binwrite(obuf,col); } @@ -639,7 +487,6 @@ namespace Moses cout << string(90,'-') << endl; } - PhraseOrientation find_po_fwd(vector<vector<ushort> >& a1, vector<vector<ushort> >& a2, @@ -654,13 +501,13 @@ namespace Moses ushort ns1,ne1,ne2; if (!expand_phrase_pair(a1,a2,n2,b1,e1,ns1,ne1,ne2)) - { - return po_other; - } + return po_other; + if (ns1 >= e1) { for (ushort j = e1; j < ns1; ++j) - if (a1[j].size()) return po_jfwd; + if (a1[j].size()) + return po_jfwd; return po_mono; } else diff --git a/moses/TranslationModel/UG/mm/ug_bitext.h b/moses/TranslationModel/UG/mm/ug_bitext.h index 397253973..4cb34c02d 100644 --- a/moses/TranslationModel/UG/mm/ug_bitext.h +++ b/moses/TranslationModel/UG/mm/ug_bitext.h @@ -56,6 +56,7 @@ namespace Moses { class Mmsapt; namespace bitext { + template<typename TKN> class Bitext; using namespace ugdiss; template<typename TKN> class Bitext; @@ -120,6 +121,7 @@ namespace Moses { void add(float w, vector<uchar> const& a, uint32_t const cnt2, uint32_t fwd_orient, uint32_t bwd_orient); void invalidate(); + void validate(); bool valid(); uint32_t dcnt_fwd(PhraseOrientation const idx) const; uint32_t dcnt_bwd(PhraseOrientation const idx) const; @@ -157,43 +159,6 @@ namespace Moses { uint32_t fwd_o, uint32_t bwd_o); }; - class - PhrasePair - { - public: - uint64_t p1, p2; - uint32_t raw1,raw2,sample1,sample2,good1,good2,joint; - vector<float> fvals; - float dfwd[po_other+1]; - float dbwd[po_other+1]; - vector<uchar> aln; - // float avlex12,avlex21; // average lexical probs (Moses std) - // float znlex1,znlex2; // zens-ney lexical smoothing - // float colex1,colex2; // based on raw lexical occurrences - float score; - PhrasePair(); - PhrasePair(PhrasePair const& o); - bool operator<(PhrasePair const& other) const; - bool operator>(PhrasePair const& other) const; - bool operator<=(PhrasePair const& other) const; - bool operator>=(PhrasePair const& other) const; - - void init(uint64_t const pid1, pstats const& ps, size_t const numfeats); - void init(uint64_t const pid1, pstats const& ps1, pstats const& ps2, - size_t const numfeats); - - PhrasePair const& - update(uint64_t const pid2, jstats const& js); - - PhrasePair const& - update(uint64_t const pid2, jstats const& js1, jstats const& js2); - - PhrasePair const& - update(uint64_t const pid2, size_t const raw2extra, jstats const& js); - - float eval(vector<float> const& w); - }; - template<typename TKN> class Bitext diff --git a/moses/TranslationModel/UG/mm/ug_im_ttrack.h b/moses/TranslationModel/UG/mm/ug_im_ttrack.h index 05066c922..0c6e4afbf 100644 --- a/moses/TranslationModel/UG/mm/ug_im_ttrack.h +++ b/moses/TranslationModel/UG/mm/ug_im_ttrack.h @@ -16,6 +16,9 @@ #include "tpt_tokenindex.h" #include "ug_ttrack_base.h" #include "tpt_tokenindex.h" +#include "util/exception.hh" +#include "moses/Util.h" + // #include "ug_vocab.h" // define the corpus buffer size (in sentences) and the @@ -49,6 +52,8 @@ namespace ugdiss typename boost::shared_ptr<imTtrack<Token> > append<Token>(typename boost::shared_ptr<imTtrack<Token> > const & crp, vector<Token> const & snt); + void m_check_token_count(); // debugging function + public: imTtrack(boost::shared_ptr<vector<vector<Token> > > const& d); @@ -70,6 +75,22 @@ namespace ugdiss }; template<typename Token> + void + imTtrack<Token>:: + m_check_token_count() + { // sanity check + size_t check = 0; + BOOST_FOREACH(vector<Token> const& s, *myData) + check += s.size(); + UTIL_THROW_IF2(check != this->numToks, "[" << HERE << "]" + << " Wrong token count after appending sentence!" + << " Counted " << check << " but expected " + << this->numToks << " in a total of " << myData->size() + << " sentences."); + + } + + template<typename Token> Token const* imTtrack<Token>:: sntStart(size_t sid) const // return pointer to beginning of sentence @@ -111,9 +132,9 @@ namespace ugdiss template<typename Token> imTtrack<Token>:: imTtrack(istream& in, TokenIndex const& V, ostream* log = NULL) + : numToks(0) { myData.reset(new vector<vector<Token> >()); - numToks = 0; string line,w; size_t linectr=0; boost::unordered_map<string,id_type> H; @@ -135,6 +156,7 @@ namespace ugdiss template<typename Token> imTtrack<Token>:: imTtrack(size_t reserve) + : numToks(0) { myData.reset(new vector<vector<Token> >()); if (reserve) myData->reserve(reserve); @@ -143,9 +165,9 @@ namespace ugdiss template<typename Token> imTtrack<Token>:: imTtrack(boost::shared_ptr<vector<vector<Token> > > const& d) + : numToks(0) { myData = d; - numToks = 0; BOOST_FOREACH(vector<Token> const& v, *d) numToks += v.size(); } @@ -171,6 +193,9 @@ namespace ugdiss shared_ptr<imTtrack<TOKEN> > append(shared_ptr<imTtrack<TOKEN> > const& crp, vector<TOKEN> const & snt) { +#if 1 + if (crp) crp->m_check_token_count(); +#endif shared_ptr<imTtrack<TOKEN> > ret; if (crp == NULL) { @@ -185,6 +210,11 @@ namespace ugdiss } else ret = crp; ret->myData->push_back(snt); + ret->numToks += snt.size(); + +#if 1 + ret->m_check_token_count(); +#endif return ret; } diff --git a/moses/TranslationModel/UG/mm/ug_lexical_phrase_scorer2.h b/moses/TranslationModel/UG/mm/ug_lexical_phrase_scorer2.h index 558b5a7fa..b7e359223 100644 --- a/moses/TranslationModel/UG/mm/ug_lexical_phrase_scorer2.h +++ b/moses/TranslationModel/UG/mm/ug_lexical_phrase_scorer2.h @@ -27,7 +27,6 @@ namespace ugdiss typedef mm2dTable<id_type,id_type,uint32_t,uint32_t> table_t; table_t COOC; void open(string const& fname); - template<typename someint> void score(TKN const* snt1, size_t const s1, size_t const e1, @@ -104,7 +103,19 @@ namespace ugdiss if (COOC.m1(s) == 0 || COOC.m2(t) == 0) return 1.0; UTIL_THROW_IF2(alpha < 0,"At " << __FILE__ << ":" << __LINE__ << ": alpha parameter must be >= 0"); - return float(COOC[s][t]+alpha)/(COOC.m1(s)+alpha); + float ret = COOC[s][t]+alpha; + ret = (ret?ret:1.)/(COOC.m1(s)+alpha); + UTIL_THROW_IF2(ret <= 0 || ret > 1, "At " << __FILE__ << ":" << __LINE__ + << ": result not > 0 and <= 1. alpha = " << alpha << "; " + << COOC[s][t] << "/" << COOC.m1(s)); + +#if 0 + cerr << "[" << s << "," << t << "] " + << COOC.m1(s) << "/" + << COOC[s][t] << "/" + << COOC.m2(t) << endl; +#endif + return ret; } template<typename TKN> @@ -115,7 +126,11 @@ namespace ugdiss if (COOC.m1(s) == 0 || COOC.m2(t) == 0) return 1.0; UTIL_THROW_IF2(alpha < 0,"At " << __FILE__ << ":" << __LINE__ << ": alpha parameter must be >= 0"); - return float(COOC[s][t]+alpha)/(COOC.m2(t)+alpha); + float ret = float(COOC[s][t]+alpha); + ret = (ret?ret:1.)/(COOC.m2(t)+alpha); + UTIL_THROW_IF2(ret <= 0 || ret > 1, "At " << __FILE__ << ":" << __LINE__ + << ": result not > 0 and <= 1."); + return ret; } template<typename TKN> diff --git a/moses/TranslationModel/UG/mm/ug_phrasepair.cc b/moses/TranslationModel/UG/mm/ug_phrasepair.cc new file mode 100644 index 000000000..6373f8468 --- /dev/null +++ b/moses/TranslationModel/UG/mm/ug_phrasepair.cc @@ -0,0 +1,97 @@ +#include "ug_phrasepair.h" +namespace Moses { + namespace bitext + { + +#if 0 + void + PhrasePair:: + init() + { + p1 = p2 = raw1 = raw2 = sample1 = sample2 = good1 = good2 = joint = 0; + } + + void + PhrasePair:: + init(uint64_t const pid1, + pstats const& ps1, + pstats const& ps2, + size_t const numfeats) + { + p1 = pid1; + raw1 = ps1.raw_cnt + ps2.raw_cnt; + sample1 = ps1.sample_cnt + ps2.sample_cnt; + sample2 = 0; + good1 = ps1.good + ps2.good; + good2 = 0; + joint = 0; + fvals.resize(numfeats); + } + + PhrasePair const& + PhrasePair:: + update(uint64_t const pid2, jstats const& js1, jstats const& js2) + { + p2 = pid2; + raw2 = js1.cnt2() + js2.cnt2(); + joint = js1.rcnt() + js2.rcnt(); + assert(js1.aln().size() || js2.aln().size()); + if (js1.aln().size()) + aln = js1.aln()[0].second; + else if (js2.aln().size()) + aln = js2.aln()[0].second; + for (int i = po_first; i < po_other; i++) + { + PhraseOrientation po = static_cast<PhraseOrientation>(i); + dfwd[i] = float(js1.dcnt_fwd(po) + js2.dcnt_fwd(po) + 1)/(sample1+po_other); + dbwd[i] = float(js1.dcnt_bwd(po) + js2.dcnt_bwd(po) + 1)/(sample1+po_other); + } + return *this; + } + + PhrasePair const& + PhrasePair:: + update(uint64_t const pid2, size_t r2) + { + p2 = pid2; + raw2 = r2; + joint = 0; + return *this; + } + + + PhrasePair const& + PhrasePair:: + update(uint64_t const pid2, + size_t const raw2extra, + jstats const& js) + { + p2 = pid2; + raw2 = js.cnt2() + raw2extra; + joint = js.rcnt(); + assert(js.aln().size()); + if (js.aln().size()) + aln = js.aln()[0].second; + for (int i = po_first; i <= po_other; i++) + { + PhraseOrientation po = static_cast<PhraseOrientation>(i); + dfwd[i] = float(js.dcnt_fwd(po)+1)/(sample1+po_other); + dbwd[i] = float(js.dcnt_bwd(po)+1)/(sample1+po_other); + } + return *this; + } + + float + PhrasePair:: + eval(vector<float> const& w) + { + assert(w.size() == this->fvals.size()); + this->score = 0; + for (size_t i = 0; i < w.size(); ++i) + this->score += w[i] * this->fvals[i]; + return this->score; + } +#endif + } // namespace bitext +} // namespace Moses + diff --git a/moses/TranslationModel/UG/mm/ug_phrasepair.h b/moses/TranslationModel/UG/mm/ug_phrasepair.h new file mode 100644 index 000000000..8cd43dc18 --- /dev/null +++ b/moses/TranslationModel/UG/mm/ug_phrasepair.h @@ -0,0 +1,243 @@ +//-*- c++ -*- +#pragma once +#include "ug_bitext.h" + +using namespace ugdiss; +using namespace std; + +namespace Moses { + namespace bitext + { + + template<typename Token> + string + toString(TokenIndex const& V, Token const* x, size_t const len) + { + if (!len) return ""; + UTIL_THROW_IF2(!x, HERE << ": Unexpected end of phrase!"); + ostringstream buf; + buf << V[x->id()]; + size_t i = 1; + for (x = x->next(); x && i < len; ++i, x = x->next()) + buf << " " << V[x->id()]; + UTIL_THROW_IF2(i != len, HERE << ": Unexpected end of phrase!"); + return buf.str(); + } + + template<typename Token> + class + PhrasePair + { + public: + Token const* start1; + Token const* start2; + uint32_t len1; + uint32_t len2; + // uint64_t p1, p2; + uint32_t raw1,raw2,sample1,sample2,good1,good2,joint; + vector<float> fvals; + float dfwd[po_other+1]; // distortion counts // counts or probs? + float dbwd[po_other+1]; // distortion counts + vector<uchar> aln; + float score; + PhrasePair() { }; + PhrasePair(PhrasePair const& o); + + PhrasePair const& operator+=(PhrasePair const& other); + + bool operator<(PhrasePair const& other) const; + bool operator>(PhrasePair const& other) const; + bool operator<=(PhrasePair const& other) const; + bool operator>=(PhrasePair const& other) const; + + void init(); + void init(Token const* x, uint32_t const len, + pstats const* ps = NULL, size_t const numfeats=0); + + // void init(uint64_t const pid1, pstats const& ps, size_t const numfeats); + // void init(uint64_t const pid1, pstats const& ps1, pstats const& ps2, + // size_t const numfeats); + + // PhrasePair const& + // update(uint64_t const pid2, size_t r2 = 0); + + PhrasePair const& + update(Token const* x, uint32_t const len, jstats const& js); + + // PhrasePair const& + // update(uint64_t const pid2, jstats const& js1, jstats const& js2); + + // PhrasePair const& + // update(uint64_t const pid2, size_t const raw2extra, jstats const& js); + + // float + // eval(vector<float> const& w); + + class SortByTargetIdSeq + { + public: + int cmp(PhrasePair const& a, PhrasePair const& b) const; + bool operator()(PhrasePair const& a, PhrasePair const& b) const; + }; + }; + + template<typename Token> + void + PhrasePair<Token>:: + init(Token const* x, uint32_t const len, + pstats const* ps, size_t const numfeats) + { + start1 = x; len1 = len; + // p1 = pid1; + // p2 = 0; + if (ps) + { + raw1 = ps->raw_cnt; + sample1 = ps->sample_cnt; + good1 = ps->good; + } + else raw1 = sample1 = good1 = 0; + joint = 0; + good2 = 0; + sample2 = 0; + raw2 = 0; + fvals.resize(numfeats); + } + + template<typename Token> + PhrasePair<Token> const& + PhrasePair<Token>:: + update(Token const* x, uint32_t const len, jstats const& js) + { + // p2 = pid2; + start2 = x; len2 = len; + raw2 = js.cnt2(); + joint = js.rcnt(); + assert(js.aln().size()); + if (js.aln().size()) + aln = js.aln()[0].second; + float total_fwd = 0, total_bwd = 0; + for (int i = po_first; i <= po_other; i++) + { + PhraseOrientation po = static_cast<PhraseOrientation>(i); + total_fwd += js.dcnt_fwd(po)+1; + total_bwd += js.dcnt_bwd(po)+1; + } + + // should we do that here or leave the raw counts? + for (int i = po_first; i <= po_other; i++) + { + PhraseOrientation po = static_cast<PhraseOrientation>(i); + dfwd[i] = float(js.dcnt_fwd(po)+1)/total_fwd; + dbwd[i] = float(js.dcnt_bwd(po)+1)/total_bwd; + } + + return *this; + } + + template<typename Token> + bool + PhrasePair<Token>:: + operator<(PhrasePair const& other) const + { return this->score < other.score; } + + template<typename Token> + bool + PhrasePair<Token>:: + operator>(PhrasePair const& other) const + { return this->score > other.score; } + + template<typename Token> + bool + PhrasePair<Token>:: + operator<=(PhrasePair const& other) const + { return this->score <= other.score; } + + template<typename Token> + bool + PhrasePair<Token>:: + operator>=(PhrasePair const& other) const + { return this->score >= other.score; } + + template<typename Token> + PhrasePair<Token> const& + PhrasePair<Token>:: + operator+=(PhrasePair const& o) + { + raw1 += o.raw1; + raw2 += o.raw2; + sample1 += o.sample1; + sample2 += o.sample2; + good1 += o.good1; + good2 += o.good2; + joint += o.joint; + return *this; + } + + template<typename Token> + PhrasePair<Token>:: + PhrasePair(PhrasePair<Token> const& o) + : start1(o.start1) + , start2(o.start2) + , len1(o.len1) + , len2(o.len2) + , raw1(o.raw1) + , raw2(o.raw2) + , sample1(o.sample1) + , sample2(o.sample2) + , good1(o.good1) + , good2(o.good2) + , joint(o.joint) + , fvals(o.fvals) + , aln(o.aln) + , score(o.score) + { + for (size_t i = 0; i <= po_other; ++i) + { + dfwd[i] = o.dfwd[i]; + dbwd[i] = o.dbwd[i]; + } + } + + template<typename Token> + int + PhrasePair<Token>:: + SortByTargetIdSeq:: + cmp(PhrasePair const& a, PhrasePair const& b) const + { + size_t i = 0; + Token const* x = a.start2; + Token const* y = b.start2; + while (i < a.len2 && i < b.len2 && x->id() == y->id()) + { + x = x->next(); + y = y->next(); + ++i; + } + if (i == a.len2 && i == b.len2) return 0; + if (i == a.len2) return -1; + if (i == b.len2) return 1; + return x->id() < y->id() ? -1 : 1; + } + + template<typename Token> + bool + PhrasePair<Token>:: + SortByTargetIdSeq:: + operator()(PhrasePair const& a, PhrasePair const& b) const + { + return this->cmp(a,b) < 0; + } + + template<typename Token> + void + PhrasePair<Token>:: + init() + { + len1 = len2 = raw1 = raw2 = sample1 = sample2 = good1 = good2 = joint = 0; + start1 = start2 = NULL; + } + + + } // namespace bitext +} // namespace Moses diff --git a/moses/TranslationModel/UG/mm/ug_tsa_tree_iterator.h b/moses/TranslationModel/UG/mm/ug_tsa_tree_iterator.h index 14bf6cdad..ab7f96bf0 100644 --- a/moses/TranslationModel/UG/mm/ug_tsa_tree_iterator.h +++ b/moses/TranslationModel/UG/mm/ug_tsa_tree_iterator.h @@ -7,6 +7,8 @@ #include "ug_typedefs.h" #include "tpt_tokenindex.h" #include <iostream> +#include "util/exception.hh" +#include "moses/Util.h" //#include <cassert> // #include "ug_bv_iter.h" @@ -60,10 +62,15 @@ namespace ugdiss // TSA_tree_iterator(TSA_tree_iterator const& other); TSA_tree_iterator(TSA<Token> const* s); + TSA_tree_iterator(TSA<Token> const* s, TSA_tree_iterator<Token> const& other); TSA_tree_iterator(TSA<Token> const* r, id_type const* s, size_t const len); // TSA_tree_iterator(TSA<Token> const* s, Token const& t); TSA_tree_iterator(TSA<Token> const* s, Token const* kstart, + size_t const len, + bool full_match_only=true); + TSA_tree_iterator(TSA<Token> const* s, + Token const* kstart, Token const* kend, bool full_match_only=true); // TSA_tree_iterator(TSA<Token> const* s, @@ -150,9 +157,12 @@ namespace ugdiss double approxOccurrenceCount(int p=-1) const { assert(root); + if (p < 0) p += lower.size(); double ret = arrayByteSpanSize(p)/root->aveIndexEntrySize(); - assert(ret < root->corpus->numTokens()); if (ret < 25) ret = rawCnt(p); + UTIL_THROW_IF2(ret > root->corpus->numTokens(), "[" << HERE << "] " + << "Word count mismatch."); + assert(ret <= root->corpus->numTokens()); return ret; } @@ -320,6 +330,18 @@ namespace ugdiss template<typename Token> TSA_tree_iterator<Token>:: + TSA_tree_iterator(TSA<Token> const* s, TSA_tree_iterator<Token> const& other) + : root(s) + { + Token const* x = other.getToken(0); + for (size_t i = 0; i < other.size() && this->extend(x->id()); ++i) + x = x->next(); + }; + + + + template<typename Token> + TSA_tree_iterator<Token>:: TSA_tree_iterator (TSA<Token> const* r, id_type const* s, @@ -385,6 +407,25 @@ namespace ugdiss template<typename Token> TSA_tree_iterator<Token>:: TSA_tree_iterator(TSA<Token> const* s, Token const* kstart, + size_t const len, bool full_match_only) + : root(s) + { + if (!root) return; + size_t i = 0; + for (; i < len && kstart && extend(*kstart); ++i) + kstart = kstart->next(); + if (full_match_only && i != len) + { + lower.clear(); + upper.clear(); + } + }; + + // DEPRECATED: DO NOT USE. Use the one that takes the length + // instead of kend. + template<typename Token> + TSA_tree_iterator<Token>:: + TSA_tree_iterator(TSA<Token> const* s, Token const* kstart, Token const* kend, bool full_match_only) : root(s) { @@ -561,8 +602,7 @@ namespace ugdiss TSA_tree_iterator<Token>:: rawCnt(int p) const { - if (p < 0) - p = lower.size()+p; + if (p < 0) p += lower.size(); assert(p>=0); if (lower.size() == 0) return root->getCorpusSize(); return root->rawCnt(lower[p],upper[p]); diff --git a/moses/TranslationModel/UG/mmsapt.cpp b/moses/TranslationModel/UG/mmsapt.cpp index dc9945472..596fec4e6 100644 --- a/moses/TranslationModel/UG/mmsapt.cpp +++ b/moses/TranslationModel/UG/mmsapt.cpp @@ -1,13 +1,38 @@ #include "mmsapt.h" #include <boost/foreach.hpp> +#include <boost/scoped_ptr.hpp> #include <boost/tokenizer.hpp> #include <algorithm> +#include "moses/TranslationModel/UG/mm/ug_phrasepair.h" +#include "util/exception.hh" +#include <set> namespace Moses { using namespace bitext; using namespace std; using namespace boost; + + + // uint64_t + // pack_phrasekey(uint64_t const shard_id, uint64_t const snt_id, + // uint64_t const offset, uint64_t const len) + // { + // uint64_t one = 1; + // // 8 bits - 256 shards + // // 13 bits - max offset + // // 11 bits - max len + // // 32 bits - max sentence id + // UTIL_TRHOW_IF2(shard_id >= 256, "[" << HERE << "] " + // << "Sentence ID exceeds limit."); + // UTIL_THROW_IF2(snt_id >= 4294967296, "[" << HERE << "] " + // << "Sentence ID exceeds limit."); + // UTIL_TRHOW_IF2(offset >= 8192, "[" << HERE << "]" + // << "Phrase offset exceeds limit."); + // UTIL_TRHOW_IF2(offset >= 2048, "[" << HERE << "]" + // << "Phrase length exceeds limit."); + // return ((shard_id<<56)+(snt_id<<24)+(offset<<11)+len); + // } void fillIdSeq(Phrase const& mophrase, size_t const ifactor, @@ -23,7 +48,7 @@ namespace Moses void - parseLine(string const& line, map<string,string> & params) + parseLine(string const& line, map<string,string> & param) { char_separator<char> sep("; "); tokenizer<char_separator<char> > tokens(line,sep); @@ -32,9 +57,14 @@ namespace Moses size_t i = t.find_first_not_of(" ="); size_t j = t.find_first_of(" =",i+1); size_t k = t.find_first_not_of(" =",j+1); + UTIL_THROW_IF2(i == string::npos || k == string::npos, + "[" << HERE << "] " + << "Parameter specification error near '" + << t << "' in moses ini line\n" + << line); assert(i != string::npos); assert(k != string::npos); - params[t.substr(i,j)] = t.substr(k); + param[t.substr(i,j)] = t.substr(k); } } @@ -57,13 +87,13 @@ namespace Moses Mmsapt:: Mmsapt(string const& line) : PhraseDictionary(line) - , m_lex_alpha(1.0) - , withLogCountFeatures(false) - , withCoherence(true) - , m_pfwd_features("g") - , m_pbwd_features("g") - , withPbwd(true) - , poolCounts(true) + // , m_lex_alpha(1.0) + // , withLogCountFeatures(false) + // , withCoherence(true) + // , m_pfwd_features("g") + // , m_pbwd_features("g") + // , withPbwd(true) + // , poolCounts(true) , ofactor(1,0) , m_tpc_ctr(0) { @@ -94,81 +124,125 @@ namespace Moses void Mmsapt:: + register_ff(sptr<pscorer> const& ff, vector<sptr<pscorer> > & registry) + { + registry.push_back(ff); + ff->setIndex(m_feature_names.size()); + for (int i = 0; i < ff->fcnt(); ++i) + { + m_feature_names.push_back(ff->fname(i)); + m_is_logval.push_back(ff->isLogVal(i)); + m_is_integer.push_back(ff->isIntegerValued(i)); + } + } + + bool + Mmsapt:: + isLogVal(int i) const { return m_is_logval.at(i); } + + bool + Mmsapt:: + isInteger(int i) const { return m_is_integer.at(i); } + + void + Mmsapt:: init(string const& line) { map<string,string>::const_iterator m; - map<string,string> param; - parseLine(line,param); + parseLine(line,this->param); + + this->m_numScoreComponents = atoi(param["num-features"].c_str()); m = param.find("config"); if (m != param.end()) read_config_file(m->second,param); - - bname = param["base"]; + + bname = param["base"]; L1 = param["L1"]; L2 = param["L2"]; - assert(bname.size()); - assert(L1.size()); - assert(L2.size()); - - m = param.find("pfwd-denom"); - m_pfwd_denom = m != param.end() ? m->second[0] : 's'; - - m = param.find("smooth"); - m_lbop_parameter = m != param.end() ? atof(m->second.c_str()) : .05; - m = param.find("max-samples"); - m_default_sample_size = m != param.end() ? atoi(m->second.c_str()) : 1000; + UTIL_THROW_IF2(bname.size() == 0, "Missing corpus base name at " << HERE); + UTIL_THROW_IF2(L1.size() == 0, "Missing L1 tag at " << HERE); + UTIL_THROW_IF2(L2.size() == 0, "Missing L2 tag at " << HERE); - if ((m = param.find("logcnt-features")) != param.end()) - withLogCountFeatures = m->second != "0"; - - if ((m = param.find("coh")) != param.end()) - withCoherence = m->second != "0"; - - if ((m = param.find("pfwd")) != param.end()) - m_pfwd_features = (m->second == "0" ? "" : m->second); - - if (m_pfwd_features == "1") // legacy; deprecated - m_pfwd_features[0] = m_pfwd_denom; + // set defaults for all parameters if not specified so far + pair<string,string> dflt("input-factor","0"); + input_factor = atoi(param.insert(dflt).first->second.c_str()); + // shouldn't that be a string? - if ((m = param.find("pbwd")) != param.end()) - m_pbwd_features = (m->second == "0" ? "" : m->second); + dflt = pair<string,string> ("smooth",".01"); + m_lbop_conf = atof(param.insert(dflt).first->second.c_str()); - if (m_pbwd_features == "1") - m_pbwd_features = "r"; // lecagy; deprecated + dflt = pair<string,string> ("lexalpha","0"); + m_lex_alpha = atof(param.insert(dflt).first->second.c_str()); - if ((m = param.find("lexalpha")) != param.end()) - m_lex_alpha = atof(m->second.c_str()); + dflt = pair<string,string> ("sample","1000"); + m_default_sample_size = atoi(param.insert(dflt).first->second.c_str()); - m = param.find("workers"); - m_workers = m != param.end() ? atoi(m->second.c_str()) : 8; + dflt = pair<string,string>("workers","8"); + m_workers = atoi(param.insert(dflt).first->second.c_str()); m_workers = min(m_workers,24UL); - if ((m = param.find("limit")) != param.end()) - m_tableLimit = atoi(m->second.c_str()); + dflt = pair<string,string>("limit","20"); + m_tableLimit = atoi(param.insert(dflt).first->second.c_str()); - m = param.find("cache-size"); - m_history.reserve(m != param.end()?max(1000,atoi(m->second.c_str())):10000); + dflt = pair<string,string>("cache","10000"); + size_t hsize = max(1000,atoi(param.insert(dflt).first->second.c_str())); + m_history.reserve(hsize); // in plain language: cache size is at least 1000, and 10,000 by default // this cache keeps track of the most frequently used target phrase collections // even when not actively in use - - this->m_numScoreComponents = atoi(param["num-features"].c_str()); - m = param.find("ifactor"); - input_factor = m != param.end() ? atoi(m->second.c_str()) : 0; + // Feature functions are initialized in function Load(); + param.insert(pair<string,string>("pfwd", "g")); + param.insert(pair<string,string>("pbwd", "g")); + param.insert(pair<string,string>("logcnt", "0")); + param.insert(pair<string,string>("coh", "0")); + param.insert(pair<string,string>("rare", "1")); + param.insert(pair<string,string>("prov", "1")); poolCounts = true; if ((m = param.find("extra")) != param.end()) extra_data = m->second; + // check for unknown parameters + vector<string> known_parameters; known_parameters.reserve(50); + known_parameters.push_back("L1"); + known_parameters.push_back("L2"); + known_parameters.push_back("Mmsapt"); + known_parameters.push_back("base"); + known_parameters.push_back("cache"); + known_parameters.push_back("coh"); + known_parameters.push_back("config"); + known_parameters.push_back("extra"); + known_parameters.push_back("input-factor"); + known_parameters.push_back("lexalpha"); + known_parameters.push_back("limit"); + known_parameters.push_back("logcnt"); + known_parameters.push_back("name"); + known_parameters.push_back("num-features"); + known_parameters.push_back("output-factor"); + known_parameters.push_back("pbwd"); + known_parameters.push_back("pfwd"); + known_parameters.push_back("prov"); + known_parameters.push_back("rare"); + known_parameters.push_back("sample"); + known_parameters.push_back("smooth"); + known_parameters.push_back("unal"); + known_parameters.push_back("workers"); + for (map<string,string>::iterator m = param.begin(); m != param.end(); ++m) + { + UTIL_THROW_IF2(!binary_search(known_parameters.begin(), + known_parameters.end(), m->first), + HERE << ": Unknown parameter specification for Mmsapt: " + << m->first); + } } void Mmsapt:: - load_extra_data(string bname) + load_extra_data(string bname, bool locking = true) { // TO DO: ADD CHECKS FOR ROBUSTNESS // - file existence? @@ -186,122 +260,120 @@ namespace Moses while(getline(in2,line)) text2.push_back(line); while(getline(ina,line)) symal.push_back(line); - lock_guard<mutex> guard(this->lock); + boost::scoped_ptr<lock_guard<mutex> > guard; + if (locking) guard.reset(new lock_guard<mutex>(this->lock)); btdyn = btdyn->add(text1,text2,symal); assert(btdyn); // cerr << "Loaded " << btdyn->T1->size() << " sentence pairs" << endl; } - size_t + template<typename fftype> + void Mmsapt:: - add_corpus_specific_features - (vector<sptr<pscorer > >& ffvec, size_t num_feats) + check_ff(string const ffname, vector<sptr<pscorer> >* registry) { - float const lbop = m_lbop_parameter; // just for code readability below - // for the time being, we assume that all phrase probability features - // use the same confidence parameter for lower-bound-estimation - for (size_t i = 0; i < m_pfwd_features.size(); ++i) - { - UTIL_THROW_IF2(m_pfwd_features[i] != 'g' && - m_pfwd_features[i] != 'r' && - m_pfwd_features[i] != 's', - "Can't handle pfwd feature type '" - << m_pfwd_features[i] << "'."); - sptr<PScorePfwd<Token> > ff(new PScorePfwd<Token>()); - size_t k = num_feats; - num_feats = ff->init(num_feats,lbop,m_pfwd_features[i]); - for (;k < num_feats; ++k) m_feature_names.push_back(ff->fname(k)); - ffvec.push_back(ff); + string const& spec = param[ffname]; + if (spec == "" || spec == "0") return; + if (registry) + { + sptr<fftype> ff(new fftype(spec)); + register_ff(ff, *registry); } - - for (size_t i = 0; i < m_pbwd_features.size(); ++i) - { - UTIL_THROW_IF2(m_pbwd_features[i] != 'g' && - m_pbwd_features[i] != 'r' && - m_pbwd_features[i] != 's', - "Can't handle pbwd feature type '" - << m_pbwd_features[i] << "'."); - sptr<PScorePbwd<Token> > ff(new PScorePbwd<Token>()); - size_t k = num_feats; - num_feats = ff->init(num_feats,lbop,m_pbwd_features[i]); - for (;k < num_feats; ++k) m_feature_names.push_back(ff->fname(k)); - ffvec.push_back(ff); + else if (spec[spec.size()-1] == '+') // corpus specific + { + sptr<fftype> ff(new fftype(spec)); + register_ff(ff, m_active_ff_fix); + ff.reset(new fftype(spec)); + register_ff(ff, m_active_ff_dyn); } - - // if (withPbwd) - // { - // sptr<PScorePbwd<Token> > ff(new PScorePbwd<Token>()); - // size_t k = num_feats; - // num_feats = ff->init(num_feats,lbop); - // for (; k < num_feats; ++k) m_feature_names.push_back(ff->fname(k)); - // ffvec.push_back(ff); - // } - - if (withLogCountFeatures) + else { - sptr<PScoreLogCounts<Token> > ff(new PScoreLogCounts<Token>()); - size_t k = num_feats; - num_feats = ff->init(num_feats); - for (; k < num_feats; ++k) m_feature_names.push_back(ff->fname(k)); - ffvec.push_back(ff); + sptr<fftype> ff(new fftype(spec)); + register_ff(ff, m_active_ff_common); } + } - return num_feats; + template<typename fftype> + void + Mmsapt:: + check_ff(string const ffname, float const xtra, vector<sptr<pscorer> >* registry) + { + string const& spec = param[ffname]; + if (spec == "" || spec == "0") return; + if (registry) + { + sptr<fftype> ff(new fftype(xtra,spec)); + register_ff(ff, *registry); + } + else if (spec[spec.size()-1] == '+') // corpus specific + { + sptr<fftype> ff(new fftype(xtra,spec)); + register_ff(ff, m_active_ff_fix); + ff.reset(new fftype(xtra,spec)); + register_ff(ff, m_active_ff_dyn); + } + else + { + sptr<fftype> ff(new fftype(xtra,spec)); + register_ff(ff, m_active_ff_common); + } } + // void + // Mmsapt:: + // add_corpus_specific_features(vector<sptr<pscorer > >& registry) + // { + // check_ff<PScorePbwd<Token> >("pbwd",m_lbop_conf,registry); + // check_ff<PScoreLogCnt<Token> >("logcnt",registry); + // } + void Mmsapt:: Load() { + lock_guard<mutex> guard(this->lock); + + // can load only once + // UTIL_THROW_IF2(shards.size(),"Mmsapt is already loaded at " << HERE); + + // lexical scores + string lexfile = bname + L1 + "-" + L2 + ".lex"; + sptr<PScoreLex1<Token> > ff(new PScoreLex1<Token>(param["lex_alpha"],lexfile)); + register_ff(ff,m_active_ff_common); + + // these are always computed on pooled data + check_ff<PScoreRareness<Token> > ("rare", &m_active_ff_common); + check_ff<PScoreUnaligned<Token> >("unal", &m_active_ff_common); + check_ff<PScoreCoherence<Token> >("coh", &m_active_ff_common); + + // for these ones either way is possible (specification ends with '+' + // if corpus-specific + check_ff<PScorePfwd<Token> >("pfwd", m_lbop_conf); + check_ff<PScorePbwd<Token> >("pbwd", m_lbop_conf); + check_ff<PScoreLogCnt<Token> >("logcnt"); + + // These are always corpus-specific + check_ff<PScoreProvenance<Token> >("prov", &m_active_ff_fix); + check_ff<PScoreProvenance<Token> >("prov", &m_active_ff_dyn); + + UTIL_THROW_IF2(this->m_feature_names.size() != this->m_numScoreComponents, + "At " << HERE << ": number of feature values provided by " + << "Phrase table (" << this->m_feature_names.size() + << ") does not match number specified in Moses config file (" + << this->m_numScoreComponents << ")!\n";); + + // Load corpora. For the time being, we can have one memory-mapped static + // corpus and one in-memory dynamic corpus + // sptr<mmbitext> btfix(new mmbitext()); btfix.num_workers = this->m_workers; btfix.open(bname, L1, L2); btfix.setDefaultSampleSize(m_default_sample_size); + // shards.push_back(btfix); - size_t num_feats = 0; - - // lexical scores are currently always active - sptr<PScoreLex<Token> > ff(new PScoreLex<Token>(m_lex_alpha)); - size_t k = num_feats; - num_feats = ff->init(num_feats, bname + L1 + "-" + L2 + ".lex"); - for (; k < num_feats; ++k) m_feature_names.push_back(ff->fname(k)); - m_active_ff_common.push_back(ff); - - if (withCoherence) - { - sptr<PScoreCoherence<Token> > ff(new PScoreCoherence<Token>()); - size_t k = num_feats; - num_feats = ff->init(num_feats); - for (; k < num_feats; ++k) m_feature_names.push_back(ff->fname(k)); - m_active_ff_common.push_back(ff); - } - - num_feats = add_corpus_specific_features(m_active_ff_fix,num_feats); - // cerr << num_feats << "/" << this->m_numScoreComponents - // << " at " << __FILE__ << ":" << __LINE__ << endl; - poolCounts = poolCounts && num_feats == this->m_numScoreComponents; - if (!poolCounts) - num_feats = add_corpus_specific_features(m_active_ff_dyn, num_feats); - -#if 0 - cerr << "MMSAPT provides " << num_feats << " features at " - << __FILE__ << ":" << __LINE__ << endl; - BOOST_FOREACH(string const& fname, m_feature_names) - cerr << fname << endl; -#endif - UTIL_THROW_IF2(num_feats != this->m_numScoreComponents, - "At " << __FILE__ << ":" << __LINE__ - << ": number of feature values provided by Phrase table (" - << num_feats << ") does not match number specified in " - << "Moses config file (" << this->m_numScoreComponents - << ")!\n";); - - - btdyn.reset(new imBitext<Token>(btfix.V1, btfix.V2,m_default_sample_size)); + btdyn.reset(new imbitext(btfix.V1, btfix.V2, m_default_sample_size)); btdyn->num_workers = this->m_workers; if (extra_data.size()) - { - load_extra_data(extra_data); - } + load_extra_data(extra_data,false); #if 0 // currently not used @@ -330,258 +402,345 @@ namespace Moses TargetPhrase* Mmsapt:: - createTargetPhrase(Phrase const& src, - Bitext<Token> const& bt, - PhrasePair const& pp) const + mkTPhrase(Phrase const& src, + PhrasePair<Token>* fix, + PhrasePair<Token>* dyn, + sptr<Bitext<Token> > const& dynbt) const { - Word w; uint32_t sid,off,len; + UTIL_THROW_IF2(!fix && !dyn, HERE << + ": Can't create target phrase from nothing."); + vector<float> fvals(this->m_numScoreComponents); + PhrasePair<Token> pool = fix ? *fix : *dyn; + if (fix) + { + BOOST_FOREACH(sptr<pscorer> const& ff, m_active_ff_fix) + (*ff)(btfix, *fix, &fvals); + } + if (dyn) + { + BOOST_FOREACH(sptr<pscorer> const& ff, m_active_ff_dyn) + (*ff)(*dynbt, *dyn, &fvals); + } + + if (fix && dyn) { pool += *dyn; } + else if (fix) + { + PhrasePair<Token> zilch; zilch.init(); + TSA<Token>::tree_iterator m(dynbt->I2.get(), fix->start2, fix->len2); + if (m.size() == fix->len2) + zilch.raw2 = m.approxOccurrenceCount(); + pool += zilch; + BOOST_FOREACH(sptr<pscorer> const& ff, m_active_ff_dyn) + (*ff)(*dynbt, ff->allowPooling() ? pool : zilch, &fvals); + } + else if (dyn) + { + PhrasePair<Token> zilch; zilch.init(); + TSA<Token>::tree_iterator m(btfix.I2.get(), dyn->start2, dyn->len2); + if (m.size() == dyn->len2) + zilch.raw2 = m.approxOccurrenceCount(); + pool += zilch; + BOOST_FOREACH(sptr<pscorer> const& ff, m_active_ff_fix) + (*ff)(*dynbt, ff->allowPooling() ? pool : zilch, &fvals); + } + if (fix) + { + BOOST_FOREACH(sptr<pscorer> const& ff, m_active_ff_common) + (*ff)(btfix, pool, &fvals); + } + else + { + BOOST_FOREACH(sptr<pscorer> const& ff, m_active_ff_common) + (*ff)(*dynbt, pool, &fvals); + } TargetPhrase* tp = new TargetPhrase(); - parse_pid(pp.p2, sid, off, len); - Token const* x = bt.T2->sntStart(sid) + off; - for (uint32_t k = 0; k < len; ++k) + Token const* x = fix ? fix->start2 : dyn->start2; + uint32_t len = fix ? fix->len2 : dyn->len2; + for (uint32_t k = 0; k < len; ++k, x = x->next()) { - // cerr << (*bt.V2)[x[k].id()] << " at " << __FILE__ << ":" << __LINE__ << endl; - StringPiece wrd = (*bt.V2)[x[k].id()]; - // if ((off+len) > bt.T2->sntLen(sid)) - // cerr << off << ";" << len << " " << bt.T2->sntLen(sid) << endl; - assert(off+len <= bt.T2->sntLen(sid)); - w.CreateFromString(Output,ofactor,wrd,false); + StringPiece wrd = (*(btfix.V2))[x->id()]; + Word w; w.CreateFromString(Output,ofactor,wrd,false); tp->AddWord(w); } - tp->GetScoreBreakdown().Assign(this, pp.fvals); + tp->GetScoreBreakdown().Assign(this, fvals); tp->Evaluate(src); return tp; } - // process phrase stats from a single parallel corpus - void - Mmsapt:: - process_pstats - (Phrase const& src, - uint64_t const pid1, - pstats const& stats, - Bitext<Token> const & bt, - TargetPhraseCollection* tpcoll - ) const - { - PhrasePair pp; - pp.init(pid1, stats, this->m_numScoreComponents); - pstats::trg_map_t::const_iterator t; - for (t = stats.trg.begin(); t != stats.trg.end(); ++t) - { - pp.update(t->first,t->second); - BOOST_FOREACH(sptr<pscorer> const& ff, m_active_ff_fix) - (*ff)(bt,pp); - BOOST_FOREACH(sptr<pscorer> const& ff, m_active_ff_common) - (*ff)(bt,pp); - tpcoll->Add(createTargetPhrase(src,bt,pp)); - } - } + // TargetPhrase* + // Mmsapt:: + // mkTPhrase(Phrase const& src, + // Bitext<Token> const& bt, + // PhrasePair const& pp) const + // { + // Word w; uint32_t sid,off,len; + // TargetPhrase* tp = new TargetPhrase(); + // parse_pid(pp.p2, sid, off, len); + // Token const* x = bt.T2->sntStart(sid) + off; + // for (uint32_t k = 0; k < len; ++k) + // { + // // cerr << (*bt.V2)[x[k].id()] << " at " << __FILE__ << ":" << __LINE__ << endl; + // StringPiece wrd = (*bt.V2)[x[k].id()]; + // // if ((off+len) > bt.T2->sntLen(sid)) + // // cerr << off << ";" << len << " " << bt.T2->sntLen(sid) << endl; + // assert(off+len <= bt.T2->sntLen(sid)); + // w.CreateFromString(Output,ofactor,wrd,false); + // tp->AddWord(w); + // } + // tp->GetScoreBreakdown().Assign(this, pp.fvals); + // tp->Evaluate(src); + // return tp; + // } + + // // process phrase stats from a single parallel corpus + // void + // Mmsapt:: + // process_pstats + // (Phrase const& src, + // uint64_t const pid1, + // pstats const& stats, + // Bitext<Token> const & bt, + // TargetPhraseCollection* tpcoll + // ) const + // { + // PhrasePair pp; + // pp.init(pid1, stats, this->m_numScoreComponents); + // pstats::trg_map_t::const_iterator t; + // for (t = stats.trg.begin(); t != stats.trg.end(); ++t) + // { + // pp.update(t->first,t->second); + // BOOST_FOREACH(sptr<pscorer> const& ff, m_active_ff_fix) + // (*ff)(bt,pp); + // BOOST_FOREACH(sptr<pscorer> const& ff, m_active_ff_common) + // (*ff)(bt,pp); + // tpcoll->Add(mkTPhrase(src,bt,pp)); + // } + // } + + // void + // Mmsapt:: + // ScorePPfix(PhrasePair& pp) const + // { + // BOOST_FOREACH(sptr<pscorer> const& ff, m_active_ff_fix) + // (*ff)(btfix,pp); + // BOOST_FOREACH(sptr<pscorer> const& ff, m_active_ff_common) + // (*ff)(btfix,pp); + // } + +// // process phrase stats from a single parallel corpus +// bool +// Mmsapt:: +// pool_pstats(Phrase const& src, +// uint64_t const pid1a, +// pstats * statsa, +// Bitext<Token> const & bta, +// uint64_t const pid1b, +// pstats const* statsb, +// Bitext<Token> const & btb, +// TargetPhraseCollection* tpcoll) const +// { +// PhrasePair pp; +// if (statsa && statsb) +// pp.init(pid1b, *statsa, *statsb, this->m_numScoreComponents); +// else if (statsa) +// pp.init(pid1a, *statsa, this->m_numScoreComponents); +// else if (statsb) +// pp.init(pid1b, *statsb, this->m_numScoreComponents); +// else return false; // throw "no stats for pooling available!"; + +// pstats::trg_map_t::const_iterator b; +// pstats::trg_map_t::iterator a; +// if (statsb) +// { +// for (b = statsb->trg.begin(); b != statsb->trg.end(); ++b) +// { +// uint32_t sid,off,len; +// parse_pid(b->first, sid, off, len); +// Token const* x = btb.T2->sntStart(sid) + off; +// TSA<Token>::tree_iterator m(bta.I2.get(),x,x+len); +// if (m.size() == len) +// { +// ; +// if (statsa && ((a = statsa->trg.find(m.getPid())) +// != statsa->trg.end())) +// { +// pp.update(b->first,a->second,b->second); +// a->second.invalidate(); +// } +// else +// pp.update(b->first,m.approxOccurrenceCount(), +// b->second); +// } +// else pp.update(b->first,b->second); +// BOOST_FOREACH(sptr<pscorer> const& ff, m_active_ff_fix) +// (*ff)(btb,pp); +// BOOST_FOREACH(sptr<pscorer> const& ff, m_active_ff_common) +// (*ff)(btb,pp); +// tpcoll->Add(mkTPhrase(src,btb,pp)); +// } +// } +// if (!statsa) return statsb != NULL; +// for (a = statsa->trg.begin(); a != statsa->trg.end(); ++a) +// { +// uint32_t sid,off,len; +// if (!a->second.valid()) continue; +// parse_pid(a->first, sid, off, len); +// if (btb.T2) +// { +// Token const* x = bta.T2->sntStart(sid) + off; +// TSA<Token>::tree_iterator m(btb.I2.get(), x, len); +// if (m.size() == len) +// pp.update(a->first,m.approxOccurrenceCount(),a->second); +// else +// pp.update(a->first,a->second); +// } +// else pp.update(a->first,a->second); +// #if 0 +// // jstats const& j = a->second; +// cerr << bta.T1->pid2str(bta.V1.get(),pp.p1) << " ::: " +// << bta.T2->pid2str(bta.V2.get(),pp.p2) << endl; +// cerr << pp.raw1 << " " << pp.sample1 << " " << pp.good1 << " " +// << pp.joint << " " << pp.raw2 << endl; +// #endif + +// UTIL_THROW_IF2(pp.raw2 == 0, +// "OOPS" << bta.T1->pid2str(bta.V1.get(),pp.p1) << " ::: " +// << bta.T2->pid2str(bta.V2.get(),pp.p2) << ": " +// << pp.raw1 << " " << pp.sample1 << " " +// << pp.good1 << " " << pp.joint << " " +// << pp.raw2); +// BOOST_FOREACH(sptr<pscorer> const& ff, m_active_ff_fix) +// (*ff)(bta,pp); +// BOOST_FOREACH(sptr<pscorer> const& ff, m_active_ff_common) +// (*ff)(bta,pp); +// tpcoll->Add(mkTPhrase(src,bta,pp)); +// } +// return true; +// } - void - Mmsapt:: - ScorePPfix(bitext::PhrasePair& pp) const - { - BOOST_FOREACH(sptr<pscorer> const& ff, m_active_ff_fix) - (*ff)(btfix,pp); - BOOST_FOREACH(sptr<pscorer> const& ff, m_active_ff_common) - (*ff)(btfix,pp); - } - // process phrase stats from a single parallel corpus - bool - Mmsapt:: - pool_pstats(Phrase const& src, - uint64_t const pid1a, - pstats * statsa, - Bitext<Token> const & bta, - uint64_t const pid1b, - pstats const* statsb, - Bitext<Token> const & btb, - TargetPhraseCollection* tpcoll) const - { - PhrasePair pp; - if (statsa && statsb) - pp.init(pid1b, *statsa, *statsb, this->m_numScoreComponents); - else if (statsa) - pp.init(pid1a, *statsa, this->m_numScoreComponents); - else if (statsb) - pp.init(pid1b, *statsb, this->m_numScoreComponents); - else return false; // throw "no stats for pooling available!"; - - pstats::trg_map_t::const_iterator b; - pstats::trg_map_t::iterator a; - if (statsb) - { - for (b = statsb->trg.begin(); b != statsb->trg.end(); ++b) - { - uint32_t sid,off,len; - parse_pid(b->first, sid, off, len); - Token const* x = bta.T2->sntStart(sid) + off; - TSA<Token>::tree_iterator m(bta.I2.get(),x,x+len); - if (m.size() == len) - { - ; - if (statsa && ((a = statsa->trg.find(m.getPid())) - != statsa->trg.end())) - { - pp.update(b->first,a->second,b->second); - a->second.invalidate(); - } - else - pp.update(b->first,m.approxOccurrenceCount(), - b->second); - } - else pp.update(b->first,b->second); - BOOST_FOREACH(sptr<pscorer> const& ff, m_active_ff_fix) - (*ff)(btb,pp); - BOOST_FOREACH(sptr<pscorer> const& ff, m_active_ff_common) - (*ff)(btb,pp); - tpcoll->Add(createTargetPhrase(src,btb,pp)); - } - } - if (!statsa) return statsb != NULL; - for (a = statsa->trg.begin(); a != statsa->trg.end(); ++a) - { - uint32_t sid,off,len; - if (!a->second.valid()) continue; - parse_pid(a->first, sid, off, len); - if (btb.T2) - { - Token const* x = bta.T2->sntStart(sid) + off; - TSA<Token>::tree_iterator m(btb.I2.get(), x, x+len); - if (m.size() == len) - pp.update(a->first,m.approxOccurrenceCount(),a->second); - else - pp.update(a->first,a->second); - } - else - pp.update(a->first,a->second); -#if 0 - // jstats const& j = a->second; - cerr << bta.T1->pid2str(bta.V1.get(),pp.p1) << " ::: " - << bta.T2->pid2str(bta.V2.get(),pp.p2) << endl; - cerr << pp.raw1 << " " << pp.sample1 << " " << pp.good1 << " " - << pp.joint << " " << pp.raw2 << endl; -#endif - UTIL_THROW_IF2(pp.raw2 == 0, - "OOPS" << bta.T1->pid2str(bta.V1.get(),pp.p1) << " ::: " - << bta.T2->pid2str(bta.V2.get(),pp.p2) << ": " - << pp.raw1 << " " << pp.sample1 << " " - << pp.good1 << " " << pp.joint << " " - << pp.raw2); - BOOST_FOREACH(sptr<pscorer> const& ff, m_active_ff_fix) - (*ff)(bta,pp); - BOOST_FOREACH(sptr<pscorer> const& ff, m_active_ff_common) - (*ff)(bta,pp); - tpcoll->Add(createTargetPhrase(src,bta,pp)); - } - return true; - } - - // process phrase stats from a single parallel corpus - bool - Mmsapt:: - combine_pstats - (Phrase const& src, - uint64_t const pid1a, pstats * statsa, Bitext<Token> const & bta, - uint64_t const pid1b, pstats const* statsb, Bitext<Token> const & btb, - TargetPhraseCollection* tpcoll) const - { - PhrasePair ppfix,ppdyn,pool; - // ppfix: counts from btfix - // ppdyn: counts from btdyn - // pool: pooled counts from both - Word w; - if (statsa) ppfix.init(pid1a,*statsa,this->m_numScoreComponents); - if (statsb) ppdyn.init(pid1b,*statsb,this->m_numScoreComponents); - pstats::trg_map_t::const_iterator b; - pstats::trg_map_t::iterator a; - - if (statsb) - { - pool.init(pid1b,*statsb,0); - for (b = statsb->trg.begin(); b != statsb->trg.end(); ++b) - { - ppdyn.update(b->first,b->second); - BOOST_FOREACH(sptr<pscorer> const& ff, m_active_ff_dyn) - (*ff)(btb,ppdyn); + // // process phrase stats from a single parallel corpus + // bool + // Mmsapt:: + // combine_pstats + // (Phrase const& src, + // uint64_t const pid1a, pstats * statsa, Bitext<Token> const & bta, + // uint64_t const pid1b, pstats const* statsb, Bitext<Token> const & btb, + // TargetPhraseCollection* tpcoll) const + // { + // if (!statsa && !statsb) return false; + + // PhrasePair ppfix,ppdyn,pool; Word w; + // // ppfix: counts from btfix + // // ppdyn: counts from btdyn + // // pool: pooled counts from both + + // pstats::trg_map_t::const_iterator b; + // pstats::trg_map_t::iterator a; + + + // set<uint64_t> check; + // if (statsb) + // { + // ppdyn.init(pid1b,*statsb,this->m_numScoreComponents); + // if (statsa) + // { + // pool.init(pid1b, *statsa, *statsb, 0); + // ppfix.init(pid1a,*statsa, 0); + // } + // else + // { + // pool.init(pid1b, *statsb,0); + // ppfix.init(); + // } + + // for (b = statsb->trg.begin(); b != statsb->trg.end(); ++b) + // { + // ppdyn.update(b->first,b->second); + // BOOST_FOREACH(sptr<pscorer> const& ff, m_active_ff_dyn) + // (*ff)(btb,ppdyn); - uint32_t sid,off,len; - parse_pid(b->first, sid, off, len); - Token const* x = bta.T2->sntStart(sid) + off; - TSA<Token>::tree_iterator m(bta.I2.get(),x,x+len); + // uint32_t sid,off,len; + // parse_pid(b->first, sid, off, len); + // Token const* x = btb.T2->sntStart(sid) + off; + // TSA<Token>::tree_iterator m(bta.I2.get(),x,len); - if (m.size() && statsa && - ((a = statsa->trg.find(m.getPid())) != statsa->trg.end())) - { - // phrase pair found also in btfix - ppfix.update(a->first,a->second); - BOOST_FOREACH(sptr<pscorer> const& ff, m_active_ff_fix) - (*ff)(bta,ppfix,&ppdyn.fvals); - BOOST_FOREACH(sptr<pscorer> const& ff, m_active_ff_common) - (*ff)(bta,ppfix,&ppdyn.fvals); - a->second.invalidate(); - } - else - { - // phrase pair was not found in btfix - - // ... but the source phrase was - if (m.size()) - pool.update(b->first,m.approxOccurrenceCount(), b->second); - - // ... and not even the source phrase - else - pool.update(b->first,b->second); - - BOOST_FOREACH(sptr<pscorer> const& ff, m_active_ff_fix) - (*ff)(btb,pool,&ppdyn.fvals); - BOOST_FOREACH(sptr<pscorer> const& ff, m_active_ff_common) - (*ff)(btb,pool,&ppdyn.fvals); - - } - - tpcoll->Add(createTargetPhrase(src,btb,ppdyn)); - } - } - - // now deal with all phraise pairs that are ONLY in btfix - // (the ones that are in both were dealt with above) - if (statsa) - { - pool.init(pid1a,*statsa,0); - for (a = statsa->trg.begin(); a != statsa->trg.end(); ++a) - { - if (!a->second.valid()) continue; // done above - ppfix.update(a->first,a->second); - BOOST_FOREACH(sptr<pscorer> const& ff, m_active_ff_fix) - (*ff)(bta,ppfix); - BOOST_FOREACH(sptr<pscorer> const& ff, m_active_ff_common) - (*ff)(bta,ppfix); + // Token const* y = m.getToken(0); + // for (size_t i = 0; i < len; ++i) + // cout << x[i].id() << " " << endl; + // for (size_t i = 0; i < m.size(); ++i) + // cout << y[i].id() << " " << endl; - if (btb.I2) - { - uint32_t sid,off,len; - parse_pid(a->first, sid, off, len); - Token const* x = bta.T2->sntStart(sid) + off; - TSA<Token>::tree_iterator m(btb.I2.get(),x,x+len); - if (m.size()) - pool.update(a->first,m.approxOccurrenceCount(),a->second); - else - pool.update(a->first,a->second); - } - else pool.update(a->first,a->second); - BOOST_FOREACH(sptr<pscorer> const& ff, m_active_ff_dyn) - (*ff)(btb,pool,&ppfix.fvals); - if (ppfix.p2) - tpcoll->Add(createTargetPhrase(src,bta,ppfix)); - } - } - return (statsa || statsb); - } + // if (statsa && m.size() && + // ((a = statsa->trg.find(m.getPid())) != statsa->trg.end())) + // { // i.e., phrase pair found also in btfix + // ppfix.update(a->first,a->second); + // pool.update(b->first, b->second, a->second); + // BOOST_FOREACH(sptr<pscorer> const& ff, m_active_ff_fix) + // (*ff)(bta, ppfix, &ppdyn.fvals); + // check.insert(a->first); + // } + // else // phrase pair was not found in btfix + // { + // if (m.size()) // ... but the source phrase was + // { + // pool.update(b->first, m.approxOccurrenceCount(), b->second); + // ppfix.update(b->first,m.approxOccurrenceCount()); + // } + // else // ... and not even the source phrase + // { + // pool.update(b->first, b->second); + // ppfix.update(b->first,0); + // } + // BOOST_FOREACH(sptr<pscorer> const& ff, m_active_ff_fix) + // (*ff)(btb, ff->allowPooling() ? pool : ppfix, &ppdyn.fvals); + // } + // BOOST_FOREACH(sptr<pscorer> const& ff, m_active_ff_common) + // (*ff)(btb, pool, &ppdyn.fvals); + // tpcoll->Add(mkTPhrase(src,btb,ppdyn)); + // } + // } + + // // now deal with all phraise pairs that are ONLY in btfix + // // (the ones that are in both were dealt with above) + // if (statsa) + // { + // ppfix.init(pid1a, *statsa, this->m_numScoreComponents); + // pool.init(pid1a, *statsa, 0); + // ppdyn.init(); + // for (a = statsa->trg.begin(); a != statsa->trg.end(); ++a) + // { + // if (check.find(a->first) != check.end()) + // continue; + + // ppfix.update(a->first, a->second); + // BOOST_FOREACH(sptr<pscorer> const& ff, m_active_ff_fix) + // (*ff)(bta, ppfix); + + // if (btb.I2) + // { + // uint32_t sid,off,len; + // parse_pid(a->first, sid, off, len); + // Token const* x = bta.T2->sntStart(sid) + off; + // TSA<Token>::tree_iterator m(btb.I2.get(), x, len); + // if (m.size()) + // pool.update(a->first, m.approxOccurrenceCount(), a->second); + // else + // pool.update(a->first, a->second); + // } + // else pool.update(a->first, a->second); + // BOOST_FOREACH(sptr<pscorer> const& ff, m_active_ff_dyn) + // (*ff)(btb, ff->allowPooling() ? pool : ppdyn, &ppfix.fvals); + // BOOST_FOREACH(sptr<pscorer> const& ff, m_active_ff_common) + // (*ff)(bta, pool, &ppfix.fvals); + // if (ppfix.p2) + // tpcoll->Add(mkTPhrase(src, bta, ppfix)); + // } + // } + // return true; + // } Mmsapt:: TargetPhraseCollectionWrapper:: @@ -595,8 +754,34 @@ namespace Moses { assert(this->refCount == 0); } - + template<typename Token> + void + expand(typename Bitext<Token>::iter const& m, + Bitext<Token> const& bt, + pstats const& ps, vector<PhrasePair<Token> >& dest) + { + dest.reserve(ps.trg.size()); + PhrasePair<Token> pp; + pp.init(m.getToken(0), m.size(), &ps, 0); + // cout << HERE << " " << toString(*(bt.V1),pp.start1,pp.len1) << endl; + pstats::trg_map_t::const_iterator a; + for (a = ps.trg.begin(); a != ps.trg.end(); ++a) + { + uint32_t sid,off,len; + parse_pid(a->first, sid, off, len); + pp.update(bt.T2->sntStart(sid)+off, len, a->second); + dest.push_back(pp); + } + typename PhrasePair<Token>::SortByTargetIdSeq sorter; + sort(dest.begin(), dest.end(),sorter); +#if 0 + BOOST_FOREACH(PhrasePair<Token> const& p, dest) + cout << toString (*bt.V1,p.start1,p.len1) << " ::: " + << toString (*bt.V2,p.start2,p.len2) << " " + << p.joint << endl; +#endif + } // This is not the most efficient way of phrase lookup! TargetPhraseCollection const* @@ -605,13 +790,9 @@ namespace Moses { // map from Moses Phrase to internal id sequence vector<id_type> sphrase; - fillIdSeq(src,input_factor,*btfix.V1,sphrase); + fillIdSeq(src,input_factor,*(btfix.V1),sphrase); if (sphrase.size() == 0) return NULL; - // lookup in static bitext - TSA<Token>::tree_iterator mfix(btfix.I1.get(),&sphrase[0],sphrase.size()); - - // lookup in dynamic bitext // Reserve a local copy of the dynamic bitext in its current form. /btdyn/ // is set to a new copy of the dynamic bitext every time a sentence pair // is added. /dyn/ keeps the old bitext around as long as we need it. @@ -621,12 +802,13 @@ namespace Moses dyn = btdyn; } assert(dyn); + + // lookup phrases in both bitexts + TSA<Token>::tree_iterator mfix(btfix.I1.get(), &sphrase[0], sphrase.size()); TSA<Token>::tree_iterator mdyn(dyn->I1.get()); if (dyn->I1.get()) - { - for (size_t i = 0; mdyn.size() == i && i < sphrase.size(); ++i) - mdyn.extend(sphrase[i]); - } + for (size_t i = 0; mdyn.size() == i && i < sphrase.size(); ++i) + mdyn.extend(sphrase[i]); #if 0 cerr << src << endl; @@ -634,43 +816,62 @@ namespace Moses << mdyn.size() << " " << mdyn.getPid() << endl; #endif - // phrase not found in either - if (mdyn.size() != sphrase.size() && - mfix.size() != sphrase.size()) - return NULL; // not found + if (mdyn.size() != sphrase.size() && mfix.size() != sphrase.size()) + return NULL; // phrase not found in either bitext // cache lookup: - - uint64_t phrasekey; - if (mfix.size() == sphrase.size()) - phrasekey = (mfix.getPid()<<1); - else - phrasekey = (mdyn.getPid()<<1)+1; - + uint64_t phrasekey = (mfix.size() == sphrase.size() ? (mfix.getPid()<<1) + : (mdyn.getPid()<<1)+1); size_t revision = dyn->revision(); { boost::lock_guard<boost::mutex> guard(this->lock); tpc_cache_t::iterator c = m_cache.find(phrasekey); + // TO DO: we should revise the revision mechanism: we take the length + // of the dynamic bitext (in sentences) at the time the PT entry + // was stored as the time stamp. For each word in the + // vocabulary, we also store its most recent occurrence in the + // bitext. Only if the timestamp of each word in the phrase is + // newer than the timestamp of the phrase itself we must update + // the entry. if (c != m_cache.end() && c->second->revision == revision) return encache(c->second); } - // not found or not up to date + // OK: pt entry not found or not up to date + // lookup and expansion could be done in parallel threds, + // but ppdyn is probably small anyway + // TO DO: have Bitexts return lists of PhrasePairs instead of pstats + // no need to expand pstats at every single lookup again, especially + // for btfix. sptr<pstats> sfix,sdyn; - if (mfix.size() == sphrase.size()) - sfix = btfix.lookup(mfix); - if (mdyn.size() == sphrase.size()) - sdyn = dyn->lookup(mdyn); + if (mfix.size() == sphrase.size()) sfix = btfix.lookup(mfix); + if (mdyn.size() == sphrase.size()) sdyn = dyn->lookup(mdyn); + + vector<PhrasePair<Token> > ppfix,ppdyn; + if (sfix) expand(mfix, btfix, *sfix, ppfix); + if (sdyn) expand(mdyn, *dyn, *sdyn, ppdyn); - TargetPhraseCollectionWrapper* - ret = new TargetPhraseCollectionWrapper(revision,phrasekey); - if ((poolCounts && - pool_pstats(src, mfix.getPid(),sfix.get(),btfix, - mdyn.getPid(),sdyn.get(),*dyn,ret)) - || combine_pstats(src, mfix.getPid(),sfix.get(),btfix, - mdyn.getPid(),sdyn.get(),*dyn,ret)) + // now we have two lists of Phrase Pairs, let's merge them + TargetPhraseCollectionWrapper* ret; + ret = new TargetPhraseCollectionWrapper(revision,phrasekey); + PhrasePair<Token>::SortByTargetIdSeq sorter; + size_t i = 0; size_t k = 0; + while (i < ppfix.size() && k < ppdyn.size()) + { + int cmp = sorter.cmp(ppfix[i], ppdyn[k]); + if (cmp < 0) ret->Add(mkTPhrase(src,&ppfix[i++],NULL,dyn)); + else if (cmp == 0) ret->Add(mkTPhrase(src,&ppfix[i++],&ppdyn[k++],dyn)); + else ret->Add(mkTPhrase(src,NULL,&ppdyn[k++],dyn)); + } + while (i < ppfix.size()) ret->Add(mkTPhrase(src,&ppfix[i++],NULL,dyn)); + while (k < ppdyn.size()) ret->Add(mkTPhrase(src,NULL,&ppdyn[k++],dyn)); + if (m_tableLimit) ret->Prune(true, m_tableLimit); + else ret->Prune(true,ret->GetSize()); +#if 0 + if (combine_pstats(src, + mfix.getPid(), sfix.get(), btfix, + mdyn.getPid(), sdyn.get(), *dyn, ret)) { - if (m_tableLimit) ret->Prune(true,m_tableLimit); #if 0 sort(ret->begin(), ret->end(), CompareTargetPhrase()); cout << "SOURCE PHRASE: " << src << endl; @@ -686,6 +887,9 @@ namespace Moses } #endif } +#endif + + // put the result in the cache and return boost::lock_guard<boost::mutex> guard(this->lock); m_cache[phrasekey] = ret; return encache(ret); @@ -839,6 +1043,7 @@ namespace Moses TSA<Token>::tree_iterator mfix(btfix.I1.get(),&myphrase[0],myphrase.size()); if (mfix.size() == myphrase.size()) { + btfix.prep(mfix); // cerr << phrase << " " << mfix.approxOccurrenceCount() << endl; return true; } @@ -854,6 +1059,7 @@ namespace Moses { for (size_t i = 0; mdyn.size() == i && i < myphrase.size(); ++i) mdyn.extend(myphrase[i]); + if (mdyn.size() == myphrase.size()) dyn->prep(mdyn); } return mdyn.size() == myphrase.size(); } diff --git a/moses/TranslationModel/UG/mmsapt.h b/moses/TranslationModel/UG/mmsapt.h index b6be36131..a7ece8fdb 100644 --- a/moses/TranslationModel/UG/mmsapt.h +++ b/moses/TranslationModel/UG/mmsapt.h @@ -19,6 +19,7 @@ #include "moses/TranslationModel/UG/mm/ug_typedefs.h" #include "moses/TranslationModel/UG/mm/tpt_pickler.h" #include "moses/TranslationModel/UG/mm/ug_bitext.h" +#include "moses/TranslationModel/UG/mm/ug_phrasepair.h" #include "moses/TranslationModel/UG/mm/ug_lexical_phrase_scorer2.h" #include "moses/InputFileStream.h" @@ -29,7 +30,8 @@ #include <map> #include "moses/TranslationModel/PhraseDictionary.h" -#include "mmsapt_phrase_scorers.h" +#include "mmsapt_phrase_scorers.h" // deprecated +#include "sapt_phrase_scorers.h" // TO DO: // - make lexical phrase scorer take addition to the "dynamic overlay" into account @@ -47,47 +49,68 @@ namespace Moses #endif { friend class Alignment; + map<string,string> param; public: typedef L2R_Token<SimpleWordId> Token; typedef mmBitext<Token> mmbitext; typedef imBitext<Token> imbitext; + typedef Bitext<Token> bitext; typedef TSA<Token> tsa; typedef PhraseScorer<Token> pscorer; + private: + // vector<sptr<bitext> > shards; mmbitext btfix; - sptr<imbitext> btdyn; + sptr<imbitext> btdyn; string bname,extra_data; string L1; string L2; - float m_lbop_parameter; - float m_lex_alpha; + float m_lbop_conf; // confidence level for lbop smoothing + float m_lex_alpha; // alpha paramter (j+a)/(m+a) for lexical smoothing // alpha parameter for lexical smoothing (joint+alpha)/(marg + alpha) // must be > 0 if dynamic size_t m_default_sample_size; size_t m_workers; // number of worker threads for sampling the bitexts - // deprecated! - char m_pfwd_denom; // denominator for computation of fwd phrase score: - // 'r' - divide by raw count - // 's' - divide by sample count - // 'g' - devide by number of "good" (i.e. coherent) samples - // size_t num_features; + // // deprecated! + // char m_pfwd_denom; // denominator for computation of fwd phrase score: + // // 'r' - divide by raw count + // // 's' - divide by sample count + // // 'g' - devide by number of "good" (i.e. coherent) samples + // // size_t num_features; size_t input_factor; size_t output_factor; // we can actually return entire Tokens! - bool withLogCountFeatures; // add logs of counts as features? - bool withCoherence; - string m_pfwd_features; // which pfwd functions to use - string m_pbwd_features; // which pbwd functions to use + // bool withLogCountFeatures; // add logs of counts as features? + // bool withCoherence; + // string m_pfwd_features; // which pfwd functions to use + // string m_pbwd_features; // which pbwd functions to use + + // for display for human inspection (ttable dumps): vector<string> m_feature_names; // names of features activated + vector<bool> m_is_logval; // keeps track of which features are log valued + vector<bool> m_is_integer; // keeps track of which features are integer valued + vector<sptr<pscorer > > m_active_ff_fix; // activated feature functions (fix) vector<sptr<pscorer > > m_active_ff_dyn; // activated feature functions (dyn) vector<sptr<pscorer > > m_active_ff_common; // activated feature functions (dyn) - size_t - add_corpus_specific_features - (vector<sptr<pscorer > >& ffvec, size_t num_feats); + void + register_ff(sptr<pscorer> const& ff, vector<sptr<pscorer> > & registry); + + template<typename fftype> + void + check_ff(string const ffname,vector<sptr<pscorer> >* registry = NULL); + // add feature function if specified + + template<typename fftype> + void + check_ff(string const ffname, float const xtra, vector<sptr<pscorer> >* registry = NULL); + // add feature function if specified + + void + add_corpus_specific_features(vector<sptr<pscorer > >& ffvec); // built-in feature functions // PScorePfwd<Token> calc_pfwd_fix, calc_pfwd_dyn; @@ -140,12 +163,24 @@ namespace Moses mm2dtable_t COOCraw; TargetPhrase* - createTargetPhrase + mkTPhrase(Phrase const& src, + Moses::bitext::PhrasePair<Token>* fix, + Moses::bitext::PhrasePair<Token>* dyn, + sptr<Bitext<Token> > const& dynbt) const; + + // template<typename Token> + // void + // expand(typename Bitext<Token>::iter const& m, Bitext<Token> const& bt, + // pstats const& pstats, vector<PhrasePair<Token> >& dest); + +#if 0 + TargetPhrase* + mkTPhrase (Phrase const& src, Bitext<Token> const& bt, - bitext::PhrasePair const& pp + Moses::bitext::PhrasePair const& pp ) const; - +#endif void process_pstats (Phrase const& src, @@ -180,7 +215,7 @@ namespace Moses ) const; void - load_extra_data(string bname); + load_extra_data(string bname, bool locking); mutable size_t m_tpc_ctr; public: @@ -231,8 +266,14 @@ namespace Moses vector<string> const& GetFeatureNames() const; - void - ScorePPfix(bitext::PhrasePair& pp) const; + // void + // ScorePPfix(bitext::PhrasePair& pp) const; + + bool + isLogVal(int i) const; + + bool + isInteger(int i) const; private: }; diff --git a/moses/TranslationModel/UG/mmsapt_align.cc b/moses/TranslationModel/UG/mmsapt_align.cc index 407df648d..8b6bf1eb2 100644 --- a/moses/TranslationModel/UG/mmsapt_align.cc +++ b/moses/TranslationModel/UG/mmsapt_align.cc @@ -1,335 +1,336 @@ #include "mmsapt.h" +// currently broken -namespace Moses -{ - using namespace bitext; - using namespace std; - using namespace boost; +// namespace Moses +// { +// using namespace bitext; +// using namespace std; +// using namespace boost; - struct PPgreater - { - bool operator()(PhrasePair const& a, PhrasePair const& b) - { - return a.score > b.score; - } - }; +// struct PPgreater +// { +// bool operator()(PhrasePair const& a, PhrasePair const& b) +// { +// return a.score > b.score; +// } +// }; - void - Mmsapt:: - setWeights(vector<float> const & w) - { - assert(w.size() == this->m_numScoreComponents); - this->feature_weights = w; - } +// void +// Mmsapt:: +// setWeights(vector<float> const & w) +// { +// assert(w.size() == this->m_numScoreComponents); +// this->feature_weights = w; +// } - struct PhraseAlnHyp - { - PhrasePair pp; - ushort s1,e1,s2,e2; // start and end positions - int prev; // preceding alignment hypothesis - float score; - bitvector scov; // source coverage - PhraseAlnHyp(PhrasePair const& ppx, int slen, - pair<uint32_t,uint32_t> const& sspan, - pair<uint32_t,uint32_t> const& tspan) - : pp(ppx), prev(-1), score(ppx.score), scov(slen) - { - s1 = sspan.first; e1 = sspan.second; - s2 = tspan.first; e2 = tspan.second; - for (size_t i = s1; i < e1; ++i) - scov.set(i); - } +// struct PhraseAlnHyp +// { +// PhrasePair pp; +// ushort s1,e1,s2,e2; // start and end positions +// int prev; // preceding alignment hypothesis +// float score; +// bitvector scov; // source coverage +// PhraseAlnHyp(PhrasePair const& ppx, int slen, +// pair<uint32_t,uint32_t> const& sspan, +// pair<uint32_t,uint32_t> const& tspan) +// : pp(ppx), prev(-1), score(ppx.score), scov(slen) +// { +// s1 = sspan.first; e1 = sspan.second; +// s2 = tspan.first; e2 = tspan.second; +// for (size_t i = s1; i < e1; ++i) +// scov.set(i); +// } - bool operator<(PhraseAlnHyp const& other) const - { - return this->score < other.score; - } +// bool operator<(PhraseAlnHyp const& other) const +// { +// return this->score < other.score; +// } - bool operator>(PhraseAlnHyp const& other) const - { - return this->score > other.score; - } +// bool operator>(PhraseAlnHyp const& other) const +// { +// return this->score > other.score; +// } - PhraseOrientation - po_bwd(PhraseAlnHyp const* prev) const - { - if (s2 == 0) return po_first; - assert(prev); - assert(prev->e2 <= s2); - if (prev->e2 < s2) return po_other; - if (prev->e1 == s1) return po_mono; - if (prev->e1 < s1) return po_jfwd; - if (prev->s1 == e1) return po_swap; - if (prev->s1 > e1) return po_jbwd; - return po_other; - } +// PhraseOrientation +// po_bwd(PhraseAlnHyp const* prev) const +// { +// if (s2 == 0) return po_first; +// assert(prev); +// assert(prev->e2 <= s2); +// if (prev->e2 < s2) return po_other; +// if (prev->e1 == s1) return po_mono; +// if (prev->e1 < s1) return po_jfwd; +// if (prev->s1 == e1) return po_swap; +// if (prev->s1 > e1) return po_jbwd; +// return po_other; +// } - PhraseOrientation - po_fwd(PhraseAlnHyp const* next) const - { - if (!next) return po_last; - assert(next->s2 >= e2); - if (next->s2 < e2) return po_other; - if (next->e1 == s1) return po_swap; - if (next->e1 < s1) return po_jbwd; - if (next->s1 == e1) return po_mono; - if (next->s1 > e1) return po_jfwd; - return po_other; - } +// PhraseOrientation +// po_fwd(PhraseAlnHyp const* next) const +// { +// if (!next) return po_last; +// assert(next->s2 >= e2); +// if (next->s2 < e2) return po_other; +// if (next->e1 == s1) return po_swap; +// if (next->e1 < s1) return po_jbwd; +// if (next->s1 == e1) return po_mono; +// if (next->s1 > e1) return po_jfwd; +// return po_other; +// } - float - dprob_fwd(PhraseAlnHyp const& next) - { - return pp.dfwd[po_fwd(&next)]; - } +// float +// dprob_fwd(PhraseAlnHyp const& next) +// { +// return pp.dfwd[po_fwd(&next)]; +// } - float - dprob_bwd(PhraseAlnHyp const& prev) - { - return pp.dbwd[po_bwd(&prev)]; - } +// float +// dprob_bwd(PhraseAlnHyp const& prev) +// { +// return pp.dbwd[po_bwd(&prev)]; +// } - }; +// }; - class Alignment - { - typedef L2R_Token<SimpleWordId> Token; - typedef TSA<Token> tsa; - typedef pair<uint32_t, uint32_t> span; - typedef vector<vector<uint64_t> > pidmap_t; // span -> phrase ID - typedef boost::unordered_map<uint64_t,vector<span> > pid2span_t; - typedef pstats::trg_map_t jStatsTable; +// class Alignment +// { +// typedef L2R_Token<SimpleWordId> Token; +// typedef TSA<Token> tsa; +// typedef pair<uint32_t, uint32_t> span; +// typedef vector<vector<uint64_t> > pidmap_t; // span -> phrase ID +// typedef boost::unordered_map<uint64_t,vector<span> > pid2span_t; +// typedef pstats::trg_map_t jStatsTable; - Mmsapt const& PT; - vector<id_type> s,t; - pidmap_t sspan2pid, tspan2pid; // span -> phrase ID - pid2span_t spid2span,tpid2span; - vector<vector<sptr<pstats> > > spstats; +// Mmsapt const& PT; +// vector<id_type> s,t; +// pidmap_t sspan2pid, tspan2pid; // span -> phrase ID +// pid2span_t spid2span,tpid2span; +// vector<vector<sptr<pstats> > > spstats; - vector<PhrasePair> PP; - // position-independent phrase pair info - public: - vector<PhraseAlnHyp> PAH; - vector<vector<int> > tpos2ahyp; - // maps from target start positions to PhraseAlnHyps starting at - // that position +// vector<PhrasePair> PP; +// // position-independent phrase pair info +// public: +// vector<PhraseAlnHyp> PAH; +// vector<vector<int> > tpos2ahyp; +// // maps from target start positions to PhraseAlnHyps starting at +// // that position - sptr<pstats> getPstats(span const& sspan); - void fill_tspan_maps(); - void fill_sspan_maps(); - public: - Alignment(Mmsapt const& pt, string const& src, string const& trg); - void show(ostream& out); - void show(ostream& out, PhraseAlnHyp const& ah); - }; +// sptr<pstats> getPstats(span const& sspan); +// void fill_tspan_maps(); +// void fill_sspan_maps(); +// public: +// Alignment(Mmsapt const& pt, string const& src, string const& trg); +// void show(ostream& out); +// void show(ostream& out, PhraseAlnHyp const& ah); +// }; - void - Alignment:: - show(ostream& out, PhraseAlnHyp const& ah) - { -#if 0 - LexicalPhraseScorer2<Token>::table_t const& - COOCjnt = PT.calc_lex.scorer.COOC; +// void +// Alignment:: +// show(ostream& out, PhraseAlnHyp const& ah) +// { +// #if 0 +// LexicalPhraseScorer2<Token>::table_t const& +// COOCjnt = PT.calc_lex.scorer.COOC; - out << setw(10) << exp(ah.score) << " " - << PT.btfix.T2->pid2str(PT.btfix.V2.get(), ah.pp.p2) - << " <=> " - << PT.btfix.T1->pid2str(PT.btfix.V1.get(), ah.pp.p1); - vector<uchar> const& a = ah.pp.aln; - // BOOST_FOREACH(int x,a) cout << "[" << x << "] "; - for (size_t u = 0; u+1 < a.size(); u += 2) - out << " " << int(a[u+1]) << "-" << int(a[u]); +// out << setw(10) << exp(ah.score) << " " +// << PT.btfix.T2->pid2str(PT.btfix.V2.get(), ah.pp.p2) +// << " <=> " +// << PT.btfix.T1->pid2str(PT.btfix.V1.get(), ah.pp.p1); +// vector<uchar> const& a = ah.pp.aln; +// // BOOST_FOREACH(int x,a) cout << "[" << x << "] "; +// for (size_t u = 0; u+1 < a.size(); u += 2) +// out << " " << int(a[u+1]) << "-" << int(a[u]); - if (ah.e2-ah.s2 == 1 and ah.e1-ah.s1 == 1) - out << " " << COOCjnt[s[ah.s1]][t[ah.s2]] - << "/" << PT.COOCraw[s[ah.s1]][t[ah.s2]] - << "=" << float(COOCjnt[s[ah.s1]][t[ah.s2]])/PT.COOCraw[s[ah.s1]][t[ah.s2]]; - out << endl; - // float const* ofwdj = ah.pp.dfwd; - // float const* obwdj = ah.pp.dbwd; - // uint32_t const* ofwdm = spstats[ah.s1][ah.e1-ah.s1-1]->ofwd; - // uint32_t const* obwdm = spstats[ah.s1][ah.e1-ah.s1-1]->obwd; - // out << " [first: " << ofwdj[po_first]<<"/"<<ofwdm[po_first] - // << " last: " << ofwdj[po_last]<<"/"<<ofwdm[po_last] - // << " mono: " << ofwdj[po_mono]<<"/"<<ofwdm[po_mono] - // << " jfwd: " << ofwdj[po_jfwd]<<"/"<<ofwdm[po_jfwd] - // << " swap: " << ofwdj[po_swap]<<"/"<<ofwdm[po_swap] - // << " jbwd: " << ofwdj[po_jbwd]<<"/"<<ofwdm[po_jbwd] - // << " other: " << ofwdj[po_other]<<"/"<<ofwdm[po_other] - // << "]" << endl - // << " [first: " << obwdj[po_first]<<"/"<<obwdm[po_first] - // << " last: " << obwdj[po_last]<<"/"<<obwdm[po_last] - // << " mono: " << obwdj[po_mono]<<"/"<<obwdm[po_mono] - // << " jfwd: " << obwdj[po_jfwd]<<"/"<<obwdm[po_jfwd] - // << " swap: " << obwdj[po_swap]<<"/"<<obwdm[po_swap] - // << " jbwd: " << obwdj[po_jbwd]<<"/"<<obwdm[po_jbwd] - // << " other: " << obwdj[po_other]<<"/"<<obwdm[po_other] - // << "]" << endl; -#endif - } +// if (ah.e2-ah.s2 == 1 and ah.e1-ah.s1 == 1) +// out << " " << COOCjnt[s[ah.s1]][t[ah.s2]] +// << "/" << PT.COOCraw[s[ah.s1]][t[ah.s2]] +// << "=" << float(COOCjnt[s[ah.s1]][t[ah.s2]])/PT.COOCraw[s[ah.s1]][t[ah.s2]]; +// out << endl; +// // float const* ofwdj = ah.pp.dfwd; +// // float const* obwdj = ah.pp.dbwd; +// // uint32_t const* ofwdm = spstats[ah.s1][ah.e1-ah.s1-1]->ofwd; +// // uint32_t const* obwdm = spstats[ah.s1][ah.e1-ah.s1-1]->obwd; +// // out << " [first: " << ofwdj[po_first]<<"/"<<ofwdm[po_first] +// // << " last: " << ofwdj[po_last]<<"/"<<ofwdm[po_last] +// // << " mono: " << ofwdj[po_mono]<<"/"<<ofwdm[po_mono] +// // << " jfwd: " << ofwdj[po_jfwd]<<"/"<<ofwdm[po_jfwd] +// // << " swap: " << ofwdj[po_swap]<<"/"<<ofwdm[po_swap] +// // << " jbwd: " << ofwdj[po_jbwd]<<"/"<<ofwdm[po_jbwd] +// // << " other: " << ofwdj[po_other]<<"/"<<ofwdm[po_other] +// // << "]" << endl +// // << " [first: " << obwdj[po_first]<<"/"<<obwdm[po_first] +// // << " last: " << obwdj[po_last]<<"/"<<obwdm[po_last] +// // << " mono: " << obwdj[po_mono]<<"/"<<obwdm[po_mono] +// // << " jfwd: " << obwdj[po_jfwd]<<"/"<<obwdm[po_jfwd] +// // << " swap: " << obwdj[po_swap]<<"/"<<obwdm[po_swap] +// // << " jbwd: " << obwdj[po_jbwd]<<"/"<<obwdm[po_jbwd] +// // << " other: " << obwdj[po_other]<<"/"<<obwdm[po_other] +// // << "]" << endl; +// #endif +// } - void - Alignment:: - show(ostream& out) - { - // show what we have so far ... - for (size_t s2 = 0; s2 < t.size(); ++s2) - { - VectorIndexSorter<PhraseAlnHyp> foo(PAH); - sort(tpos2ahyp[s2].begin(), tpos2ahyp[s2].end(), foo); - for (size_t h = 0; h < tpos2ahyp[s2].size(); ++h) - show(out,PAH[tpos2ahyp[s2][h]]); - } - } +// void +// Alignment:: +// show(ostream& out) +// { +// // show what we have so far ... +// for (size_t s2 = 0; s2 < t.size(); ++s2) +// { +// VectorIndexSorter<PhraseAlnHyp> foo(PAH); +// sort(tpos2ahyp[s2].begin(), tpos2ahyp[s2].end(), foo); +// for (size_t h = 0; h < tpos2ahyp[s2].size(); ++h) +// show(out,PAH[tpos2ahyp[s2][h]]); +// } +// } - sptr<pstats> - Alignment:: - getPstats(span const& sspan) - { - size_t k = sspan.second - sspan.first - 1; - if (k < spstats[sspan.first].size()) - return spstats[sspan.first][k]; - else return sptr<pstats>(); - } +// sptr<pstats> +// Alignment:: +// getPstats(span const& sspan) +// { +// size_t k = sspan.second - sspan.first - 1; +// if (k < spstats[sspan.first].size()) +// return spstats[sspan.first][k]; +// else return sptr<pstats>(); +// } - void - Alignment:: - fill_tspan_maps() - { - tspan2pid.assign(t.size(),vector<uint64_t>(t.size(),0)); - for (size_t i = 0; i < t.size(); ++i) - { - tsa::tree_iterator m(PT.btfix.I2.get()); - for (size_t k = i; k < t.size() && m.extend(t[k]); ++k) - { - uint64_t pid = m.getPid(); - tpid2span[pid].push_back(pair<uint32_t,uint32_t>(i,k+1)); - tspan2pid[i][k] = pid; - } - } - } +// void +// Alignment:: +// fill_tspan_maps() +// { +// tspan2pid.assign(t.size(),vector<uint64_t>(t.size(),0)); +// for (size_t i = 0; i < t.size(); ++i) +// { +// tsa::tree_iterator m(PT.btfix.I2.get()); +// for (size_t k = i; k < t.size() && m.extend(t[k]); ++k) +// { +// uint64_t pid = m.getPid(); +// tpid2span[pid].push_back(pair<uint32_t,uint32_t>(i,k+1)); +// tspan2pid[i][k] = pid; +// } +// } +// } - void - Alignment:: - fill_sspan_maps() - { - sspan2pid.assign(s.size(),vector<uint64_t>(s.size(),0)); - spstats.resize(s.size()); - for (size_t i = 0; i < s.size(); ++i) - { - tsa::tree_iterator m(PT.btfix.I1.get()); - for (size_t k = i; k < s.size() && m.extend(s[k]); ++k) - { - uint64_t pid = m.getPid(); - sspan2pid[i][k] = pid; - pid2span_t::iterator p = spid2span.find(pid); - if (p != spid2span.end()) - { - int x = p->second[0].first; - int y = p->second[0].second-1; - spstats[i].push_back(spstats[x][y-x]); - } - else - { - spstats[i].push_back(PT.btfix.lookup(m)); - cout << PT.btfix.T1->pid2str(PT.btfix.V1.get(),pid) << " " - << spstats[i].back()->good << "/" << spstats[i].back()->sample_cnt - << endl; - } - spid2span[pid].push_back(pair<uint32_t,uint32_t>(i,k+1)); - } - } - } +// void +// Alignment:: +// fill_sspan_maps() +// { +// sspan2pid.assign(s.size(),vector<uint64_t>(s.size(),0)); +// spstats.resize(s.size()); +// for (size_t i = 0; i < s.size(); ++i) +// { +// tsa::tree_iterator m(PT.btfix.I1.get()); +// for (size_t k = i; k < s.size() && m.extend(s[k]); ++k) +// { +// uint64_t pid = m.getPid(); +// sspan2pid[i][k] = pid; +// pid2span_t::iterator p = spid2span.find(pid); +// if (p != spid2span.end()) +// { +// int x = p->second[0].first; +// int y = p->second[0].second-1; +// spstats[i].push_back(spstats[x][y-x]); +// } +// else +// { +// spstats[i].push_back(PT.btfix.lookup(m)); +// cout << PT.btfix.T1->pid2str(PT.btfix.V1.get(),pid) << " " +// << spstats[i].back()->good << "/" << spstats[i].back()->sample_cnt +// << endl; +// } +// spid2span[pid].push_back(pair<uint32_t,uint32_t>(i,k+1)); +// } +// } +// } - Alignment:: - Alignment(Mmsapt const& pt, string const& src, string const& trg) - : PT(pt) - { - PT.btfix.V1->fillIdSeq(src,s); - PT.btfix.V2->fillIdSeq(trg,t); +// Alignment:: +// Alignment(Mmsapt const& pt, string const& src, string const& trg) +// : PT(pt) +// { +// PT.btfix.V1->fillIdSeq(src,s); +// PT.btfix.V2->fillIdSeq(trg,t); - // LexicalPhraseScorer2<Token>::table_t const& COOC = PT.calc_lex.scorer.COOC; - // BOOST_FOREACH(id_type i, t) - // { - // cout << (*PT.btfix.V2)[i]; - // if (i < PT.wlex21.size()) - // { - // BOOST_FOREACH(id_type k, PT.wlex21[i]) - // { - // size_t j = COOC[k][i]; - // size_t m1 = COOC.m1(k); - // size_t m2 = COOC.m2(i); - // if (j*1000 > m1 && j*1000 > m2) - // cout << " " << (*PT.btfix.V1)[k]; - // } - // } - // cout << endl; - // } +// // LexicalPhraseScorer2<Token>::table_t const& COOC = PT.calc_lex.scorer.COOC; +// // BOOST_FOREACH(id_type i, t) +// // { +// // cout << (*PT.btfix.V2)[i]; +// // if (i < PT.wlex21.size()) +// // { +// // BOOST_FOREACH(id_type k, PT.wlex21[i]) +// // { +// // size_t j = COOC[k][i]; +// // size_t m1 = COOC.m1(k); +// // size_t m2 = COOC.m2(i); +// // if (j*1000 > m1 && j*1000 > m2) +// // cout << " " << (*PT.btfix.V1)[k]; +// // } +// // } +// // cout << endl; +// // } - fill_tspan_maps(); - fill_sspan_maps(); - tpos2ahyp.resize(t.size()); - // now fill the association score table - PAH.reserve(1000000); - typedef pid2span_t::iterator psiter; - for (psiter L = spid2span.begin(); L != spid2span.end(); ++L) - { - if (!L->second.size()) continue; // should never happen anyway - int i = L->second[0].first; - int k = L->second[0].second - i -1; - sptr<pstats> ps = spstats[i][k]; - PhrasePair pp; pp.init(L->first,*ps, PT.m_numScoreComponents); - jStatsTable & J = ps->trg; - for (jStatsTable::iterator y = J.begin(); y != J.end(); ++y) - { - psiter R = tpid2span.find(y->first); - if (R == tpid2span.end()) continue; - pp.update(y->first, y->second); - PT.ScorePPfix(pp); - pp.eval(PT.feature_weights); - PP.push_back(pp); - BOOST_FOREACH(span const& sspan, L->second) - { - BOOST_FOREACH(span const& tspan, R->second) - { - tpos2ahyp[tspan.first].push_back(PAH.size()); - PAH.push_back(PhraseAlnHyp(PP.back(),s.size(),sspan,tspan)); - } - } - } - } - } +// fill_tspan_maps(); +// fill_sspan_maps(); +// tpos2ahyp.resize(t.size()); +// // now fill the association score table +// PAH.reserve(1000000); +// typedef pid2span_t::iterator psiter; +// for (psiter L = spid2span.begin(); L != spid2span.end(); ++L) +// { +// if (!L->second.size()) continue; // should never happen anyway +// int i = L->second[0].first; +// int k = L->second[0].second - i -1; +// sptr<pstats> ps = spstats[i][k]; +// PhrasePair pp; pp.init(L->first,*ps, PT.m_numScoreComponents); +// jStatsTable & J = ps->trg; +// for (jStatsTable::iterator y = J.begin(); y != J.end(); ++y) +// { +// psiter R = tpid2span.find(y->first); +// if (R == tpid2span.end()) continue; +// pp.update(y->first, y->second); +// PT.ScorePPfix(pp); +// pp.eval(PT.feature_weights); +// PP.push_back(pp); +// BOOST_FOREACH(span const& sspan, L->second) +// { +// BOOST_FOREACH(span const& tspan, R->second) +// { +// tpos2ahyp[tspan.first].push_back(PAH.size()); +// PAH.push_back(PhraseAlnHyp(PP.back(),s.size(),sspan,tspan)); +// } +// } +// } +// } +// } - int - extend(vector<PhraseAlnHyp> & PAH, int edge, int next) - { - if ((PAH[edge].scov & PAH[next].scov).count()) - return -1; - int ret = PAH.size(); - PAH.push_back(PAH[next]); - PhraseAlnHyp & h = PAH.back(); - h.prev = edge; - h.scov |= PAH[edge].scov; - h.score += log(PAH[edge].dprob_fwd(PAH[next])); - h.score += log(PAH[next].dprob_bwd(PAH[edge])); - return ret; - } +// int +// extend(vector<PhraseAlnHyp> & PAH, int edge, int next) +// { +// if ((PAH[edge].scov & PAH[next].scov).count()) +// return -1; +// int ret = PAH.size(); +// PAH.push_back(PAH[next]); +// PhraseAlnHyp & h = PAH.back(); +// h.prev = edge; +// h.scov |= PAH[edge].scov; +// h.score += log(PAH[edge].dprob_fwd(PAH[next])); +// h.score += log(PAH[next].dprob_bwd(PAH[edge])); +// return ret; +// } - sptr<vector<int> > - Mmsapt:: - align(string const& src, string const& trg) const - { - // For the time being, we consult only the fixed bitext. - // We might also consider the dynamic bitext. => TO DO. - Alignment A(*this,src,trg); - VectorIndexSorter<PhraseAlnHyp> foo(A.PAH); - vector<size_t> o; foo.GetOrder(o); - BOOST_FOREACH(int i, o) A.show(cout,A.PAH[i]); - sptr<vector<int> > aln; - return aln; -} -} +// sptr<vector<int> > +// Mmsapt:: +// align(string const& src, string const& trg) const +// { +// // For the time being, we consult only the fixed bitext. +// // We might also consider the dynamic bitext. => TO DO. +// Alignment A(*this,src,trg); +// VectorIndexSorter<PhraseAlnHyp> foo(A.PAH); +// vector<size_t> o; foo.GetOrder(o); +// BOOST_FOREACH(int i, o) A.show(cout,A.PAH[i]); +// sptr<vector<int> > aln; +// return aln; +// } +// } diff --git a/moses/TranslationModel/UG/mmsapt_phrase_scorers.h b/moses/TranslationModel/UG/mmsapt_phrase_scorers.h index 6e852b44b..083afb3a3 100644 --- a/moses/TranslationModel/UG/mmsapt_phrase_scorers.h +++ b/moses/TranslationModel/UG/mmsapt_phrase_scorers.h @@ -1,268 +1,17 @@ // -*- c++ -*- +// written by Ulrich Germann #pragma once #include "moses/TranslationModel/UG/mm/ug_bitext.h" #include "util/exception.hh" +#include "boost/format.hpp" +#include "sapt_pscore_base.h" + +// DEPRECATED CODE: Word and phrase penalties are now +// added by the decoder. namespace Moses { namespace bitext { - - template<typename Token> - class - PhraseScorer - { - protected: - int m_index; - int m_num_feats; - vector<string> m_feature_names; - public: - - virtual - void - operator()(Bitext<Token> const& pt, PhrasePair& pp, vector<float> * dest=NULL) - const = 0; - - int - fcnt() const - { return m_num_feats; } - - vector<string> const & - fnames() const - { return m_feature_names; } - - string const & - fname(int i) const - { - UTIL_THROW_IF2((i < m_index || i >= m_index + m_num_feats), - "Feature name index out of range at " - << __FILE__ << ":" << __LINE__); - return m_feature_names.at(i - m_index); - } - - int - getIndex() const - { return m_index; } - }; - - //////////////////////////////////////////////////////////////////////////////// - - template<typename Token> - class - PScorePfwd : public PhraseScorer<Token> - { - float conf; - char denom; - public: - PScorePfwd() - { - this->m_num_feats = 1; - } - - int - init(int const i, float const c, char d) - { - conf = c; - denom = d; - this->m_index = i; - ostringstream buf; - buf << format("pfwd-%c%.3f") % denom % c; - this->m_feature_names.push_back(buf.str()); - return i + this->m_num_feats; - } - - void - operator()(Bitext<Token> const& bt, PhrasePair & pp, - vector<float> * dest = NULL) const - { - if (!dest) dest = &pp.fvals; - if (pp.joint > pp.good1) - { - cerr<<bt.toString(pp.p1,0)<<" ::: "<<bt.toString(pp.p2,1)<<endl; - cerr<<pp.joint<<"/"<<pp.good1<<"/"<<pp.raw2<<endl; - } - switch (denom) - { - case 'g': - (*dest)[this->m_index] = log(lbop(pp.good1, pp.joint, conf)); - break; - case 's': - (*dest)[this->m_index] = log(lbop(pp.sample1, pp.joint, conf)); - break; - case 'r': - (*dest)[this->m_index] = log(lbop(pp.raw1, pp.joint, conf)); - } - } - }; - - //////////////////////////////////////////////////////////////////////////////// - - template<typename Token> - class - PScorePbwd : public PhraseScorer<Token> - { - float conf; - char denom; - public: - PScorePbwd() - { - this->m_num_feats = 1; - } - - int - init(int const i, float const c, char d) - { - conf = c; - denom = d; - this->m_index = i; - ostringstream buf; - buf << format("pbwd-%c%.3f") % denom % c; - this->m_feature_names.push_back(buf.str()); - return i + this->m_num_feats; - } - - void - operator()(Bitext<Token> const& bt, PhrasePair& pp, - vector<float> * dest = NULL) const - { - if (!dest) dest = &pp.fvals; - // we use the denominator specification to scale the raw counts on the - // target side; the clean way would be to counter-sample - uint32_t r2 = pp.raw2; - if (denom == 'g') r2 = round(r2 * float(pp.good1) / pp.raw1); - else if (denom == 's') r2 = round(r2 * float(pp.sample1) / pp.raw1); - (*dest)[this->m_index] = log(lbop(max(r2, pp.joint),pp.joint,conf)); - } - }; - - //////////////////////////////////////////////////////////////////////////////// - - template<typename Token> - class - PScoreCoherence : public PhraseScorer<Token> - { - public: - PScoreCoherence() - { - this->m_num_feats = 1; - } - - int - init(int const i) - { - this->m_index = i; - this->m_feature_names.push_back(string("coherence")); - return i + this->m_num_feats; - } - - void - operator()(Bitext<Token> const& bt, PhrasePair& pp, - vector<float> * dest = NULL) const - { - if (!dest) dest = &pp.fvals; - (*dest)[this->m_index] = log(pp.good1) - log(pp.sample1); - } - }; - - //////////////////////////////////////////////////////////////////////////////// - - template<typename Token> - class - PScoreLogCounts : public PhraseScorer<Token> - { - float conf; - public: - PScoreLogCounts() - { - this->m_num_feats = 5; - } - - int - init(int const i) - { - this->m_index = i; - this->m_feature_names.push_back("log-r1"); - this->m_feature_names.push_back("log-s1"); - this->m_feature_names.push_back("log-g1"); - this->m_feature_names.push_back("log-j"); - this->m_feature_names.push_back("log-r2"); - return i + this->m_num_feats; - } - - void - operator()(Bitext<Token> const& bt, PhrasePair& pp, - vector<float> * dest = NULL) const - { - if (!dest) dest = &pp.fvals; - size_t i = this->m_index; - assert(pp.raw1); - assert(pp.sample1); - assert(pp.good1); - assert(pp.joint); - assert(pp.raw2); - (*dest)[i] = -log(pp.raw1); - (*dest)[++i] = -log(pp.sample1); - (*dest)[++i] = -log(pp.good1); - (*dest)[++i] = +log(pp.joint); - (*dest)[++i] = -log(pp.raw2); - } - }; - - template<typename Token> - class - PScoreLex : public PhraseScorer<Token> - { - float const m_alpha; - public: - LexicalPhraseScorer2<Token> scorer; - - PScoreLex(float const a) - : m_alpha(a) - { this->m_num_feats = 2; } - - int - init(int const i, string const& fname) - { - scorer.open(fname); - this->m_index = i; - this->m_feature_names.push_back("lexfwd"); - this->m_feature_names.push_back("lexbwd"); - return i + this->m_num_feats; - } - - void - operator()(Bitext<Token> const& bt, PhrasePair& pp, vector<float> * dest = NULL) const - { - if (!dest) dest = &pp.fvals; - uint32_t sid1=0,sid2=0,off1=0,off2=0,len1=0,len2=0; - parse_pid(pp.p1, sid1, off1, len1); - parse_pid(pp.p2, sid2, off2, len2); - -#if 0 - cout << len1 << " " << len2 << endl; - Token const* t1 = bt.T1->sntStart(sid1); - for (size_t i = off1; i < off1 + len1; ++i) - cout << (*bt.V1)[t1[i].id()] << " "; - cout << __FILE__ << ":" << __LINE__ << endl; - - Token const* t2 = bt.T2->sntStart(sid2); - for (size_t i = off2; i < off2 + len2; ++i) - cout << (*bt.V2)[t2[i].id()] << " "; - cout << __FILE__ << ":" << __LINE__ << endl; - - BOOST_FOREACH (int a, pp.aln) - cout << a << " " ; - cout << __FILE__ << ":" << __LINE__ << "\n" << endl; - -#endif - scorer.score(bt.T1->sntStart(sid1)+off1,0,len1, - bt.T2->sntStart(sid2)+off2,0,len2, - pp.aln, m_alpha, - (*dest)[this->m_index], - (*dest)[this->m_index+1]); - } - - }; - /// Word penalty template<typename Token> class @@ -280,7 +29,8 @@ namespace Moses { } void - operator()(Bitext<Token> const& bt, PhrasePair& pp, vector<float> * dest = NULL) const + operator()(Bitext<Token> const& bt, PhrasePair<Token>& pp, + vector<float> * dest = NULL) const { if (!dest) dest = &pp.fvals; uint32_t sid2=0,off2=0,len2=0; @@ -307,7 +57,8 @@ namespace Moses { } void - operator()(Bitext<Token> const& bt, PhrasePair& pp, vector<float> * dest = NULL) const + operator()(Bitext<Token> const& bt, PhrasePair<Token>& pp, + vector<float> * dest = NULL) const { if (!dest) dest = &pp.fvals; (*dest)[this->m_index] = 1; diff --git a/moses/TranslationModel/UG/ptable-lookup.cc b/moses/TranslationModel/UG/ptable-lookup.cc index 106505f05..2cbf89b16 100644 --- a/moses/TranslationModel/UG/ptable-lookup.cc +++ b/moses/TranslationModel/UG/ptable-lookup.cc @@ -106,15 +106,11 @@ int main(int argc, char* argv[]) cout << " "; for (size_t k = idx.first; k < idx.second; ++k) { - if (mmsapt && fname[k-idx.first].substr(0,3) == "log") - { - if(scores[k] < 0) - cout << " " << format("%10d") % round(exp(-scores[k])); - else - cout << " " << format("%10d") % round(exp(scores[k])); - } - else - cout << " " << format("%10.8f") % exp(scores[k]); + size_t j = k-idx.first; + float f = (mmsapt ? mmsapt->isLogVal(j) ? exp(scores[k]) : scores[k] + : scores[k] < 0 ? exp(scores[k]) : scores[k]); + string fmt = (mmsapt && mmsapt->isInteger(j)) ? "%10d" : "%10.8f"; + cout << " " << format(fmt) % f; } cout << endl; } diff --git a/moses/TranslationModel/UG/sapt_phrase_key.h b/moses/TranslationModel/UG/sapt_phrase_key.h new file mode 100644 index 000000000..e1ecf1573 --- /dev/null +++ b/moses/TranslationModel/UG/sapt_phrase_key.h @@ -0,0 +1,13 @@ +//-*- c++ -*- +#pragma once +#include <stdint.h> + +using namespace std; +namespace sapt +{ + using namespace Moses; + using namespace std; + + + +} diff --git a/moses/TranslationModel/UG/sapt_phrase_scorers.h b/moses/TranslationModel/UG/sapt_phrase_scorers.h new file mode 100644 index 000000000..37cfd26fd --- /dev/null +++ b/moses/TranslationModel/UG/sapt_phrase_scorers.h @@ -0,0 +1,12 @@ +// -*- c++ -*- +// Phrase scoring functions for suffix array-based phrase tables +// written by Ulrich Germann +#pragma once +#include "sapt_pscore_unaligned.h" // count # of unaligned words +#include "sapt_pscore_provenance.h" // reward for joint phrase occ. per corpus +#include "sapt_pscore_rareness.h" // penalty for rare occurrences (global?) +#include "sapt_pscore_logcnt.h" // logs of observed counts +#include "sapt_pscore_lex1.h" // plain vanilla Moses lexical scores +#include "sapt_pscore_pfwd.h" // fwd phrase prob +#include "sapt_pscore_pbwd.h" // bwd phrase prob +#include "sapt_pscore_coherence.h" // coherence feature: good/sample-size diff --git a/moses/TranslationModel/UG/sapt_pscore_base.h b/moses/TranslationModel/UG/sapt_pscore_base.h new file mode 100644 index 000000000..68a491145 --- /dev/null +++ b/moses/TranslationModel/UG/sapt_pscore_base.h @@ -0,0 +1,103 @@ +// -*- c++ -*- +// Base classes for suffix array-based phrase scorers +// written by Ulrich Germann +#pragma once +#include "moses/TranslationModel/UG/mm/ug_bitext.h" +#include "moses/TranslationModel/UG/mm/ug_phrasepair.h" +#include "util/exception.hh" +#include "boost/format.hpp" + +namespace Moses { + namespace bitext + { + + // abstract base class that defines the common API for phrase scorers + template<typename Token> + class + PhraseScorer + { + protected: + int m_index; + int m_num_feats; + string m_tag; + vector<string> m_feature_names; + public: + + virtual + void + operator()(Bitext<Token> const& pt, + PhrasePair<Token>& pp, + vector<float> * dest=NULL) + const = 0; + + void + setIndex(int const i) { m_index = i; } + + int + getIndex() const { return m_index; } + + int + fcnt() const { return m_num_feats; } + + vector<string> const & + fnames() const { return m_feature_names; } + + string const & + fname(int i) const + { + if (i < 0) i += m_num_feats; + UTIL_THROW_IF2(i < 0 || i >= m_num_feats, + "Feature name index out of range at " << HERE); + return m_feature_names.at(i); + } + + virtual + bool + isLogVal(int i) const { return true; }; + // is this feature log valued? + + virtual + bool + isIntegerValued(int i) const { return false; }; + // is this feature integer valued (e.g., count features)? + + virtual + bool + allowPooling() const { return true; } + // does this feature function allow pooling of counts if + // there are no occurrences in the respective corpus? + + }; + + // base class for 'families' of phrase scorers that have a single + template<typename Token> + class + SingleRealValuedParameterPhraseScorerFamily + : public PhraseScorer<Token> + { + protected: + vector<float> m_x; + + virtual + void + init(string const specs) + { + using namespace boost; + UTIL_THROW_IF2(this->m_tag.size() == 0, + "m_tag must be initialized in constructor"); + UTIL_THROW_IF2(specs.size() == 0,"empty specification string!"); + UTIL_THROW_IF2(this->m_feature_names.size(), + "PhraseScorer can only be initialized once!"); + this->m_index = -1; + float x; char c; + for (istringstream buf(specs); buf>>x; buf>>c) + { + this->m_x.push_back(x); + string fname = (format("%s-%.2f") % this->m_tag % x).str(); + this->m_feature_names.push_back(fname); + } + this->m_num_feats = this->m_x.size(); + } + }; + } // namespace bitext +} // namespace moses diff --git a/moses/TranslationModel/UG/sapt_pscore_coherence.h b/moses/TranslationModel/UG/sapt_pscore_coherence.h new file mode 100644 index 000000000..a3211df54 --- /dev/null +++ b/moses/TranslationModel/UG/sapt_pscore_coherence.h @@ -0,0 +1,33 @@ +// -*- c++ -*- +// written by Ulrich Germann +#pragma once +#include "moses/TranslationModel/UG/mm/ug_bitext.h" +#include "util/exception.hh" +#include "boost/format.hpp" + +namespace Moses { + namespace bitext + { + template<typename Token> + class + PScoreCoherence : public PhraseScorer<Token> + { + public: + PScoreCoherence(string const dummy) + { + this->m_index = -1; + this->m_num_feats = 1; + this->m_feature_names.push_back(string("coherence")); + } + + void + operator()(Bitext<Token> const& bt, + PhrasePair<Token>& pp, + vector<float> * dest = NULL) const + { + if (!dest) dest = &pp.fvals; + (*dest)[this->m_index] = log(pp.good1) - log(pp.sample1); + } + }; + } +} diff --git a/moses/TranslationModel/UG/sapt_pscore_lex1.h b/moses/TranslationModel/UG/sapt_pscore_lex1.h new file mode 100644 index 000000000..be994b0d3 --- /dev/null +++ b/moses/TranslationModel/UG/sapt_pscore_lex1.h @@ -0,0 +1,70 @@ +// -*- c++ -*- +// Phrase scorer that counts the number of unaligend words in the phrase +// written by Ulrich Germann + +#include "moses/TranslationModel/UG/mm/ug_bitext.h" +#include "sapt_pscore_base.h" +#include <boost/dynamic_bitset.hpp> + +namespace Moses { + namespace bitext + { + template<typename Token> + class + PScoreLex1 : public PhraseScorer<Token> + { + float m_alpha; + public: + LexicalPhraseScorer2<Token> scorer; + + PScoreLex1(string const& alpaspec, string const& lexfile) + { + this->m_index = -1; + this->m_num_feats = 2; + this->m_feature_names.reserve(2); + this->m_feature_names.push_back("lexfwd"); + this->m_feature_names.push_back("lexbwd"); + m_alpha = atof(alpaspec.c_str()); + scorer.open(lexfile); + } + + void + operator()(Bitext<Token> const& bt, + PhrasePair<Token>& pp, + vector<float> * dest = NULL) const + { + if (!dest) dest = &pp.fvals; + // uint32_t sid1=0,sid2=0,off1=0,off2=0,len1=0,len2=0; + // parse_pid(pp.p1, sid1, off1, len1); + // parse_pid(pp.p2, sid2, off2, len2); +#if 0 + cout << len1 << " " << len2 << endl; + Token const* t1 = bt.T1->sntStart(sid1); + for (size_t i = off1; i < off1 + len1; ++i) + cout << (*bt.V1)[t1[i].id()] << " "; + cout << __FILE__ << ":" << __LINE__ << endl; + + Token const* t2 = bt.T2->sntStart(sid2); + for (size_t i = off2; i < off2 + len2; ++i) + cout << (*bt.V2)[t2[i].id()] << " "; + cout << __FILE__ << ":" << __LINE__ << endl; + + BOOST_FOREACH (int a, pp.aln) + cout << a << " " ; + cout << __FILE__ << ":" << __LINE__ << "\n" << endl; + + scorer.score(bt.T1->sntStart(sid1)+off1,0,len1, + bt.T2->sntStart(sid2)+off2,0,len2, + pp.aln, m_alpha, + (*dest)[this->m_index], + (*dest)[this->m_index+1]); +#endif + scorer.score(pp.start1,0, pp.len1, + pp.start2,0, pp.len2, pp.aln, m_alpha, + (*dest)[this->m_index], + (*dest)[this->m_index+1]); + } + }; + } //namespace bitext +} // namespace Moses + diff --git a/moses/TranslationModel/UG/sapt_pscore_logcnt.h b/moses/TranslationModel/UG/sapt_pscore_logcnt.h new file mode 100644 index 000000000..2790323ed --- /dev/null +++ b/moses/TranslationModel/UG/sapt_pscore_logcnt.h @@ -0,0 +1,65 @@ +// -*- c++ -*- +// Phrase scorer that rewards the number of phrase pair occurrences in a bitext +// with the asymptotic function x/(j+x) where x > 0 is a function +// parameter that determines the steepness of the rewards curve +// written by Ulrich Germann + +#include "sapt_pscore_base.h" +#include <boost/dynamic_bitset.hpp> + +using namespace std; +namespace Moses { + namespace bitext { + + template<typename Token> + class + PScoreLogCnt : public PhraseScorer<Token> + { + string m_specs; + public: + PScoreLogCnt(string const specs) + { + this->m_index = -1; + this->m_specs = specs; + if (specs.find("r1") != string::npos) // raw source phrase counts + this->m_feature_names.push_back("log-r1"); + if (specs.find("s1") != string::npos) + this->m_feature_names.push_back("log-s1"); // L1 sample size + if (specs.find("g1") != string::npos) // coherent phrases + this->m_feature_names.push_back("log-g1"); + if (specs.find("j") != string::npos) // joint counts + this->m_feature_names.push_back("log-j"); + if (specs.find("r2") != string::npos) // raw target phrase counts + this->m_feature_names.push_back("log-r2"); + this->m_num_feats = this->m_feature_names.size(); + } + + bool + isIntegerValued(int i) const { return true; } + + void + operator()(Bitext<Token> const& bt, + PhrasePair<Token>& pp, + vector<float> * dest = NULL) const + { + if (!dest) dest = &pp.fvals; + assert(pp.raw1); + assert(pp.sample1); + assert(pp.good1); + assert(pp.joint); + assert(pp.raw2); + size_t i = this->m_index; + if (m_specs.find("r1") != string::npos) + (*dest)[i++] = log(pp.raw1); + if (m_specs.find("s1") != string::npos) + (*dest)[i++] = log(pp.sample1); + if (m_specs.find("g1") != string::npos) + (*dest)[i++] = log(pp.good1); + if (m_specs.find("j") != string::npos) + (*dest)[i++] = log(pp.joint); + if (m_specs.find("r2") != string::npos) + (*dest)[++i] = log(pp.raw2); + } + }; + } // namespace bitext +} // namespace Moses diff --git a/moses/TranslationModel/UG/sapt_pscore_pbwd.h b/moses/TranslationModel/UG/sapt_pscore_pbwd.h new file mode 100644 index 000000000..f7b4686d7 --- /dev/null +++ b/moses/TranslationModel/UG/sapt_pscore_pbwd.h @@ -0,0 +1,58 @@ +//-*- c++ -*- +// written by Ulrich Germann +#pragma once +#include "moses/TranslationModel/UG/mm/ug_bitext.h" +#include "util/exception.hh" +#include "boost/format.hpp" +#include "boost/foreach.hpp" + +namespace Moses { + namespace bitext + { + template<typename Token> + class + PScorePbwd : public PhraseScorer<Token> + { + float conf; + string denom; + + public: + PScorePbwd(float const c, string d) + { + this->m_index = -1; + conf = c; + denom = d; + size_t checksum = d.size(); + BOOST_FOREACH(char const& x, denom) + { + if (x == '+') { --checksum; continue; } + if (x != 'g' && x != 's' && x != 'r') continue; + string s = (format("pbwd-%c%.3f") % x % c).str(); + this->m_feature_names.push_back(s); + } + this->m_num_feats = this->m_feature_names.size(); + UTIL_THROW_IF2(this->m_feature_names.size() != checksum, + "Unknown parameter in specification '" + << d << "' for Pbwd phrase scorer at " << HERE); + } + + void + operator()(Bitext<Token> const& bt, + PhrasePair<Token>& pp, + vector<float> * dest = NULL) const + { + if (!dest) dest = &pp.fvals; + // we use the denominator specification to scale the raw counts on the + // target side; the clean way would be to counter-sample + size_t i = this->m_index; + BOOST_FOREACH(char const& x, denom) + { + uint32_t m2 = pp.raw2; + if (x == 'g') m2 = round(m2 * float(pp.good1) / pp.raw1); + else if (x == 's') m2 = round(m2 * float(pp.sample1) / pp.raw1); + (*dest)[i++] = log(lbop(max(m2, pp.joint),pp.joint,conf)); + } + } + }; + } // namespace bitext +} // namespace Moses diff --git a/moses/TranslationModel/UG/sapt_pscore_pfwd.h b/moses/TranslationModel/UG/sapt_pscore_pfwd.h new file mode 100644 index 000000000..ed48a93d2 --- /dev/null +++ b/moses/TranslationModel/UG/sapt_pscore_pfwd.h @@ -0,0 +1,70 @@ +// -*- c++ -*- +// written by Ulrich Germann +#pragma once +#include "moses/TranslationModel/UG/mm/ug_bitext.h" +#include "util/exception.hh" +#include "boost/format.hpp" +#include "boost/foreach.hpp" + +namespace Moses { + namespace bitext + { + template<typename Token> + class + PScorePfwd : public PhraseScorer<Token> + { + float conf; + string denom; + + public: + + PScorePfwd(float const c, string d) + { + this->m_index = -1; + conf = c; + denom = d; + size_t checksum = d.size(); + BOOST_FOREACH(char const& x, denom) + { + if (x == '+') { --checksum; continue; } + if (x != 'g' && x != 's' && x != 'r') continue; + string s = (format("pfwd-%c%.3f") % x % c).str(); + this->m_feature_names.push_back(s); + } + this->m_num_feats = this->m_feature_names.size(); + UTIL_THROW_IF2(this->m_feature_names.size() != checksum, + "Unknown parameter in specification '" + << d << "' for Pfwd phrase scorer at " << HERE); + } + + void + operator()(Bitext<Token> const& bt, PhrasePair<Token> & pp, + vector<float> * dest = NULL) const + { + if (!dest) dest = &pp.fvals; + if (pp.joint > pp.good1) + { + pp.joint = pp.good1; + // cerr<<bt.toString(pp.p1,0)<<" ::: "<<bt.toString(pp.p2,1)<<endl; + // cerr<<pp.joint<<"/"<<pp.good1<<"/"<<pp.raw2<<endl; + } + size_t i = this->m_index; + BOOST_FOREACH(char const& c, this->denom) + { + switch (c) + { + case 'g': + (*dest)[i++] = log(lbop(pp.good1, pp.joint, conf)); + break; + case 's': + (*dest)[i++] = log(lbop(pp.sample1, pp.joint, conf)); + break; + case 'r': + (*dest)[i++] = log(lbop(pp.raw1, pp.joint, conf)); + } + } + } + }; + } +} + diff --git a/moses/TranslationModel/UG/sapt_pscore_provenance.h b/moses/TranslationModel/UG/sapt_pscore_provenance.h new file mode 100644 index 000000000..c33b98fe7 --- /dev/null +++ b/moses/TranslationModel/UG/sapt_pscore_provenance.h @@ -0,0 +1,47 @@ +// -*- c++ -*- +// Phrase scorer that rewards the number of phrase pair occurrences in a bitext +// with the asymptotic function j/(j+x) where x > 0 is a function +// parameter that determines the steepness of the rewards curve +// written by Ulrich Germann + +#include "sapt_pscore_base.h" +#include <boost/dynamic_bitset.hpp> + +using namespace std; +namespace Moses { + namespace bitext { + + // asymptotic provenance feature n/(n+x) + template<typename Token> + class + PScoreProvenance : public SingleRealValuedParameterPhraseScorerFamily<Token> + { + public: + + PScoreProvenance(string const& spec) + { + this->m_tag = "prov"; + this->init(spec); + } + + bool + isLogVal(int i) const { return false; } + + void + operator()(Bitext<Token> const& bt, + PhrasePair<Token>& pp, + vector<float> * dest = NULL) const + { + if (!dest) dest = &pp.fvals; + size_t i = this->m_index; + BOOST_FOREACH(float const x, this->m_x) + (*dest).at(i++) = pp.joint/(x + pp.joint); + } + + bool + allowPooling() const + { return false; } + + }; + } // namespace bitext +} // namespace Moses diff --git a/moses/TranslationModel/UG/sapt_pscore_rareness.h b/moses/TranslationModel/UG/sapt_pscore_rareness.h new file mode 100644 index 000000000..58f204c88 --- /dev/null +++ b/moses/TranslationModel/UG/sapt_pscore_rareness.h @@ -0,0 +1,41 @@ +// -*- c++ -*- +// Phrase scorer that rewards the number of phrase pair occurrences in a bitext +// with the asymptotic function x/(j+x) where x > 0 is a function +// parameter that determines the steepness of the rewards curve +// written by Ulrich Germann + +#include "sapt_pscore_base.h" +#include <boost/dynamic_bitset.hpp> + +using namespace std; +namespace Moses { + namespace bitext { + + // rareness penalty: x/(n+x) + template<typename Token> + class + PScoreRareness : public SingleRealValuedParameterPhraseScorerFamily<Token> + { + public: + PScoreRareness(string const spec) + { + this->m_tag = "rare"; + this->init(spec); + } + + bool + isLogVal(int i) const { return false; } + + void + operator()(Bitext<Token> const& bt, + PhrasePair<Token>& pp, + vector<float> * dest = NULL) const + { + if (!dest) dest = &pp.fvals; + size_t i = this->m_index; + BOOST_FOREACH(float const x, this->m_x) + (*dest).at(i++) = x/(x + pp.joint); + } + }; + } // namespace bitext +} // namespace Moses diff --git a/moses/TranslationModel/UG/sapt_pscore_unaligned.h b/moses/TranslationModel/UG/sapt_pscore_unaligned.h new file mode 100644 index 000000000..bdd2919b4 --- /dev/null +++ b/moses/TranslationModel/UG/sapt_pscore_unaligned.h @@ -0,0 +1,67 @@ +// -*- c++ -*- +// Phrase scorer that counts the number of unaligend words in the phrase +// written by Ulrich Germann + +#include "sapt_pscore_base.h" +#include <boost/dynamic_bitset.hpp> + +namespace Moses { + namespace bitext + { + template<typename Token> + class + PScoreUnaligned : public PhraseScorer<Token> + { + typedef boost::dynamic_bitset<uint64_t> bitvector; + public: + PScoreUnaligned(string const spec) + { + this->m_index = -1; + int f = this->m_num_feats = atoi(spec.c_str()); + UTIL_THROW_IF2(f != 1 && f != 2,"unal parameter must be 1 or 2 at "<<HERE); + this->m_feature_names.resize(f); + if (f == 1) + this->m_feature_names[0] = "unal"; + else + { + this->m_feature_names[0] = "unal-s"; + this->m_feature_names[1] = "unal-t"; + } + } + + bool + isLogVal(int i) const { return false; } + + bool + isIntegerValued(int i) const { return true; } + + void + operator()(Bitext<Token> const& bt, + PhrasePair<Token>& pp, + vector<float> * dest = NULL) const + { + if (!dest) dest = &pp.fvals; + // uint32_t sid1=0,sid2=0,off1=0,off2=0,len1=0,len2=0; + // parse_pid(pp.p1, sid1, off1, len1); + // parse_pid(pp.p2, sid2, off2, len2); + bitvector check1(pp.len1),check2(pp.len2); + for (size_t i = 0; i < pp.aln.size(); ) + { + check1.set(pp.aln[i++]); + check2.set(pp.aln.at(i++)); + } + + if (this->m_num_feats == 1) + { + (*dest)[this->m_index] = pp.len1 - check1.count(); + (*dest)[this->m_index] += pp.len2 - check2.count(); + } + else + { + (*dest)[this->m_index] = pp.len1 - check1.count(); + (*dest)[this->m_index+1] = pp.len2 - check2.count(); + } + } + }; + } // namespace bitext +} // namespace Moses diff --git a/moses/TranslationModel/UG/sim-pe.cc b/moses/TranslationModel/UG/sim-pe.cc new file mode 100644 index 000000000..58a70cab4 --- /dev/null +++ b/moses/TranslationModel/UG/sim-pe.cc @@ -0,0 +1,83 @@ +#include "mmsapt.h" +#include "moses/Manager.h" +#include "moses/TranslationModel/PhraseDictionaryTreeAdaptor.h" +#include <boost/foreach.hpp> +#include <boost/format.hpp> +#include <boost/tokenizer.hpp> +#include <boost/shared_ptr.hpp> +#include <algorithm> +#include <iostream> + +using namespace Moses; +using namespace bitext; +using namespace std; +using namespace boost; + +vector<FactorType> fo(1,FactorType(0)); + +ostream& +operator<<(ostream& out, Hypothesis const* x) +{ + vector<const Hypothesis*> H; + for (const Hypothesis* h = x; h; h = h->GetPrevHypo()) + H.push_back(h); + for (; H.size(); H.pop_back()) + { + Phrase const& p = H.back()->GetCurrTargetPhrase(); + for (size_t pos = 0 ; pos < p.GetSize() ; pos++) + out << *p.GetFactor(pos, 0) << (H.size() ? " " : ""); + } + return out; +} + +vector<FactorType> ifo; +size_t lineNumber; + +string +translate(string const& source) +{ + StaticData const& global = StaticData::Instance(); + + Sentence sentence; + istringstream ibuf(source+"\n"); + sentence.Read(ibuf,ifo); + + Manager manager(lineNumber, sentence, global.GetSearchAlgorithm()); + manager.ProcessSentence(); + + ostringstream obuf; + const Hypothesis* h = manager.GetBestHypothesis(); + obuf << h; + return obuf.str(); + +} + +int main(int argc, char* argv[]) +{ + Parameter params; + if (!params.LoadParam(argc,argv) || !StaticData::LoadDataStatic(¶ms, argv[0])) + exit(1); + + StaticData const& global = StaticData::Instance(); + global.SetVerboseLevel(0); + ifo = global.GetInputFactorOrder(); + + lineNumber = 0; // TODO: Include sentence request number here? + string source, target, alignment; + while (getline(cin,source)) + { + getline(cin,target); + getline(cin,alignment); + cout << "[S] " << source << endl; + cout << "[H] " << translate(source) << endl; + cout << "[T] " << target << endl; + Mmsapt* pdsa = reinterpret_cast<Mmsapt*>(PhraseDictionary::GetColl()[0]); + pdsa->add(source,target,alignment); + cout << "[X] " << translate(source) << endl; + cout << endl; + } + exit(0); +} + + + diff --git a/moses/TranslationModel/UG/try-align.cc b/moses/TranslationModel/UG/try-align.cc index 30c87ccab..483ad2c34 100644 --- a/moses/TranslationModel/UG/try-align.cc +++ b/moses/TranslationModel/UG/try-align.cc @@ -2,32 +2,33 @@ using namespace std; using namespace Moses; +// currently broken Mmsapt* PT; int main(int argc, char* argv[]) { - string base = argv[1]; - string L1 = argv[2]; - string L2 = argv[3]; - ostringstream buf; - buf << "Mmsapt name=PT0 output-factor=0 num-features=5 base=" - << base << " L1=" << L1 << " L2=" << L2; - string configline = buf.str(); - PT = new Mmsapt(configline); - PT->Load(); - float w[] = { 0.0582634, 0.0518865, 0.0229819, 0.00640856, 0.647506 }; - vector<float> weights(w,w+5); - PT->setWeights(weights); - // these values are taken from a moses.ini file; - // is there a convenient way of accessing them from within mmsapt ??? - string eline,fline; - // TokenIndex V; V.open("crp/trn/mm/de.tdx"); - while (getline(cin,eline) && getline(cin,fline)) - { - cout << eline << endl; - cout << fline << endl; - PT->align(eline,fline); - } - delete PT; + // string base = argv[1]; + // string L1 = argv[2]; + // string L2 = argv[3]; + // ostringstream buf; + // buf << "Mmsapt name=PT0 output-factor=0 num-features=5 base=" + // << base << " L1=" << L1 << " L2=" << L2; + // string configline = buf.str(); + // PT = new Mmsapt(configline); + // PT->Load(); + // float w[] = { 0.0582634, 0.0518865, 0.0229819, 0.00640856, 0.647506 }; + // vector<float> weights(w,w+5); + // PT->setWeights(weights); + // // these values are taken from a moses.ini file; + // // is there a convenient way of accessing them from within mmsapt ??? + // string eline,fline; + // // TokenIndex V; V.open("crp/trn/mm/de.tdx"); + // while (getline(cin,eline) && getline(cin,fline)) + // { + // cout << eline << endl; + // cout << fline << endl; + // PT->align(eline,fline); + // } + // delete PT; } diff --git a/moses/TranslationModel/fuzzy-match/FuzzyMatchWrapper.cpp b/moses/TranslationModel/fuzzy-match/FuzzyMatchWrapper.cpp index 8766743b3..a91c58343 100644 --- a/moses/TranslationModel/fuzzy-match/FuzzyMatchWrapper.cpp +++ b/moses/TranslationModel/fuzzy-match/FuzzyMatchWrapper.cpp @@ -345,10 +345,10 @@ string FuzzyMatchWrapper::ExtractTM(WordIndex &wordIndex, long translationId, co // find the best matches according to letter sed string best_path = ""; int best_match = -1; - int best_letter_cost; + unsigned int best_letter_cost; if (lsed_flag) { best_letter_cost = compute_length( input[sentenceInd] ) * min_match / 100 + 1; - for(int si=0; si<best_tm.size(); si++) { + for(size_t si=0; si<best_tm.size(); si++) { int s = best_tm[si]; string path; unsigned int letter_cost = sed( input[sentenceInd], source[s], path, true ); diff --git a/moses/TypeDef.h b/moses/TypeDef.h index fb9fd56cb..7852d130d 100644 --- a/moses/TypeDef.h +++ b/moses/TypeDef.h @@ -59,7 +59,11 @@ const size_t DEFAULT_MAX_HYPOSTACK_SIZE = 200; const size_t DEFAULT_MAX_TRANS_OPT_CACHE_SIZE = 10000; const size_t DEFAULT_MAX_TRANS_OPT_SIZE = 5000; const size_t DEFAULT_MAX_PART_TRANS_OPT_SIZE = 10000; -const size_t DEFAULT_MAX_PHRASE_LENGTH = 20; +#ifdef PT_UG + const size_t DEFAULT_MAX_PHRASE_LENGTH = -1; +#else + const size_t DEFAULT_MAX_PHRASE_LENGTH = 20; +#endif const size_t DEFAULT_MAX_CHART_SPAN = 10; const size_t ARRAY_SIZE_INCR = 10; //amount by which a phrase gets resized when necessary const float LOWEST_SCORE = -100.0f; diff --git a/moses/Util.h b/moses/Util.h index 3bba71332..24a4e2c28 100644 --- a/moses/Util.h +++ b/moses/Util.h @@ -56,8 +56,12 @@ namespace Moses /** verbose macros * */ + #define VERBOSE(level,str) { if (StaticData::Instance().GetVerboseLevel() >= level) { TRACE_ERR(str); } } #define IFVERBOSE(level) if (StaticData::Instance().GetVerboseLevel() >= level) +#define XVERBOSE(level,str) { if (StaticData::Instance().GetVerboseLevel() >= level) { TRACE_ERR("[" << __FILE__ << ":" << __LINE__ << "] ");TRACE_ERR(str); } } +#define HERE __FILE__ << ":" << __LINE__ + #if __GNUC__ == 4 && __GNUC_MINOR__ == 8 && (__GNUC_PATCHLEVEL__ == 1 || __GNUC_PATCHLEVEL__ == 2) // gcc nth_element() bug diff --git a/scripts/server/moses.py b/scripts/server/moses.py index 155458b9b..a176c473a 100644 --- a/scripts/server/moses.py +++ b/scripts/server/moses.py @@ -152,7 +152,7 @@ def find_free_port(p): class MosesServer(ProcessWrapper): - def __init__(self,args=["-fd", "\n"]): + def __init__(self,args=[]): self.process = None mserver_cmd = moses_root+"/bin/mosesserver" self.cmd = [mserver_cmd] + args @@ -175,7 +175,10 @@ class MosesServer(ProcessWrapper): self.cmd.extend(["--server-port", "%d"%self.port]) if debug: print >>sys.stderr,self.cmd - self.process = Popen(self.cmd,stderr = sys.stderr) + # self.stderr = open("mserver.%d.stderr"%self.port,'w') + # self.stdout = open("mserver.%d.stdout"%self.port,'w') + # self.process = Popen(self.cmd,stderr = self.stderr,stdout = self.stdout) + self.process = Popen(self.cmd) else: devnull = open(os.devnull,"w") self.process = Popen(self.cmd, stderr=devnull, stdout=devnull) @@ -216,10 +219,13 @@ class MosesServer(ProcessWrapper): elif type(input) is list: return [self.translate(x) for x in input] + elif type(input) is dict: return self.proxy.translate(input) + else: raise Exception("Can't handle input of this type!") + except: attempts += 1 print >>sys.stderr, "WAITING", attempts diff --git a/scripts/server/sim-pe.py b/scripts/server/sim-pe.py index 340695a56..52d1e314a 100755 --- a/scripts/server/sim-pe.py +++ b/scripts/server/sim-pe.py @@ -127,13 +127,40 @@ def translate(proxy, args, line): param['nbest-distinct'] = True pass attempts = 0 - while attempts < 120: + while attempts < 20: + t1 = time.time() try: - return proxy.translate(param) - except: - print >>sys.stderr, "Waiting", proxy - attempts += 1 + return proxy.translate(param) + + # except xmlrpclib.Fault as e: + # except xmlrpclib.ProtocolError as e: + # except xmlrpclib.ResponseError as e: + except xmlrpclib.Error as e: + time.sleep(2) # give all the stderr stuff a chance to be flushed + print >>sys.stderr," XMLRPC error:",e + print >>sys.stderr, "Input was" + print >>sys.stderr, param + sys.exit(1) + + except IOError as e: + print >>sys.stderr,"I/O error({0}): {1}".format(e.errno, e.strerror) time.sleep(5) + + except: + serverstatus = mserver.process.poll() + if serverstatus == None: + print >>sys.stderr, "Connection failed after %f seconds"%(time.time()-t1) + attempts += 1 + if attempts > 10: + time.sleep(10) + else: + time.sleep(5) + pass + else: + + print >>sys.stderr, "Oopsidaisy, server exited with code %d (signal %d)"\ + %(serverstatus/256,serverstatus%256) + pass pass pass raise Exception("Exception: could not reach translation server.") @@ -210,17 +237,25 @@ if __name__ == "__main__": pass pass - if args.url: - mserver.connect(args.url) - else: - mserver.start(args=mo_args,port=args.port,debug=args.debug) - pass - ref = None aln = None if args.ref: ref = read_data(args.ref) if args.aln: aln = read_data(args.aln) + if ref and aln: + try: + mo_args.index("--serial") + except: + mo_args.append("--serial") + pass + pass + + if args.url: + mserver.connect(args.url) + else: + mserver.start(args=mo_args, port=args.port, debug=args.debug) + pass + if (args.input == "-"): line = sys.stdin.readline() idx = 0 |